Exemplo n.º 1
0
    def tokenize_lang(self, path):
        # 0 for ZH, 1 for EN, 2 for EOS
        assert os.path.exists(path)

        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                line = line.strip()
                line = line.replace("  ", " ")
                words = line.split() + ['<eos>']
                tokens += len(words)

        with open(path, 'r') as f:
            ids = torch.LongTensor(tokens)
            langs = torch.LongTensor(tokens)
            token = 0
            for line in f:
                line = line.strip()
                line = line.replace("  ", " ")

                strs = line.split()
                for i in range(len(strs)):
                    word = strs[i]
                    if texthelper.is_contain_chinese_word(word):
                        ids[token] = 0
                    else:
                        ids[token] = 1
                    token += 1

                # add EOS (2)
                ids[token] = 2
                token += 1

        return ids
Exemplo n.º 2
0
    def tokenize(self, path, save, randomize=False):
        """Tokenizes a text file."""
        assert os.path.exists(path)

        # Add words to the dictionary
        self.dictionary.add_word("<oov>")

        data = []
        with open(path, 'r') as f:
            for line in f:
                data.append(line)

        # if randomize:
        #     random.shuffle(data)

        # with open(path, 'r') as f:
        tokens = 0
        for i in range(len(data)):
            line = data[i]
            line = line.strip().lower()
            line = line.replace("  ", " ")
            words = line.split() + ['<eos>']
            tokens += len(words)
            for word in words:
                if save:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r') as f:
            ids = torch.LongTensor(tokens)
            langs = torch.LongTensor(tokens)
            token = 0
            for line in f:
                line = line.strip().lower()
                line = line.replace("  ", " ")
                words = line.split() + ['<eos>']
                for word in words:
                    if not word in self.dictionary.word2idx:
                        ids[token] = self.dictionary.word2idx["<oov>"]
                    else:
                        ids[token] = self.dictionary.word2idx[word]

                    if texthelper.is_contain_chinese_word(word):
                        langs[token] = 1
                    else:
                        langs[token] = 0

                    token += 1

        return ids, langs
Exemplo n.º 3
0
 def create_language_mask(self):
     for i in range(len(self.dictionary.idx2word)):
         word = self.dictionary.idx2word[i]
         if texthelper.is_contain_chinese_word(word):
             self.zh_mask.append(1)
             self.en_mask.append(0)
             self.other_mask.append(0)
         elif word == "<eos>":
             self.zh_mask.append(0)
             self.en_mask.append(0)
             self.other_mask.append(1)
         else:
             self.zh_mask.append(0)
             self.en_mask.append(1)
             self.other_mask.append(0)
Exemplo n.º 4
0
    def tokenize(self, path, save):
        """Tokenizes a text file."""
        assert os.path.exists(path)

        # Add words to the dictionary
        self.dictionary.add_word("<oov>")

        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                line = line.strip()
                line = line.replace("  ", " ")
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    if save:
                        self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r') as f:
            ids = torch.LongTensor(tokens)
            langs = torch.LongTensor(tokens)
            token = 0
            for line in f:
                line = line.strip()
                line = line.replace("  ", " ")
                words = line.split() + ['<eos>']
                for word in words:
                    if not word in self.dictionary.word2idx:
                        ids[token] = self.dictionary.word2idx["<oov>"]
                    else:
                        ids[token] = self.dictionary.word2idx[word]

                    if texthelper.is_contain_chinese_word(word):
                        langs[token] = 1
                    else:
                        langs[token] = 0

                    token += 1

        return ids, langs
Exemplo n.º 5
0
def evaluate_test(data_source, type_evaluation="val"):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    # hidden_lang = model.init_hidden(eval_batch_size)
    criterion = nn.CrossEntropyLoss()

    total_en_en_loss, total_en_zh_loss, total_zh_en_loss, total_zh_zh_loss = 0, 0, 0, 0  # test
    total_data_en_en, total_data_en_zh, total_data_zh_en, total_data_zh_zh = 0, 0, 0, 0

    for i in range(0, data_source.size(0) - 1, args.bptt):
        data, targets = get_batch(data_source, i, evaluation=True)

        output, hidden = model(data, hidden)

        if type_evaluation == "test":
            # print("data>", data.squeeze().size())
            # print("targets>", targets.size())
            source_string, target_string = [], []
            source_lang, target_lang = [], []

            for i in range(len(data.squeeze())):
                word = corpus.dictionary.idx2word[data.squeeze()[i].item()]
                source_string.append(word)
                if is_contain_chinese_word(word):
                    source_lang.append(True)
                else:
                    source_lang.append(False)

            for i in range(len(targets)):
                word = corpus.dictionary.idx2word[targets[i].item()]
                target_string.append(word)
                if is_contain_chinese_word(word):
                    target_lang.append(True)
                else:
                    target_lang.append(False)

            # print("source:", source_string)
            # print("target", target_string)

            output_en_en, output_en_zh, output_zh_en, output_zh_zh = [], [], [], []
            target_en_en, target_en_zh, target_zh_en, target_zh_zh = [], [], [], []

            # print("output:", output.size())

            batch_size = output.size(1)
            seq_len = output.size(0)

            for i in range(len(source_lang)):
                source_word = corpus.dictionary.idx2word[data.squeeze()
                                                         [i].item()]
                target_word = corpus.dictionary.idx2word[targets[i].item()]

                if source_word == "<eos>" or target_word == "<eos>":
                    # print("skip <eos>")
                    continue

                if source_lang[i] and target_lang[i]:
                    output_en_en.append(output[i])
                    target_en_en.append(targets[i])
                elif source_lang[i] and not target_lang[i]:
                    output_en_zh.append(output[i])
                    target_en_zh.append(targets[i])
                elif not source_lang[i] and target_lang[i]:
                    output_zh_en.append(output[i])
                    target_zh_en.append(targets[i])
                else:
                    output_zh_zh.append(output[i])
                    target_zh_zh.append(targets[i])

            total_data_en_en += len(output_en_en)
            total_data_en_zh += len(output_en_zh)
            total_data_zh_en += len(output_zh_en)
            total_data_zh_zh += len(output_zh_zh)

            if len(output_en_en) > 0:
                output_en_en = torch.stack(output_en_en)
                target_en_en = torch.LongTensor(target_en_en)
                if args.cuda:
                    output_en_en = output_en_en.cuda()
                    target_en_en = target_en_en.cuda()
                output_en_en_flat = output_en_en.view(-1, ntokens)
                total_en_en_loss += len(output_en_en) * criterion(
                    output_en_en_flat, target_en_en).data
            if len(output_en_zh) > 0:
                output_en_zh = torch.stack(output_en_zh)
                target_en_zh = torch.LongTensor(target_en_zh)
                if args.cuda:
                    output_en_zh = output_en_zh.cuda()
                    target_en_zh = target_en_zh.cuda()
                output_en_zh_flat = output_en_zh.view(-1, ntokens)
                total_en_zh_loss += len(output_en_zh) * criterion(
                    output_en_zh_flat, target_en_zh).data
            if len(output_zh_en) > 0:
                output_zh_en = torch.stack(output_zh_en)
                target_zh_en = torch.LongTensor(target_zh_en)
                if args.cuda:
                    output_zh_en = output_zh_en.cuda()
                    target_zh_en = target_zh_en.cuda()
                output_zh_en_flat = output_zh_en.view(-1, ntokens)
                total_zh_en_loss += len(output_zh_en) * criterion(
                    output_zh_en_flat, target_zh_en).data
            if len(output_zh_zh) > 0:
                output_zh_zh = torch.stack(output_zh_zh)
                target_zh_zh = torch.LongTensor(target_zh_zh)
                if args.cuda:
                    output_zh_zh = output_zh_zh.cuda()
                    target_zh_zh = target_zh_zh.cuda()
                output_zh_zh_flat = output_zh_zh.view(-1, ntokens)
                total_zh_zh_loss += len(output_zh_zh) * criterion(
                    output_zh_zh_flat, target_zh_zh).data

            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).data
            hidden = repackage_hidden(hidden)
        else:
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).data
            hidden = repackage_hidden(hidden)
        # hidden_lang = repackage_hidden(hidden_lang)

    if type_evaluation == "test":
        print(total_data_en_en, total_data_en_zh, total_data_zh_en,
              total_data_zh_zh)
        print(total_en_en_loss.item(), total_en_zh_loss.item(),
              total_zh_en_loss.item(), total_zh_zh_loss.item())
        return total_loss.item() / len(data_source), total_en_en_loss.item() / total_data_en_en \
            , total_en_zh_loss.item() / total_data_en_zh, total_zh_en_loss.item() / total_data_zh_en \
                , total_zh_zh_loss.item() / total_data_zh_zh, (total_en_zh_loss.item() + total_zh_en_loss.item()) / (total_data_en_zh + total_data_zh_en)
    else:
        return total_loss.item() / len(data_source)
Exemplo n.º 6
0
    else:
        data = Variable(source[i:i + seq_len])
    target = Variable(source[i + 1:i + 1 + seq_len].view(-1))
    return data, target


# print the result
word2idx = corpus.dictionary.word2idx
idx2word = corpus.dictionary.idx2word

num_word = len(corpus.dictionary.idx2word)
english_word = {}
chinese_word = {}
for j in range(num_word):
    word = idx2word[j]
    if is_contain_chinese_word(word):
        chinese_word[j] = True
    else:
        english_word[j] = True


def evaluate(data_source, type_evaluation="val"):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(10)
    # hidden_lang = model.init_hidden(eval_batch_size)
    criterion = nn.CrossEntropyLoss()

    # if type_evaluation == "test":