def add_aux_corpus(self): src_rare_vocab = rare_vocab_create(self.src_vocab) trg_rare_vocab = rare_vocab_create(self.trg_vocab) #rare_vocabの単語を含む対訳を抽出 aux_taiyaku = [] for src in src_rare_vocab: aux_taiyaku.append(scraping(src, "en")) for trg in trg_rare_vocab: aux_taiyaku.append(scraping(trg, "ja")) aux_corpus = './../data/aux_taiyaku.tsv' f = open(aux_corpus, 'w') for s in aux_taiyaku: f.write(s+'\n') f.close() PrepareData.extract_each_sentence(aux_corpus, './../data/ja_aux_sentences.tsv', 'ja') PrepareData.extract_each_sentence(aux_corpus, './../data/en_aux_sentences.tsv', 'en') #aux_corpusについてもvocabを作成 aux_src_vocab = Tokenizer.en_vocab_create('./../data/en_aux_sentences.tsv') aux_trg_vocab = Tokenizer.ja_vocab_create('./../data/ja_aux_sentences.tsv') #vocabにない、かつaux_vocabにおいてもfreq50未満の単語 が含まれている文をaux_corpusから削除 no_use_src_vocab = rare_vocab_create(aux_src_vocab) no_use_trg_vocab = rare_vocab_create(aux_trg_vocab) for i in range(len(no_use_src_vocab)): if no_use_src_vocab[i] in self.src_vocab.keys(): del no_use_src_vocab[i] for i in range(len(no_use_trg_vocab)): if no_use_trg_vocab[i] in self.trg_vocab.keys(): del no_use_trg_vocab[i] no_use_vocab = no_use_src_vocab + no_use_trg_vocab for j in range(len(aux_taiyaku)): for word in no_use_vocab: if word in aux_taiyaku[j]: del aux_taiyaku[j] break f = open(aux_corpus, 'w') for s in aux_taiyaku: f.write(s+'\n') f.close() #トレーニングデータにaux_corpusを追加 PrepareData.extract_each_sentence(aux_corpus, './../data/ja_sentences.tsv', 'ja') PrepareData.extract_each_sentence(aux_corpus, './../data/en_sentences.tsv', 'en')
def __init__(self): self.src_vocab = Tokenizer.en_vocab_create() self.trg_vocab = Tokenizer.ja_vocab_create()