def __init__(self, source_with_start=False, source_with_end=False, source_with_unk=False, target_with_start=False, target_with_end=False, target_with_unk=False, same_length=False): self.source_with_start = source_with_start self.source_with_end = source_with_end self.source_with_unk = source_with_unk self.target_with_start = target_with_start self.target_with_end = target_with_end self.target_with_unk = target_with_unk self.source_corpus = SequenceCorpus(source_with_start, source_with_end, source_with_unk) self.target_corpus = SequenceCorpus(target_with_start, target_with_end, target_with_unk) self.same_length = same_length self.corpus = []
from neural_machine.tasks.language.common.corpus.segmentor import * from neural_machine.tasks.language.common.corpus.sequence_corpus import SequenceCorpus from neural_machine.tasks.language.common.data_reader.bucket_iter import * import sys import logging if __name__ == '__main__': head = '%(asctime)-15s %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) segmenter = SpaceSegmenter() corpus = SequenceCorpus() corpus.build(open(sys.argv[1], 'r'), segmenter) cell_num = corpus.cell_num() problem = LanguageModelProblem(corpus) batch_size = 32 data_train = BucketIter(problem, batch_size) val_corpus = corpus.make(open(sys.argv[2], 'r'), segmenter) val_problem = LanguageModelProblem(val_corpus) data_val = BucketIter(val_problem, batch_size) arch_param = LanguageModelArchParam(num_hidden=200,