def train_model(training_data, validating_data, batch_size, max_pad): head = '%(asctime)-15s %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) segmenter = CharacterSegmenter() train_corpus = SequencePairCorpus(source_with_unk=True, same_length=True) train_corpus.build(codecs.open(training_data, 'r', encoding="utf8"), segmenter, segmenter) logging.debug("Train corpus built : " + str(train_corpus.corpus_size())) unlabeled_tag_id = train_corpus.target_corpus.id("U") val_corpus = train_corpus.make(codecs.open(validating_data, 'r', encoding="utf8"), segmenter, segmenter) logging.debug("Validate corpus built") learning_param = LearnParam( num_epoch=25, learning_rate=0.05, momentum=0.0, batch_size=batch_size, max_pad = max_pad, device=None, nworker=None ) lm = SequenceTaggingMachine(unlabeled_tag_id) logging.log(logging.INFO, "Begin to train ...") lm.train(train_corpus, val_corpus, learning_param)
def train_model(training_data, validating_data, batch_size, max_pad): head = '%(asctime)-15s %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) segmenter = CharacterSegmenter() train_corpus = SequencePairCorpus(source_with_unk=True, same_length=True) train_corpus.build(codecs.open(training_data, 'r', encoding="utf8"), segmenter, segmenter) logging.debug("Train corpus built : " + str(train_corpus.corpus_size())) unlabeled_tag_id = train_corpus.target_corpus.id("U") val_corpus = train_corpus.make( codecs.open(validating_data, 'r', encoding="utf8"), segmenter, segmenter) logging.debug("Validate corpus built") learning_param = LearnParam(num_epoch=25, learning_rate=0.05, momentum=0.0, batch_size=batch_size, max_pad=max_pad, device=None, nworker=None) lm = SequenceTaggingMachine(unlabeled_tag_id) logging.log(logging.INFO, "Begin to train ...") lm.train(train_corpus, val_corpus, learning_param)
def train_model(training_data, validating_data, batch_size, max_pad, dev, nworker): head = '%(asctime)-15s %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) segmenter = CharacterSegmenter() corpus = SequencePairCorpus(source_with_unk=True, same_length=True) corpus.build(codecs.open(training_data, 'r', encoding = "utf8"), segmenter, segmenter) unlabeled_tag_id = corpus.target_corpus.id("U") problem = SequenceTaggingProblem(corpus) data_train = BucketIter(problem, batch_size, max_pad_num = max_pad) val_corpus = corpus.make(codecs.open(validating_data, 'r', encoding = "utf8"), segmenter, segmenter) val_problem = SequenceTaggingProblem(val_corpus) data_val = BucketIter(val_problem, batch_size, max_pad_num = max_pad) arch_param = ArchParam( num_hidden= 200, num_embed= 200, num_lstm_layer= 2, input_cell_num = corpus.source_cell_num(), output_cell_num= corpus.target_cell_num() ) learning_param = LearnParam( num_epoch=25,learning_rate=0.05, momentum=0.0, batch_size = batch_size, max_pad = max_pad, device=dev, nworker = nworker ) lm = PartialLabeledSenquenceTaggingModel(arch_param, unlabeled_tag_id) #lm.show_shape_info(data_train) logging.debug("O = {0}, S = {1}, B = {2}, I = {3}, E = {4} U = {5}".format( corpus.target_corpus.id("O"), corpus.target_corpus.id("S"), corpus.target_corpus.id("B"), corpus.target_corpus.id("I"), corpus.target_corpus.id("E"), corpus.target_corpus.id("U"))) logging.log(logging.INFO, "Begin to train ...") lm.train(data_train, data_val, learning_param)