def _create_vocab(self): assert self.split == 'train', "Vocabulary can ONLY be build from trainset" tokenizer = TweetTokenizer(preserve_case=False) c = Counter() w2i = OrderedDict() i2w = OrderedDict() tokens = ['<pad>', '<unk>', '<sos>', '<eos>'] for i, st in enumerate(tokens): i2w[i] = st w2i[st] = i with open(self.get_path('src-' + self.split + '.txt'), 'r') as file: for line in file: c.update(tokenizer.tokenize(line)) if self.mt: with open(self.get_path('tgt-' + self.split + '.txt'), 'r') as file: for line in file: c.update(tokenizer.tokenize(line)) #collection of the vocabulary and its counts vocab_counts = utils.vocab(c) for i, (word, counts) in enumerate(vocab_counts): if counts > self.min_occ and word not in tokens: i2w[len(w2i)] = word w2i[word] = len(w2i) assert len(w2i) == len(i2w) vocab = dict(w2i=w2i, i2w=i2w) with io.open(self.get_path('vocab.json'), 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace')) self.load_vocab()
score = float(poslasCount) * 100 / count if score >= highestScore: parser.Save( os.path.join(options.output, os.path.basename(options.model))) highestScore = score print "POS&LAS of the previous saved model: %.2f" % ( highestScore) else: ner_epoch = 1 dep_epoch = 1 print 'Extracting vocabulary' wordsdep, w2idep, c2idep, posdep, relsdep, capsdep = utils.vocab( options.conll_train) wordsner, w2iner, c2iner, posner, relsner, capsner = utils.vocab_ner( options.conll_trainner) words, c2i, pos, rels, caps = merge_counters( wordsdep, wordsner), merge_c2i_dicts(c2idep, c2iner), posner, relsdep, capsdep w2i = {w: i for i, w in enumerate(words.keys())} with open(os.path.join(options.output, options.params), 'w') as paramsfp: pickle.dump((words, w2i, c2i, pos, rels, options), paramsfp) #print 'Initializing joint model' parser = learner.jPosDepLearner(words, pos, rels, w2i, c2i, caps, options)
def run(om, options, i): if options.multiling: outdir = options.outdir else: cur_treebank = om.languages[i] outdir = cur_treebank.outdir if options.shared_task: outdir = options.shared_task_outdir if not options.predict: # training print 'Preparing vocab' if options.multiling: words, w2i, pos, cpos, rels, langs, ch = utils.vocab( om.languages, path_is_dir=True) else: words, w2i, pos, cpos, rels, langs, ch = utils.vocab( cur_treebank.trainfile) paramsfile = os.path.join(outdir, options.params) with open(paramsfile, 'w') as paramsfp: print 'Saving params to ' + paramsfile pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch), paramsfp) print 'Finished collecting vocab' print 'Initializing blstm arc hybrid:' parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options) durations = [] for epoch in xrange(options.first_epoch, options.first_epoch + options.epochs): print 'Starting epoch ' + str(epoch) start_time = time.time() if options.multiling: traindata = list( utils.read_conll_dir(om.languages, "train", options.max_sentences)) else: traindata = list( utils.read_conll(cur_treebank.trainfile, cur_treebank.iso_id, options.max_sentences)) parser.Train(traindata) print 'Finished epoch ' + str(epoch) if not options.overwrite_model: model_file = os.path.join(outdir, options.model + str(epoch)) parser.Save(model_file) if options.pred_dev: # use the model to predict on dev data if options.multiling: pred_langs = [ lang for lang in om.languages if lang.pred_dev ] # languages which have dev data on which to predict for lang in pred_langs: lang.outfilename = os.path.join( lang.outdir, 'dev_epoch_' + str(epoch) + '.conllu') print "Predicting on dev data for " + lang.name devdata = utils.read_conll_dir(pred_langs, "dev") pred = list(parser.Predict(devdata)) if len(pred) > 0: utils.write_conll_multiling(pred, pred_langs) else: print "Warning: prediction empty" if options.pred_eval: for lang in pred_langs: print "Evaluating dev prediction for " + lang.name utils.evaluate(lang.dev_gold, lang.outfilename, om.conllu) else: # monolingual case if cur_treebank.pred_dev: print "Predicting on dev data for " + cur_treebank.name devdata = utils.read_conll(cur_treebank.devfile, cur_treebank.iso_id) cur_treebank.outfilename = os.path.join( outdir, 'dev_epoch_' + str(epoch) + ('.conll' if not om.conllu else '.conllu')) pred = list(parser.Predict(devdata)) utils.write_conll(cur_treebank.outfilename, pred) if options.pred_eval: print "Evaluating dev prediction for " + cur_treebank.name score = utils.evaluate(cur_treebank.dev_gold, cur_treebank.outfilename, om.conllu) if options.model_selection: if score > cur_treebank.dev_best[1]: cur_treebank.dev_best = [epoch, score] if options.overwrite_model: print "Overwriting model due to higher dev score" model_file = os.path.join( cur_treebank.outdir, options.model) parser.Save(model_file) if options.deadline: # keep track of duration of training+eval now = time.time() duration = now - start_time durations.append(duration) # estimate when next epoch will finish last_five_durations = durations[-5:] eta = time.time() + max(last_five_durations) print 'Deadline in %.1f seconds' % (options.deadline - now) print 'ETA of next epoch in %.1f seconds' % (eta - now) # does it exceed the deadline? exceeds_deadline = eta > options.deadline else: # no deadline exceeds_deadline = False if exceeds_deadline or epoch == options.epochs: # at the last epoch copy the best model to barchybrid.model if not options.model_selection: # model selection off completely (for example multilingual case) # --> take the final epoch, i.e. the current epoch best_epoch = epoch else: best_epoch = cur_treebank.dev_best[ 0] # will be final epoch by default if model selection not on for this treebank if cur_treebank.model_selection: print "Best dev score of " + str( cur_treebank.dev_best[1] ) + " found at epoch " + str(cur_treebank.dev_best[0]) if not options.overwrite_model: bestmodel_file = os.path.join( outdir, "barchybrid.model" + str(best_epoch)) model_file = os.path.join(outdir, "barchybrid.model") print "Copying " + bestmodel_file + " to " + model_file copyfile(bestmodel_file, model_file) if exceeds_deadline and epoch < options.epochs: print 'Leaving epoch loop early to avoid exceeding deadline' break if exceeds_deadline and epoch < options.epochs: print 'Leaving epoch loop early to avoid exceeding deadline' break else: #if predict - so if options.multiling: modeldir = options.modeldir else: modeldir = om.languages[i].modeldir params = os.path.join(modeldir, options.params) print 'Reading params from ' + params with open(params, 'r') as paramsfp: words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load( paramsfp) parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, stored_opt) model = os.path.join(modeldir, options.model) parser.Load(model) if options.multiling: testdata = utils.read_conll_dir(om.languages, "test") else: testdata = utils.read_conll(cur_treebank.testfile, cur_treebank.iso_id) ts = time.time() if options.multiling: for l in om.languages: l.outfilename = os.path.join(outdir, l.outfilename) pred = list(parser.Predict(testdata)) utils.write_conll_multiling(pred, om.languages) else: if cur_treebank.outfilename: cur_treebank.outfilename = os.path.join( outdir, cur_treebank.outfilename) else: cur_treebank.outfilename = os.path.join( outdir, 'out' + ('.conll' if not om.conllu else '.conllu')) utils.write_conll(cur_treebank.outfilename, parser.Predict(testdata)) te = time.time() if options.pred_eval: if options.multiling: for l in om.languages: print "Evaluating on " + l.name score = utils.evaluate(l.test_gold, l.outfilename, om.conllu) print "Obtained LAS F1 score of %.2f on %s" % (score, l.name) else: print "Evaluating on " + cur_treebank.name score = utils.evaluate(cur_treebank.test_gold, cur_treebank.outfilename, om.conllu) print "Obtained LAS F1 score of %.2f on %s" % ( score, cur_treebank.name) print 'Finished predicting'
import os import sys sys.path.insert(0, 'src') import utils import jamo as jpack jamos_train, j2i_train, chars_train, c2i_train, words_train, w2i_train, pos_train, rels_train = utils.vocab( sys.argv[1]) jamos_dev, j2i_dev, chars_dev, c2i_dev, words_dev, w2i_dev, pos_dev, rels_dev = utils.vocab( sys.argv[2]) oov_word = 0 for word in words_dev: if not word in words_train: oov_word += 1 print 'OOV word: ', oov_word, ' / ', len( words_dev), ' ', float(oov_word) / len(words_dev) * 100 hangul_chars_train = {} for char in chars_train: if len(jpack.decompose(char)) > 1: hangul_chars_train[char] = True hangul_chars_dev = {} for char in chars_dev: if len(jpack.decompose(char)) > 1: hangul_chars_dev[char] = True oov_char = 0 for char in hangul_chars_dev: if not char in hangul_chars_train: oov_char += 1
test_res = list(parser.Predict(options.conll_test)) te = time.time() print 'Finished predicting test.', te - ts, 'seconds.' utils.write_conll(tespath, test_res) if not conllu: os.system('perl conll/eval.pl -g ' + options.conll_test + ' -s ' + tespath + ' > ' + tespath + '.txt') else: os.system( 'python conll/evaluation_script/conll17_ud_eval.py -v -w conll/evaluation_script/weights.clas ' + options.conll_test + ' ' + tespath + ' > ' + tespath + '.txt') else: print 'Preparing vocab' words, w2i, pos, rels = utils.vocab(options.conll_train) with open(os.path.join(options.output, options.params), 'w') as paramsfp: pickle.dump((words, w2i, pos, rels, options), paramsfp) print 'Finished collecting vocab' print 'Initializing lstm mstparser:' parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, options) for epoch in xrange(options.epochs): print 'Starting epoch', epoch parser.Train(options.conll_train) conllu = (os.path.splitext( options.conll_dev.lower())[1] == '.conllu') devpath = os.path.join(
if options.conll_dev != "N/A": devPredSents = parser.predict(options.conll_dev) count = 0 for idSent, devSent in enumerate(devPredSents): conll_devSent = [entry for entry in devSent if isinstance(entry, utils.ConllEntry)] for entry in conll_devSent: if entry.id <= 0: continue count += 1 else: print 'Extracting vocabulary' c2i = utils.vocab(options.conll_train) with open(os.path.join(options.output, options.params), 'w') as paramsfp: pickle.dump((c2i, options), paramsfp) parser = learner.Learner(c2i, options) highestScore = 0.0 eId = 0 for epoch in xrange(options.epochs): print '\n-----------------\nStarting epoch', epoch + 1 if epoch % 10 == 0: if epoch == 0: parser.trainer.restart(learning_rate=0.001) elif epoch == 10: parser.trainer.restart(learning_rate=0.0005)
help='Use predicate boolean flag.') parser.add_option("--dynet-gpu", action="store_true", dest="--dynet-gpu", default=False, help='Use GPU instead of cpu.') (options, args) = parser.parse_args() print 'Using external embedding:', options.external_embedding from srl import SRLLSTM if options.conll_train: print 'Preparing vocab' print options train_data = list(utils.read_conll(options.conll_train)) words, lemmas, pos, roles, chars = utils.vocab(train_data) with open(os.path.join(options.outdir, options.params), 'w') as paramsfp: pickle.dump((words, lemmas, pos, roles, chars, options), paramsfp) print 'Finished collecting vocab' print 'Initializing blstm srl:' parser = SRLLSTM(words, lemmas, pos, roles, chars, options) max_len = max([len(d) for d in train_data]) min_len = min([len(d) for d in train_data]) buckets = [list() for i in range(min_len, max_len)] for d in train_data: buckets[len(d) - min_len - 1].append(d) buckets = [x for x in buckets if x != []]
print 'Using external embedding:', options.external_embedding, "textual file:", options.external_embedding_Textual if not options.predictFlag: # Training if not (options.rlFlag or options.rlMostFlag or options.headFlag): print 'You must use either --userlmost or --userl or --usehead (you can use multiple)' sys.exit() print 'Preparing vocab' if WITHCPOS: words, w2i, pos, cpos, GENDER, NUMBER, PERSON, CASE, rels = utils.vocab(options.conll_train, True) else: words, w2i, pos, rels = utils.vocab(options.conll_train, False) #print words print pos #print cpos if WITHCPOS: with open(os.path.join(options.output, options.params), 'w') as paramsfp: pickle.dump((words, w2i, pos, cpos, GENDER, NUMBER, PERSON, CASE, rels, options), paramsfp) else: with open(os.path.join(options.output, options.params), 'w') as paramsfp: pickle.dump((words, w2i, pos, rels, options), paramsfp) print 'Finished collecting vocab'
import dynet as dy import random from utils import * import utils # encoding=utf8 import sys reload(sys) sys.setdefaultencoding('utf8') conll_train = "/Users/huseyinalecakir/NLP_LAB/data/tr_imst-ud-train.conllu" c2i, w2i, features = utils.vocab(conll_train) EOS = '<s>' characters = list("abcdefghijklmnopqrstuvwxyz ") characters.append(EOS) int2char = {c2i[i]: i for i in c2i} char2int = c2i VOCAB_SIZE = len(c2i) LSTM_NUM_OF_LAYERS = 2 EMBEDDINGS_SIZE = 128 STATE_SIZE = 256 ATTENTION_SIZE = 64 model = dy.Model() enc_fwd_lstm = dy.LSTMBuilder(LSTM_NUM_OF_LAYERS, EMBEDDINGS_SIZE, STATE_SIZE,
print 'Initializing lstm mstparser:' parser = mstlstm.MSTParserLSTM(pos, rels, w2i, chars, stored_opt) parser.Load(options.model) ts = time.time() print 'loading buckets' test_buckets = [list()] test_data = list(utils.read_conll(open(options.conll_test, 'r'))) for d in test_data: test_buckets[0].append(d) print 'parsing' test(parser, test_buckets, options.conll_test, options.conll_output) te = time.time() print 'Finished predicting test.', te - ts, 'seconds.' else: print 'Preparing vocab' w2i, pos, rels, chars = utils.vocab(options.conll_train) if not os.path.isdir(options.output): os.mkdir(options.output) with open(os.path.join(options.output, options.params), 'w') as paramsfp: pickle.dump((w2i, pos, rels, chars, options), paramsfp) print 'Finished collecting vocab' print 'Initializing lstm mstparser:' parser = mstlstm.MSTParserLSTM(pos, rels, w2i, chars, options) best_acc = -float('inf') t, epoch = 0, 1 train_data = list(utils.read_conll(open(options.conll_train, 'r'))) max_len = max([len(d) for d in train_data]) min_len = min([len(d) for d in train_data]) buckets = [list() for i in range(min_len, max_len)] for d in train_data:
parser.add_option("--usehead", action="store_true", dest="headFlag", default=False) parser.add_option("--userlmost", action="store_true", dest="rlFlag", default=False) parser.add_option("--userl", action="store_true", dest="rlMostFlag", default=False) parser.add_option("--predict", action="store_true", dest="predictFlag", default=False) parser.add_option("--dynet-mem", type="int", dest="cnn_mem", default=512) (options, args) = parser.parse_args() print 'Using external embedding:', options.external_embedding if not options.predictFlag: if not (options.rlFlag or options.rlMostFlag or options.headFlag): print 'You must use either --userlmost or --userl or --usehead (you can use multiple)' sys.exit() print 'Preparing vocab' words, w2i, pos, rels = utils.vocab(options.conll_train) with open(os.path.join(options.output, options.params), 'w') as paramsfp: pickle.dump((words, w2i, pos, rels, options), paramsfp) print 'Finished collecting vocab' print 'Initializing blstm arc hybrid:' parser = ArcHybridLSTM(words, pos, rels, w2i, options) for epoch in xrange(options.epochs): print 'Starting epoch', epoch parser.Train(options.conll_train) conllu = (os.path.splitext(options.conll_dev.lower())[1] == '.conllu') devpath = os.path.join(options.output, 'dev_epoch_' + str(epoch+1) + ('.conll' if not conllu else '.conllu')) utils.write_conll(devpath, parser.Predict(options.conll_dev))
def train(): vocab = utils.vocab() textcnn = TextCNN(sequence_length=100, num_classes=19, vocab_size=len(vocab), embedding_size=100, filter_sizes=[3, 5, 7, 9], num_filters=64, pool_k=1, trainable=True) textcnn.fit(['yq_train.txt', 'yq_valid.txt', 'yq_test.txt'], 30, dropout=0.5, save_path='./model/yq/lncnn', everyEpochNum=500, batch_size=32)
action="store_true", dest="predictFlag", default=False) parser.add_option("--dynet-mem", type="int", dest="dynet_mem", default=1000) # Doesn't work, must provide command line (options, args) = parser.parse_args() if not options.predictFlag: # Training if not (options.rlFlag or options.rlMostFlag or options.headFlag): print 'You must use either --userlmost or --userl or --usehead (you can use multiple)' sys.exit() jamos, j2i, chars, c2i, words, w2i, pos, rels = utils.vocab( options.conll_train) print '----------------------------' print len(words), 'wtypes,', len(chars), 'ctypes,', len( jamos), 'jtypes' print 'Use word?', not options.noword print 'Use char?', options.usechar print 'Use jamo?', options.usejamo print 'word dim:', options.wembedding_dims print 'char dim:', options.cembedding_dims print 'pos dim:', options.pembedding_dims print '----------------------------' external_embedding = {} if options.external_embedding is not None: with open(options.external_embedding,
path_amrs_dev = args.amr_dev+".graphs" with codecs.open(path_amrs,'rb') as f: amr_graphs = pickle.load(f) with codecs.open(path_amrs_dev,'rb') as f: dev_amr_graphs = pickle.load(f) with codecs.open(path_amr_templates,'rb') as ft: amr_graph_templates = pickle.load(ft) with codecs.open(path_multiword_templates,'rb') as ft: amr_multiword_graph_templates = pickle.load(ft) words,lemmas, pos,rels, nodes, entities,deps = utils.vocab(amr_graphs) _, _, _, dev_rels, dev_nodes,_, _ = utils.vocab(dev_amr_graphs) with open(os.path.join(args.output, args.params), 'wb') as paramsfp: pickle.dump((words, lemmas, pos, rels, nodes, entities,deps,args), paramsfp) parser = mlp.PerceptronAMR(words,pos,rels,nodes,entities,deps,args.external_embedding, None #args.pos_external_embedding ,None,None, None, amr_graph_templates, amr_multiword_graph_templates, None, args)
def run(om, options, i): outdir = options.output if options.multi_monoling: cur_treebank = om.languages[i] outdir = cur_treebank.outdir modelDir = cur_treebank.modelDir else: outdir = options.output modelDir = om.languages[i].modelDir if options.shared_task: outdir = options.shared_task_outdir if not options.include: cur_treebank = om.treebank if not options.predictFlag: print 'Preparing vocab' if options.multiling: words, w2i, pos, cpos, rels, langs, ch = utils.vocab( om.languages, path_is_dir=True) else: words, w2i, pos, cpos, rels, langs, ch = utils.vocab( cur_treebank.trainfile) with open(os.path.join(outdir, options.params), 'w') as paramsfp: pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch), paramsfp) print 'Finished collecting vocab' print 'Initializing blstm arc hybrid:' parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options) for epoch in xrange(options.first_epoch - 1, options.first_epoch - 1 + options.epochs): if options.multiling: traindata = list( utils.read_conll_dir(om.languages, "train", options.drop_proj, options.maxCorpus)) devdata = enumerate(utils.read_conll_dir(om.languages, "dev")) else: conllFP = open(cur_treebank.trainfile, 'r') traindata = list( utils.read_conll(conllFP, options.drop_proj, cur_treebank.iso_id)) if os.path.exists(cur_treebank.devfile): conllFP = open(cur_treebank.devfile, 'r') devdata = enumerate( utils.read_conll(conllFP, False, cur_treebank.iso_id)) else: tot_sen = len(traindata) #take a bit less than 5% of train sentences for dev if tot_sen > 1000: import random random.shuffle(traindata) dev_len = int(0.05 * tot_sen) #gen object * 2 devdata, dev_gold = itertools.tee(traindata[:dev_len]) devdata = enumerate(devdata) dev_gold_f = os.path.join(outdir, 'dev_gold' + '.conllu') utils.write_conll(dev_gold_f, dev_gold) cur_treebank.dev_gold = dev_gold_f traindata = traindata[dev_len:] else: devdata = None print 'Starting epoch', epoch parser.Train(traindata) if options.multiling: for l in om.languages: l.outfilename = os.path.join( l.outdir, 'dev_epoch_' + str(epoch + 1) + '.conllu') pred = list(parser.Predict(devdata)) if len(pred) > 0: utils.write_conll_multiling(pred, om.languages) else: cur_treebank.outfilename = os.path.join( outdir, 'dev_epoch_' + str(epoch + 1) + ('.conll' if not om.conllu else '.conllu')) if devdata: pred = list(parser.Predict(devdata)) utils.write_conll(cur_treebank.outfilename, pred) if options.multiling: for l in om.languages: utils.evaluate(l.dev_gold, l.outfilename, om.conllu) else: utils.evaluate(cur_treebank.dev_gold, cur_treebank.outfilename, om.conllu) print 'Finished predicting dev' parser.Save(os.path.join(outdir, options.model + str(epoch + 1))) else: #if predict - so params = os.path.join(modelDir, options.params) with open(params, 'r') as paramsfp: words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load( paramsfp) parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, stored_opt) model = os.path.join(modelDir, options.model) parser.Load(model) if options.multiling: testdata = enumerate(utils.read_conll_dir( om.languages, "test")) if not options.multiling: conllFP = open(cur_treebank.testfile, 'r') testdata = enumerate( utils.read_conll(conllFP, False, cur_treebank.iso_id)) ts = time.time() if options.multiling: for l in om.languages: l.outfilename = os.path.join(outdir, l.outfilename) pred = list(parser.Predict(testdata)) utils.write_conll_multiling(pred, om.languages) else: cur_treebank.outfilename = os.path.join( outdir, cur_treebank.outfilename) utils.write_conll(cur_treebank.outfilename, parser.Predict(testdata)) te = time.time() if options.predEval: if options.multiling: for l in om.languages: utils.evaluate(l.test_gold, l.outfilename, om.conllu) else: utils.evaluate(cur_treebank.test_gold, cur_treebank.outfilename, om.conllu) print 'Finished predicting test', te - ts
import sys import utils #Find % of OOVs on dev or test dataset #Usage python OOVs.py path_to_train path_to_dev/test train = sys.argv[1] dev_test = sys.argv[2] words, w2i, c2i, pos, rels = utils.vocab(train) words_dev, w2i_dev, c2i_dev, pos_dev, rels_dev = utils.vocab(dev_test) OOVs = 0 for k, v in words_dev.items(): if not (k in words.keys()): OOVs += 1 print str(format(float(OOVs) / (len(words_dev)) * 100, '.2f')) + "% OOVs on test/dev dataset"
def run(om,options,i): if options.multiling: outdir = options.outdir else: cur_treebank = om.languages[i] outdir = cur_treebank.outdir if options.shared_task: outdir = options.shared_task_outdir if not options.predict: # training fineTune = False start_from = 1 if options.continueModel is None: continueTraining = False else: continueTraining = True trainedModel = options.continueModel if options.fineTune: fineTune = True else: start_from = options.first_epoch - 1 if not continueTraining: print 'Preparing vocab' if options.multiling: path_is_dir=True, words, w2i, pos, cpos, rels, langs, ch = utils.vocab(om.languages,\ path_is_dir, options.shareWordLookup,\ options.shareCharLookup) else: words, w2i, pos, cpos, rels, langs, ch = utils.vocab(cur_treebank.trainfile) paramsfile = os.path.join(outdir, options.params) with open(paramsfile, 'w') as paramsfp: print 'Saving params to ' + paramsfile pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch), paramsfp) print 'Finished collecting vocab' else: paramsfile = os.path.join(outdir, options.params) with open(paramsfile, 'rb') as paramsfp: print 'Load params from ' + paramsfile words, w2i, pos, rels, cpos, langs, options, ch = pickle.load(paramsfp) print 'Finished loading vocab' max_epochs = options.first_epoch + options.epochs print 'Initializing blstm arc hybrid:' parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options) if continueTraining: if not fineTune: # continue training only, not doing fine tuning options.first_epoch = start_from + 1 max_epochs = options.epochs else: # fine tune model options.first_epoch = options.epochs + 1 max_epochs = options.first_epoch + 15 print 'Fine tune model for another', max_epochs - options.first_epoch, 'epochs' parser.Load(trainedModel) best_multi_las = -1 best_multi_epoch = 0 if continueTraining: train_stats = codecs.open(os.path.join(outdir, 'train.stats'), 'a', encoding='utf-8') else: train_stats = codecs.open(os.path.join(outdir, 'train.stats'), 'w', encoding='utf-8') for epoch in xrange(options.first_epoch, max_epochs + 1): print 'Starting epoch ' + str(epoch) if options.multiling: traindata = list(utils.read_conll_dir(om.languages, "train", options.max_sentences)) else: traindata = list(utils.read_conll(cur_treebank.trainfile, cur_treebank.iso_id,options.max_sentences)) parser.Train(traindata) train_stats.write(unicode('Epoch ' + str(epoch) + '\n')) print 'Finished epoch ' + str(epoch) model_file = os.path.join(outdir, options.model + '.tmp') parser.Save(model_file) if options.pred_dev: # use the model to predict on dev data if options.multiling: pred_langs = [lang for lang in om.languages if lang.pred_dev] # languages which have dev data on which to predict for lang in pred_langs: lang.outfilename = os.path.join(lang.outdir, 'dev_epoch_' + str(epoch) + '.conllu') print "Predicting on dev data for " + lang.name devdata = utils.read_conll_dir(pred_langs,"dev") pred = list(parser.Predict(devdata)) if len(pred)>0: utils.write_conll_multiling(pred,pred_langs) else: print "Warning: prediction empty" if options.pred_eval: total_las = 0 for lang in pred_langs: print "Evaluating dev prediction for " + lang.name las_score = utils.evaluate(lang.dev_gold, lang.outfilename,om.conllu) total_las += las_score train_stats.write(unicode('Dev LAS ' + lang.name + ': ' + str(las_score) + '\n')) if options.model_selection: if total_las > best_multi_las: best_multi_las = total_las best_multi_epoch = epoch else: # monolingual case if cur_treebank.pred_dev: print "Predicting on dev data for " + cur_treebank.name devdata = utils.read_conll(cur_treebank.devfile, cur_treebank.iso_id) cur_treebank.outfilename = os.path.join(outdir, 'dev_epoch_' + str(epoch) + ('.conll' if not om.conllu else '.conllu')) pred = list(parser.Predict(devdata)) utils.write_conll(cur_treebank.outfilename, pred) if options.pred_eval: print "Evaluating dev prediction for " + cur_treebank.name las_score = utils.evaluate(cur_treebank.dev_gold, cur_treebank.outfilename, om.conllu) if options.model_selection: if las_score > cur_treebank.dev_best[1]: cur_treebank.dev_best = [epoch, las_score] train_stats.write(unicode('Dev LAS ' + cur_treebank.name + ': ' + str(las_score) + '\n')) if epoch == max_epochs: # at the last epoch choose which model to copy to barchybrid.model if not options.model_selection: best_epoch = options.epochs # take the final epoch if model selection off completely (for example multilingual case) else: if options.multiling: best_epoch = best_multi_epoch else: best_epoch = cur_treebank.dev_best[0] # will be final epoch by default if model selection not on for this treebank if cur_treebank.model_selection: print "Best dev score of " + str(cur_treebank.dev_best[1]) + " found at epoch " + str(cur_treebank.dev_best[0]) bestmodel_file = os.path.join(outdir,"barchybrid.model.tmp") model_file = os.path.join(outdir,"barchybrid.model") if fineTune: model_file = os.path.join(outdir,"barchybrid.tuned.model") print "Best epoch: " + str(best_epoch) print "Copying " + bestmodel_file + " to " + model_file copyfile(bestmodel_file,model_file) train_stats.close() else: #if predict - so # import pdb;pdb.set_trace() eval_type = options.evaltype print "Eval type: ", eval_type if eval_type == "train": if options.multiling: for l in om.languages: l.test_gold = l.test_gold.replace('test', 'train') else: cur_treebank.testfile = cur_treebank.trainfile cur_treebank.test_gold = cur_treebank.trainfile elif eval_type == "dev": if options.multiling: for l in om.languages: l.test_gold = l.test_gold.replace('test', 'dev') else: cur_treebank.testfile = cur_treebank.devfile cur_treebank.test_gold = cur_treebank.devfile if options.multiling: modeldir = options.modeldir if options.fineTune: prefix = [os.path.join(outdir, os.path.basename(l.test_gold) + '-tuned') for l in om.languages] else: prefix = [os.path.join(outdir, os.path.basename(l.test_gold)) for l in om.languages] else: modeldir = om.languages[i].modeldir if options.fineTune: prefix = os.path.join(outdir, os.path.basename(cur_treebank.testfile)) + '-tuned' else: prefix = os.path.join(outdir, os.path.basename(cur_treebank.testfile)) if not options.extract_vectors: prefix = None params = os.path.join(modeldir, options.params) print 'Reading params from ' + params with open(params, 'r') as paramsfp: words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load(paramsfp) parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, stored_opt) if options.fineTune: options.model = options.model.replace('.model', '.tuned.model') model = os.path.join(modeldir, options.model) parser.Load(model) if options.multiling: testdata = utils.read_conll_dir(om.languages, eval_type) else: testdata = utils.read_conll(cur_treebank.testfile, cur_treebank.iso_id) ts = time.time() if options.multiling: for l in om.languages: l.outfilename = os.path.join(outdir, eval_type + "-" + l.outfilename) pred = list(parser.Predict(testdata, prefix)) utils.write_conll_multiling(pred,om.languages) else: if cur_treebank.outfilename: cur_treebank.outfilename = os.path.join(outdir, eval_type + "-" + cur_treebank.outfilename) else: cur_treebank.outfilename = os.path.join(outdir, 'out' + ('.conll' if not om.conllu else '.conllu')) utils.write_conll(cur_treebank.outfilename, parser.Predict(testdata, prefix)) te = time.time() if options.pred_eval: if options.multiling: for l in om.languages: print "Evaluating on " + l.name score = utils.evaluate(l.test_gold, l.outfilename, om.conllu) print "Obtained LAS F1 score of %.2f on %s" %(score, l.name) else: print "Evaluating on " + cur_treebank.name score = utils.evaluate(cur_treebank.test_gold, cur_treebank.outfilename, om.conllu) print "Obtained LAS F1 score of %.2f on %s" %(score,cur_treebank.name) print 'Finished predicting'
8: '军事', 9: '民生', 10: '时尚', 11: '娱乐', 12: '游戏', 13: '时事', 14: '股票', 15: '历史', 16: '美食', 17: '房产', 18: '汽车' } app = Flask(__name__) topK = 100 MODEL_NAME = './model/yq/' vocab = utils.vocab() textcnn = TextCNN(sequence_length=100, num_classes=len(label_names), vocab_size=len(vocab), embedding_size=100, filter_sizes=[3, 5, 7, 9], num_filters=64, trainable=False, pool_k=1) textcnn.load_model(MODEL_NAME) @app.route('/') def demo(): return render_template('index.html')
def __init__(self, new_options): print '1. Init Parser' print '1-1. Preparing vocab' if not new_options.predictFlag: if new_options.train_multilingual: vocab, w2i, pos, xpos, rels = utils.vocab_multilingual( new_options.conll_train) # new_options.xpembedding_dims = 0 else: vocab, w2i, pos, xpos, rels = utils.vocab( new_options.conll_train, new_options.conll_train_language) new_options.xpembedding_dims = 0 if len( xpos) < 5 else new_options.xpembedding_dims options = new_options else: with open(new_options.params, 'r') as paramsfp: vocab, w2i, pos, xpos, rels, ex_trn, exc_trnd, stored_opt = pickle.load( paramsfp) self.extrnd = ex_trn self.exctrnd = exc_trnd stored_opt.conll_test = new_options.conll_test stored_opt.conll_test_language = new_options.conll_test_language stored_opt.predictFlag = new_options.predictFlag stored_opt.lang_vec_file = new_options.lang_vec_file options = stored_opt print " ls it using multilingual embedding?:", options.multilingual_emb print '1-1. End of preparing vocab' self.model = Model() random.seed(1) self.trainer = AdamTrainer(self.model) self.activations = { 'tanh': tanh, 'sigmoid': logistic, 'relu': rectify, 'tanh3': (lambda x: tanh(cwise_multiply(cwise_multiply(x, x), x))) } self.activation = self.activations[options.activation] self.blstmFlag = options.blstmFlag self.labelsFlag = options.labelsFlag self.costaugFlag = options.costaugFlag self.bibiFlag = options.bibiFlag self.extConcateFlag = options.extConcateFlag self.ldims = options.lstm_dims self.wdims = options.wembedding_dims self.pdims = options.pembedding_dims self.xpdims = options.xpembedding_dims self.rdims = options.rembedding_dims self.layers = options.lstm_layers self.wordsCount = vocab self.vocab = {word: ind + 3 for word, ind in w2i.iteritems()} self.pos = {word: ind + 3 for ind, word in enumerate(pos)} self.xpos = {word: ind + 3 for ind, word in enumerate(xpos)} self.rels = {word: ind for ind, word in enumerate(rels)} self.irels = rels self.train_multilingual = options.train_multilingual self.lang_vec_file = options.lang_vec_file self.multilingual_emb = options.multilingual_emb self.add_lang_vec = options.add_lang_vec self.languageVec_dic = read_languageVec( self.lang_vec_file) ## read language_vec.csv file self.landims = len(self.languageVec_dic.values()[0].lang_vec) self.conll_test_language = options.conll_test_language self.conll_train_language = options.conll_train_language if options.conll_train_language is not None else "Unknown" print " Training language: ", self.conll_train_language print " Load Language vector, Dimensions: ", self.landims print "1-2. Load external embedding" self.external_embedding, self.edim = None, 0 if options.predictFlag: self.elookup = self.model.add_lookup_parameters( (3, 1) ) # set temporal variable for model, later it will be resetted by model.load automatically. if options.external_embedding is not None: self.external_embedding = options.external_embedding self.edim = options.edim else: if options.external_embedding is not None: external_embedding_fp = open(options.external_embedding, 'r') external_embedding_fp.readline() self.external_embedding = { line.split(' ')[0]: [float(f) for f in line.strip().split(' ')[1:]] for line in external_embedding_fp } external_embedding_fp.close() self.edim = len(self.external_embedding.values()[0]) self.noextrn = [0.0 for _ in xrange(self.edim)] self.extrnd = { word: i + 3 for i, word in enumerate(self.external_embedding) } self.elookup = self.model.add_lookup_parameters( (len(self.external_embedding) + 3, self.edim)) for word, i in self.extrnd.iteritems(): self.elookup.init_row(i, self.external_embedding[word]) self.extrnd['*PAD*'] = 1 self.extrnd['*INITIAL*'] = 2 if options.extConcateFlag: print ' Load external embedding. It will be used for an additional embedding', self.edim else: self.wdims = self.edim print ' Load external embedding. It will be used for the word dimension vector', self.edim else: self.elookup = self.model.add_lookup_parameters( (3, 1) ) #set temporal variable for model, later it will be resetted by model.load automatically. self.extrnd = {} print "1-2. End of loading external embedding" print "1-3. Load external cluster embedding" self.external_cluster_embedding, self.ecdim = None, 0 if options.predictFlag: self.eclookup = self.model.add_lookup_parameters((3, 1)) if options.external_cluster_embedding is not None: self.external_cluster_embedding = options.external_cluster_embedding self.ecdim = options.ecdim else: if options.external_cluster_embedding is not None: external_cluster_embedding_fp = open( options.external_cluster_embedding, 'r') external_cluster_embedding_fp.readline() self.external_cluster_embedding = { line.split(' ')[0]: [float(f) for f in line.strip().split(' ')[1:]] for line in external_cluster_embedding_fp } external_cluster_embedding_fp.close() self.ecdim = len(self.external_cluster_embedding.values()[0]) self.noexctrn = [0.0 for _ in xrange(self.ecdim)] self.exctrnd = { word: i + 3 for i, word in enumerate(self.external_cluster_embedding) } self.eclookup = self.model.add_lookup_parameters( (len(self.external_cluster_embedding) + 3, self.ecdim)) for word, i in self.exctrnd.iteritems(): self.eclookup.init_row( i, self.external_cluster_embedding[word]) self.exctrnd['*PAD*'] = 1 self.exctrnd['*INITIAL*'] = 2 print ' Load external cluster embedding. It will be used for an additional embedding', self.ecdim else: self.eclookup = self.model.add_lookup_parameters((3, 1)) self.exctrnd = {} print "1-3 End of loading external cluster embedding" ### Add language embedding if self.add_lang_vec: print "Add Language Vector", "language dims: ", self.landims self.llookup = self.model.add_lookup_parameters( (self.landims + 3, self.landims)) for key in self.languageVec_dic.keys(): self.llookup.init_row( self.languageVec_dic.get(key).lang_num, self.languageVec_dic.get(key).lang_vec) ## Finish language embedding self.dims = self.wdims + self.pdims + self.xpdims + ( self.landims if self.add_lang_vec else 0) + (self.edim if options.extConcateFlag else 0) + (self.ecdim if self.external_cluster_embedding else 0) print "Total dims: ", self.dims, "word dims: ", self.wdims if self.bibiFlag: self.builders = [ VanillaLSTMBuilder(1, self.dims, self.ldims, self.model), VanillaLSTMBuilder(1, self.dims, self.ldims, self.model) ] self.bbuilders = [ VanillaLSTMBuilder(1, self.ldims * 2, self.ldims, self.model), VanillaLSTMBuilder(1, self.ldims * 2, self.ldims, self.model) ] elif self.layers > 0: self.builders = [ VanillaLSTMBuilder(self.layers, self.dims, self.ldims, self.model), VanillaLSTMBuilder(self.layers, self.dims, self.ldims, self.model) ] else: self.builders = [ SimpleRNNBuilder(1, self.dims, self.ldims, self.model), SimpleRNNBuilder(1, self.dims, self.ldims, self.model) ] self.hidden_units = options.hidden_units self.hidden2_units = options.hidden2_units self.vocab['*PAD*'] = 1 self.pos['*PAD*'] = 1 self.xpos['*PAD*'] = 1 self.vocab['*INITIAL*'] = 2 self.pos['*INITIAL*'] = 2 self.xpos['*INITIAL*'] = 2 self.wlookup = self.model.add_lookup_parameters( (len(vocab) + 3, self.wdims)) self.plookup = self.model.add_lookup_parameters( (len(pos) + 3, self.pdims)) self.xplookup = self.model.add_lookup_parameters( (len(xpos) + 3, self.xpdims)) self.rlookup = self.model.add_lookup_parameters( (len(rels), self.rdims)) self.hidLayerFOH = self.model.add_parameters( (self.hidden_units, self.ldims * 2)) self.hidLayerFOM = self.model.add_parameters( (self.hidden_units, self.ldims * 2)) self.hidBias = self.model.add_parameters((self.hidden_units)) self.hid2Layer = self.model.add_parameters( (self.hidden2_units, self.hidden_units)) self.hid2Bias = self.model.add_parameters((self.hidden2_units)) self.outLayer = self.model.add_parameters( (1, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units)) if self.labelsFlag: self.rhidLayerFOH = self.model.add_parameters( (self.hidden_units, 2 * self.ldims)) self.rhidLayerFOM = self.model.add_parameters( (self.hidden_units, 2 * self.ldims)) self.rhidBias = self.model.add_parameters((self.hidden_units)) self.rhid2Layer = self.model.add_parameters( (self.hidden2_units, self.hidden_units)) self.rhid2Bias = self.model.add_parameters((self.hidden2_units)) self.routLayer = self.model.add_parameters( (len(self.irels), self.hidden2_units if self.hidden2_units > 0 else self.hidden_units)) self.routBias = self.model.add_parameters((len(self.irels))) if not new_options.predictFlag: options.edim = self.edim options.ecdim = self.ecdim with open(os.path.join(new_options.output, new_options.params), 'w') as paramsfp: pickle.dump((vocab, w2i, pos, xpos, rels, self.extrnd, self.exctrnd, options), paramsfp) print 'Finished collecting vocab'
print('Predicting POS XPOS tags') ts = time.time() test_res = list(tagger.Predict(conll_sentences, True)) te = time.time() print('Finished in', te - ts, 'seconds.') utils.write_conll(tespath, test_res) else: ext_words_train = utils.ext_vocab(options.conll_train, options.external_embedding_voc) ext_words_dev = utils.ext_vocab(options.conll_dev, options.external_embedding_voc) print('Extracting vocabulary') words, w2i, c2i, pos, xpos = utils.vocab(options.conll_train) with open(os.path.join(options.output, options.params), 'wb') as paramsfp: pickle.dump((words, w2i, c2i, pos, xpos, options), paramsfp) print('Initializing model') tagger = learner.Affine_tagger(words, pos, xpos, w2i, c2i, ext_words_train, ext_words_dev, options) with open(options.conll_dev, 'r') as conllFP: devData = list(utils.read_conll(conllFP, tagger.c2i)) conll_sentences = [] for sentence in devData: conll_sentence = [
def run(om, options, i): if options.multiling: outdir = options.outdir else: cur_treebank = om.languages[i] outdir = cur_treebank.outdir if options.shared_task: outdir = options.shared_task_outdir if not options.predict: # training print 'Preparing vocab' if options.multiling: path_is_dir = True, words, w2i, pos, cpos, rels, langs, ch = utils.vocab(om.languages,\ path_is_dir, options.shareWordLookup,\ options.shareCharLookup) else: words, w2i, pos, cpos, rels, langs, ch = utils.vocab( cur_treebank.trainfile) paramsfile = os.path.join(outdir, options.params) with open(paramsfile, 'w') as paramsfp: print 'Saving params to ' + paramsfile pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch), paramsfp) print 'Finished collecting vocab' print 'Initializing blstm arc hybrid:' parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options) if options.continueModel is not None: parser.Load(options.continueModel) for epoch in xrange(options.first_epoch, options.first_epoch + options.epochs): print 'Starting epoch ' + str(epoch) if options.multiling: traindata = list( utils.read_conll_dir(om.languages, "train", options.max_sentences)) else: traindata = list( utils.read_conll(cur_treebank.trainfile, cur_treebank.iso_id, options.max_sentences)) parser.Train(traindata) print 'Finished epoch ' + str(epoch) model_file = os.path.join(outdir, options.model + str(epoch)) parser.Save(model_file) if options.pred_dev: # use the model to predict on dev data if options.multiling: pred_langs = [ lang for lang in om.languages if lang.pred_dev ] # languages which have dev data on which to predict for lang in pred_langs: lang.outfilename = os.path.join( lang.outdir, 'dev_epoch_' + str(epoch) + '.conllu') print "Predicting on dev data for " + lang.name devdata = utils.read_conll_dir(pred_langs, "dev") pred = list(parser.Predict(devdata)) if len(pred) > 0: utils.write_conll_multiling(pred, pred_langs) else: print "Warning: prediction empty" if options.pred_eval: for lang in pred_langs: print "Evaluating dev prediction for " + lang.name utils.evaluate(lang.dev_gold, lang.outfilename, om.conllu) else: # monolingual case if cur_treebank.pred_dev: print "Predicting on dev data for " + cur_treebank.name devdata = utils.read_conll(cur_treebank.devfile, cur_treebank.iso_id) cur_treebank.outfilename = os.path.join( outdir, 'dev_epoch_' + str(epoch) + ('.conll' if not om.conllu else '.conllu')) pred = list(parser.Predict(devdata)) utils.write_conll(cur_treebank.outfilename, pred) if options.pred_eval: print "Evaluating dev prediction for " + cur_treebank.name score = utils.evaluate(cur_treebank.dev_gold, cur_treebank.outfilename, om.conllu) if options.model_selection: if score > cur_treebank.dev_best[1]: cur_treebank.dev_best = [epoch, score] if epoch == options.epochs: # at the last epoch choose which model to copy to barchybrid.model if not options.model_selection: best_epoch = options.epochs # take the final epoch if model selection off completely (for example multilingual case) else: best_epoch = cur_treebank.dev_best[ 0] # will be final epoch by default if model selection not on for this treebank if cur_treebank.model_selection: print "Best dev score of " + str( cur_treebank.dev_best[1] ) + " found at epoch " + str(cur_treebank.dev_best[0]) bestmodel_file = os.path.join( outdir, "barchybrid.model" + str(best_epoch)) model_file = os.path.join(outdir, "barchybrid.model") print "Copying " + bestmodel_file + " to " + model_file copyfile(bestmodel_file, model_file) else: #if predict - so if options.multiling: modeldir = options.modeldir else: modeldir = om.languages[i].modeldir params = os.path.join(modeldir, options.params) print 'Reading params from ' + params with open(params, 'r') as paramsfp: words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load( paramsfp) parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, stored_opt) model = os.path.join(modeldir, options.model) parser.Load(model) if options.multiling: testdata = utils.read_conll_dir(om.languages, "test") else: testdata = utils.read_conll(cur_treebank.testfile, cur_treebank.iso_id) ts = time.time() if options.multiling: for l in om.languages: l.outfilename = os.path.join(outdir, l.outfilename) pred = list(parser.Predict(testdata)) utils.write_conll_multiling(pred, om.languages) else: if cur_treebank.outfilename: cur_treebank.outfilename = os.path.join( outdir, cur_treebank.outfilename) else: cur_treebank.outfilename = os.path.join( outdir, 'out' + ('.conll' if not om.conllu else '.conllu')) utils.write_conll(cur_treebank.outfilename, parser.Predict(testdata)) te = time.time() if options.pred_eval: if options.multiling: for l in om.languages: print "Evaluating on " + l.name score = utils.evaluate(l.test_gold, l.outfilename, om.conllu) print "Obtained LAS F1 score of %.2f on %s" % (score, l.name) else: print "Evaluating on " + cur_treebank.name score = utils.evaluate(cur_treebank.test_gold, cur_treebank.outfilename, om.conllu) print "Obtained LAS F1 score of %.2f on %s" % ( score, cur_treebank.name) print 'Finished predicting'
print('Predicting POS tags and parsing dependencies') with open(testoutpath, 'w') as fh: for sentence in parser.Predict(options.conll_test): for entry in sentence[1:]: fh.write(str(entry) + '\n') fh.write('\n') else: print("Training file: " + options.conll_train) highestScore = 0.0 eId = 0 print('Extracting vocabulary') morph_dict = utils.get_morph_dict(options.segmentation_path, options.lowerCase) words, w2i, c2i, m2i, t2i, pos, rels = utils.vocab( options.conll_train, morph_dict) with open(os.path.join(options.output, options.params), 'wb') as paramsfp: pickle.dump( (words, w2i, c2i, m2i, t2i, morph_dict, pos, rels, options), paramsfp) #print 'Initializing joint model' parser = learner.jPosDepLearner(words, pos, rels, w2i, c2i, m2i, t2i, morph_dict, options) if options.pipeline and options.morphFlag and not pretrained_flag: for epoch in range(5): print('\n-----------------\nStarting Morph2Vec epoch', epoch + 1)
print("POS accuracy:\t%.2f" % (posCount * 100 / count)) print("POS&LAS:\t%.2f" % (poslasCount * 100 / count)) score = poslasCount * 100 / count if score >= highestScore: parser.Save( os.path.join(args.output, os.path.basename(args.model))) highestScore = score print("POS&LAS of the previous saved model: %.2f" % (highestScore)) else: print('Extracting vocabulary') words, w2i, c2i, pos, rels = utils.vocab(args.conll_train) with open(os.path.join(args.output, args.params), 'wb') as paramsfp: pickle.dump((words, w2i, c2i, pos, rels, args), paramsfp, protocol=2) #print 'Initializing joint model' parser = oldslavdep.OldSlavDep(words, pos, rels, w2i, c2i, args) for epoch in range(args.epochs): print('\n-----------------\nStarting epoch', epoch + 1) if epoch % 10 == 0: if epoch == 0:
parser.add_option("--dynet-mem", type="int", dest="mem", default=0) parser.add_option( "--model-type", type="int", dest="model_type", default=0 ) # 0 none -1 simple char rnn - 2 simple char bilstm - 3 simple prevec (options, args) = parser.parse_args() print("Training file: " + options.conll_train) if options.conll_dev != "N/A": print("Development file: " + options.conll_dev) highestScore = 0.0 eId = 0 print 'Extracting vocabulary' c2i, w2i, features = utils.vocab(options.conll_train) parser = learner.Learner(c2i, w2i, features, options) highestScore = 0.0 eId = 0 for epoch in xrange(options.epochs): print '\n-----------------\nStarting epoch', epoch + 1 if epoch % 10 == 0: if epoch == 0: parser.trainer.restart(learning_rate=0.001) elif epoch == 10: parser.trainer.restart(learning_rate=0.0005) else: parser.trainer.restart(learning_rate=0.00025)
for l in f: if l.startswith('UAS'): print('UAS:%s' % l.strip().split()[-1]) elif l.startswith('LAS'): print('LAS:%s' % l.strip().split()[-1]) else: # Training classifier print(f'Training with file {options.conll_train}') # Added to run from IntelliJ train_file = os.getcwd() + options.conll_train dev_file = os.getcwd() + options.conll_dev # Added to run from IntelliJ print('Preparing vocabulary table') words, enum_word, pos, rels, onto, cpos = list(utils.vocab(train_file)) with open(os.path.join(output_file, options.params), 'wb') as paramsfp: pickle.dump((words, enum_word, pos, rels, onto, cpos, options), paramsfp) print('Finished collecting vocabulary') print('Initializing mst-parser:') parser = mstlstm.MSTParserLSTM(words, pos, rels, enum_word, options, onto, cpos) for epoch in range(options.epochs): print('Starting epoch', epoch) parser.train(train_file) parser.save( os.path.join(output_file, os.path.basename(model_path) + str(epoch + 1))) # evaluate_model()
type="int", dest="dynet-autobatch", default=0) parser.add_option("--dynet-l2", type="float", dest="dynet-l2", default=0) parser.add_option("--dynet-gpus", action="store_true", dest="dynet-gpus", default=False, help='Use GPU instead of cpu.') (options, args) = parser.parse_args() if options.train_file: train_data, dev_data = utils.split_data(options.train_file, options.train_t, options.dev_percent) words, tags, chars = utils.vocab(train_data, options.min_freq) max_len = max([len(d[1]) for d in train_data]) min_len = min([len(d[1]) for d in train_data]) buckets = [list() for i in range(min_len, max_len)] for d in train_data: buckets[len(d[1]) - min_len - 1].append(d) dev_buckets = [list()] for d in dev_data: dev_buckets[0].append(d) with open(os.path.join(options.outdir, options.params), 'w') as paramsfp: pickle.dump((words, tags, chars, options), paramsfp) t = MT(options, words, tags, chars) dev_batches = utils.get_batches(dev_buckets, t, False) best_dev = 0