def train_parser(cls, options, data_train=None, data_dev=None, data_test=None): if sys.platform.startswith("linux"): set_proc_name(options.title) ensure_dir(options.output) path = os.path.join(options.output, "{}_{}_train.log".format(options.title, int(time.time()))) log_to_file(path) logger.name = options.title cls.options_hook(options) DataFormatClass = cls.get_data_formats()[options.data_format] if data_train is None: data_train = DataFormatClass.from_file(options.conll_train) if data_dev is None: data_dev = {i: DataFormatClass.from_file(i, False) for i in options.conll_dev} if data_test is None and options.conll_test is not None: data_test = DataFormatClass.from_file(options.conll_test, False) else: data_test = None try: os.makedirs(options.output) except OSError: pass return cls.repeat_train_and_validate(data_train, data_dev, data_test, options)
def train_parser(cls, options, data_train=None, data_dev=None, data_test=None): set_proc_name(options.title) ensure_dir(options.output) path = os.path.join(options.output, "{}_{}_train.log".format(options.title, int(time.time()))) log_to_file(path) logger.name = options.title cls.options_hook(options) DataFormatClass = cls.get_data_formats()[options.data_format] if data_train is None: data_train = DataFormatClass.from_file(options.conll_train) if data_dev is None: data_dev = {i: DataFormatClass.from_file(i, False) for i in options.conll_dev} try: os.makedirs(options.output) except OSError: pass parser = cls(options, data_train) random_obj = random.Random(1) for epoch in range(options.epochs): logger.info('Starting epoch %d', epoch) random_obj.shuffle(data_train) options.is_train = True parser.train(data_train) # save model and delete old model for i in range(0, epoch - options.max_save): path = os.path.join(options.output, os.path.basename(options.model)) + str(i + 1) if os.path.exists(path): os.remove(path) path = os.path.join(options.output, os.path.basename(options.model)) + str(epoch + 1) parser.save(path) def predict(sentences, gold_file, output_file): options.is_train = False with open(output_file, "w") as f_output: if hasattr(DataFormatClass, "file_header"): f_output.write(DataFormatClass.file_header + "\n") for i in parser.predict(sentences): f_output.write(i.to_string()) # script_path = os.path.join(os.path.dirname(__file__), "main.py") # p = subprocess.Popen([sys.executable, script_path, "mst+empty", "predict", "--model", path, # "--test", gold_file, # "--output", output_file], stdout=sys.stdout) # p.wait() DataFormatClass.evaluate_with_external_program(gold_file, output_file) for file_name, file_content in data_dev.items(): try: prefix, suffix = os.path.basename(file_name).rsplit(".", 1) except ValueError: prefix = os.path.basename(file_name) suffix = "" dev_output = os.path.join(options.output, '{}_epoch_{}.{}'.format(prefix, epoch + 1, suffix)) predict(file_content, file_name, dev_output)
def train_parser(cls, options, data_train=None, data_dev=None, data_test=None): set_proc_name(options.title) ensure_dir(options.output) path = os.path.join(options.output, "{}_{}_train.log".format(options.title, int(time.time()))) log_to_file(path) logger.name = options.title logger.info('Options:\n%s', pformat(options.__dict__)) if data_train is None: data_train = cls.DataType.from_file(options.conll_train) if data_dev is None: data_dev = {i: cls.DataType.from_file(i, False) for i in options.conll_dev} try: os.makedirs(options.output) except OSError: pass parser = cls(options, data_train) random_obj = random.Random(1) def do_predict(epoch): for file_name, dev_sentences in data_dev.items(): try: prefix, suffix = os.path.basename(file_name).rsplit(".", 1) except ValueError: prefix = file_name suffix = "" dev_output = os.path.join(options.output, '{}_epoch_{}.{}'.format(prefix, epoch, suffix)) cls.predict_and_output(parser, options, dev_sentences, dev_output) if options.epochs == 0: print("Predict directly.") do_predict(0) for epoch in range(options.epochs): logger.info('Starting epoch %d', epoch) random_obj.shuffle(data_train) parser.train(data_train) # save model and delete old model for i in range(0, epoch - options.max_save): path = os.path.join(options.output, os.path.basename(options.model)) + str(i + 1) if os.path.exists(path): os.remove(path) path = os.path.join(options.output, os.path.basename(options.model)) + str(epoch + 1) parser.save(path) do_predict(epoch)
def train_parser(options, sentences_train=None, sentences_dev=None, sentences_test=None): current_path = os.path.dirname(__file__) set_proc_name(options.title) if not (options.rlFlag or options.rlMostFlag or options.headFlag): print( 'You must use either --userlmost or --userl or --usehead (you can use multiple)' ) sys.exit() if not sentences_train: sentences_train = get_sentences(options.conll_train) if not sentences_dev: sentences_dev = get_sentences(options.conll_dev) \ if options.conll_dev is not None else None if not sentences_test: sentences_test = get_sentences(options.conll_test) \ if options.conll_test is not None else None print('Preparing vocab') words, w2i, pos, rels = tree_utils.vocab(sentences_train) if not os.path.exists(options.output): os.mkdir(options.output) with open(os.path.join(options.output, options.params), 'wb') as paramsfp: pickle.dump((words, w2i, pos, rels, options), paramsfp) print('Finished collecting vocab') print('Initializing blstm arc hybrid:') parser = ArcHybridLSTM(words, pos, rels, w2i, options) for epoch in range(options.epochs): print('Starting epoch', epoch) parser.Train(sentences_train) def predict(sentences, gold_file, output_file): with open(output_file, "w") as f: result = parser.Predict(sentences) for i in result: f.write(i.to_string()) eval_script = os.path.join( current_path, "utils/evaluation_script/conll17_ud_eval.py") weight_file = os.path.join(current_path, "utils/evaluation_script/weights.clas") eval_process = sh.python(eval_script, "-v", "-w", weight_file, gold_file, output_file, _out=output_file + '.txt') eval_process.wait() sh.cat(output_file + '.txt', _out=sys.stdout) print('Finished predicting {}'.format(gold_file)) if sentences_dev: dev_output = os.path.join( options.output, 'dev_epoch_' + str(epoch + 1) + '.conllu') predict(sentences_dev, options.conll_dev, dev_output) if sentences_test: test_output = os.path.join( options.output, 'test_epoch_' + str(epoch + 1) + '.conllu') predict(sentences_test, options.conll_test, test_output) for i in range(epoch + 1 - options.max_model): filename = os.path.join(options.output, options.model + str(i)) if os.path.exists(filename): os.remove(filename) parser.Save( os.path.join(options.output, options.model + str(epoch + 1)))
for i in range(epoch + 1 - options.max_model): filename = os.path.join(options.output, options.model + str(i)) if os.path.exists(filename): os.remove(filename) parser.Save( os.path.join(options.output, options.model + str(epoch + 1))) if __name__ == '__main__': parser = get_parser() (options, args) = parser.parse_args() print('Using external embedding:', options.external_embedding) current_path = os.path.dirname(__file__) set_proc_name(options.title) if not options.predictFlag: train_parser(options) else: with open(options.params, 'r') as paramsfp: words, w2i, pos, rels, stored_opt = pickle.load(paramsfp) stored_opt.external_embedding = options.external_embedding parser = ArcHybridLSTM(words, pos, rels, w2i, stored_opt) parser.Load(options.model) conllu = (os.path.splitext(options.conll_test.lower())[1] == '.conllu') tespath = os.path.join( options.output, 'test_pred.conll' if not conllu else 'test_pred.conllu')