def generate_spacy_factor_corpus(text_file, output_dir, lang_code, prefix, factor_separator=u'|'): mkdir_p(output_dir) text_output = codecs.open(os.path.join( output_dir, prefix + '.{}.'.format(lang_code) + 'tok'), 'w', encoding='utf8') factor_output = codecs.open(os.path.join( output_dir, prefix + '.{}.'.format(lang_code) + 'factors'), 'w', encoding='utf8') nlp = spacy.load(lang_code) logger.info('Loaded Spacy {} model'.format(lang_code)) with codecs.open(text_file, encoding='utf8') as inp: for count, line in enumerate(inp): row = extract_factors(line, nlp) text, factors = zip(*[(factor_tup[0], factor_tup[1:]) for factor_tup in row]) text_output.write(u' '.join(text) + '\n') factor_output.write( u' '.join([factor_separator.join(f) for f in factors]) + '\n') if (count + 1) % 1000 == 0: logger.info('Processed {} rows'.format(count + 1)) logger.info('Wrote new files: {} and {}'.format(text_output.name, factor_output.name)) text_output.close() factor_output.close()
def extract_ter_alignment(hyps_file, refs_file, output_path, src_lang, trg_lang, tercom_path): tercom_jar = os.path.join(tercom_path, 'tercom.7.25.jar') mkdir_p(output_path) output_prefix = os.path.join('{}-{}.tercom.out'.format(src_lang, trg_lang)) # WORKING: we need to put hyps and refs files in a special format hyps_file_iter = codecs.open(hyps_file, encoding='utf8') refs_file_iter = codecs.open(refs_file, encoding='utf8') hyp_ref_iter = parallel_iterator(hyps_file_iter, refs_file_iter) temp_hyps_file = hyps_file + '.ter.temp' temp_refs_file = refs_file + '.ter.temp' with codecs.open(temp_hyps_file, 'w', encoding='utf8') as f_hyp: with codecs.open(temp_refs_file, 'w', encoding='utf8') as f_ref: for i, (hyp, ref) in enumerate(hyp_ref_iter): # Note the logic for escaping XML entities here f_hyp.write('%s\t(%.12d)\n' % (u' '.join([cgi.escape(w) for w in hyp]), i)) f_ref.write('%s\t(%.12d)\n' % (u' '.join([cgi.escape(w) for w in ref]), i)) # Run TERCOM. cmd = 'java -jar {} -r {} -h {} -n {} -d 0'.format(tercom_jar, temp_refs_file, temp_hyps_file, output_prefix) p = subprocess.Popen(cmd, shell=True, stderr=sys.stderr, stdout=sys.stdout) p.wait() os.remove(temp_hyps_file) os.remove(temp_refs_file) # Parse TERCOM output xml mt_tokens, pe_tokens, edits, hters = \ parse_pra_xml.parse_file('{}.xml'.format(output_prefix)) tags_map = {'C': 'OK', 'S': 'BAD', 'I': 'BAD', 'D': 'BAD'} tags = [parse_pra_xml.get_tags(edit, tags_map, keep_inserts=False) for edit in edits] tags_output_file = os.path.join(output_path, output_prefix + '.tags') with codecs.open(tags_output_file, 'w', encoding='utf8') as out: for row in tags: out.write(u' '.join(row) + u'\n') logger.info('Wrote tags to: {}'.format(tags_output_file))
VOCAB_DIR = "/media/1tb_drive/nematus_ape_experiments/amunmt_ape_pretrained/system/models" SRC_VOCAB = os.path.join(VOCAB_DIR, 'src-pe/vocab.src.json') MT_VOCAB = os.path.join(VOCAB_DIR, 'mt-pe/vocab.mt.json') PE_VOCAB = os.path.join(VOCAB_DIR, 'mt-pe/vocab.pe.json') TRAIN_DATA_DIR = "/media/1tb_drive/Dropbox/data/qe/amunmt_artificial_ape_2016/data/500K_and_20x_task_internal" SRC_TRAIN = os.path.join(TRAIN_DATA_DIR, 'train.mt.factor_corpus') TRG_TRAIN = os.path.join(TRAIN_DATA_DIR, 'train.pe.prepped') # WMT 16 EN-DE QE/APE DEV Data QE_DATA_DIR = "/media/1tb_drive/Dropbox/data/qe/ape/concat_wmt_2016_2017" SRC_DEV = os.path.join(QE_DATA_DIR, 'dev.mt.factor_corpus') TRG_DEV = os.path.join(QE_DATA_DIR, 'dev.pe.prepped') mkdir_p('model') # start training from best model from previous experiment STARTING_MODEL = '/media/1tb_drive/nematus_ape_experiments/ape_qe/en-de/model/model.npz.npz.best_bleu' if __name__ == '__main__': validerr = train(saveto=os.path.join('model/model.npz'), prior_model=STARTING_MODEL, reload_=True, dim_word=256, dim=512, n_words=PE_VOCAB_SIZE, n_words_src=SRC_VOCAB_SIZE, decay_c=0., clip_c=1., lrate=0.0001,
tokens = freq_dict.keys() freqs = freq_dict.values() sorted_idx = numpy.argsort(freqs) sorted_words = [tokens[ii] for ii in sorted_idx[::-1]] token_dicts[i]['eos'] = 0 token_dicts[i]['UNK'] = 1 for ii, ww in enumerate(sorted_words): token_dicts[i][ww] = ii+2 return token_dicts if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="Input file with factor tuples separated by `factor_separator`") parser.add_argument("-o", "--output", help="Directory where output files will be written") parser.add_argument("-n", "--num_factors", type=int, help="the number of factors") args = parser.parse_args() factor_iterator = factor_iter(open(args.input, 'r'), args.num_factors) factor_dicts = vocab_dictionaries_from_factor_iterator(factor_iterator, num_factors=args.num_factors) mkdir_p(args.output) for idx, filename in enumerate(['factor_{}'.format(i+1) for i in range(args.num_factors)]): with open('%s.json'%os.path.join(args.output, filename), 'wb') as f: json.dump(factor_dicts[idx], f, indent=2, ensure_ascii=False) logger.info('Wrote index to: {}'.format(filename))
def train(self, train_iter_func, dev_iter_func, restore_from=None, auto_log_suffix=True, start_iteration=0, shuffle=True): """ Training with dev checks for QE sequence models Params: training_iter_func: function which returns iterable over (source, mt, labels) instances dev_iter_func: function which returns iterable over (source, mt, labels) instances """ logdir = os.path.join(self.storage, 'logs') persist_dir = os.path.join(self.storage, 'model') mkdir_p(persist_dir) evaluation_logdir = os.path.join(self.storage, 'evaluation_reports') mkdir_p(evaluation_logdir) training_iter = train_iter_func() training_iter = itertools.cycle(training_iter) # wrap the data iter to add functionality if shuffle: shuffle_factor = self.config.get('shuffle_factor', 5000) training_iter = shuffle_instances_iterator( training_iter, shuffle_factor=shuffle_factor) # load pretrained source word embeddings source_embeddings = None if self.config.get('source_embeddings') is not None: source_embeddings = np.load(open(self.config['source_embeddings'])) # TODO: support pretrained target and output vocabulary embeddings if auto_log_suffix: log_suffix = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") output_logdir = os.path.join(logdir, log_suffix) else: output_logdir = logdir dev_perfs = OrderedDict() mkdir_p(output_logdir) train_writer = tf.summary.FileWriter(output_logdir, self.graph) logger.info('Running session, logdir is: {}'.format(output_logdir)) with tf.Session(graph=self.graph, config=config) as session: # Initialization ops if restore_from is None: tf.initialize_all_variables().run() logger.info('Initialized') # Pretrained Word Embeddings if source_embeddings is not None: session.run( tf.assign(self.word_embeddings, source_embeddings)) logger.info( 'Source word embeddings loaded from: {}'.format( self.config['source_embeddings'])) else: self.saver.restore(session, restore_from) logger.info( 'restored trained model from: {}'.format(restore_from)) average_loss = 0 val_freq = self.config['validation_freq'] # SGD loop for step in range(self.config['num_steps']): if step % 10 == 0: logger.info('running step: {}'.format(step)) data_cols = self.get_batch( training_iter, self.config['batch_size'], sample_prob=self.config['sample_prob']) source, source_mask, target, target_mask, output, output_mask = data_cols feed_dict = { self.source: source, self.source_mask: source_mask, self.target: target, self.target_mask: target_mask, self.output: output, self.output_mask: output_mask, self.dropout_prob: self.config['dropout_prob'] } # if step < self.config['training_transition_cutoff']: _, l, summary = session.run( [ self.full_graph_optimizer, self.cost, # # self.accuracy, self.merged ], feed_dict=feed_dict) # else: # _, l, summary = session.run([self.entity_representation_optimizer, # self.cost, # # self.accuracy, # self.merged], feed_dict=feed_dict) train_writer.add_summary(summary, step) average_loss += l # Validation if step % val_freq == 0: logger.info('Running validation...') logger.info('Training loss on last batch: {}'.format(l)) dev_iter = dev_iter_func() dev_batch_len = self.config['batch_size'] total_correct = 0 total_instances = 0 source_out = [] mt_out = [] output_out = [] pred_out = [] acc_out = [] dev_batch = 0 while dev_batch_len > 0: data_cols = self.get_batch(dev_iter, dev_batch_len, sample_prob=1.0) # this will be zero once the iterator has finished dev_batch_len = len(data_cols[0]) if dev_batch_len == 0: continue source, source_mask, target, target_mask, output, output_mask = data_cols feed_dict = { self.source: source, self.source_mask: source_mask, self.target: target, self.target_mask: target_mask, self.output: output, self.output_mask: output_mask, self.dropout_prob: 1.0 } preds = session.run(self.predictions, feed_dict=feed_dict) preds = np.argmax(preds, axis=2) for s, t, p, o, m in zip(source, target, preds, output, output_mask): dev_source = [self.src_vocab_idict[w] for w in s] output_len = np.count_nonzero(m) mt_actual = t[:output_len] pred_actual = p[:output_len] output_actual = o[:output_len] dev_mt = [ self.trg_vocab_idict[w] for w in mt_actual ] dev_pred = [ self.output_vocab_idict[w] for w in pred_actual ] dev_output = [ self.output_vocab_idict[w] for w in output_actual ] num_correct = sum([ 1 for p, a in zip(pred_actual, output_actual) if p == a ]) acc = num_correct / float(output_len) source_out.append(dev_source) mt_out.append(dev_mt) pred_out.append(dev_pred) output_out.append(dev_output) acc_out.append(acc) total_correct += num_correct total_instances += output_len dev_batch += 1 dev_reports = [] for s, m, p, o, a in zip(source_out, mt_out, pred_out, output_out, acc_out): dev_report = { 'source': u' '.join(s), 'mt': u' '.join(m), 'pred': u' '.join(p), 'output': u' '.join(o), 'acc': a } dev_reports.append(dev_report) evaluation_report = qe_output_evaluation( mt_out, pred_out, output_out, expanded_tagset=self.config['expanded_output_tagset']) logger.info(u'Evaluation report at step: {} -- {}'.format( step, evaluation_report)) dev_report_file = os.path.join(logdir, 'dev_{}.out'.format(step)) with codecs.open(dev_report_file, 'w', encoding='utf8') as dev_out: dev_out.write(json.dumps(dev_reports, indent=2)) logger.info('Wrote validation report to: {}'.format( dev_report_file)) evaluation_logfile = 'f1-product-{}.step-{}.json'.format( evaluation_report['f1_product'], step) evaluation_logfile = os.path.join(evaluation_logdir, evaluation_logfile) with codecs.open(evaluation_logfile, 'w', encoding='utf8') as eval_out: eval_out.write(json.dumps(evaluation_report, indent=2)) logger.info('Wrote evaluation log to: {}'.format( evaluation_logfile)) dev_perf = evaluation_report['f1_product'] dev_perfs[step] = dev_perf if dev_perf == max( v for k, v in dev_perfs.items()) and step > 0: save_path = self.saver.save( session, os.path.join(persist_dir, 'best_model.ckpt')) logger.info( "Step: {} -- {} is the best score so far, model saved in file: {}" .format(step, dev_perf, save_path)) if step > 0 and step % 10000 == 0: save_path = self.saver.save( session, os.path.join(persist_dir, 'model_{}.ckpt'.format(step))) logger.info( "Step: {} -- checkpoint model saved in file: {}". format(step, save_path)) logger.info("Step: {} -- Finished Training".format(step))