def encode_document(doc, vocab_2_idx, sos='<sos>', eos='<eos>'): ''' Encodes a document (string) based on the given mapping (vocab_2_idx) Params: * doc : string, document to encode, it suppose that the token separator is a space * vocab_2_idx : dictionary, string to index * sos (optional) : string, Start Of Sentence token * eos (optional) : string, End Of Sentence token Returns: * doc_encoded : list of int ''' doc_encoded = [] for w in doc.split(' '): # handle trigrams encoded = u.process_word(w, vocab_2_idx) # handle bigrams for i, wp in enumerate(encoded): if not wp[1]: encoded[i] = u.process_word(wp[0], vocab_2_idx, n=2) encoded = u.flat_list(encoded) # handle unigrams encoded = [[(c, True) for c in wp[0]] if not wp[1] else wp for wp in encoded] encoded = u.flat_list(encoded) doc_encoded += [vocab_2_idx[wp] for wp, _ in encoded] doc_encoded.append(vocab_2_idx[' ']) doc_encoded = [vocab_2_idx[sos]] + doc_encoded[:-1] + [vocab_2_idx[eos]] return doc_encoded
#We read the time that it took to process the samples from the NCRF++ log file. log_lines = codecs.open(path_tagger_log).readlines() raw_time = float([l for l in log_lines if l.startswith("raw: time:") ][0].split(",")[0].replace("raw: time:", "").replace("s", "")) raw_unary_time = 0 #If we applied the retagging strategy, we also need to consider the time that it took to execute the retagger if args.retagger: log_lines_unary = codecs.open(path_tagger_log_unary).readlines() raw_unary_time = float([ l for l in log_lines_unary if l.startswith("raw: time:") ][0].split(",")[0].replace("raw: time:", "").replace("s", "")) os.remove(decode_unary_fid) os.remove(+fid) os.system(" ".join( [args.evalb, "-p", args.evalb_param, args.gold, tmpfile.name])) os.remove(decode_fid) total_time = raw_time + raw_unary_time + end_posprocess_time + end_parenthesized_time + end_merge_retags_time print "Total time:", round(total_time, 2) print "Sents/s: ", round(len(gold_trees) / (total_time), 2) #Computing additional metrics: accuracy if args.retagger: enriched_preds = get_enriched_labels_for_retagger( preds, new_unary_preds) flat_preds = flat_list(enriched_preds) else: flat_preds = flat_list(preds) print "Accuracy: ", round(accuracy_score(gold_labels, flat_preds), 4)
help="Path to the gold sequences of the dataset") args = arg_parser.parse_args() with codecs.open(args.input) as f_input: with codecs.open(args.gold) as f_gold: pred_sentences = [ s.split("\n") for s in f_input.read().split("\n\n") ] gold_sentences = [ s.split("\n") for s in f_gold.read().split("\n\n") ] pred_sequences = flat_list( [[line.split("\t")[-1] for line in s if line != ""] for s in pred_sentences]) gold_sequences = flat_list( [[line.split("\t")[-1] for line in s if line != ""] for s in gold_sentences]) pred_levels, gold_levels = [], [] pred_labels, gold_labels = [], [] pred_unaries, gold_unaries = [], [] assert (len(pred_sequences) == len(gold_sequences)) for p, g in zip(pred_sequences, gold_sequences): plevel, plabel, punary = split_label(p) glevel, glabel, gunary = split_label(g)
type=int, default=10, help="Prune unaries that occur less than a threshold") args = arg_parser.parse_args() f_out = codecs.open(args.output, "w") with codecs.open(args.input) as f_input: sentences = [ sentence.split("\n") for sentence in f_input.read().split("\n\n") if sentence != "" ] raw_labels = [ split_label(e.split("\t")[2]) for e in flat_list(sentences) ] levels, labels, unaries = [], [], [] for label in raw_labels: levels.append(label[0]) labels.append(label[1]) unaries.append(label[2]) counter_levels = Counter(levels) counter_labels = Counter(labels) counter_unaries = Counter(unaries) log_changes_level = {} log_changes_label = {}
if not os.path.exists(args.output_unary): with codecs.open(args.output_unary, "w") as f: for j, sentence in enumerate(sentences): for (word, postag), retag in zip(sentence, preds[j]): f.write("\t".join([word, postag, retag]) + "\n") f.write("\n") else: raise ValueError("File already exist:", args.output_unary) exit() parenthesized_trees = sequence_to_parenthesis(new_sentences, preds) final_time = time.time() tmpfile.write("\n".join(parenthesized_trees) + "\n") os.system(" ".join([args.evalb, args.gold, tmpfile.name])) gold_labels = [e[2] for e in flat_list(gold_samples)] if args.retagger: enriched_preds = get_enriched_labels_for_retagger( preds, unary_preds) flat_preds = flat_list(enriched_preds) else: flat_preds = flat_list(preds) print "Accuracy", round(accuracy_score(gold_labels, flat_preds), 4) total_time = final_time - init_time print "Total time:", round(total_time, 4) print "Sents/s", round(len(gold_samples) / (total_time), 2) ######################################################### #