예제 #1
0
def encode_document(doc, vocab_2_idx, sos='<sos>', eos='<eos>'):
    '''
  Encodes a document (string) based on the given mapping (vocab_2_idx)

  Params:
    * doc : string, document to encode, it suppose that the token separator is a space
    * vocab_2_idx : dictionary, string to index
    * sos (optional) : string, Start Of Sentence token
    * eos (optional) : string, End Of Sentence token
  
  Returns:
    * doc_encoded : list of int
  '''
    doc_encoded = []
    for w in doc.split(' '):
        # handle trigrams
        encoded = u.process_word(w, vocab_2_idx)
        # handle bigrams
        for i, wp in enumerate(encoded):
            if not wp[1]:
                encoded[i] = u.process_word(wp[0], vocab_2_idx, n=2)

        encoded = u.flat_list(encoded)
        # handle unigrams
        encoded = [[(c, True) for c in wp[0]] if not wp[1] else wp
                   for wp in encoded]
        encoded = u.flat_list(encoded)

        doc_encoded += [vocab_2_idx[wp] for wp, _ in encoded]
        doc_encoded.append(vocab_2_idx[' '])

    doc_encoded = [vocab_2_idx[sos]] + doc_encoded[:-1] + [vocab_2_idx[eos]]
    return doc_encoded
예제 #2
0
        #We read the time that it took to process the samples from the NCRF++ log file.
        log_lines = codecs.open(path_tagger_log).readlines()
        raw_time = float([l for l in log_lines if l.startswith("raw: time:")
                          ][0].split(",")[0].replace("raw: time:",
                                                     "").replace("s", ""))
        raw_unary_time = 0
        #If we applied the retagging strategy, we also need to consider the time that it took to execute the retagger
        if args.retagger:
            log_lines_unary = codecs.open(path_tagger_log_unary).readlines()
            raw_unary_time = float([
                l for l in log_lines_unary if l.startswith("raw: time:")
            ][0].split(",")[0].replace("raw: time:", "").replace("s", ""))
            os.remove(decode_unary_fid)
            os.remove(+fid)
        os.system(" ".join(
            [args.evalb, "-p", args.evalb_param, args.gold, tmpfile.name]))
        os.remove(decode_fid)

        total_time = raw_time + raw_unary_time + end_posprocess_time + end_parenthesized_time + end_merge_retags_time
        print "Total time:", round(total_time, 2)
        print "Sents/s:   ", round(len(gold_trees) / (total_time), 2)

        #Computing additional metrics: accuracy
        if args.retagger:
            enriched_preds = get_enriched_labels_for_retagger(
                preds, new_unary_preds)
            flat_preds = flat_list(enriched_preds)
        else:
            flat_preds = flat_list(preds)
        print "Accuracy:  ", round(accuracy_score(gold_labels, flat_preds), 4)
예제 #3
0
                            help="Path to the gold sequences of the dataset")

    args = arg_parser.parse_args()

    with codecs.open(args.input) as f_input:
        with codecs.open(args.gold) as f_gold:

            pred_sentences = [
                s.split("\n") for s in f_input.read().split("\n\n")
            ]
            gold_sentences = [
                s.split("\n") for s in f_gold.read().split("\n\n")
            ]

            pred_sequences = flat_list(
                [[line.split("\t")[-1] for line in s if line != ""]
                 for s in pred_sentences])
            gold_sequences = flat_list(
                [[line.split("\t")[-1] for line in s if line != ""]
                 for s in gold_sentences])

            pred_levels, gold_levels = [], []
            pred_labels, gold_labels = [], []
            pred_unaries, gold_unaries = [], []

            assert (len(pred_sequences) == len(gold_sequences))

            for p, g in zip(pred_sequences, gold_sequences):

                plevel, plabel, punary = split_label(p)
                glevel, glabel, gunary = split_label(g)
        type=int,
        default=10,
        help="Prune unaries that occur less than a threshold")
    args = arg_parser.parse_args()

    f_out = codecs.open(args.output, "w")

    with codecs.open(args.input) as f_input:

        sentences = [
            sentence.split("\n") for sentence in f_input.read().split("\n\n")
            if sentence != ""
        ]

        raw_labels = [
            split_label(e.split("\t")[2]) for e in flat_list(sentences)
        ]

        levels, labels, unaries = [], [], []
        for label in raw_labels:

            levels.append(label[0])
            labels.append(label[1])
            unaries.append(label[2])

        counter_levels = Counter(levels)
        counter_labels = Counter(labels)
        counter_unaries = Counter(unaries)

        log_changes_level = {}
        log_changes_label = {}
예제 #5
0
            if not os.path.exists(args.output_unary):
                with codecs.open(args.output_unary, "w") as f:
                    for j, sentence in enumerate(sentences):

                        for (word, postag), retag in zip(sentence, preds[j]):
                            f.write("\t".join([word, postag, retag]) + "\n")
                        f.write("\n")
            else:
                raise ValueError("File already exist:", args.output_unary)
            exit()

        parenthesized_trees = sequence_to_parenthesis(new_sentences, preds)
        final_time = time.time()
        tmpfile.write("\n".join(parenthesized_trees) + "\n")
        os.system(" ".join([args.evalb, args.gold, tmpfile.name]))
        gold_labels = [e[2] for e in flat_list(gold_samples)]

        if args.retagger:
            enriched_preds = get_enriched_labels_for_retagger(
                preds, unary_preds)
            flat_preds = flat_list(enriched_preds)
        else:
            flat_preds = flat_list(preds)

        print "Accuracy", round(accuracy_score(gold_labels, flat_preds), 4)
        total_time = final_time - init_time
        print "Total time:", round(total_time, 4)
        print "Sents/s", round(len(gold_samples) / (total_time), 2)

    #########################################################
    #