예제 #1
0
    def __init__(self, vocabs, lex_map, filename, batch_size, for_train):
        self.data = []
        for amr, token, lemma, pos, ner in zip(*read_file(filename)):

            if for_train:
                _, _, not_ok = amr.root_centered_sort()
                if not_ok:
                    continue
                if ' '.join(
                        token
                ) == "https://www.com.html https://www.com.html </a>":
                    continue
            cp_seq, mp_seq, token2idx, idx2token = lex_map.get_concepts(
                lemma, token, vocabs['predictable_concept'])

            #print("datum",len(pos),len(token),len(ner),len(lemma))
            #print("datum",pos,token,ner)
            if len(pos) != len(token):
                exit()
            datum = {'amr':amr, 'tok':token, 'lem':lemma, 'pos':pos, 'ner':ner, \
                     'cp_seq':cp_seq, 'mp_seq':mp_seq,\
                     'token2idx':token2idx, 'idx2token':idx2token}
            self.data.append(datum)
        print("Get %d AMRs from %s" % (len(self.data), filename))
        self.vocabs = vocabs
        self.batch_size = batch_size
        self.train = for_train
        self.unk_rate = 0.
예제 #2
0
 def __init__(self, vocabs, lex_map, filename, batch_size, for_train):
     self.data = read_file(filename)
     self.vocabs = vocabs
     self.lex_map = lex_map
     self.batch_size = batch_size
     self.train = for_train
     self.unk_rate = 0.
     self.nprocessors = 8
     self.record_flag = False
예제 #3
0
def validate(model,
             test_data,
             golden_file,
             beam_size=8,
             alpha=0.6,
             max_time_step=100):
    """For development Only"""
    pp = PostProcess()

    ref_stream = []
    for line in open(golden_file + '.input_clean'):
        if line.startswith('# ::tokens '):
            o = json.loads(line[len('# ::tokens '):].strip())
            ref_stream.append(' '.join(o).lower())
    # gold model output
    graph, gold_sys_stream, _, abstract = read_file(golden_file + '.preproc')
    ref_streams = [ref_stream]

    sys_stream = []
    for batch in test_data:
        res = generate_batch(model, batch, beam_size, alpha, max_time_step)
        sys_stream.extend(res['token'])

    assert len(sys_stream) == len(ref_stream)
    sys_stream = [
        pp.post_process(o, abstract[i], graph[i])
        for i, o in enumerate(sys_stream)
    ]

    bleu = sacrebleu.corpus_bleu(sys_stream,
                                 ref_streams,
                                 force=True,
                                 lowercase=True,
                                 tokenize='none').score
    chrf = sacrebleu.corpus_chrf(sys_stream, ref_stream)

    return bleu, chrf
예제 #4
0
if __name__ == '__main__':
    import json
    from extract import read_file
    import sacrebleu
    args = parse_config()
    pp = PostProcess(retokenize=args.retokenize,
                     span=args.span,
                     compound_map_file=args.compound_map_file)

    ref_stream = []
    for line in open(args.golden_file):
        if line.startswith('# ::original '):
            o = json.loads(line[len('# ::original '):].strip())
            ref_stream.append(' '.join(o).lower())
    # gold model output
    graph, gold_sys_stream, _, abstract = read_file(args.golden_file +
                                                    '.preproc')
    ref_streams = [ref_stream]

    pred_sys_stream = []
    for line in open(args.pred_file):
        if line.startswith('#model output:'):
            ans = line[len('#model output:'):].strip().split()
            pred_sys_stream.append(ans)

    prev = [' '.join(o) for o in pred_sys_stream]

    # choose one (gold or pred) and postprocess
    sys_stream = pred_sys_stream
    sys_stream = [
        pp.post_process(o, abstract[i], graph[i])
        for i, o in enumerate(sys_stream)
예제 #5
0
파일: test_simple.py 프로젝트: Ystwryth/DBS
 def setUp(self):
     self.data = read_file('changes_python.txt')
예제 #6
0
import matplotlib.pyplot as plt
import extract, statistics, csv

DATA_FILE = "data.txt"
OUTPUT_FILE = "processed_data.csv"


def read_data_file(file_name):
    with open(file_name, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        return [row for row in csv_reader]


data = extract.read_file(DATA_FILE)
extract.write_file(data, OUTPUT_FILE)