示例#1
0
def process_file(i, path_file, full, wk, sk):
    doc_file = dr.path2fname(path_file)
    kf = out_keys_dir + doc_file
    af = out_abs_dir + doc_file

    if keyfiles_count > 0:
        gold_kf = keys_dir + doc_file.replace('.txt', '.key')
    gold_af = abs_dir + doc_file

    if match_sizes:
        if keyfiles_count > 0:
            gold_k = file2string(gold_kf).count('\n')
            if gold_k > 1: wk = gold_k + 0  #max(wk,gold_k)
        gold_a = file2string(gold_af).count('.')
        if gold_a > 1: sk = gold_a + 0  # min(sk,gold_a)
        #print('!!!', wk, sk)

    if not force and exists_file(kf) and exists_file(af):
        print('SKIPPING ALREADY PROCESSED:', doc_file)
        return

    if DIRECT:
        text = file2string(path_file)
        #print('path_file:', path_file)
        if (text == None): return
    else:
        d = disect_doc(path_file)
        title = d['TITLE']
        abstract = d['ABSTRACT']
        body = d['BODY']
        text_no_abs = ''.join(title + [' '] + body)

        if full:
            text = ''.join(title + [' '] + abstract + [' '] + body)
        else:
            text = ''.join(title + [' '] + body)

    if SYSTEM == "TEXTRANK":
        (keys, exabs) = keys_and_abs(text, wk, sk)
        print(i, ':', doc_file)

    else:
        temp_file = temp_dir + doc_file
        string2file(temp_file, text)
        if SYSTEM == "DOCTALK":
            (keys, xss, nk, ek) = runWithTextAlt(temp_file, wk, sk, dr.isWord)
        elif SYSTEM == "STANZAGRAPHS":
            (keys, xss) = runWithText_StanzaGraphs(temp_file, wk, sk)
            exabs = xss
            print('keys:\n', keys)
            print('abs:\n', xss)
        elif SYSTEM == "TEXTCRAFT":
            (keys, xss, nk, ek) = runWithText(text, wk, sk, dr.isWord)
        if SYSTEM != "STANZAGRAPHS":
            print(i, ':', doc_file, 'nodes:', nk, 'edges:', ek)  # ,title)
            exabs = map(lambda x: interleave(' ', x), xss)

    seq2file(kf, keys)
    seq2file(af, exabs)
示例#2
0
def fill_out_abs():
    for doc_file in doc_files:
        d = disect_doc(doc_file)
        abstract = d['ABSTRACT']
        text = ''.join(abstract)
        abs_file = abs_dir + dr.path2fname(doc_file)
        print('abstract extraced to: ', abs_file)
        string2file(abs_file, text)
示例#3
0
def keys_with_rouge(i):
    files = []
    f = []
    p = []
    r = []
    for doc_file in doc_files:
        fname = dr.path2fname(doc_file)
        ref_name = keys_dir + fname
        ref_name = ref_name.replace('.txt', '.key')
        abs_name = out_keys_dir + fname
        #if trace_mode : print(fname)
        gold = file2string(ref_name)
        silver = file2string(abs_name)
        if not gold:
            print('gold file missing:', ref_name)
            continue
        if not silver:
            print('silver file missing:', abs_name)
            continue
        k = 0
        for res in rs.rstat(silver, gold):
            if k == i:
                d = res[0]

                px = d['p'][0]
                rx = d['r'][0]
                fx = d['f'][0]

                files.append(fname)
                p.append(px)
                r.append(rx)
                f.append(fx)

            elif k > i:
                break
            k += 1
        if trace_mode:
            print('  ABS ROUGE MOV. AVG', i, fname, avg(p), avg(r), avg(f))
    rouge_name = (1, 2, 'l', 'w')
    print("KEYS ROUGE", rouge_name[i], ':', avg(p), '  ', avg(r), '  ', avg(f))

    #save KEYS ROUGE scores into file
    content = 'fileName, Precision, Recall, F-Measure' + '\n'
    content += score2txt(files, p, r, f)
    string2file(out_keys_dir + 'KeysRouge.csv', content)
示例#4
0
def eval_keys():
    files = []
    f = []
    p = []
    r = []
    for doc_file in doc_files:
        fname = dr.path2fname(doc_file)
        ref_name = keys_dir + fname
        keys_name = out_keys_dir + fname
        #if trace_mode : print(fname)
        gold = file2string(txt2key(ref_name))
        silver = file2string(keys_name)
        if not gold:
            print('gold file missing:', ref_name)
            continue
        if not silver:
            print('silver file missing:', keys_name)
            continue
        #print(gold)
        #print(silver)
        d = ks.kstat(silver, gold)
        if not d:
            print('FAILING on', fname)
            print('SILVER', silver)
            print('GOLD', gold)
            continue
        if trace_mode: print('  KEYS', d)
        px = d['p']
        rx = d['r']
        fx = d['f']
        files.append(fname)
        p.append(px)
        r.append(rx)
        f.append(fx)
        #if trace_mode : print('  KEYS . AVG:',fname,avg(p),avg(r),avg(f))
    print('KEYS SCORES :', avg(p), avg(r), avg(f))

    #save keys scores into file
    content = 'fileName, Precision, Recall, F-Measure' + '\n'
    content += score2txt(files, p, r, f)
    string2file(out_keys_dir + 'KeysScores.csv', content)
示例#5
0
def eval_abs():
    files = []
    f = []
    p = []
    r = []
    for doc_file in doc_files:
        fname = dr.path2fname(doc_file)
        ref_name = abs_dir + fname
        abs_name = out_abs_dir + fname
        gold = file2string(ref_name)
        silver = file2string(abs_name)
        if not gold:
            print('gold file missing:', ref_name)
            continue
        if not silver:
            print('silver file missing:', abs_name)
            continue
        #print(gold)
        #print(silver)
        d = ks.kstat(silver, gold)
        if not d:
            print('FAILING on', fname)
            continue
        if trace_mode: print('  ABS SCORE:', d)
        px = d['p']
        rx = d['r']
        fx = d['f']
        if px and rx and fx:
            files.append(fname)
            p.append(px)
            r.append(rx)
            f.append(fx)
        if trace_mode: print('  ABS MOV. AVG', fname, avg(p), avg(r), avg(f))
    print("ABS SCORES  :", avg(p), avg(r), avg(f))

    #save ABS SCORES into file
    content = 'fileName, Precision, Recall, F-Measure' + '\n'
    content += score2txt(files, p, r, f)
    string2file(out_abs_dir + 'AbsScores.csv', content)