def process_file(i, path_file, full, wk, sk): doc_file = dr.path2fname(path_file) kf = out_keys_dir + doc_file af = out_abs_dir + doc_file if keyfiles_count > 0: gold_kf = keys_dir + doc_file.replace('.txt', '.key') gold_af = abs_dir + doc_file if match_sizes: if keyfiles_count > 0: gold_k = file2string(gold_kf).count('\n') if gold_k > 1: wk = gold_k + 0 #max(wk,gold_k) gold_a = file2string(gold_af).count('.') if gold_a > 1: sk = gold_a + 0 # min(sk,gold_a) #print('!!!', wk, sk) if not force and exists_file(kf) and exists_file(af): print('SKIPPING ALREADY PROCESSED:', doc_file) return if DIRECT: text = file2string(path_file) #print('path_file:', path_file) if (text == None): return else: d = disect_doc(path_file) title = d['TITLE'] abstract = d['ABSTRACT'] body = d['BODY'] text_no_abs = ''.join(title + [' '] + body) if full: text = ''.join(title + [' '] + abstract + [' '] + body) else: text = ''.join(title + [' '] + body) if SYSTEM == "TEXTRANK": (keys, exabs) = keys_and_abs(text, wk, sk) print(i, ':', doc_file) else: temp_file = temp_dir + doc_file string2file(temp_file, text) if SYSTEM == "DOCTALK": (keys, xss, nk, ek) = runWithTextAlt(temp_file, wk, sk, dr.isWord) elif SYSTEM == "STANZAGRAPHS": (keys, xss) = runWithText_StanzaGraphs(temp_file, wk, sk) exabs = xss print('keys:\n', keys) print('abs:\n', xss) elif SYSTEM == "TEXTCRAFT": (keys, xss, nk, ek) = runWithText(text, wk, sk, dr.isWord) if SYSTEM != "STANZAGRAPHS": print(i, ':', doc_file, 'nodes:', nk, 'edges:', ek) # ,title) exabs = map(lambda x: interleave(' ', x), xss) seq2file(kf, keys) seq2file(af, exabs)
def fill_out_abs(): for doc_file in doc_files: d = disect_doc(doc_file) abstract = d['ABSTRACT'] text = ''.join(abstract) abs_file = abs_dir + dr.path2fname(doc_file) print('abstract extraced to: ', abs_file) string2file(abs_file, text)
def keys_with_rouge(i): files = [] f = [] p = [] r = [] for doc_file in doc_files: fname = dr.path2fname(doc_file) ref_name = keys_dir + fname ref_name = ref_name.replace('.txt', '.key') abs_name = out_keys_dir + fname #if trace_mode : print(fname) gold = file2string(ref_name) silver = file2string(abs_name) if not gold: print('gold file missing:', ref_name) continue if not silver: print('silver file missing:', abs_name) continue k = 0 for res in rs.rstat(silver, gold): if k == i: d = res[0] px = d['p'][0] rx = d['r'][0] fx = d['f'][0] files.append(fname) p.append(px) r.append(rx) f.append(fx) elif k > i: break k += 1 if trace_mode: print(' ABS ROUGE MOV. AVG', i, fname, avg(p), avg(r), avg(f)) rouge_name = (1, 2, 'l', 'w') print("KEYS ROUGE", rouge_name[i], ':', avg(p), ' ', avg(r), ' ', avg(f)) #save KEYS ROUGE scores into file content = 'fileName, Precision, Recall, F-Measure' + '\n' content += score2txt(files, p, r, f) string2file(out_keys_dir + 'KeysRouge.csv', content)
def eval_keys(): files = [] f = [] p = [] r = [] for doc_file in doc_files: fname = dr.path2fname(doc_file) ref_name = keys_dir + fname keys_name = out_keys_dir + fname #if trace_mode : print(fname) gold = file2string(txt2key(ref_name)) silver = file2string(keys_name) if not gold: print('gold file missing:', ref_name) continue if not silver: print('silver file missing:', keys_name) continue #print(gold) #print(silver) d = ks.kstat(silver, gold) if not d: print('FAILING on', fname) print('SILVER', silver) print('GOLD', gold) continue if trace_mode: print(' KEYS', d) px = d['p'] rx = d['r'] fx = d['f'] files.append(fname) p.append(px) r.append(rx) f.append(fx) #if trace_mode : print(' KEYS . AVG:',fname,avg(p),avg(r),avg(f)) print('KEYS SCORES :', avg(p), avg(r), avg(f)) #save keys scores into file content = 'fileName, Precision, Recall, F-Measure' + '\n' content += score2txt(files, p, r, f) string2file(out_keys_dir + 'KeysScores.csv', content)
def eval_abs(): files = [] f = [] p = [] r = [] for doc_file in doc_files: fname = dr.path2fname(doc_file) ref_name = abs_dir + fname abs_name = out_abs_dir + fname gold = file2string(ref_name) silver = file2string(abs_name) if not gold: print('gold file missing:', ref_name) continue if not silver: print('silver file missing:', abs_name) continue #print(gold) #print(silver) d = ks.kstat(silver, gold) if not d: print('FAILING on', fname) continue if trace_mode: print(' ABS SCORE:', d) px = d['p'] rx = d['r'] fx = d['f'] if px and rx and fx: files.append(fname) p.append(px) r.append(rx) f.append(fx) if trace_mode: print(' ABS MOV. AVG', fname, avg(p), avg(r), avg(f)) print("ABS SCORES :", avg(p), avg(r), avg(f)) #save ABS SCORES into file content = 'fileName, Precision, Recall, F-Measure' + '\n' content += score2txt(files, p, r, f) string2file(out_abs_dir + 'AbsScores.csv', content)