def relatedTermByLexicon(TermJson, Word2TermJson, Save2Json): relatedTerms = {} # TERM -> [related TERM] dfTermOfTtl = fif.load_json(TermJson) idxW2TTtl = fif.load_json(Word2TermJson) print "find related terms" for t in dfTermOfTtl: words = t.split() wordSet = set(words) if not len(words) > 1: continue # find other terms that also contain the same $words related = {} for w in words: containingTerms = idxW2TTtl.get(w, None) if not containingTerms: continue # this term must have a least two common words for ct in containingTerms: __nCommonWords = 0 for w2 in wordSet: if w2 in ct: __nCommonWords += 1 if __nCommonWords >= 2: counter.inc(related, ct) if t in related: del related[t] # delete self relatedness top = sorted(related.items(), key=lambda (k, v): v, reverse=True)[:10] # sort again by DF top = [(k, dfTermOfTtl.get(k, 0)) for (k, v) in top] top_s = sorted(top, key=lambda (k, v): v, reverse=True)[:5] rel = [i[0] for i in top_s] if not rel: continue #print "%s related to %s"%(t, top_s) relatedTerms[t] = rel fif.save_json(Save2Json, relatedTerms)
def genRelTermTbl_sem(TermTbl): print "generate related-terms table" cdf_ttl2ttl = fif.load_json('pdb/pm_cdf_ttl2ttl.json') cdf_ttl2abs = fif.load_json('pdb/pm_cdf_ttl2abs.json') FILE_REL_CSV = 'pdb/pm_relatedterms_semantic.csv' fif.resetFile(FILE_REL_CSV) for ref_term, df in cdf_ttl2ttl.items(): # related terms by CDF of TITLE relTtl = sorted(df.items(), key=lambda (k, v): v, reverse=True)[:8] relTtl = [t for (t, cnt) in relTtl] relTtl_str = '|'.join( ['%s;%s' % (t, TermTbl.get(t, 0)) for t in relTtl]) # related terms by CDF of ABSTRACT dfa = cdf_ttl2abs.get(t, None) if not dfa: continue relAbs = sorted(dfa.items(), key=lambda (k, v): v, reverse=True)[:15] relAbs = [t for (t, cnt) in relAbs] relAbs_str = '|'.join( ['%s;%s' % (t, TermTbl.get(t, 0)) for t in relAbs]) line = '%s,%d,%s,%s' % (ref_term, TermTbl.get( ref_term, 0), relTtl_str, relAbs_str) fif.addLineToFile(FILE_REL_CSV, line)
def genTblPaperTerm(TermTbl): print "generate paper->term table" FILE_PT_CSV = 'pdb/SqlPaperTerm.csv' fif.resetFile(FILE_PT_CSV) sig_t_abs = fif.load_json('pdb/pm_df_t_abs_3.json') print "dump records to", FILE_PT_CSV for pt in readin.paperterms(): pid, t_ttl, t_abs = pt[0], pt[1], pt[2] # remove some terms of abstract t_abs_good = [ t for t in t_abs if sig_t_abs.get(t, 0) > 5 and sig_t_abs[t] < 2000 ] # rank the 1gram by df (lower is better) t_abs_ngram = [t for t in t_abs_good if len(t.split()) > 1] t_abs_1gram = [t for t in t_abs_good if not t in t_abs_ngram] t_abs_1gram = sorted(t_abs_1gram, key=lambda k: sig_t_abs[k])[:2] #TODO: better to check abbreviation, if offen in title or frequent terms, etc def __termCount(term): return '%s:%s' % (term, TermTbl.get(term, 0)) ttl_term_str = ';'.join([__termCount(t) for t in t_ttl]) abs_term_str = ';'.join( [__termCount(t) for t in t_abs_ngram + t_abs_1gram]) line = '%s,%s,%s' % (pid, ttl_term_str, abs_term_str) fif.addLineToFile(FILE_PT_CSV, line)
def makeTable(): dftt = fif.load_json(FILE_TERM_DFT) dfta = fif.load_json(FILE_TERM_DFA) terms = dftt.keys() + dfta.keys() tid, this_id = {}, 1 for t in terms: if not t in tid: tid[t] = this_id this_id += 1 print "termtbl size", len(tid) fif.resetFile(FILE_TERM_TBL_CSV) fif.addLineToFile(FILE_TERM_TBL_CSV, "termid, term") for t, _id in tid.items(): fif.addLineToFile(FILE_TERM_TBL_CSV, '%d,"%s"' % (_id, t)) fif.save_json(FILE_TERM_TBL_JSON, tid)
def genRelTermTbl_lex(TermTbl): rel_lex = fif.load_json('pdb/pm_rel_t_ttl.json') FILE_REL_LEX_CSV = 'pdb/pm_relatedterms_lexical.csv' fif.resetFile(FILE_REL_LEX_CSV) for ref_term, terms in rel_lex.items(): if TermTbl.get(ref_term, 0) == 0: continue term_str = '|'.join(['%s;%s' % (t, TermTbl.get(t, 0)) for t in terms]) line = '%s,%d,%s' % (ref_term, TermTbl.get(ref_term, 0), term_str) fif.addLineToFile(FILE_REL_LEX_CSV, line)
def genTerm2Paper(termtbl): print "generate term2paper table" # load exiting term->paper_id t2p = fif.load_json('pdb/pm_idx_t2p_ttl.json') FILE_T2P_CSV = 'pdb/pm_index_t2p.csv' fif.resetFile(FILE_T2P_CSV) for t, plist in t2p.items(): if not t: continue if not len(plist) > 1: continue if len(plist) > 2000: continue # TODO: think about this tid = termtbl[t] plist_str = ','.join(plist) line = '%s,%d,"%s"' % (t, tid, plist_str) fif.addLineToFile(FILE_T2P_CSV, line)
def headwording(TermJson, Save2Json): print "\nfind index from headword to terms" dfTermOfTtl = fif.load_json(TermJson) indexHeading_pre = {} indexHeading = {} for t, count in dfTermOfTtl.items(): words = t.split() if not len(words) > 1: continue headword = words[0] if not headword in indexHeading_pre: indexHeading_pre[headword] = [(t, count)] for hw, t in indexHeading_pre.items(): indexHeading[hw] = sorted(t, key=lambda (term, count): count, reverse=True) fif.save_json(Save2Json, indexHeading)
def wordchain(): print "find wordchain" terms = fif.load_json(cfg.FILE_TERM_DF_ABOVE2_JSON) chain_nex = {} # this_word -> next_word -> count chain_pre = {} # this_word -> previous_word -> count for t in terms: words = t.split() for i, w in enumerate(words[:-1]): counter.inc_2d(w, words[i + 1], chain_nex) counter.inc_2d(words[i + 1], w, chain_pre) print " - total term size", len(terms) print " - total nex_wordchain size", util.dictsize(chain_nex) print " - total pre_wordchain size", util.dictsize(chain_pre) for w in chain_nex.keys()[:5]: print w, "-->", chain_nex[w] for w in chain_pre.keys()[:5]: print w, "<--", chain_pre[w] while (1): w = raw_input('type type type') print "" print " --> ", chain_nex[w] print " <-- ", chain_pre[w]
def readPaperHash2Id(): return fif.load_json(FileHash2Id)
def loadDFTermTtl(): return fif.load_json("pdb/pm_df_t_ttl_2.json")
def loadDFTermAbs(): return fif.load_json("pdb/pm_df_t_abs_2.json")
def loadIndexW2T(): return fif.load_json(__IDX_WORD2TERM_FILE)
def loadIndexT2P(): return fif.load_json(__IDX_TERM2PAPER_FILE)
def generateTables(): ttbl = fif.load_json('pdb/pm_term_tbl.json') genTblPaperTerm(ttbl)