示例#1
0
def relatedTermByLexicon(TermJson, Word2TermJson, Save2Json):

    relatedTerms = {}  # TERM -> [related TERM]

    dfTermOfTtl = fif.load_json(TermJson)
    idxW2TTtl = fif.load_json(Word2TermJson)
    print "find related terms"
    for t in dfTermOfTtl:
        words = t.split()
        wordSet = set(words)
        if not len(words) > 1: continue
        # find other terms that also contain the same $words
        related = {}
        for w in words:
            containingTerms = idxW2TTtl.get(w, None)
            if not containingTerms: continue
            # this term must have a least two common words
            for ct in containingTerms:
                __nCommonWords = 0
                for w2 in wordSet:
                    if w2 in ct: __nCommonWords += 1
                if __nCommonWords >= 2: counter.inc(related, ct)
        if t in related: del related[t]  # delete self relatedness
        top = sorted(related.items(), key=lambda (k, v): v, reverse=True)[:10]
        # sort again by DF
        top = [(k, dfTermOfTtl.get(k, 0)) for (k, v) in top]
        top_s = sorted(top, key=lambda (k, v): v, reverse=True)[:5]
        rel = [i[0] for i in top_s]
        if not rel: continue
        #print "%s related to %s"%(t, top_s)
        relatedTerms[t] = rel
    fif.save_json(Save2Json, relatedTerms)
示例#2
0
def hashcode2PubmedId():

    print "make dict:  id --> year, pub, ttl"
    pidYearPubTtl = getPaperIdYearPubTtl()

    print "make dict:  hash --> year, ttl"
    phashYearTtl = getPaperHashYearTTl()  # old data with hash-id
    badRecords = []
    hash2Pid = {}

    for pid, fields in pidYearPubTtl.items():
        year, ttl = fields[0], fields[2]

        # compute the hash code and make mapping hashcode -> pid
        h = makeHash(ttl, year)

        # check that the hash-code hashed to the same paper
        if not h in phashYearTtl:
            badRecords.append('%s!%s!%s' % (pid, year, ttl))
        else:
            oldffields = phashYearTtl[h]
            oldyear = str(oldffields[0])
            oldttl = oldffields[1]
            test1 = oldyear == year
            test2 = oldttl[:5] == ttl[:5]
            if not test1:
                print "different year"
                print "old year:", oldyear, "type=", type(oldyear)
                print "year:", year, "type=", type(year)
                raw_input()
        hash2Pid[h] = pid

    # save them to a file
    fif.save_json(FileHash2Id, hash2Pid)
    fif.saveIterable('TblHash2IdBad.txt', badRecords)
示例#3
0
def cdf(term_getter, jsonfile, threshold_ppr=0, threshold_occ=2, debug=0):

    print "find cdf: using term getter", term_getter, "ppr>", threshold_ppr, "coocr>", threshold_occ
    _conditionalDF = {}
    mapT2P = cmpl.loadIndexT2P()
    # TODO - load only the ">2" terms
    mapP2T = impt.loadP2T(term_getter)

    print "find cdf: loop all terms"
    # for each term check all papers whose title containing this term
    for t in mapT2P:
        if not t: continue
        if not len(t.split()) > 1: continue
        pidlist = mapT2P[t]
        if not len(pidlist) > threshold_ppr: continue

        # for each paper, count the terms based on $term_getter
        for pid in pidlist:
            targetTerms = [i for i in mapP2T[pid] if len(i.split()) > 1]

            counter.count_2d(t, targetTerms, _conditionalDF)
            #print t, " in:", pid, "co-terms:", targetTerms

        #delete terms that do not co-occur more than $threshold times
        util.dictfilter2d_inplace(_conditionalDF, threshold_occ)

        #if(len(_conditionalDF)>10): break

    util.dictfilter2d_nonempty_inplace(_conditionalDF)

    for k in _conditionalDF.keys()[:10]:
        print "Conditional DF - %s => %s" % (k, _conditionalDF[k])
    print "Conditional DF: %s" % util.dictsize(_conditionalDF)

    fif.save_json(jsonfile, _conditionalDF)
示例#4
0
def headwording(TermJson, Save2Json):
    print "\nfind index from headword to terms"
    dfTermOfTtl = fif.load_json(TermJson)
    indexHeading_pre = {}
    indexHeading = {}
    for t, count in dfTermOfTtl.items():
        words = t.split()
        if not len(words) > 1: continue
        headword = words[0]
        if not headword in indexHeading_pre:
            indexHeading_pre[headword] = [(t, count)]
    for hw, t in indexHeading_pre.items():
        indexHeading[hw] = sorted(t,
                                  key=lambda (term, count): count,
                                  reverse=True)

    fif.save_json(Save2Json, indexHeading)
示例#5
0
def makeTable():

    dftt = fif.load_json(FILE_TERM_DFT)
    dfta = fif.load_json(FILE_TERM_DFA)
    terms = dftt.keys() + dfta.keys()

    tid, this_id = {}, 1
    for t in terms:
        if not t in tid:
            tid[t] = this_id
            this_id += 1
    print "termtbl size", len(tid)

    fif.resetFile(FILE_TERM_TBL_CSV)
    fif.addLineToFile(FILE_TERM_TBL_CSV, "termid, term")
    for t, _id in tid.items():
        fif.addLineToFile(FILE_TERM_TBL_CSV, '%d,"%s"' % (_id, t))
    fif.save_json(FILE_TERM_TBL_JSON, tid)
示例#6
0
def indexing():
    IndexT2P_InTtl = {}
    IndexW2T_InTtl = {}

    print "indexing using data from", psr.PM_TERM_FILE
    with open(psr.PM_TERM_FILE, 'r') as f:
        for line in f:
            pid, termTtl, termAbs, wordTtl, wordAbs = psr.parseTermLine(line)
            if not pid: continue
            for t in termTtl:
                if not t in IndexT2P_InTtl: IndexT2P_InTtl[t] = []
                IndexT2P_InTtl[t].append(pid)

                for word in t.split():
                    counter.count_2d(word, [t], IndexW2T_InTtl)

        # save the indexes
        print "Size of Index TermTtl2Paper", util.dictsize(IndexT2P_InTtl)
        print "Size of Index Word2TermOfTtl", util.dictsize(IndexW2T_InTtl)
        fif.save_json(__IDX_TERM2PAPER_FILE, IndexT2P_InTtl)
        fif.save_json(__IDX_WORD2TERM_FILE, IndexW2T_InTtl)
示例#7
0
def df():

    DFTermTtl, DFTermAbs, DFWordTtl, DFWordAbs = {}, {}, {}, {}

    print "df calculation using data from", psr.PM_TERM_FILE
    with open(psr.PM_TERM_FILE, 'r') as f:
        for line in f:
            pid, termTtl, termAbs, wordTtl, wordAbs = psr.parseTermLine(line)
            counter.count(termTtl, DFTermTtl)
            counter.count(termAbs, DFTermAbs)
            counter.count(wordTtl, DFWordTtl)
            counter.count(wordAbs, DFWordAbs)

    print "Size of DF TermTtl", util.dictsize(DFTermTtl)
    print "Size of DF TermAbs", util.dictsize(DFTermAbs)
    print "Size of DF WordTtl", util.dictsize(DFWordTtl)
    print "Size of DF WordAbs", util.dictsize(DFWordAbs)
    fif.save_json("pdb/pm_df_t_ttl.json", DFTermTtl)
    fif.save_json("pdb/pm_df_t_abs.json", DFTermAbs)
    fif.save_json("pdb/pm_df_w_ttl.json", DFWordTtl)
    fif.save_json("pdb/pm_df_w_abs.json", DFWordAbs)

    DFTermTtl2 = util.dictfilter(DFTermTtl, 2)
    DFTermAbs2 = util.dictfilter(DFTermAbs, 2)
    DFWordTtl2 = util.dictfilter(DFWordTtl, 2)
    DFWordAbs2 = util.dictfilter(DFWordAbs, 2)
    print "Size of DF TermTtl above 2", util.dictsize(DFTermTtl2)
    print "Size of DF TermAbs above 2", util.dictsize(DFTermAbs2)
    print "Size of DF WordTtl above 2", util.dictsize(DFWordTtl2)
    print "Size of DF WordAbs above 2", util.dictsize(DFWordAbs2)
    fif.save_json("pdb/pm_df_t_ttl_2.json", DFTermTtl2)
    fif.save_json("pdb/pm_df_t_abs_2.json", DFTermAbs2)
    fif.save_json("pdb/pm_df_w_ttl_2.json", DFWordTtl2)
    fif.save_json("pdb/pm_df_w_abs_2.json", DFWordAbs2)

    DFTermTtl3 = util.dictfilter(DFTermTtl2, 3)
    DFTermAbs3 = util.dictfilter(DFTermAbs2, 3)
    DFWordTtl3 = util.dictfilter(DFWordTtl2, 3)
    DFWordAbs3 = util.dictfilter(DFWordAbs2, 3)
    print "Size of DF TermTtl above 3", util.dictsize(DFTermTtl3)
    print "Size of DF TermAbs above 3", util.dictsize(DFTermAbs3)
    print "Size of DF WordTtl above 3", util.dictsize(DFWordTtl3)
    print "Size of DF WordAbs above 3", util.dictsize(DFWordAbs3)
    fif.save_json("pdb/pm_df_t_ttl_3.json", DFTermTtl3)
    fif.save_json("pdb/pm_df_t_abs_3.json", DFTermAbs3)
    fif.save_json("pdb/pm_df_w_ttl_3.json", DFWordTtl3)
    fif.save_json("pdb/pm_df_w_abs_3.json", DFWordAbs3)