예제 #1
0
def postprocess():

    print "make paper table: paperid, year, journal, title, authors, authorids"

    # load existing author table
    AuthTblFile       ='pdb/pm_author_count.csv'
    atbl = fif.loadCSV(AuthTblFile)

    FILE_PPRMETA_CSV = "pdb/pm_paper_meta.csv"
    fif.resetFile(FILE_PPRMETA_CSV)
    fif.addLineToFile(FILE_PPRMETA_CSV, "paperid,year,journal,title,authors")

    #TODO-put journal table generation from compiler to importer
    jnl_name2id = fif.loadCSV('pdb/pm_jnl.csv')

    # load and process each paper
    for pt in papermeta():
        _id, _year, _jnl, authors, ttl, abstract = pt
        if not _id: continue
        if not ttl: ttl='-'

        _idstr=str(_id).strip('L')
        jnlid=jnl_name2id.get(_jnl,0)
        _linemeta = '%s,%s,%s,"%s",'%(_idstr, _year, jnlid, ttl)


        if not authors:
            authortext='"-;0"'
        else:
            authortext='"%s"'%'|'.join(['%s;%s'%(a, atbl.get(a,0)) for a in authors])

        _linemeta += authortext
        fif.addLineToFile(FILE_PPRMETA_CSV, _linemeta)
예제 #2
0
def genTblPaperTerm(TermTbl):
    print "generate paper->term table"

    FILE_PT_CSV = 'pdb/SqlPaperTerm.csv'
    fif.resetFile(FILE_PT_CSV)

    sig_t_abs = fif.load_json('pdb/pm_df_t_abs_3.json')

    print "dump records to", FILE_PT_CSV
    for pt in readin.paperterms():
        pid, t_ttl, t_abs = pt[0], pt[1], pt[2]

        # remove some terms of abstract
        t_abs_good = [
            t for t in t_abs if sig_t_abs.get(t, 0) > 5 and sig_t_abs[t] < 2000
        ]

        # rank the 1gram by df (lower is better)
        t_abs_ngram = [t for t in t_abs_good if len(t.split()) > 1]
        t_abs_1gram = [t for t in t_abs_good if not t in t_abs_ngram]
        t_abs_1gram = sorted(t_abs_1gram, key=lambda k: sig_t_abs[k])[:2]

        #TODO: better to check abbreviation, if offen in title or frequent terms, etc

        def __termCount(term):
            return '%s:%s' % (term, TermTbl.get(term, 0))

        ttl_term_str = ';'.join([__termCount(t) for t in t_ttl])
        abs_term_str = ';'.join(
            [__termCount(t) for t in t_abs_ngram + t_abs_1gram])

        line = '%s,%s,%s' % (pid, ttl_term_str, abs_term_str)
        fif.addLineToFile(FILE_PT_CSV, line)
예제 #3
0
def genRelTermTbl_sem(TermTbl):
    print "generate related-terms table"
    cdf_ttl2ttl = fif.load_json('pdb/pm_cdf_ttl2ttl.json')
    cdf_ttl2abs = fif.load_json('pdb/pm_cdf_ttl2abs.json')
    FILE_REL_CSV = 'pdb/pm_relatedterms_semantic.csv'
    fif.resetFile(FILE_REL_CSV)
    for ref_term, df in cdf_ttl2ttl.items():

        # related terms by CDF of TITLE
        relTtl = sorted(df.items(), key=lambda (k, v): v, reverse=True)[:8]
        relTtl = [t for (t, cnt) in relTtl]
        relTtl_str = '|'.join(
            ['%s;%s' % (t, TermTbl.get(t, 0)) for t in relTtl])

        # related terms by CDF of ABSTRACT
        dfa = cdf_ttl2abs.get(t, None)
        if not dfa: continue
        relAbs = sorted(dfa.items(), key=lambda (k, v): v, reverse=True)[:15]
        relAbs = [t for (t, cnt) in relAbs]
        relAbs_str = '|'.join(
            ['%s;%s' % (t, TermTbl.get(t, 0)) for t in relAbs])

        line = '%s,%d,%s,%s' % (ref_term, TermTbl.get(
            ref_term, 0), relTtl_str, relAbs_str)
        fif.addLineToFile(FILE_REL_CSV, line)
예제 #4
0
def __dumpId2IdList(filename, id2IdList):
    fif.resetFile(filename)
    print "dump each record line ..."
    for iid, idlist in id2IdList.items():
        if not idlist: continue  # could be empty for author->org
        for iiid in idlist:
            line = '%s,%s' % (iid, iiid)
            fif.addToFile(filename, line, isline=1)
예제 #5
0
def genRelTermTbl_lex(TermTbl):
    rel_lex = fif.load_json('pdb/pm_rel_t_ttl.json')
    FILE_REL_LEX_CSV = 'pdb/pm_relatedterms_lexical.csv'
    fif.resetFile(FILE_REL_LEX_CSV)
    for ref_term, terms in rel_lex.items():
        if TermTbl.get(ref_term, 0) == 0: continue
        term_str = '|'.join(['%s;%s' % (t, TermTbl.get(t, 0)) for t in terms])
        line = '%s,%d,%s' % (ref_term, TermTbl.get(ref_term, 0), term_str)
        fif.addLineToFile(FILE_REL_LEX_CSV, line)
예제 #6
0
def extractPapers(pubmedFile='pmed/pubmed_1996.txt'):
    pubmedFile = 'pubmed_result.txt'
    pubmedFile = 'pmed/pubmed_neurosci.txt'
    print "extract papers from", pubmedFile

    fif.resetFile(FileTblPaper)
    fif.resetFile(FileTblAuthor)
    fif.resetFile(FileTblDepart)
    fif.resetFile(FileTblPub)

    papertext = ''
    print "crunching", pubmedFile
    papers = []
    with open(pubmedFile, 'r') as fh:
        for i, line in enumerate(fh):
            if line[0:5] == 'PMID-':
                p = parse(papertext)
                papers.append(p)
                papertext = ''
            papertext += line

            if Publisher_Name2Id.len() > 5e10:
                break
    print "crunching done"

    with open(FilePaperPkl, 'w') as fh:
        pickle.dump(papers[1:], fh)
    print "saved to", FilePaperPkl

    #dump2file()
    dump2pickle()
    print "extract papers - ALL DONE\n"
예제 #7
0
def genTerm2Paper(termtbl):
    print "generate term2paper table"
    # load exiting term->paper_id
    t2p = fif.load_json('pdb/pm_idx_t2p_ttl.json')
    FILE_T2P_CSV = 'pdb/pm_index_t2p.csv'
    fif.resetFile(FILE_T2P_CSV)
    for t, plist in t2p.items():
        if not t: continue
        if not len(plist) > 1: continue
        if len(plist) > 2000: continue  # TODO: think about this
        tid = termtbl[t]
        plist_str = ','.join(plist)
        line = '%s,%d,"%s"' % (t, tid, plist_str)
        fif.addLineToFile(FILE_T2P_CSV, line)
예제 #8
0
def overwrite():
    hash2pid = readPaperHash2Id()
    badRecord = '!NO HASH IN HASH2PID!'
    fif.resetFile(FilePid2Term)
    for pt in psr.paperterms():
        hashid, termTtl, termAbs = pt[0], pt[1], pt[2]
        if hashid in hash2pid:
            pid = hash2pid[hashid]
            tterm = ';'.join(termTtl)
            aterm = ';'.join(termAbs)
            text = '%s!%s!%s' % (pid, tterm, aterm)
            fif.addToFile(FilePid2Term, text, isline=1)
        else:
            badRecord += "%s\n" % hashid
    with open('BadHash2Pid.txt', 'w') as f:
        f.write(badRecord)
예제 #9
0
def makeTable():

    dftt = fif.load_json(FILE_TERM_DFT)
    dfta = fif.load_json(FILE_TERM_DFA)
    terms = dftt.keys() + dfta.keys()

    tid, this_id = {}, 1
    for t in terms:
        if not t in tid:
            tid[t] = this_id
            this_id += 1
    print "termtbl size", len(tid)

    fif.resetFile(FILE_TERM_TBL_CSV)
    fif.addLineToFile(FILE_TERM_TBL_CSV, "termid, term")
    for t, _id in tid.items():
        fif.addLineToFile(FILE_TERM_TBL_CSV, '%d,"%s"' % (_id, t))
    fif.save_json(FILE_TERM_TBL_JSON, tid)
예제 #10
0
def makeSqlTblPaper():

    print "\nmake === PAPER TABLE ==="

    # paperid, year, publisher, title, abstract, authors
    fif.resetFile(FileSqlPaper)
    print "dump each record line ..."
    for meta in readin.papermeta():
        iid = meta[1]
        year = meta[2]
        pub = sqlization(meta[3])
        pubid = sqlization(meta[4])
        ttl = sqlization(meta[5])
        abstr = sqlization(meta[6])
        authors = sqlization(meta[7])
        # save them to the csv
        line = ','.join([iid, year, pub, pubid, ttl, abstr, authors])
        fif.addToFile(FileSqlPaper, line, isline=1)
예제 #11
0
def makeSqlTblOrg():
    fif.resetFile(FileSqlOrg)
    # id, org-department, orgnization, city, country, georawtext
    for data in readin.getOrgId2Data():
        try:
            iid, depart, org, city, country, geo = data
            #print "orgline", iid, depart
            #print "       > org= ", org
            #print "       > city=", city
            #print "       > co=  ", country
            #print "       > geo= ", geo
            line = '%s,%s,%s,%s,%s,%s' % (iid, sqlization(depart),
                                          sqlization(org), city, country,
                                          sqlization(geo))
            line = '%s,%s,%s,%s,%s' % (iid, sqlization(depart),
                                       sqlization(org), city, country)
            fif.addToFile(FileSqlOrg, line, isline=1)
        except:
            print("BADLINE when making sql for organization", line)
    print "- DONE"
예제 #12
0
def _dumpId2Name(id2name, filename):
    fif.resetFile(filename)
    print "dump each record line ..."
    for iid, name in id2name.items():
        line = '%s,%s' % (iid, sqlization(name))
        fif.addToFile(filename, line, isline=1)