예제 #1
0
def parsePermissions(LICENSETABLE):
    " return dict with publisher name lower cased -> permission color (green or red) "
    pubToPermission = {}
    for row in maxCommon.iterTsvRows(LICENSETABLE):
        pubName = row.pubName.lower()
        if int(row.havePermission) == 1:
            pubToPermission[pubName] = "green"
        else:
            pubToPermission[pubName] = "red"

    for row in maxCommon.iterTsvRows(OATABLE):
        pubToPermission[row.pubName.lower()] = "blue"
        #print pubName.lower()
    return pubToPermission
예제 #2
0
def parseTabPublisherFile(fname):
    " parse a file with columns eIssn, publisher (optional) and urls into a list of records "
    logging.info("Parsing %s" % fname)
    journals = list(maxCommon.iterTsvRows(fname, encoding="latin1"))
    # modify publisher field
    datasetName = splitext(basename(fname))[0]
    headers = list(journals[0]._fields)
    addPubField = False
    if "publisher" not in headers:
        headers.insert(0, "publisher")
        addPubField =True
    JRec = collections.namedtuple("Journal", headers)
    newJournals = []
    for j in journals:
        if j.eIssn.lower()=="print only" or j.eIssn.lower()=="unknown":
            logging.debug("Skipping journal %s, no eIssn" % j.title)
            continue
        if addPubField:
            newJ = [datasetName]
            newJ.extend(j)
            newJRec = JRec(*newJ)
        else:
            newJRec = j
        newJournals.append(newJRec)
    return newJournals
예제 #3
0
def updatePmids(medlineDir, crawlDir, updateIds, minYear=None):
    """ go over subdirs of crawlDir, for each: read the ISSNs, and add new
    PMIDs we have in medlineDir to subdir/pmids.txt

    We never remove a PMID from pmids.txt.
    """ 
    logging.info("Now updating crawler directories with the new PMIDs")
    eIssnToPIssn = getEIssnToPIssn(pubConf.publisherIssnTable)
    issnToPmid, issnToJournal = getIssnPmidDict(medlineDir, updateIds, minYear)
    for subdir in getSubdirs(crawlDir):
        pmidFname = join(crawlDir, subdir, "pmids.txt")
        issnFname = join(crawlDir, subdir, "issns.tab")
        if not isfile(issnFname) or not isfile(pmidFname):
            continue
        logging.debug("reading subdir %s: %s and %s" % (subdir, pmidFname, issnFname))
        issns = [row.issn.strip() for row in maxCommon.iterTsvRows(issnFname)]
        logging.debug("ISSNs: %s" % ",".join(issns))
        # read old pmids
        oldPmids = set([int(line.rstrip()) for line in open(pmidFname)])
        newPmids = set()
        # add new pmids, for each issn
        for issn in issns:
            if issn not in issnToPmid:
                if issn in eIssnToPIssn:
                    logging.debug("Looks like eISSN, mapped to printISSN %s" % issn)
                    issn = eIssnToPIssn[issn]
                else:
                    logging.debug("No Pmids for ISSN %s and not eIssn for it" % issn)

            issnPmids = issnToPmid.get(issn, None)
            if issnPmids==None:
                logging.debug("No Pmids for ISSN %s" % issn)
                continue
            logging.debug("Issn %s, %d PMIDs" % (issn, len(issnPmids)))
            newPmids.update(issnPmids)
        # get some counts and output to user
        oldCount = len(oldPmids)
        updateCount = len(newPmids)
        oldPmids.update(newPmids) # faster to add new to old set than old to new set
        pmids = oldPmids
        newCount = len(pmids)
        addCount = newCount - oldCount
        logging.info("crawl dir %s: old PMID count %d, update has %d, new total %d, added %d" % \
            (subdir, oldCount, updateCount, newCount, addCount))
        # write new pmids
        pmids = [str(x) for x in pmids]
        # randomize order, to distribute errors
        random.shuffle(pmids)

        # write all pmids to a tmp file
        pmidTmpFname = pmidFname+".new"
        pmidFh = open(pmidTmpFname, "w")
        pmidFh.write("\n".join(pmids))
        pmidFh.close()

        # keep a copy of the original pmid file
        shutil.copy(pmidFname, pmidFname+".bak")
        # rename  the tmp file to the original file
        # to make sure that an intact pmid file always exists
        os.rename(pmidTmpFname, pmidFname)
예제 #4
0
def iterArticleDataDir(textDir, type="articles", filterFname=None, updateIds=None):
    """ yields all articleData from all files in textDir 
        Can filter to yield only a set of filenames or files for a 
        given list of updateIds.
    """
    fcount = 0
    if type=="articles":
        baseMask = "*.articles.gz"
    elif type=="files":
        baseMask = "*.files.gz"
    elif type=="annots":
        baseMask = "*.tab.gz"
    else:
        logging.error("Article type %s not valid" % type)
        sys.exit(1)
        
    if isfile(textDir):
        fileNames = [textDir]
        logging.debug("Found 1 file, %s" % textDir)
    else:
        fileMask = os.path.join(textDir, baseMask)
        fileNames = glob.glob(fileMask)
        logging.debug("Looking for all fulltext files in %s, found %d files" % \
            (fileMask, len(fileNames)))
        if updateIds!=None and len(updateIds)!=0:
            logging.debug("Restricting fulltext files to updateIds %s" % str(updateIds))
            filteredFiles = []
            for updateId in updateIds:
                for fname in fileNames:
                    if basename(fname).startswith(str(updateId)+"_"):
                        filteredFiles.append(fname)
                logging.debug("Update Id %s, %d files" % (str(updateId), len(filteredFiles)))
            fileNames = list(filteredFiles)

        logging.debug("Found %d files in input dir %s" % (len(fileNames), textDir))

    pm = maxCommon.ProgressMeter(len(fileNames), stepCount=100)
    for textFname in fileNames:
        if filterFname!=None and not filterFname in textFname:
            logging.warn("Skipping %s, because file filter is set" % textFname)
            continue
        reader = PubReaderFile(textFname)
        logging.debug("Reading %s, %d files left" % (textFname, len(fileNames)-fcount))
        fcount+=1
        if type=="articles":
            for articleData in reader.articleRows:
                if "publisher" not in articleData._fields: # XX temporary bugfix as I have some old files
                    articleData = list(articleData)
                    articleData.insert(2, "")
                    articleData[3] = ""
                yield articleData
        elif type=="files":
            for fileData in reader.fileRows:
                yield fileData
        elif type=="annots":
            for row in maxCommon.iterTsvRows(textFname):
                yield row
        else:
            assert(False) # illegal type parameter
        pm.taskCompleted()
예제 #5
0
def parseHighwire():
    """ create two dicts 
    printIssn -> url to pmidlookup-cgi of highwire 
    and 
    publisherName -> top-level hostnames
    >>> temps, domains = parseHighwire()
    >>> temps['0270-6474']
    u'http://www.jneurosci.org/cgi/pmidlookup?view=long&pmid=%(pmid)s'
    >>> domains["Society for Neuroscience"]
    set([u'jneurosci.org'])
    >>> domains["American Society for Biochemistry and Molecular Biology"]
    set([u'jbc.org', u'mcponline.org', u'jlr.org'])
    >>> temps["1535-9476"]
    u'http://www.mcponline.org/cgi/pmidlookup?view=long&pmid=%(pmid)s'
    """
    templates = {}
    domains = {}
    pubFname = pubConf.publisherIssnTable
    logging.info("Parsing %s to find highwire ISSNs/webservers" % pubFname)
    for row in maxCommon.iterTsvRows(pubFname):
        if not row.pubName.startswith("HIGHWIRE"):
            continue
        pubName = row.pubName.replace("HIGHWIRE ","")
        issns = [i.strip() for i in row.journalIssns.split("|")]
        servers = row.webservers.split("|")
        for issn, server in zip(issns, servers):
            template = "http://www."+server+"/cgi/pmidlookup?view=long&pmid=%(pmid)s" 
            templates[issn] = template
            domains.setdefault(pubName, set()).add(server)
            #logging.debug("HIGHWIRE CONFIG %s, %s, %s" % (pubName, template, domains[pubName]))
    return templates, domains
예제 #6
0
    def __init__(self, fname):
        " fname can end in .articles.gz, reader will still read both articles and files "
        logging.debug("Reading data from file with prefix %s (.articles.gz, .files.gz)" % fname)
        baseDir = dirname(fname)
        base = basename(fname).split('.')[0]
        articleFn = join(baseDir, base+".articles.gz")
        fileFn = join(baseDir, base+".files.gz")
        logging.debug("Reading %s and %s" % (articleFn, fileFn))

        self.articleRows = None
        if isfile(articleFn) and getsize(articleFn)!=0:
            self.articleRows = maxCommon.iterTsvRows(articleFn, encoding="utf8")
                
        self.fileRows = None
        if isfile(fileFn) and getsize(fileFn)!=0:
            self.fileRows  = maxCommon.iterTsvRows(fileFn, encoding="utf8")
예제 #7
0
def parseHighwire():
    """ create two dicts
    printIssn -> url to pmidlookup-cgi of highwire
    and
    publisherName -> top-level hostnames
    >>> temps, domains = parseHighwire()
    >>> temps['0270-6474']
    u'http://www.jneurosci.org/cgi/pmidlookup?view=long&pmid=%(pmid)s'
    >>> domains["Society for Neuroscience"]
    set([u'jneurosci.org'])
    >>> domains["American Society for Biochemistry and Molecular Biology"]
    set([u'jbc.org', u'mcponline.org', u'jlr.org'])
    >>> temps["1535-9476"]
    u'http://www.mcponline.org/cgi/pmidlookup?view=long&pmid=%(pmid)s'
    """
    templates = {}
    domains = {}
    pubFname = pubConf.publisherIssnTable
    logging.info("Parsing %s to find highwire ISSNs/webservers" % pubFname)
    for row in maxCommon.iterTsvRows(pubFname):
        if not row.pubName.startswith("HIGHWIRE"):
            continue
        pubName = row.pubName.replace("HIGHWIRE ", "")
        issns = [i.strip() for i in row.journalIssns.split("|")]
        servers = row.webservers.split("|")
        for issn, server in zip(issns, servers):
            template = "http://www." + server + "/cgi/pmidlookup?view=long&pmid=%(pmid)s"
            templates[issn] = template
            domains.setdefault(pubName, set()).add(server)
            #logging.debug("HIGHWIRE CONFIG %s, %s, %s" % (pubName, template, domains[pubName]))
    return templates, domains
예제 #8
0
def concatIdentifiers(inDir, outDir, outFname):
    " concat all identifiers of *_ids.tab files in inDir to outFname, append if exists "
    outPath = join(outDir, outFname)
    inMask = join(inDir, "*_ids.tab")
    idFnames = glob.glob(inMask)
    logging.debug("Concatting exernalIds from %s to %s" % (inMask, outPath))
    extIds = []
    for inFname in idFnames:
        if os.path.getsize(inFname)==0:
            logging.warn("file %s has zero size")
            continue
        for row in maxCommon.iterTsvRows(inFname):
            extIds.append(row.externalId)

    if isfile(outPath):
        ofh = open(outPath, "a")
    else:
        ofh = open(outPath, "w")
        ofh.write("#externalId\n")

    for extId in extIds:
        ofh.write("%s\n" % extId)
    ofh.close()

    return outPath
예제 #9
0
def parseTabPublisherFile(fname):
    " parse a file with columns eIssn, publisher (optional) and urls into a list of records "
    logging.info("Parsing %s" % fname)
    journals = list(maxCommon.iterTsvRows(fname, encoding="latin1"))
    # modify publisher field
    datasetName = splitext(basename(fname))[0]
    headers = list(journals[0]._fields)
    addPubField = False
    if "publisher" not in headers:
        headers.insert(0, "publisher")
        addPubField = True
    JRec = collections.namedtuple("Journal", headers)
    newJournals = []
    for j in journals:
        if j.eIssn.lower() == "print only" or j.eIssn.lower() == "unknown":
            logging.debug("Skipping journal %s, no eIssn" % j.title)
            continue
        if addPubField:
            newJ = [datasetName]
            newJ.extend(j)
            newJRec = JRec(*newJ)
        else:
            newJRec = j
        newJournals.append(newJRec)
    return newJournals
예제 #10
0
    def __init__(self, taxId):
        " open db files, compile patterns, parse input as far as possible "
        mutDataDir = pubConf.varDataDir
        geneDataDir = pubConf.geneDataDir
        if mutDataDir==None:
            return
        self.mutDataDir = mutDataDir
        self.entrez2sym, self.entrez2refprots = parseEntrez(join(geneDataDir, "entrez.tab"))

        # refseq sequences
        fname = join(mutDataDir, "seqs")
        logging.info("opening %s" % fname)
        seqs = pubKeyVal.SqliteKvDb(fname)
        self.seqs = seqs
        
        # refprot to refseqId
        # refseq to CDS Start
        fname = join(mutDataDir, "refseqInfo.tab")
        logging.debug("Reading %s" % fname)
        self.refProtToRefSeq = {}
        self.refSeqCds = {}
        for row in maxCommon.iterTsvRows(fname):
            self.refProtToRefSeq[row.refProt] = row.refSeq
            self.refSeqCds[row.refSeq] = int(row.cdsStart)-1 # NCBI is 1-based

        # refseq to genome
        self.pslCache = {}
        self.refGenePsls      = openIndexedPsls(mutDataDir, "refGenePsls.9606")

        # dbsnp db
        fname = join(self.mutDataDir, "dbSnp.sqlite")
        self.snpDb = sqlite3.connect(fname)


        logging.info("Reading of data finished")
예제 #11
0
def readArticleChunkAssignment(inDir, updateIds):
    "read the assignment of articleId -> chunkId from text directory"

    if updateIds == None:
        inFiles = glob.glob(os.path.join(inDir, "*_index.tab"))
    else:
        inFiles = []
        for updateId in updateIds:
            updateId = str(updateId)
            indexFname = "%s_index.tab" % updateId
            if isfile(indexFname):
                inFiles.append(os.path.join(inDir, indexFname))

    if len(inFiles) == 0:
        logging.warn("No article chunk assignment")
        return None

    logging.debug("Input files for article -> chunk assignment: %s" % inFiles)

    articleChunks = {}
    for inFile in inFiles:
        logging.info("Parsing %s" % inFile)
        for row in maxCommon.iterTsvRows(inFile):
            chunkId = int(row.chunkId.split("_")[1])
            articleChunks[int(row.articleId)] = int(chunkId)
    return articleChunks
예제 #12
0
def concatIdentifiers(inDir, outDir, outFname):
    " concat all identifiers of *_ids.tab files in inDir to outFname, append if exists "
    outPath = join(outDir, outFname)
    inMask = join(inDir, "*_ids.tab")
    idFnames = glob.glob(inMask)
    logging.debug("Concatting exernalIds from %s to %s" % (inMask, outPath))
    extIds = []
    for inFname in idFnames:
        if os.path.getsize(inFname) == 0:
            logging.warn("file %s has zero size")
            continue
        for row in maxCommon.iterTsvRows(inFname):
            extIds.append(row.externalId)

    if isfile(outPath):
        ofh = open(outPath, "a")
    else:
        ofh = open(outPath, "w")
        ofh.write("#externalId\n")

    for extId in extIds:
        ofh.write("%s\n" % extId)
    ofh.close()

    return outPath
예제 #13
0
def readArticleChunkAssignment(inDir, updateIds):
    "read the assignment of articleId -> chunkId from text directory"

    if updateIds == None:
        inFiles = glob.glob(os.path.join(inDir, "*_index.tab"))
    else:
        inFiles = []
        for updateId in updateIds:
            updateId = str(updateId)
            indexFname = "%s_index.tab" % updateId
            if isfile(indexFname):
                inFiles.append(os.path.join(inDir, indexFname))

    if len(inFiles) == 0:
        logging.warn("No article chunk assignment")
        return None

    logging.debug("Input files for article -> chunk assignment: %s" % inFiles)

    articleChunks = {}
    for inFile in inFiles:
        logging.info("Parsing %s" % inFile)
        for row in maxCommon.iterTsvRows(inFile):
            chunkId = int(row.chunkId.split("_")[1])
            articleChunks[int(row.articleId)] = int(chunkId)
    return articleChunks
예제 #14
0
def updatePmids(medlineDir, crawlDir, updateIds, minYear=None):
    """ go over subdirs of crawlDir, for each: read the ISSNs, and add new
    PMIDs we have in medlineDir to subdir/pmids.txt
    We never remove a PMID from pmids.txt.
    """ 
    logging.info("Now updating crawler directories with the new PMIDs")
    eIssnToPIssn = getEIssnToPIssn(pubConf.publisherIssnTable)
    subDirs = getSubdirs(crawlDir)
    con, cur = pubStore.openArticleDb("medline", mustOpen=True, useRamdisk=True)
    for subdir in subDirs:
        if subdir.endswith(".tmp"):
            continue
        subPath = join(crawlDir, subdir)
        logging.info("Processing subdirectory %s" % subPath)
        if isfile(pubCrawlLib.getLockFname(subPath)):
            logging.warn("Found lockfile, looks like a crawl is going on in %s, skipping" % subPath)
            continue

        pmidFname = join(crawlDir, subdir, "pmids.txt")
        issnFname = join(crawlDir, subdir, "issns.tab")
        if not isfile(issnFname) or not isfile(pmidFname):
            logging.info("Skipping %s, ISSN or docId file not found" % subPath)
            continue
        logging.debug("reading subdir %s: %s and %s" % (subdir, pmidFname, issnFname))
        issns = [row.issn.strip() for row in maxCommon.iterTsvRows(issnFname)]
        logging.debug("ISSNs: %s" % ",".join(issns))
        # read old pmids
        oldPmids = set([int(line.rstrip()) for line in open(pmidFname)])
        #newPmids = set()
        # add new pmids, for each issn
        newPmids = getPmidsForIssns(con, cur, issns, minYear)

        logging.debug("%d PMIDs" % (len(newPmids)))
        oldCount = len(oldPmids)
        updateCount = len(newPmids)
        oldPmids.update(newPmids) # faster to add new to old set than old to new set

        pmids = oldPmids
        newCount = len(pmids)
        addCount = newCount - oldCount
        logging.info("crawl dir %s: old PMID count %d, update has %d, new total %d, added %d" % \
            (subdir, oldCount, updateCount, newCount, addCount))

        # write new pmids
        pmids = [str(x) for x in pmids]
        # randomize order, to distribute errors
        random.shuffle(pmids)

        # write all pmids to a tmp file
        pmidTmpFname = pmidFname+".new"
        pmidFh = open(pmidTmpFname, "w")
        pmidFh.write("\n".join(pmids))
        pmidFh.close()

        # keep a copy of the original pmid file
        shutil.copy(pmidFname, pmidFname+".bak")
        # atomic rename  the tmp file to the original file
        # to make sure that an intact pmid file always exists
        os.rename(pmidTmpFname, pmidFname)
예제 #15
0
def getAllBatchIds(outDir):
    """ parse batches.tab and return all available batchIds
    """
    batchIds = []
    for row in maxCommon.iterTsvRows(join(outDir, "batches.tab")):
        batchIds.append(row.batchId)
    logging.debug("Found batchIds %s in directory %s" % (batchIds, outDir))
    return batchIds
예제 #16
0
def loadTsvSqlite(dbFname, tableName, tsvFnames, headers=None, intFields=[], \
    primKey=None, idxFields=[], dropTable=True):
    " load tabsep file into sqlLite db table "
    # if first parameter is string, make it to a list
    if len(tsvFnames)==0:
        logging.debug("No filenames to load")
        return
    if isinstance(tsvFnames, basestring):
        tsvFnames = [tsvFnames]
    if os.path.isfile(dbFname):
        lockDb = False
        finalDbFname = None
    else:
        lockDb = True
        finalDbFname = dbFname
        dbFname = pubGeneric.getFastUniqueTempFname()
        logging.info("writing first to db on ramdisk %s" % dbFname)
    con, cur = openSqlite(dbFname, lockDb=lockDb)

    # drop old table 
    if dropTable:
        logging.debug("dropping old sqlite table")
        cur.execute('DROP TABLE IF EXISTS %s;'% tableName)
        con.commit()

    # create table
    createSql, idxSqls = makeTableCreateStatement(tableName, headers, \
        intFields=intFields, idxFields=idxFields, primKey=primKey)
    logging.log(5, "creating table with %s" % createSql)
    cur.execute(createSql)
    con.commit()

    logging.info("Loading data into table")
    tp = maxCommon.ProgressMeter(len(tsvFnames))
    sql = "INSERT INTO %s (%s) VALUES (%s)" % (tableName, ", ".join(headers), ", ".join(["?"]*len(headers)))
    for tsvName in tsvFnames:
        logging.debug("Importing %s" % tsvName)
        if os.path.getsize(tsvName)==0:
            logging.debug("Skipping %s, zero size" % tsvName)
            continue
        rows = list(maxCommon.iterTsvRows(tsvName))
        logging.log(5, "Running Sql %s against %d rows" % (sql, len(rows)))
        cur.executemany(sql, rows)
        con.commit()
        tp.taskCompleted()

    logging.info("Adding indexes to table")
    for idxSql in idxSqls:
        cur.execute(idxSql)
        con.commit()

    con.close()

    if finalDbFname!=None:
        logging.info("moving over ramdisk db to %s" % dbFname)
        shutil.move(dbFname, finalDbFname)
예제 #17
0
def loadTsvSqlite(dbFname, tableName, tsvFnames, headers=None, intFields=[], \
    primKey=None, idxFields=[], dropTable=True):
    " load tabsep file into sqlLite db table "
    # if first parameter is string, make it to a list
    if len(tsvFnames) == 0:
        logging.debug("No filenames to load")
        return
    if isinstance(tsvFnames, basestring):
        tsvFnames = [tsvFnames]

    if os.path.isfile(dbFname):
        lockDb = False
        finalDbFname = None
    else:
        lockDb = True
        finalDbFname = dbFname
    con, cur = openSqlite(dbFname, lockDb=lockDb)

    # drop old table
    if dropTable:
        logging.debug("dropping old sqlite table")
        cur.execute('DROP TABLE IF EXISTS %s;' % tableName)
        con.commit()

    # create table
    createSql, idxSqls = makeTableCreateStatement(tableName, headers, \
        intFields=intFields, idxFields=idxFields, primKey=primKey)
    logging.log(5, "creating table with %s" % createSql)
    cur.execute(createSql)
    con.commit()

    logging.info("Loading data into table")
    tp = maxCommon.ProgressMeter(len(tsvFnames))
    sql = "INSERT INTO %s (%s) VALUES (%s)" % (tableName, ", ".join(headers),
                                               ", ".join(["?"] * len(headers)))
    for tsvName in tsvFnames:
        logging.debug("Importing %s" % tsvName)
        if os.path.getsize(tsvName) == 0:
            logging.debug("Skipping %s, zero size" % tsvName)
            continue
        rows = list(maxCommon.iterTsvRows(tsvName))
        logging.log(5, "Running Sql %s against %d rows" % (sql, len(rows)))
        cur.executemany(sql, rows)
        con.commit()
        tp.taskCompleted()

    logging.info("Adding indexes to table")
    for idxSql in idxSqls:
        cur.execute(idxSql)
        con.commit()

    con.close()

    if finalDbFname != None:
        logging.info("moving over ramdisk db to %s" % dbFname)
        shutil.move(dbFname, finalDbFname)
예제 #18
0
def parseUidToCounts(fname):
    res = {}
    for row in maxCommon.iterTsvRows(fname):
        total = int(row.total)
        geneProtCount = int(row.geneProtCount)

        res[row.uid] = (total, geneProtCount)
    logging.info('Found "gene/protein"-counts for %d journals in %s' %
                 (len(res), fname))
    return res
예제 #19
0
def parseDoneIds(fname):
    " parse all already converted identifiers from inDir "
    doneIds = set()
    if os.path.getsize(fname) == 0:
        return doneIds

    for row in maxCommon.iterTsvRows(fname):
        doneIds.add(row.doi)
    logging.info("Found %d identifiers of already parsed files" % len(doneIds))
    return doneIds
예제 #20
0
def parseDoneIds(fname):
    " parse all already converted identifiers from inDir "
    doneIds = set()
    if os.path.getsize(fname)==0:
        return doneIds

    for row in maxCommon.iterTsvRows(fname):
        doneIds.add(row.doi)
    logging.info("Found %d identifiers of already parsed files" % len(doneIds))
    return doneIds
예제 #21
0
def convertOneChunk(inIndexFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """ 
    store = pubStore.PubWriterFile(outFile)

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    doi2pmid = None
    logging.info("Converting %d files" % len(inRows))
    convCount = 0
    for row in inRows:
        # read line
        i+=1
        articleId, baseDir = row.articleId, row.baseDir
        zipFilename, filename = row.zipFilename, row.filename
        articleId=int(articleId)

        # open file from zipfile
        fullZipPath = join(baseDir, zipFilename)
        zipFile = zipfile.ZipFile(fullZipPath)
        logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows)-i))
        if doi2pmid==None:
            doi2pmid = parseDoi2Pmid(baseDir)
        xmlString = zipFile.open(filename).read()
        xmlTree   = pubXml.etreeFromXml(xmlString)

        # parse xml
        articleData = pubStore.createEmptyArticleDict(publisher="elsevier")
        articleData = parseElsevier(xmlTree, articleData)
        if articleData==None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        articleData["origFile"]="consyn://"+zipFilename+"/"+filename
        if articleData["doi"] in doi2pmid:
           articleData["pmid"] = doi2pmid[articleData["doi"]]

        pii = splitext(basename(filename))[0]
        articleData["externalId"]="PII"+pii
        articleData["fulltextUrl"]="http://www.sciencedirect.com/science/svapps/pii/"+pii

        # convert to ascii
        asciiString, mimeType = treeToAscii_Elsevier(xmlTree)
        if asciiString==None:
            logging.warn("No ASCII for %s / %s" % (zipFilename, filename))
            continue
        store.writeArticle(articleId, articleData)

        # write to output
        fileData = createFileData(articleData, mimeType, asciiString)
        store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"])
        convCount += 1
    logging.info("Converted %d files" % convCount)
    store.close()
예제 #22
0
def convertOneChunk(gzDir, idFname, inIndexFile, outFile):
    # for each row in index:
    store = pubStore.PubWriterFile(outFile)
    donePiis = pubGeneric.parseDoneIds(idFname)

    # log to file
    outBase = join(dirname(outFile), basename(outFile).split(".")[0])
    logFname = outBase + ".log"
    pubGeneric.setupLogging(__file__, None, logFileName=logFname)

    idFname = outBase + "_ids.tab"
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\texternalId\n")

    lastTsvFname = None
    tsvFile = None
    pmidFinder = pubCompare.PmidFinder()
    for row in maxCommon.iterTsvRows(inIndexFile, encoding=None):
        # open file and seek, if necessry
        if tsvFile == None or lastTsvFname != row.tsvFile:
            logging.debug("Seeking to %s in tsvfile %s" %
                          (row.offset, row.tsvFile))
            tsvFile = gzip.open(join(gzDir, row.tsvFile))
            tsvFile.seek(int(row.offset))
        lastTsvFname = row.tsvFile

        line = tsvFile.readline()

        if row.url.startswith("!"):
            logging.info("Ignoring %s, marked as duplicated" % row.url)
            continue
        #fields are: ["articleId", "tsvFile", "url", "offset"]
        fields = line.split("\t")
        url = fields[0]
        logging.debug("Replacing weird bing chars")
        content = fields[-1]
        assert (url == row.url)
        assert (len(content) != 0)
        url = url.decode("utf8")

        logging.debug("Converting to text")
        content = convertMicrosoft(content)
        artDict, fileDict = convertHtmlToDicts(url, content)
        if artDict == None:
            artDict, fileDict = minimalHtmlToDicts(url, content)
        if artDict == None:
            continue
        artDict["pmid"] = pmidFinder.lookupPmid(artDict)
        # write file
        articleId = int(row.articleId)
        fileId = articleId * 1000
        store.writeFile(articleId, fileId, fileDict)
        store.writeArticle(articleId, artDict)
    store.close()
예제 #23
0
def convertOneChunk(gzDir, idFname, inIndexFile, outFile):
    # for each row in index:
    store = pubStore.PubWriterFile(outFile)
    donePiis = pubGeneric.parseDoneIds(idFname)

    # log to file
    outBase = join(dirname(outFile), basename(outFile).split(".")[0])
    logFname = outBase+".log"
    pubGeneric.setupLogging(__file__, None, logFileName=logFname)

    idFname = outBase+"_ids.tab"
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\texternalId\n")

    lastTsvFname = None
    tsvFile = None
    pmidFinder = pubCompare.PmidFinder()
    for row in maxCommon.iterTsvRows(inIndexFile, encoding=None):
        # open file and seek, if necessry
        if tsvFile==None or lastTsvFname!=row.tsvFile:
            logging.debug("Seeking to %s in tsvfile %s" % (row.offset, row.tsvFile))
            tsvFile = gzip.open(join(gzDir, row.tsvFile))
            tsvFile.seek(int(row.offset))
        lastTsvFname = row.tsvFile

        line = tsvFile.readline()

        if row.url.startswith("!"):
            logging.info("Ignoring %s, marked as duplicated" % row.url)
            continue
        #fields are: ["articleId", "tsvFile", "url", "offset"]
        fields = line.split("\t")
        url = fields[0]
        logging.debug("Replacing weird bing chars")
        content = fields[-1]
        assert(url==row.url)
        assert(len(content)!=0)
        url = url.decode("utf8")

        logging.debug("Converting to text")
        content = convertMicrosoft(content)
        artDict, fileDict = convertHtmlToDicts(url, content)
        if artDict==None:
            artDict, fileDict = minimalHtmlToDicts(url, content)
        if artDict==None:
            continue
        artDict["pmid"]  = pmidFinder.lookupPmid(artDict)
        # write file
        articleId = int(row.articleId)
        fileId = articleId*1000
        store.writeFile(articleId, fileId, fileDict)
        store.writeArticle(articleId, artDict)
    store.close()
예제 #24
0
def startup(paramDict):
    global geneIds
    fname = join(dirname(__file__), "data", "wormFinder", "wormIds.tab.gz")
    geneCount = 0
    for row in maxCommon.iterTsvRows(fname):
        if row.locus!="":
            geneIds[row.locus] = row.geneId
        if row.seqId!="":
            geneIds[row.seqId] = row.geneId
        geneCount +=1
        #if row.geneId!="":
            #geneIds[row.geneId] = row.geneId
    logging.info("Loaded %d words mapped to %d genes" % (len(geneIds), geneCount))
예제 #25
0
def parseHighwire():
    """ create two dicts 
    printIssn -> url to pmidlookup-cgi of highwire 
    and 
    publisherName -> top-level hostnames
    >>> temps, domains = parseHighwire()
    >>> temps['0270-6474']
    u'http://www.jneurosci.org/cgi/pmidlookup?view=long&pmid=%(pmid)s'
    >>> domains["Society for Neuroscience"]
    set([u'jneurosci'])
    """
    # highwire's publisher names are not resolved ("SAGE", "SAGE Pub", etc)
    # so: first get dict printIssn -> resolved publisherName from publishers.tab
    pubFname = join(pubConf.publisherDir, "publishers.tab")
    pIssnToPub = {}
    for row in maxCommon.iterTsvRows(pubFname):
        if not row.pubName.startswith("HIGHWIRE"):
            continue
        for issn in row.journalIssns.split("|"):
            issn = issn.rstrip(" ")
            pIssnToPub[issn] = row.pubName.replace("HIGHWIRE ","").strip()

    # go over highwire table and make dict pubName -> issn -> templates
    # and dict pubName -> domains
    fname = join(pubConf.journalListDir, "highwire.tab")
    templates = {}
    domains = {}
    for row in maxCommon.iterTsvRows(fname, encoding="latin1"):
        if row.eIssn.strip()=="Unknown":
            continue
        pubName = pIssnToPub[row.pIssn.strip()].strip()
        templates.setdefault(pubName, {})
        templates[row.pIssn.strip()] = row.urls.strip()+"/cgi/pmidlookup?view=long&pmid=%(pmid)s" 

        host = urlparse.urlparse(row.urls).hostname
        domain = ".".join(host.split('.')[-2:]).strip()
        domains.setdefault(pubName, set()).add(domain)

    return templates, domains
예제 #26
0
def startup(paramDict):
    global geneIds
    fname = join(dirname(__file__), "data", "wormFinder", "wormIds.tab.gz")
    geneCount = 0
    for row in maxCommon.iterTsvRows(fname):
        if row.locus != "":
            geneIds[row.locus] = row.geneId
        if row.seqId != "":
            geneIds[row.seqId] = row.geneId
        geneCount += 1
        #if row.geneId!="":
        #geneIds[row.geneId] = row.geneId
    logging.info("Loaded %d words mapped to %d genes" %
                 (len(geneIds), geneCount))
예제 #27
0
def getEIssnToPIssn(journalFname):
    """ return a dict that maps from eIssn to pIssn """
    logging.info("Parsing %s to get eIssn -> pIssn mapping" % journalFname)
    ret = {}
    for row in maxCommon.iterTsvRows(journalFname):
        eStr = row.journalEIssns
        pStr = row.journalIssns
        if eStr=="" or pStr=="":
            continue
        eIssns = eStr.split("|")
        pIssns = pStr.split("|")
        assert(len(eIssns)==len(pIssns))
        for eIs, pIs in zip(eIssns, pIssns):
            if eIs!="" and pIs!="":
                ret[eIs] = pIs
    return ret
예제 #28
0
def getEIssnToPIssn(journalFname):
    """ return a dict that maps from eIssn to pIssn """
    logging.info("Parsing %s to get eIssn -> pIssn mapping" % journalFname)
    ret = {}
    for row in maxCommon.iterTsvRows(journalFname):
        eStr = row.journalEIssns
        pStr = row.journalIssns
        if eStr == "" or pStr == "":
            continue
        eIssns = eStr.split("|")
        pIssns = pStr.split("|")
        assert (len(eIssns) == len(pIssns))
        for eIs, pIs in zip(eIssns, pIssns):
            if eIs != "" and pIs != "":
                ret[eIs] = pIs
    return ret
예제 #29
0
def getAllUpdateIds(datasets):
    " collect all available text dataset updateIds for all datasets "
    textUpdateIds = {}
    for dataset in datasets:
        textDir = pubConf.resolveTextDir(dataset)
        updateFname = join(textDir, "updates.tab")
        logging.debug("Reading %s" % updateFname)
        updateIds = []
        for row in maxCommon.iterTsvRows(updateFname):
            updateIds.append(row.updateId)
        textUpdateIds[dataset] = updateIds
    return textUpdateIds

    # also save to file, so we don't have to do this again
    outFname = join(batchDir, "updateIds.json")
    json.dumps(textUpdateIds, open(outFname, "w"), sort_keys=True, indent=4, )
    return textUpdateIds
예제 #30
0
def runProcessRow(inName, alg, paramDict, outName):
    " run the rows from inName through alg and write to outName "
    tmpFnames = []
    outFh, tmpFnames = newTempOutFile(tmpFnames, outName, alg, None)
    for row in maxCommon.iterTsvRows(inName):
        newRow = alg.processRow(row)
        if newRow!=None and len(newRow)!=[]:
            writeRow(newRow, outFh)

    if "allResults" in dir(alg):
        logging.debug("running allResults() function")
        rows = alg.allResults()
        if rows!=None:
            for row in rows:
                writeRow(row, outFh)
    outFh.close()

    moveTempToFinal(tmpFnames[0], outName)
예제 #31
0
def concatDois(inDir, outDir, outFname):
    " concat all dois of id files in inDir to outFname "
    outPath = join(outDir, outFname)
    inMask = join(inDir, "*ids.tab")
    idFnames = glob.glob(inMask)
    logging.debug("Concatting DOIs from %s to %s" % (inMask, outPath))
    dois = []
    for inFname in idFnames:
        for row in maxCommon.iterTsvRows(inFname):
            dois.append(row.doi)

    ofh = open(outPath, "w")
    ofh.write("#doi\n")
    for doi in dois:
        ofh.write("%s\n" % doi)
    ofh.close()

    return outPath
예제 #32
0
def concatDois(inDir, outDir, outFname):
    " concat all dois of id files in inDir to outFname "
    outPath = join(outDir, outFname)
    inMask = join(inDir, "*ids.tab")
    idFnames = glob.glob(inMask)
    logging.debug("Concatting DOIs from %s to %s" % (inMask, outPath))
    dois = []
    for inFname in idFnames:
        for row in maxCommon.iterTsvRows(inFname):
            dois.append(row.doi)

    ofh = open(outPath, "w")
    ofh.write("#doi\n")
    for doi in dois:
        ofh.write("%s\n" % doi)
    ofh.close()

    return outPath
예제 #33
0
def parseEntrez(fname):
    """ parse a tab-sep table with headers and return one dict with entrez to refprots
    and another dict with entrez to symbol
    """
    entrez2Sym = dict()
    entrez2RefseqProts = dict()

    for row in maxCommon.iterTsvRows(fname):
        entrez2Sym[int(row.entrezId)] = row.sym
        #refseqs = row.refseqIds.split(",")
        if row.refseqProtIds=="":
            refProts = None
        else:
            refProts = row.refseqProtIds.split(",")
            #assert(len(refProts)==len(refseqs))

        entrez2RefseqProts[int(row.entrezId)] = refProts
    return entrez2Sym, entrez2RefseqProts
예제 #34
0
def splitTabFileOnChunkId(filename, outDir, chunkSize=None, chunkCount=None):
    """ 
    use the chunkId field of a tab-sep file as the output filename.
    if chunkSize is specified, ignore the chunkId field and make sure that each piece
    has chunkSize lines.
    """
    if isdir(outDir):
        logging.info("Deleting %s" % outDir)
        shutil.rmtree(outDir)

    if not os.path.isdir(outDir):
        logging.info("Creating directory %s" % outDir)
        os.makedirs(outDir)
    maxCommon.mustBeEmptyDir(outDir)

    # read data into data dict and split by "chunkId" field
    headerLine = open(filename).readline()
    logging.info("Reading %s, splitting into pieces" % filename)
    data = {}
    i = 0
    for row in maxCommon.iterTsvRows(filename, encoding=None):
        if chunkSize==None and chunkCount==None:
            chunkId = row.chunkId
        elif chunkSize!=None:
            chunkId = "%05d" % (i / chunkSize)
        elif chunkCount!=None:
            chunkId = "%05d" % (i % chunkSize)
        data.setdefault(str(chunkId), []).append("\t".join(row)+"\n")
        i += 1

    # write to outDir
    logging.info("Splitting file data, Writing to %d files in %s/xxxx.tgz" % (len(data), outDir))
    pm = maxCommon.ProgressMeter(len(data))
    for chunkIdString, lines in data.iteritems():
        outfname = os.path.join(outDir, chunkIdString)
        logging.debug("Writing to %s" % outfname)
        fh = open(outfname, "w")
        fh.write(headerLine)
        for line in lines:
            fh.write(line)
        fh.close()
        pm.taskCompleted()

    return data.keys()
예제 #35
0
def getTargetJournals(journalFname):
    " get english journals with eIssn "
    logging.info("Parsing %s" % journalFname)
    data = {}
    #issnToUid = {}
    for row in maxCommon.iterTsvRows(journalFname):
        if not row.source.startswith("NLM") or row.uniqueId == "":
            continue
        if row.language == "eng" and row.eIssn != "":
            #data.add(row.uniqueId)
            data[row.uniqueId] = row
        #if row.uniqueId!="":
        #issnToUid[row.pIssn] = row.uniqueId
        #issnToUid[row.eIssn] = row.uniqueId
    logging.info(
        "In NLM Catalog, found %d journals with eIssn , english and with UID" %
        len(data))
    #return data, issnToUid
    return data
예제 #36
0
def iterCdr3Rows(fname):
    for row in maxCommon.iterTsvRows(fname):
        seq = row.seq
        logging.debug("seq %s" % seq)

        if not (row.prefixFilterAccept == "Y"
                and row.suffixFilterAccept == "Y"):
            logging.debug("didn't pass prefix or suffix filter")
            continue

        if "CLASS" in seq:
            logging.debug("contains CLASS")
            continue

        if seq in blackList:
            logging.debug("blacklisted")
            continue

        if not hasCdr3Prefix(seq):
            logging.debug("prefix not OK")
            continue

        if hasCdr3Len(seq):
            logging.debug("and length OK")
            yield row
        else:
            # trying to split cdr3s that got fused into separate seqs again
            # not that this makes the annotation ID longer: it adds three additional digits for the sub-parts
            logging.debug("Length not OK, trying to split")
            parts = splitAndKeep(row.seq, cdr3Regex)

            okParts = []
            for p in parts:
                if hasCdr3Prefix(p) and hasCdr3Len(p):
                    okParts.append(p)

            if len(parts) - len(
                    okParts) < len(parts) / 3:  # we tolerate a few bad pieces
                for num, p in enumerate(okParts):
                    numStr = "%03d" % num
                    newRow = row._replace(annotId=row.annotId + numStr, seq=p)
                    yield newRow
예제 #37
0
def parseRegex(mutDataDir):
    """ parse and compile regexes to list (seqType, mutType, patName, pat) """
    # read regexes, translate placeholders to long form and compile
    replDict = {
    "sep"         : r"""(?:^|[\s\(\[\'"/,\-])""",
    "fromPos"     : r'(?P<fromPos>[1-9][0-9]+)',
    "toPos"       : r'(?P<toPos>[1-9][0-9]+)',
    "pos"         : r'(?P<pos>[1-9][0-9]+)',
    "origAaShort" : r'(?P<origAaShort>[CISQMNPKDTFAGHLRWVEYX])',
    "mutAaShort"  : r'(?P<mutAaShort>[CISQMNPKDTFAGHLRWVEYX*])',
    "skipAa"  : r'(CYS|ILE|SER|GLN|MET|ASN|PRO|LYS|ASP|THR|PHE|ALA|GLY|HIS|LEU|ARG|TRP|VAL|GLU|TYR|TER|GLUTAMINE|GLUTAMIC ACID|LEUCINE|VALINE|ISOLEUCINE|LYSINE|ALANINE|GLYCINE|ASPARTATE|METHIONINE|THREONINE|HISTIDINE|ASPARTIC ACID|ARGININE|ASPARAGINE|TRYPTOPHAN|PROLINE|PHENYLALANINE|CYSTEINE|SERINE|GLUTAMATE|TYROSINE|STOP|X)',
    "origAaLong"  : r'(?P<origAaLong>(CYS|ILE|SER|GLN|MET|ASN|PRO|LYS|ASP|THR|PHE|ALA|GLY|HIS|LEU|ARG|TRP|VAL|GLU|TYR|TER|GLUTAMINE|GLUTAMIC ACID|LEUCINE|VALINE|ISOLEUCINE|LYSINE|ALANINE|GLYCINE|ASPARTATE|METHIONINE|THREONINE|HISTIDINE|ASPARTIC ACID|ARGININE|ASPARAGINE|TRYPTOPHAN|PROLINE|PHENYLALANINE|CYSTEINE|SERINE|GLUTAMATE|TYROSINE|STOP|X))',
    "mutAaLong"  : r'(?P<mutAaLong>(CYS|ILE|SER|GLN|MET|ASN|PRO|LYS|ASP|THR|PHE|ALA|GLY|HIS|LEU|ARG|TRP|VAL|GLU|TYR|TER|GLUTAMINE|GLUTAMIC ACID|LEUCINE|VALINE|ISOLEUCINE|LYSINE|ALANINE|GLYCINE|ASPARTATE|METHIONINE|THREONINE|HISTIDINE|ASPARTIC ACID|ARGININE|ASPARAGINE|TRYPTOPHAN|PROLINE|PHENYLALANINE|CYSTEINE|SERINE|GLUTAMATE|TYROSINE|STOP|X))',
    "dna"         : r'(?P<dna>[actgACTG])',
    "origDna"     : r'(?P<origDna>[actgACTG])',
    "mutDna"      : r'(?P<mutDna>[actgACTG])',
    "fs"          : r'(?P<fs>(fs\*?[0-9]*)|fs\*|fs|)?',
    }
    regexTab = join(mutDataDir, "regex.txt")
    logging.info("Parsing regexes from %s" % regexTab)
    regexList = []
    counts = defaultdict(int)
    for row in maxCommon.iterTsvRows(regexTab, commentPrefix="#"):
        logging.log(5, "Translating %s" % row.pat)
        patName = row.patName
        if patName=="":
            patName = row.pat
        patFull = row.pat.format(**replDict)
        logging.log(5, "full pattern is %s" % patFull)
        flags = 0
        if "Long}" in row.pat:
            flags = re.IGNORECASE
            logging.log(5, "ignoring case for this pattern")
        patComp = re.compile(patFull, flags=flags)
        regexList.append((row.seqType, row.mutType, patName, patComp))
        counts[(row.seqType, row.mutType)] += 1

    for regexType, count in counts.iteritems():
            logging.info("regexType %s, found %d regexes" % (str(regexType), count))
    return regexList
예제 #38
0
def iterCdr3Rows(fname):
    for row in maxCommon.iterTsvRows(fname):
        seq = row.seq
        logging.debug("seq %s" % seq)

        if not (row.prefixFilterAccept=="Y" and row.suffixFilterAccept=="Y"):
            logging.debug("didn't pass prefix or suffix filter")
            continue

        if "CLASS" in seq:
            logging.debug("contains CLASS")
            continue

        if seq in blackList:
            logging.debug("blacklisted")
            continue

        if not hasCdr3Prefix(seq):
            logging.debug("prefix not OK")
            continue

        if hasCdr3Len(seq):
            logging.debug("and length OK")
            yield row
        else:
            # trying to split cdr3s that got fused into separate seqs again
            # not that this makes the annotation ID longer: it adds three additional digits for the sub-parts
            logging.debug("Length not OK, trying to split")
            parts = splitAndKeep(row.seq, cdr3Regex)

            okParts = []
            for p in parts:
                if hasCdr3Prefix(p) and hasCdr3Len(p):
                    okParts.append(p)

            if len(parts) - len(okParts)< len(parts)/3: # we tolerate a few bad pieces
                for num, p in enumerate(okParts):
                    numStr = "%03d" % num
                    newRow = row._replace(annotId=row.annotId+numStr, seq=p)
                    yield newRow
예제 #39
0
def getAllUpdateIds(datasets):
    " collect all available text dataset updateIds for all datasets "
    textUpdateIds = {}
    for dataset in datasets:
        textDir = pubConf.resolveTextDir(dataset)
        updateFname = join(textDir, "updates.tab")
        logging.debug("Reading %s" % updateFname)
        updateIds = []
        for row in maxCommon.iterTsvRows(updateFname):
            updateIds.append(row.updateId)
        textUpdateIds[dataset] = updateIds
    return textUpdateIds

    # also save to file, so we don't have to do this again
    outFname = join(batchDir, "updateIds.json")
    json.dumps(
        textUpdateIds,
        open(outFname, "w"),
        sort_keys=True,
        indent=4,
    )
    return textUpdateIds
예제 #40
0
def addPmids(datasetString):
    " for a given dataset, add the pmids from the pubFingerprint output file to the article files "
    #datasetString = args[0]

    textDir = pubConf.resolveTextDir(datasetString)
    logging.info("Changing article files in %s" % textDir)
    aToPfn = join(textDir, pubConf.idFname)
    logging.info("Reading art -> pmid mapping from %s" % aToPfn)
    artToPmid = parseIdFname(aToPfn)
    fnames = glob.glob(join(textDir, "*.articles.gz"))
    logging.info("Running on %d article files" % len(fnames))
    pm = maxCommon.ProgressMeter(len(fnames), stepCount=100)
    updateSqliteIds(textDir, artToPmid.items())
    #sys.exit(0)

    logging.info("Updating tab sep files")
    for fname in fnames:
        # write headers
        newFname = join(pubConf.TEMPDIR, basename(fname))
        logging.debug("reading %s, writing %s" % (fname, newFname))
        newF = gzip.open(newFname, "w")
        newF.write(gzip.open(fname).readline())

        # write rows, replacing pmids on the way
        for row in maxCommon.iterTsvRows(fname):
            artId = int(row.articleId)
            if int(row.articleId) in artToPmid:
                row = row._replace(pmid=artToPmid[artId])
            newF.write((u'\t'.join(row)).encode("utf8"))
            newF.write("\n")
        newF.close()

        # rename old, move over the new one
        shutil.move(fname, fname+".bak")
        shutil.move(newFname, fname)
        pm.taskCompleted()
예제 #41
0
def addPmids(datasetString):
    " for a given dataset, add the pmids from the pubFingerprint output file to the article files "
    #datasetString = args[0]

    textDir = pubConf.resolveTextDir(datasetString)
    logging.info("Changing article files in %s" % textDir)
    aToPfn = join(textDir, pubConf.idFname)
    logging.info("Reading art -> pmid mapping from %s" % aToPfn)
    artToPmid = parseIdFname(aToPfn)
    fnames = glob.glob(join(textDir, "*.articles.gz"))
    logging.info("Running on %d article files" % len(fnames))
    pm = maxCommon.ProgressMeter(len(fnames), stepCount=100)
    updateSqliteIds(textDir, artToPmid.items())
    #sys.exit(0)

    logging.info("Updating tab sep files")
    for fname in fnames:
        # write headers
        newFname = join(pubConf.TEMPDIR, basename(fname))
        logging.debug("reading %s, writing %s" % (fname, newFname))
        newF = gzip.open(newFname, "w")
        newF.write(gzip.open(fname).readline())

        # write rows, replacing pmids on the way
        for row in maxCommon.iterTsvRows(fname):
            artId = int(row.articleId)
            if int(row.articleId) in artToPmid:
                row = row._replace(pmid=artToPmid[artId])
            newF.write((u'\t'.join(row)).encode("utf8"))
            newF.write("\n")
        newF.close()

        # rename old, move over the new one
        shutil.move(fname, fname + ".bak")
        shutil.move(newFname, fname)
        pm.taskCompleted()
예제 #42
0
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """ 
    diskDir = abspath(join(zipDir, "..", "disk"))

    store = pubStore.PubWriterFile(outFile)

    # read all already done IDs
    doneIds = parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile), basename(outFile).split(".")[0]+".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\tdoi\tpmid\n")

    pmidFinder = pubCompare.PmidFinder()

    unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir())
    maxCommon.delOnExit(unzipTmp)

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    logging.info("Converting %d files" % len(inRows))
    convCount = 0
    pdfNotFound = 0
    for row in inRows:
        # read line
        i+=1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename

        if u'\xbf' in filename:
            logging.info("Found weird character, skipping file")
            continue
        
        articleData = pubStore.createEmptyArticleDict(publisher="springer")
        if zipFilename=="":
            xmlString, pdfString = getDiskData(diskDir, filename)
            articleData["origFile"] = filename
        else:
            xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename)
            articleData["origFile"] = zipFilename+":"+filename

        if pdfString==None:
            pdfNotFound+=1
            logging.error("Could not open pdf or xml file")
            continue

        articleId=int(articleId)

        # parse xml
        logging.debug("Parsing XML")
        try:
            xmlTree   = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename))
            continue

        articleData = parseXml(xmlTree, articleData)

        if articleData==None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        if articleData["doi"] in doneIds:
            logging.error("article %s has already been converted, skipping" % articleData["doi"])
            continue

        articleData["pmid"] = pmidFinder.lookupPmid(articleData)
        articleData["origFile"]=zipFilename+"/"+filename
        articleData["externalId"]=articleData["doi"]

        # convert pdf to ascii
        fileData = createFileData(articleData, "application/pdf", pdfString)
        logging.debug("converting pdf to ascii")
        pubGeneric.toAscii(fileData, "application/pdf")

        # write to output
        store.writeArticle(articleId, articleData)
        store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"])

        # write IDs to separate file 
        idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])]
        idFh.write("\t".join(idRow))
        idFh.write("\n")

        doneIds.add(articleData["doi"])

        convCount += 1
    logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound))
    store.close()
    idFh.close()
예제 #43
0
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """
    store = pubStore.PubWriterFile(outFile)
    # read all already done IDs
    donePiis = pubGeneric.parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile),
                   basename(outFile).split(".")[0] + ".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\tdoi\texternalId\tpmid\n")

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    #doi2pmid = None
    convCount = 0
    skipCount = 0
    pmidFinder = pubCompare.PmidFinder()
    logging.info("Converting %d files" % len(inRows))
    for row in inRows:
        # read line
        i += 1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename
        articleId = int(articleId)

        pii = splitext(basename(filename))[0]
        if pii in donePiis:
            logging.debug("PII %s has already been converted, skipping" % pii)
            skipCount += 1
            continue
        donePiis.add(pii)

        # open file from zipfile
        fullZipPath = join(zipDir, zipFilename)
        zipFile = zipfile.ZipFile(fullZipPath)
        logging.debug("Parsing %s, file %s, %d files left" %
                      (fullZipPath, filename, len(inRows) - i))
        #if doi2pmid==None:
        #doi2pmid = parseDoi2Pmid(baseDir)
        xmlString = zipFile.open(filename).read()
        try:
            xmlTree = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" %
                          (zipFilename, filename))
            continue

        # parse xml
        articleData = pubStore.createEmptyArticleDict(publisher="elsevier")
        articleData = parseElsevier(xmlTree, articleData)
        if articleData == None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        articleData["origFile"] = zipFilename + ":" + filename
        #if articleData["doi"] in doi2pmid:
        #articleData["pmid"] = doi2pmid[articleData["doi"]]

        articleData["externalId"] = pii
        articleData[
            "fulltextUrl"] = "http://www.sciencedirect.com/science/svapps/pii/" + pii
        #articleData["pmid"]  = pmidFinder.lookupPmid(articleData)

        # convert to ascii
        asciiString, mimeType = treeToAscii_Elsevier(xmlTree)
        if asciiString == None:
            logging.warn("No ASCII for %s / %s" % (zipFilename, filename))
            continue
        store.writeArticle(articleId, articleData)

        # write IDs to separate file
        idRow = [
            str(articleData["articleId"]), articleData["doi"],
            articleData["externalId"],
            str(articleData["pmid"])
        ]
        idFh.write("\t".join(idRow))
        idFh.write("\n")

        # write to output
        fileData = createFileData(articleData, mimeType, asciiString)
        store.writeFile(articleId, (1000 * (articleId)) + 1,
                        fileData,
                        externalId=articleData["externalId"])
        convCount += 1
    logging.info("Converted %d files, skipped %d" % (convCount, skipCount))
    store.close()
    idFh.close()
예제 #44
0
def parseIdFname(fname):
    res = {}
    for row in maxCommon.iterTsvRows(fname):
        res[int(row.artId1)] = row.pmid
    return res
예제 #45
0
    logging.info("Reading %s" % inFname)
    headerLine = gzip.open(inFname).readline()
    if "publisher" in headerLine:
        logging.info("%s is OK" % inFname)
        continue

    bakFname = inFname+".bak"
    if isfile(bakFname):
        logging.info("%s exists" % bakFname)
        sys.exit(1)
    logging.info("Renaming %s to %s" % (inFname, bakFname))
    shutil.move(inFname, bakFname)
    headers = headerLine.strip().split("\t")
    headers.insert(3, "publisher")

    outFname = inFname
    inFname = bakFname

    logging.info("Writing %s" % outFname)
    ofh = gzip.open(outFname, "w")
    ofh.write("\t".join(headers)+"\n")
    for row in maxCommon.iterTsvRows(inFname, isGzip=True):
        row = list(row)
        row.insert(3, publisher)
        row = [r.encode("utf8") for r in row]
        line = "\t".join(row)+"\n"
        ofh.write(line)



예제 #46
0
        s.add(word)
        #s.add(word[:3])
        #s.add(word[:4])
    return s


uniprotFname = join(pubConf.dbRefDir, "uniprot.tab")
print "Reading %s" % uniprotFname

dictFh = open("uniProt.dict.tab", "w")

print ("parsing BNC")
bncWords = parseBnc()

print ("constructing dictionary")
for row in maxCommon.iterTsvRows(uniprotFname):
    if row.taxonId!="9606":
        continue

    accs = set()
    #accs = appendAll(accs, row.accList.split("|"))
    #accs = appendAll(accs, [x.split(".")[0] for x in row.refSeq.split("|")]) # remove version number
    #accs = appendAll(accs, row.ensemblProt.split("|"))
    #accs = appendAll(accs, row.ensemblGene.split("|"))
    #accs = appendAll(accs, row.embl.split("|"))
    #accs = appendAll(accs, row.pdb.split("|"))
    #accs = appendAll(accs, row.uniGene.split("|"))
    #accs = appendAll(accs, row.omim.split("|"), prefixList=["omim ", "OMIM ", "MIM "])
    #accs = list(set(accs))
    #for delChar in ["*", ",", ".", "/", "(", ")"]: 
        #accs = [acc.replace(delChar," ").replace("  ", " ") for acc in accs]
예제 #47
0
    inFnames = glob.glob(join(inDir, "*.articles.gz"))

for inFname in inFnames:
    logging.info("Reading %s" % inFname)
    headerLine = gzip.open(inFname).readline()
    if "publisher" in headerLine:
        logging.info("%s is OK" % inFname)
        continue

    bakFname = inFname + ".bak"
    if isfile(bakFname):
        logging.info("%s exists" % bakFname)
        sys.exit(1)
    logging.info("Renaming %s to %s" % (inFname, bakFname))
    shutil.move(inFname, bakFname)
    headers = headerLine.strip().split("\t")
    headers.insert(3, "publisher")

    outFname = inFname
    inFname = bakFname

    logging.info("Writing %s" % outFname)
    ofh = gzip.open(outFname, "w")
    ofh.write("\t".join(headers) + "\n")
    for row in maxCommon.iterTsvRows(inFname, isGzip=True):
        row = list(row)
        row.insert(3, publisher)
        row = [r.encode("utf8") for r in row]
        line = "\t".join(row) + "\n"
        ofh.write(line)
예제 #48
0
        "uniProt" : "http://www.uniprot.org/uniprot/",
        "pubmed" : "http://www.ncbi.nlm.nih.gov/pubmed/"
        }

def htmlLink(urlType, acc):
    return '<a href="%s%s">%s</a>' % (urls[urlType], acc, acc)

if __name__ == '__main__':

    psls = indexPsls("uniProtVsGenome.psl")
    ofh = open("uniprotMutations.bed", "w")
    ofh2 = open("temp.bed", "w")
    uniProtMutFname = join(pubConf.dbRefDir, "uniprot.mut.tab")
    count = 0
    notMapped = []
    for mut in maxCommon.iterTsvRows(uniProtMutFname):
        mapper = PslMapBedMaker()
        if mut.acc not in psls:
            notMapped.append(mut.acc)
            continue
        mutPos = 3*(int(mut.position)-1)
        ofh2.write("\t".join([mut.acc, str(mutPos), str(mutPos+3), mut.acc+":"+mut.origAa+mut.position+mut.mutAa])+"\n")

        mapPsls = psls[mut.acc]
        for psl in mapPsls:
            bed = mapper.mapQuery(psl, mutPos, mutPos+3)
            if bed==None:
                print("Could not map: ", mut)
                continue
            bed[3] = " ".join((mut.disease.split("|")[0]).replace("-", " ").replace(" type", "").split()[:3])
            bed.append(mut.disease)
예제 #49
0
def updatePmids(medlineDir, crawlDir, updateIds, minYear=None):
    """ go over subdirs of crawlDir, for each: read the ISSNs, and add new
    PMIDs we have in medlineDir to subdir/pmids.txt
    We never remove a PMID from pmids.txt.
    """
    logging.info("Now updating crawler directories with the new PMIDs")
    eIssnToPIssn = getEIssnToPIssn(pubConf.publisherIssnTable)
    subDirs = getSubdirs(crawlDir)
    con, cur = pubStore.openArticleDb("medline",
                                      mustOpen=True,
                                      useRamdisk=True)
    for subdir in subDirs:
        if subdir.endswith(".tmp"):
            continue
        subPath = join(crawlDir, subdir)
        logging.info("Processing subdirectory %s" % subPath)
        if isfile(pubCrawlLib.getLockFname(subPath)):
            logging.warn(
                "Found lockfile, looks like a crawl is going on in %s, skipping"
                % subPath)
            continue

        pmidFname = join(crawlDir, subdir, "pmids.txt")
        issnFname = join(crawlDir, subdir, "issns.tab")
        if not isfile(issnFname) or not isfile(pmidFname):
            logging.info("Skipping %s, ISSN or docId file not found" % subPath)
            continue
        logging.debug("reading subdir %s: %s and %s" %
                      (subdir, pmidFname, issnFname))
        issns = [row.issn.strip() for row in maxCommon.iterTsvRows(issnFname)]
        logging.debug("ISSNs: %s" % ",".join(issns))
        # read old pmids
        oldPmids = set([int(line.rstrip()) for line in open(pmidFname)])
        #newPmids = set()
        # add new pmids, for each issn
        newPmids = getPmidsForIssns(con, cur, issns, minYear)

        logging.debug("%d PMIDs" % (len(newPmids)))
        oldCount = len(oldPmids)
        updateCount = len(newPmids)
        oldPmids.update(
            newPmids)  # faster to add new to old set than old to new set

        pmids = oldPmids
        newCount = len(pmids)
        addCount = newCount - oldCount
        logging.info("crawl dir %s: old PMID count %d, update has %d, new total %d, added %d" % \
            (subdir, oldCount, updateCount, newCount, addCount))

        # write new pmids
        pmids = [str(x) for x in pmids]
        # randomize order, to distribute errors
        random.shuffle(pmids)

        # write all pmids to a tmp file
        pmidTmpFname = pmidFname + ".new"
        pmidFh = open(pmidTmpFname, "w")
        pmidFh.write("\n".join(pmids))
        pmidFh.close()

        # keep a copy of the original pmid file
        shutil.copy(pmidFname, pmidFname + ".bak")
        # atomic rename  the tmp file to the original file
        # to make sure that an intact pmid file always exists
        os.rename(pmidTmpFname, pmidFname)
예제 #50
0
def parseIdFname(fname):
    res = {}
    for row in maxCommon.iterTsvRows(fname):
        res[int(row.artId1)] = row.pmid
    return res
예제 #51
0
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """ 
    store = pubStore.PubWriterFile(outFile)
    # read all already done IDs
    donePiis = pubGeneric.parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile), basename(outFile).split(".")[0]+".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\tdoi\texternalId\tpmid\n")

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    #doi2pmid = None
    convCount = 0
    skipCount = 0
    pmidFinder = pubCompare.PmidFinder()
    logging.info("Converting %d files" % len(inRows))
    for row in inRows:
        # read line
        i+=1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename
        articleId=int(articleId)

        pii = splitext(basename(filename))[0]
        if pii in donePiis:
            logging.debug("PII %s has already been converted, skipping" % pii)
            skipCount += 1
            continue
        donePiis.add(pii)

        # open file from zipfile
        fullZipPath = join(zipDir, zipFilename)
        zipFile = zipfile.ZipFile(fullZipPath)
        logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows)-i))
        #if doi2pmid==None:
            #doi2pmid = parseDoi2Pmid(baseDir)
        xmlString = zipFile.open(filename).read()
        try:
            xmlTree   = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename))
            continue

        # parse xml
        articleData = pubStore.createEmptyArticleDict(publisher="elsevier")
        articleData = parseElsevier(xmlTree, articleData)
        if articleData==None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        articleData["origFile"]=zipFilename+":"+filename
        #if articleData["doi"] in doi2pmid:
           #articleData["pmid"] = doi2pmid[articleData["doi"]]

        articleData["externalId"]=pii
        articleData["fulltextUrl"]="http://www.sciencedirect.com/science/svapps/pii/"+pii
        #articleData["pmid"]  = pmidFinder.lookupPmid(articleData)

        # convert to ascii
        asciiString, mimeType = treeToAscii_Elsevier(xmlTree)
        if asciiString==None:
            logging.warn("No ASCII for %s / %s" % (zipFilename, filename))
            continue
        store.writeArticle(articleId, articleData)

        # write IDs to separate file 
        idRow = [str(articleData["articleId"]), articleData["doi"], articleData["externalId"], str(articleData["pmid"])]
        idFh.write("\t".join(idRow))
        idFh.write("\n")

        # write to output
        fileData = createFileData(articleData, mimeType, asciiString)
        store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"])
        convCount += 1
    logging.info("Converted %d files, skipped %d" % (convCount, skipCount))
    store.close()
    idFh.close()
예제 #52
0
}


def htmlLink(urlType, acc):
    return '<a href="%s%s">%s</a>' % (urls[urlType], acc, acc)


if __name__ == '__main__':

    psls = indexPsls("uniProtVsGenome.psl")
    ofh = open("uniprotMutations.bed", "w")
    ofh2 = open("temp.bed", "w")
    uniProtMutFname = join(pubConf.dbRefDir, "uniprot.mut.tab")
    count = 0
    notMapped = []
    for mut in maxCommon.iterTsvRows(uniProtMutFname):
        mapper = PslMapBedMaker()
        if mut.acc not in psls:
            notMapped.append(mut.acc)
            continue
        mutPos = 3 * (int(mut.position) - 1)
        ofh2.write("\t".join([
            mut.acc,
            str(mutPos),
            str(mutPos + 3), mut.acc + ":" + mut.origAa + mut.position +
            mut.mutAa
        ]) + "\n")

        mapPsls = psls[mut.acc]
        for psl in mapPsls:
            bed = mapper.mapQuery(psl, mutPos, mutPos + 3)
예제 #53
0
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """
    diskDir = abspath(join(zipDir, "..", "disk"))

    store = pubStore.PubWriterFile(outFile)

    # read all already done IDs
    doneIds = parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile), basename(outFile).split(".")[0] + ".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\tdoi\tpmid\n")

    pmidFinder = pubCompare.PmidFinder()

    unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir())
    maxCommon.delOnExit(unzipTmp)

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    logging.info("Converting %d files" % len(inRows))
    convCount = 0
    pdfNotFound = 0
    for row in inRows:
        # read line
        i += 1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename

        if u"\xbf" in filename:
            logging.info("Found weird character, skipping file")
            continue

        articleData = pubStore.createEmptyArticleDict(publisher="springer")
        if zipFilename == "":
            xmlString, pdfString = getDiskData(diskDir, filename)
            articleData["origFile"] = filename
        else:
            xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename)
            articleData["origFile"] = zipFilename + ":" + filename

        if pdfString == None:
            pdfNotFound += 1
            logging.error("Could not open pdf or xml file")
            continue

        articleId = int(articleId)

        # parse xml
        logging.debug("Parsing XML")
        try:
            xmlTree = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename))
            continue

        articleData = parseXml(xmlTree, articleData)

        if articleData == None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        if articleData["doi"] in doneIds:
            logging.error("article %s has already been converted, skipping" % articleData["doi"])
            continue

        articleData["pmid"] = pmidFinder.lookupPmid(articleData)
        articleData["origFile"] = zipFilename + "/" + filename
        articleData["externalId"] = articleData["doi"]

        # convert pdf to ascii
        fileData = createFileData(articleData, "application/pdf", pdfString)
        logging.debug("converting pdf to ascii")
        pubGeneric.toAscii(fileData, "application/pdf")

        # write to output
        store.writeArticle(articleId, articleData)
        store.writeFile(articleId, (1000 * (articleId)) + 1, fileData, externalId=articleData["externalId"])

        # write IDs to separate file
        idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])]
        idFh.write("\t".join(idRow))
        idFh.write("\n")

        doneIds.add(articleData["doi"])

        convCount += 1
    logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound))
    store.close()
    idFh.close()