def parsePermissions(LICENSETABLE): " return dict with publisher name lower cased -> permission color (green or red) " pubToPermission = {} for row in maxCommon.iterTsvRows(LICENSETABLE): pubName = row.pubName.lower() if int(row.havePermission) == 1: pubToPermission[pubName] = "green" else: pubToPermission[pubName] = "red" for row in maxCommon.iterTsvRows(OATABLE): pubToPermission[row.pubName.lower()] = "blue" #print pubName.lower() return pubToPermission
def parseTabPublisherFile(fname): " parse a file with columns eIssn, publisher (optional) and urls into a list of records " logging.info("Parsing %s" % fname) journals = list(maxCommon.iterTsvRows(fname, encoding="latin1")) # modify publisher field datasetName = splitext(basename(fname))[0] headers = list(journals[0]._fields) addPubField = False if "publisher" not in headers: headers.insert(0, "publisher") addPubField =True JRec = collections.namedtuple("Journal", headers) newJournals = [] for j in journals: if j.eIssn.lower()=="print only" or j.eIssn.lower()=="unknown": logging.debug("Skipping journal %s, no eIssn" % j.title) continue if addPubField: newJ = [datasetName] newJ.extend(j) newJRec = JRec(*newJ) else: newJRec = j newJournals.append(newJRec) return newJournals
def updatePmids(medlineDir, crawlDir, updateIds, minYear=None): """ go over subdirs of crawlDir, for each: read the ISSNs, and add new PMIDs we have in medlineDir to subdir/pmids.txt We never remove a PMID from pmids.txt. """ logging.info("Now updating crawler directories with the new PMIDs") eIssnToPIssn = getEIssnToPIssn(pubConf.publisherIssnTable) issnToPmid, issnToJournal = getIssnPmidDict(medlineDir, updateIds, minYear) for subdir in getSubdirs(crawlDir): pmidFname = join(crawlDir, subdir, "pmids.txt") issnFname = join(crawlDir, subdir, "issns.tab") if not isfile(issnFname) or not isfile(pmidFname): continue logging.debug("reading subdir %s: %s and %s" % (subdir, pmidFname, issnFname)) issns = [row.issn.strip() for row in maxCommon.iterTsvRows(issnFname)] logging.debug("ISSNs: %s" % ",".join(issns)) # read old pmids oldPmids = set([int(line.rstrip()) for line in open(pmidFname)]) newPmids = set() # add new pmids, for each issn for issn in issns: if issn not in issnToPmid: if issn in eIssnToPIssn: logging.debug("Looks like eISSN, mapped to printISSN %s" % issn) issn = eIssnToPIssn[issn] else: logging.debug("No Pmids for ISSN %s and not eIssn for it" % issn) issnPmids = issnToPmid.get(issn, None) if issnPmids==None: logging.debug("No Pmids for ISSN %s" % issn) continue logging.debug("Issn %s, %d PMIDs" % (issn, len(issnPmids))) newPmids.update(issnPmids) # get some counts and output to user oldCount = len(oldPmids) updateCount = len(newPmids) oldPmids.update(newPmids) # faster to add new to old set than old to new set pmids = oldPmids newCount = len(pmids) addCount = newCount - oldCount logging.info("crawl dir %s: old PMID count %d, update has %d, new total %d, added %d" % \ (subdir, oldCount, updateCount, newCount, addCount)) # write new pmids pmids = [str(x) for x in pmids] # randomize order, to distribute errors random.shuffle(pmids) # write all pmids to a tmp file pmidTmpFname = pmidFname+".new" pmidFh = open(pmidTmpFname, "w") pmidFh.write("\n".join(pmids)) pmidFh.close() # keep a copy of the original pmid file shutil.copy(pmidFname, pmidFname+".bak") # rename the tmp file to the original file # to make sure that an intact pmid file always exists os.rename(pmidTmpFname, pmidFname)
def iterArticleDataDir(textDir, type="articles", filterFname=None, updateIds=None): """ yields all articleData from all files in textDir Can filter to yield only a set of filenames or files for a given list of updateIds. """ fcount = 0 if type=="articles": baseMask = "*.articles.gz" elif type=="files": baseMask = "*.files.gz" elif type=="annots": baseMask = "*.tab.gz" else: logging.error("Article type %s not valid" % type) sys.exit(1) if isfile(textDir): fileNames = [textDir] logging.debug("Found 1 file, %s" % textDir) else: fileMask = os.path.join(textDir, baseMask) fileNames = glob.glob(fileMask) logging.debug("Looking for all fulltext files in %s, found %d files" % \ (fileMask, len(fileNames))) if updateIds!=None and len(updateIds)!=0: logging.debug("Restricting fulltext files to updateIds %s" % str(updateIds)) filteredFiles = [] for updateId in updateIds: for fname in fileNames: if basename(fname).startswith(str(updateId)+"_"): filteredFiles.append(fname) logging.debug("Update Id %s, %d files" % (str(updateId), len(filteredFiles))) fileNames = list(filteredFiles) logging.debug("Found %d files in input dir %s" % (len(fileNames), textDir)) pm = maxCommon.ProgressMeter(len(fileNames), stepCount=100) for textFname in fileNames: if filterFname!=None and not filterFname in textFname: logging.warn("Skipping %s, because file filter is set" % textFname) continue reader = PubReaderFile(textFname) logging.debug("Reading %s, %d files left" % (textFname, len(fileNames)-fcount)) fcount+=1 if type=="articles": for articleData in reader.articleRows: if "publisher" not in articleData._fields: # XX temporary bugfix as I have some old files articleData = list(articleData) articleData.insert(2, "") articleData[3] = "" yield articleData elif type=="files": for fileData in reader.fileRows: yield fileData elif type=="annots": for row in maxCommon.iterTsvRows(textFname): yield row else: assert(False) # illegal type parameter pm.taskCompleted()
def parseHighwire(): """ create two dicts printIssn -> url to pmidlookup-cgi of highwire and publisherName -> top-level hostnames >>> temps, domains = parseHighwire() >>> temps['0270-6474'] u'http://www.jneurosci.org/cgi/pmidlookup?view=long&pmid=%(pmid)s' >>> domains["Society for Neuroscience"] set([u'jneurosci.org']) >>> domains["American Society for Biochemistry and Molecular Biology"] set([u'jbc.org', u'mcponline.org', u'jlr.org']) >>> temps["1535-9476"] u'http://www.mcponline.org/cgi/pmidlookup?view=long&pmid=%(pmid)s' """ templates = {} domains = {} pubFname = pubConf.publisherIssnTable logging.info("Parsing %s to find highwire ISSNs/webservers" % pubFname) for row in maxCommon.iterTsvRows(pubFname): if not row.pubName.startswith("HIGHWIRE"): continue pubName = row.pubName.replace("HIGHWIRE ","") issns = [i.strip() for i in row.journalIssns.split("|")] servers = row.webservers.split("|") for issn, server in zip(issns, servers): template = "http://www."+server+"/cgi/pmidlookup?view=long&pmid=%(pmid)s" templates[issn] = template domains.setdefault(pubName, set()).add(server) #logging.debug("HIGHWIRE CONFIG %s, %s, %s" % (pubName, template, domains[pubName])) return templates, domains
def __init__(self, fname): " fname can end in .articles.gz, reader will still read both articles and files " logging.debug("Reading data from file with prefix %s (.articles.gz, .files.gz)" % fname) baseDir = dirname(fname) base = basename(fname).split('.')[0] articleFn = join(baseDir, base+".articles.gz") fileFn = join(baseDir, base+".files.gz") logging.debug("Reading %s and %s" % (articleFn, fileFn)) self.articleRows = None if isfile(articleFn) and getsize(articleFn)!=0: self.articleRows = maxCommon.iterTsvRows(articleFn, encoding="utf8") self.fileRows = None if isfile(fileFn) and getsize(fileFn)!=0: self.fileRows = maxCommon.iterTsvRows(fileFn, encoding="utf8")
def parseHighwire(): """ create two dicts printIssn -> url to pmidlookup-cgi of highwire and publisherName -> top-level hostnames >>> temps, domains = parseHighwire() >>> temps['0270-6474'] u'http://www.jneurosci.org/cgi/pmidlookup?view=long&pmid=%(pmid)s' >>> domains["Society for Neuroscience"] set([u'jneurosci.org']) >>> domains["American Society for Biochemistry and Molecular Biology"] set([u'jbc.org', u'mcponline.org', u'jlr.org']) >>> temps["1535-9476"] u'http://www.mcponline.org/cgi/pmidlookup?view=long&pmid=%(pmid)s' """ templates = {} domains = {} pubFname = pubConf.publisherIssnTable logging.info("Parsing %s to find highwire ISSNs/webservers" % pubFname) for row in maxCommon.iterTsvRows(pubFname): if not row.pubName.startswith("HIGHWIRE"): continue pubName = row.pubName.replace("HIGHWIRE ", "") issns = [i.strip() for i in row.journalIssns.split("|")] servers = row.webservers.split("|") for issn, server in zip(issns, servers): template = "http://www." + server + "/cgi/pmidlookup?view=long&pmid=%(pmid)s" templates[issn] = template domains.setdefault(pubName, set()).add(server) #logging.debug("HIGHWIRE CONFIG %s, %s, %s" % (pubName, template, domains[pubName])) return templates, domains
def concatIdentifiers(inDir, outDir, outFname): " concat all identifiers of *_ids.tab files in inDir to outFname, append if exists " outPath = join(outDir, outFname) inMask = join(inDir, "*_ids.tab") idFnames = glob.glob(inMask) logging.debug("Concatting exernalIds from %s to %s" % (inMask, outPath)) extIds = [] for inFname in idFnames: if os.path.getsize(inFname)==0: logging.warn("file %s has zero size") continue for row in maxCommon.iterTsvRows(inFname): extIds.append(row.externalId) if isfile(outPath): ofh = open(outPath, "a") else: ofh = open(outPath, "w") ofh.write("#externalId\n") for extId in extIds: ofh.write("%s\n" % extId) ofh.close() return outPath
def parseTabPublisherFile(fname): " parse a file with columns eIssn, publisher (optional) and urls into a list of records " logging.info("Parsing %s" % fname) journals = list(maxCommon.iterTsvRows(fname, encoding="latin1")) # modify publisher field datasetName = splitext(basename(fname))[0] headers = list(journals[0]._fields) addPubField = False if "publisher" not in headers: headers.insert(0, "publisher") addPubField = True JRec = collections.namedtuple("Journal", headers) newJournals = [] for j in journals: if j.eIssn.lower() == "print only" or j.eIssn.lower() == "unknown": logging.debug("Skipping journal %s, no eIssn" % j.title) continue if addPubField: newJ = [datasetName] newJ.extend(j) newJRec = JRec(*newJ) else: newJRec = j newJournals.append(newJRec) return newJournals
def __init__(self, taxId): " open db files, compile patterns, parse input as far as possible " mutDataDir = pubConf.varDataDir geneDataDir = pubConf.geneDataDir if mutDataDir==None: return self.mutDataDir = mutDataDir self.entrez2sym, self.entrez2refprots = parseEntrez(join(geneDataDir, "entrez.tab")) # refseq sequences fname = join(mutDataDir, "seqs") logging.info("opening %s" % fname) seqs = pubKeyVal.SqliteKvDb(fname) self.seqs = seqs # refprot to refseqId # refseq to CDS Start fname = join(mutDataDir, "refseqInfo.tab") logging.debug("Reading %s" % fname) self.refProtToRefSeq = {} self.refSeqCds = {} for row in maxCommon.iterTsvRows(fname): self.refProtToRefSeq[row.refProt] = row.refSeq self.refSeqCds[row.refSeq] = int(row.cdsStart)-1 # NCBI is 1-based # refseq to genome self.pslCache = {} self.refGenePsls = openIndexedPsls(mutDataDir, "refGenePsls.9606") # dbsnp db fname = join(self.mutDataDir, "dbSnp.sqlite") self.snpDb = sqlite3.connect(fname) logging.info("Reading of data finished")
def readArticleChunkAssignment(inDir, updateIds): "read the assignment of articleId -> chunkId from text directory" if updateIds == None: inFiles = glob.glob(os.path.join(inDir, "*_index.tab")) else: inFiles = [] for updateId in updateIds: updateId = str(updateId) indexFname = "%s_index.tab" % updateId if isfile(indexFname): inFiles.append(os.path.join(inDir, indexFname)) if len(inFiles) == 0: logging.warn("No article chunk assignment") return None logging.debug("Input files for article -> chunk assignment: %s" % inFiles) articleChunks = {} for inFile in inFiles: logging.info("Parsing %s" % inFile) for row in maxCommon.iterTsvRows(inFile): chunkId = int(row.chunkId.split("_")[1]) articleChunks[int(row.articleId)] = int(chunkId) return articleChunks
def concatIdentifiers(inDir, outDir, outFname): " concat all identifiers of *_ids.tab files in inDir to outFname, append if exists " outPath = join(outDir, outFname) inMask = join(inDir, "*_ids.tab") idFnames = glob.glob(inMask) logging.debug("Concatting exernalIds from %s to %s" % (inMask, outPath)) extIds = [] for inFname in idFnames: if os.path.getsize(inFname) == 0: logging.warn("file %s has zero size") continue for row in maxCommon.iterTsvRows(inFname): extIds.append(row.externalId) if isfile(outPath): ofh = open(outPath, "a") else: ofh = open(outPath, "w") ofh.write("#externalId\n") for extId in extIds: ofh.write("%s\n" % extId) ofh.close() return outPath
def readArticleChunkAssignment(inDir, updateIds): "read the assignment of articleId -> chunkId from text directory" if updateIds == None: inFiles = glob.glob(os.path.join(inDir, "*_index.tab")) else: inFiles = [] for updateId in updateIds: updateId = str(updateId) indexFname = "%s_index.tab" % updateId if isfile(indexFname): inFiles.append(os.path.join(inDir, indexFname)) if len(inFiles) == 0: logging.warn("No article chunk assignment") return None logging.debug("Input files for article -> chunk assignment: %s" % inFiles) articleChunks = {} for inFile in inFiles: logging.info("Parsing %s" % inFile) for row in maxCommon.iterTsvRows(inFile): chunkId = int(row.chunkId.split("_")[1]) articleChunks[int(row.articleId)] = int(chunkId) return articleChunks
def updatePmids(medlineDir, crawlDir, updateIds, minYear=None): """ go over subdirs of crawlDir, for each: read the ISSNs, and add new PMIDs we have in medlineDir to subdir/pmids.txt We never remove a PMID from pmids.txt. """ logging.info("Now updating crawler directories with the new PMIDs") eIssnToPIssn = getEIssnToPIssn(pubConf.publisherIssnTable) subDirs = getSubdirs(crawlDir) con, cur = pubStore.openArticleDb("medline", mustOpen=True, useRamdisk=True) for subdir in subDirs: if subdir.endswith(".tmp"): continue subPath = join(crawlDir, subdir) logging.info("Processing subdirectory %s" % subPath) if isfile(pubCrawlLib.getLockFname(subPath)): logging.warn("Found lockfile, looks like a crawl is going on in %s, skipping" % subPath) continue pmidFname = join(crawlDir, subdir, "pmids.txt") issnFname = join(crawlDir, subdir, "issns.tab") if not isfile(issnFname) or not isfile(pmidFname): logging.info("Skipping %s, ISSN or docId file not found" % subPath) continue logging.debug("reading subdir %s: %s and %s" % (subdir, pmidFname, issnFname)) issns = [row.issn.strip() for row in maxCommon.iterTsvRows(issnFname)] logging.debug("ISSNs: %s" % ",".join(issns)) # read old pmids oldPmids = set([int(line.rstrip()) for line in open(pmidFname)]) #newPmids = set() # add new pmids, for each issn newPmids = getPmidsForIssns(con, cur, issns, minYear) logging.debug("%d PMIDs" % (len(newPmids))) oldCount = len(oldPmids) updateCount = len(newPmids) oldPmids.update(newPmids) # faster to add new to old set than old to new set pmids = oldPmids newCount = len(pmids) addCount = newCount - oldCount logging.info("crawl dir %s: old PMID count %d, update has %d, new total %d, added %d" % \ (subdir, oldCount, updateCount, newCount, addCount)) # write new pmids pmids = [str(x) for x in pmids] # randomize order, to distribute errors random.shuffle(pmids) # write all pmids to a tmp file pmidTmpFname = pmidFname+".new" pmidFh = open(pmidTmpFname, "w") pmidFh.write("\n".join(pmids)) pmidFh.close() # keep a copy of the original pmid file shutil.copy(pmidFname, pmidFname+".bak") # atomic rename the tmp file to the original file # to make sure that an intact pmid file always exists os.rename(pmidTmpFname, pmidFname)
def getAllBatchIds(outDir): """ parse batches.tab and return all available batchIds """ batchIds = [] for row in maxCommon.iterTsvRows(join(outDir, "batches.tab")): batchIds.append(row.batchId) logging.debug("Found batchIds %s in directory %s" % (batchIds, outDir)) return batchIds
def loadTsvSqlite(dbFname, tableName, tsvFnames, headers=None, intFields=[], \ primKey=None, idxFields=[], dropTable=True): " load tabsep file into sqlLite db table " # if first parameter is string, make it to a list if len(tsvFnames)==0: logging.debug("No filenames to load") return if isinstance(tsvFnames, basestring): tsvFnames = [tsvFnames] if os.path.isfile(dbFname): lockDb = False finalDbFname = None else: lockDb = True finalDbFname = dbFname dbFname = pubGeneric.getFastUniqueTempFname() logging.info("writing first to db on ramdisk %s" % dbFname) con, cur = openSqlite(dbFname, lockDb=lockDb) # drop old table if dropTable: logging.debug("dropping old sqlite table") cur.execute('DROP TABLE IF EXISTS %s;'% tableName) con.commit() # create table createSql, idxSqls = makeTableCreateStatement(tableName, headers, \ intFields=intFields, idxFields=idxFields, primKey=primKey) logging.log(5, "creating table with %s" % createSql) cur.execute(createSql) con.commit() logging.info("Loading data into table") tp = maxCommon.ProgressMeter(len(tsvFnames)) sql = "INSERT INTO %s (%s) VALUES (%s)" % (tableName, ", ".join(headers), ", ".join(["?"]*len(headers))) for tsvName in tsvFnames: logging.debug("Importing %s" % tsvName) if os.path.getsize(tsvName)==0: logging.debug("Skipping %s, zero size" % tsvName) continue rows = list(maxCommon.iterTsvRows(tsvName)) logging.log(5, "Running Sql %s against %d rows" % (sql, len(rows))) cur.executemany(sql, rows) con.commit() tp.taskCompleted() logging.info("Adding indexes to table") for idxSql in idxSqls: cur.execute(idxSql) con.commit() con.close() if finalDbFname!=None: logging.info("moving over ramdisk db to %s" % dbFname) shutil.move(dbFname, finalDbFname)
def loadTsvSqlite(dbFname, tableName, tsvFnames, headers=None, intFields=[], \ primKey=None, idxFields=[], dropTable=True): " load tabsep file into sqlLite db table " # if first parameter is string, make it to a list if len(tsvFnames) == 0: logging.debug("No filenames to load") return if isinstance(tsvFnames, basestring): tsvFnames = [tsvFnames] if os.path.isfile(dbFname): lockDb = False finalDbFname = None else: lockDb = True finalDbFname = dbFname con, cur = openSqlite(dbFname, lockDb=lockDb) # drop old table if dropTable: logging.debug("dropping old sqlite table") cur.execute('DROP TABLE IF EXISTS %s;' % tableName) con.commit() # create table createSql, idxSqls = makeTableCreateStatement(tableName, headers, \ intFields=intFields, idxFields=idxFields, primKey=primKey) logging.log(5, "creating table with %s" % createSql) cur.execute(createSql) con.commit() logging.info("Loading data into table") tp = maxCommon.ProgressMeter(len(tsvFnames)) sql = "INSERT INTO %s (%s) VALUES (%s)" % (tableName, ", ".join(headers), ", ".join(["?"] * len(headers))) for tsvName in tsvFnames: logging.debug("Importing %s" % tsvName) if os.path.getsize(tsvName) == 0: logging.debug("Skipping %s, zero size" % tsvName) continue rows = list(maxCommon.iterTsvRows(tsvName)) logging.log(5, "Running Sql %s against %d rows" % (sql, len(rows))) cur.executemany(sql, rows) con.commit() tp.taskCompleted() logging.info("Adding indexes to table") for idxSql in idxSqls: cur.execute(idxSql) con.commit() con.close() if finalDbFname != None: logging.info("moving over ramdisk db to %s" % dbFname) shutil.move(dbFname, finalDbFname)
def parseUidToCounts(fname): res = {} for row in maxCommon.iterTsvRows(fname): total = int(row.total) geneProtCount = int(row.geneProtCount) res[row.uid] = (total, geneProtCount) logging.info('Found "gene/protein"-counts for %d journals in %s' % (len(res), fname)) return res
def parseDoneIds(fname): " parse all already converted identifiers from inDir " doneIds = set() if os.path.getsize(fname) == 0: return doneIds for row in maxCommon.iterTsvRows(fname): doneIds.add(row.doi) logging.info("Found %d identifiers of already parsed files" % len(doneIds)) return doneIds
def parseDoneIds(fname): " parse all already converted identifiers from inDir " doneIds = set() if os.path.getsize(fname)==0: return doneIds for row in maxCommon.iterTsvRows(fname): doneIds.add(row.doi) logging.info("Found %d identifiers of already parsed files" % len(doneIds)) return doneIds
def convertOneChunk(inIndexFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ store = pubStore.PubWriterFile(outFile) i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) doi2pmid = None logging.info("Converting %d files" % len(inRows)) convCount = 0 for row in inRows: # read line i+=1 articleId, baseDir = row.articleId, row.baseDir zipFilename, filename = row.zipFilename, row.filename articleId=int(articleId) # open file from zipfile fullZipPath = join(baseDir, zipFilename) zipFile = zipfile.ZipFile(fullZipPath) logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows)-i)) if doi2pmid==None: doi2pmid = parseDoi2Pmid(baseDir) xmlString = zipFile.open(filename).read() xmlTree = pubXml.etreeFromXml(xmlString) # parse xml articleData = pubStore.createEmptyArticleDict(publisher="elsevier") articleData = parseElsevier(xmlTree, articleData) if articleData==None: logging.warn("Parser got no data for %s" % filename) continue articleData["origFile"]="consyn://"+zipFilename+"/"+filename if articleData["doi"] in doi2pmid: articleData["pmid"] = doi2pmid[articleData["doi"]] pii = splitext(basename(filename))[0] articleData["externalId"]="PII"+pii articleData["fulltextUrl"]="http://www.sciencedirect.com/science/svapps/pii/"+pii # convert to ascii asciiString, mimeType = treeToAscii_Elsevier(xmlTree) if asciiString==None: logging.warn("No ASCII for %s / %s" % (zipFilename, filename)) continue store.writeArticle(articleId, articleData) # write to output fileData = createFileData(articleData, mimeType, asciiString) store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"]) convCount += 1 logging.info("Converted %d files" % convCount) store.close()
def convertOneChunk(gzDir, idFname, inIndexFile, outFile): # for each row in index: store = pubStore.PubWriterFile(outFile) donePiis = pubGeneric.parseDoneIds(idFname) # log to file outBase = join(dirname(outFile), basename(outFile).split(".")[0]) logFname = outBase + ".log" pubGeneric.setupLogging(__file__, None, logFileName=logFname) idFname = outBase + "_ids.tab" logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\texternalId\n") lastTsvFname = None tsvFile = None pmidFinder = pubCompare.PmidFinder() for row in maxCommon.iterTsvRows(inIndexFile, encoding=None): # open file and seek, if necessry if tsvFile == None or lastTsvFname != row.tsvFile: logging.debug("Seeking to %s in tsvfile %s" % (row.offset, row.tsvFile)) tsvFile = gzip.open(join(gzDir, row.tsvFile)) tsvFile.seek(int(row.offset)) lastTsvFname = row.tsvFile line = tsvFile.readline() if row.url.startswith("!"): logging.info("Ignoring %s, marked as duplicated" % row.url) continue #fields are: ["articleId", "tsvFile", "url", "offset"] fields = line.split("\t") url = fields[0] logging.debug("Replacing weird bing chars") content = fields[-1] assert (url == row.url) assert (len(content) != 0) url = url.decode("utf8") logging.debug("Converting to text") content = convertMicrosoft(content) artDict, fileDict = convertHtmlToDicts(url, content) if artDict == None: artDict, fileDict = minimalHtmlToDicts(url, content) if artDict == None: continue artDict["pmid"] = pmidFinder.lookupPmid(artDict) # write file articleId = int(row.articleId) fileId = articleId * 1000 store.writeFile(articleId, fileId, fileDict) store.writeArticle(articleId, artDict) store.close()
def convertOneChunk(gzDir, idFname, inIndexFile, outFile): # for each row in index: store = pubStore.PubWriterFile(outFile) donePiis = pubGeneric.parseDoneIds(idFname) # log to file outBase = join(dirname(outFile), basename(outFile).split(".")[0]) logFname = outBase+".log" pubGeneric.setupLogging(__file__, None, logFileName=logFname) idFname = outBase+"_ids.tab" logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\texternalId\n") lastTsvFname = None tsvFile = None pmidFinder = pubCompare.PmidFinder() for row in maxCommon.iterTsvRows(inIndexFile, encoding=None): # open file and seek, if necessry if tsvFile==None or lastTsvFname!=row.tsvFile: logging.debug("Seeking to %s in tsvfile %s" % (row.offset, row.tsvFile)) tsvFile = gzip.open(join(gzDir, row.tsvFile)) tsvFile.seek(int(row.offset)) lastTsvFname = row.tsvFile line = tsvFile.readline() if row.url.startswith("!"): logging.info("Ignoring %s, marked as duplicated" % row.url) continue #fields are: ["articleId", "tsvFile", "url", "offset"] fields = line.split("\t") url = fields[0] logging.debug("Replacing weird bing chars") content = fields[-1] assert(url==row.url) assert(len(content)!=0) url = url.decode("utf8") logging.debug("Converting to text") content = convertMicrosoft(content) artDict, fileDict = convertHtmlToDicts(url, content) if artDict==None: artDict, fileDict = minimalHtmlToDicts(url, content) if artDict==None: continue artDict["pmid"] = pmidFinder.lookupPmid(artDict) # write file articleId = int(row.articleId) fileId = articleId*1000 store.writeFile(articleId, fileId, fileDict) store.writeArticle(articleId, artDict) store.close()
def startup(paramDict): global geneIds fname = join(dirname(__file__), "data", "wormFinder", "wormIds.tab.gz") geneCount = 0 for row in maxCommon.iterTsvRows(fname): if row.locus!="": geneIds[row.locus] = row.geneId if row.seqId!="": geneIds[row.seqId] = row.geneId geneCount +=1 #if row.geneId!="": #geneIds[row.geneId] = row.geneId logging.info("Loaded %d words mapped to %d genes" % (len(geneIds), geneCount))
def parseHighwire(): """ create two dicts printIssn -> url to pmidlookup-cgi of highwire and publisherName -> top-level hostnames >>> temps, domains = parseHighwire() >>> temps['0270-6474'] u'http://www.jneurosci.org/cgi/pmidlookup?view=long&pmid=%(pmid)s' >>> domains["Society for Neuroscience"] set([u'jneurosci']) """ # highwire's publisher names are not resolved ("SAGE", "SAGE Pub", etc) # so: first get dict printIssn -> resolved publisherName from publishers.tab pubFname = join(pubConf.publisherDir, "publishers.tab") pIssnToPub = {} for row in maxCommon.iterTsvRows(pubFname): if not row.pubName.startswith("HIGHWIRE"): continue for issn in row.journalIssns.split("|"): issn = issn.rstrip(" ") pIssnToPub[issn] = row.pubName.replace("HIGHWIRE ","").strip() # go over highwire table and make dict pubName -> issn -> templates # and dict pubName -> domains fname = join(pubConf.journalListDir, "highwire.tab") templates = {} domains = {} for row in maxCommon.iterTsvRows(fname, encoding="latin1"): if row.eIssn.strip()=="Unknown": continue pubName = pIssnToPub[row.pIssn.strip()].strip() templates.setdefault(pubName, {}) templates[row.pIssn.strip()] = row.urls.strip()+"/cgi/pmidlookup?view=long&pmid=%(pmid)s" host = urlparse.urlparse(row.urls).hostname domain = ".".join(host.split('.')[-2:]).strip() domains.setdefault(pubName, set()).add(domain) return templates, domains
def startup(paramDict): global geneIds fname = join(dirname(__file__), "data", "wormFinder", "wormIds.tab.gz") geneCount = 0 for row in maxCommon.iterTsvRows(fname): if row.locus != "": geneIds[row.locus] = row.geneId if row.seqId != "": geneIds[row.seqId] = row.geneId geneCount += 1 #if row.geneId!="": #geneIds[row.geneId] = row.geneId logging.info("Loaded %d words mapped to %d genes" % (len(geneIds), geneCount))
def getEIssnToPIssn(journalFname): """ return a dict that maps from eIssn to pIssn """ logging.info("Parsing %s to get eIssn -> pIssn mapping" % journalFname) ret = {} for row in maxCommon.iterTsvRows(journalFname): eStr = row.journalEIssns pStr = row.journalIssns if eStr=="" or pStr=="": continue eIssns = eStr.split("|") pIssns = pStr.split("|") assert(len(eIssns)==len(pIssns)) for eIs, pIs in zip(eIssns, pIssns): if eIs!="" and pIs!="": ret[eIs] = pIs return ret
def getEIssnToPIssn(journalFname): """ return a dict that maps from eIssn to pIssn """ logging.info("Parsing %s to get eIssn -> pIssn mapping" % journalFname) ret = {} for row in maxCommon.iterTsvRows(journalFname): eStr = row.journalEIssns pStr = row.journalIssns if eStr == "" or pStr == "": continue eIssns = eStr.split("|") pIssns = pStr.split("|") assert (len(eIssns) == len(pIssns)) for eIs, pIs in zip(eIssns, pIssns): if eIs != "" and pIs != "": ret[eIs] = pIs return ret
def getAllUpdateIds(datasets): " collect all available text dataset updateIds for all datasets " textUpdateIds = {} for dataset in datasets: textDir = pubConf.resolveTextDir(dataset) updateFname = join(textDir, "updates.tab") logging.debug("Reading %s" % updateFname) updateIds = [] for row in maxCommon.iterTsvRows(updateFname): updateIds.append(row.updateId) textUpdateIds[dataset] = updateIds return textUpdateIds # also save to file, so we don't have to do this again outFname = join(batchDir, "updateIds.json") json.dumps(textUpdateIds, open(outFname, "w"), sort_keys=True, indent=4, ) return textUpdateIds
def runProcessRow(inName, alg, paramDict, outName): " run the rows from inName through alg and write to outName " tmpFnames = [] outFh, tmpFnames = newTempOutFile(tmpFnames, outName, alg, None) for row in maxCommon.iterTsvRows(inName): newRow = alg.processRow(row) if newRow!=None and len(newRow)!=[]: writeRow(newRow, outFh) if "allResults" in dir(alg): logging.debug("running allResults() function") rows = alg.allResults() if rows!=None: for row in rows: writeRow(row, outFh) outFh.close() moveTempToFinal(tmpFnames[0], outName)
def concatDois(inDir, outDir, outFname): " concat all dois of id files in inDir to outFname " outPath = join(outDir, outFname) inMask = join(inDir, "*ids.tab") idFnames = glob.glob(inMask) logging.debug("Concatting DOIs from %s to %s" % (inMask, outPath)) dois = [] for inFname in idFnames: for row in maxCommon.iterTsvRows(inFname): dois.append(row.doi) ofh = open(outPath, "w") ofh.write("#doi\n") for doi in dois: ofh.write("%s\n" % doi) ofh.close() return outPath
def concatDois(inDir, outDir, outFname): " concat all dois of id files in inDir to outFname " outPath = join(outDir, outFname) inMask = join(inDir, "*ids.tab") idFnames = glob.glob(inMask) logging.debug("Concatting DOIs from %s to %s" % (inMask, outPath)) dois = [] for inFname in idFnames: for row in maxCommon.iterTsvRows(inFname): dois.append(row.doi) ofh = open(outPath, "w") ofh.write("#doi\n") for doi in dois: ofh.write("%s\n" % doi) ofh.close() return outPath
def parseEntrez(fname): """ parse a tab-sep table with headers and return one dict with entrez to refprots and another dict with entrez to symbol """ entrez2Sym = dict() entrez2RefseqProts = dict() for row in maxCommon.iterTsvRows(fname): entrez2Sym[int(row.entrezId)] = row.sym #refseqs = row.refseqIds.split(",") if row.refseqProtIds=="": refProts = None else: refProts = row.refseqProtIds.split(",") #assert(len(refProts)==len(refseqs)) entrez2RefseqProts[int(row.entrezId)] = refProts return entrez2Sym, entrez2RefseqProts
def splitTabFileOnChunkId(filename, outDir, chunkSize=None, chunkCount=None): """ use the chunkId field of a tab-sep file as the output filename. if chunkSize is specified, ignore the chunkId field and make sure that each piece has chunkSize lines. """ if isdir(outDir): logging.info("Deleting %s" % outDir) shutil.rmtree(outDir) if not os.path.isdir(outDir): logging.info("Creating directory %s" % outDir) os.makedirs(outDir) maxCommon.mustBeEmptyDir(outDir) # read data into data dict and split by "chunkId" field headerLine = open(filename).readline() logging.info("Reading %s, splitting into pieces" % filename) data = {} i = 0 for row in maxCommon.iterTsvRows(filename, encoding=None): if chunkSize==None and chunkCount==None: chunkId = row.chunkId elif chunkSize!=None: chunkId = "%05d" % (i / chunkSize) elif chunkCount!=None: chunkId = "%05d" % (i % chunkSize) data.setdefault(str(chunkId), []).append("\t".join(row)+"\n") i += 1 # write to outDir logging.info("Splitting file data, Writing to %d files in %s/xxxx.tgz" % (len(data), outDir)) pm = maxCommon.ProgressMeter(len(data)) for chunkIdString, lines in data.iteritems(): outfname = os.path.join(outDir, chunkIdString) logging.debug("Writing to %s" % outfname) fh = open(outfname, "w") fh.write(headerLine) for line in lines: fh.write(line) fh.close() pm.taskCompleted() return data.keys()
def getTargetJournals(journalFname): " get english journals with eIssn " logging.info("Parsing %s" % journalFname) data = {} #issnToUid = {} for row in maxCommon.iterTsvRows(journalFname): if not row.source.startswith("NLM") or row.uniqueId == "": continue if row.language == "eng" and row.eIssn != "": #data.add(row.uniqueId) data[row.uniqueId] = row #if row.uniqueId!="": #issnToUid[row.pIssn] = row.uniqueId #issnToUid[row.eIssn] = row.uniqueId logging.info( "In NLM Catalog, found %d journals with eIssn , english and with UID" % len(data)) #return data, issnToUid return data
def iterCdr3Rows(fname): for row in maxCommon.iterTsvRows(fname): seq = row.seq logging.debug("seq %s" % seq) if not (row.prefixFilterAccept == "Y" and row.suffixFilterAccept == "Y"): logging.debug("didn't pass prefix or suffix filter") continue if "CLASS" in seq: logging.debug("contains CLASS") continue if seq in blackList: logging.debug("blacklisted") continue if not hasCdr3Prefix(seq): logging.debug("prefix not OK") continue if hasCdr3Len(seq): logging.debug("and length OK") yield row else: # trying to split cdr3s that got fused into separate seqs again # not that this makes the annotation ID longer: it adds three additional digits for the sub-parts logging.debug("Length not OK, trying to split") parts = splitAndKeep(row.seq, cdr3Regex) okParts = [] for p in parts: if hasCdr3Prefix(p) and hasCdr3Len(p): okParts.append(p) if len(parts) - len( okParts) < len(parts) / 3: # we tolerate a few bad pieces for num, p in enumerate(okParts): numStr = "%03d" % num newRow = row._replace(annotId=row.annotId + numStr, seq=p) yield newRow
def parseRegex(mutDataDir): """ parse and compile regexes to list (seqType, mutType, patName, pat) """ # read regexes, translate placeholders to long form and compile replDict = { "sep" : r"""(?:^|[\s\(\[\'"/,\-])""", "fromPos" : r'(?P<fromPos>[1-9][0-9]+)', "toPos" : r'(?P<toPos>[1-9][0-9]+)', "pos" : r'(?P<pos>[1-9][0-9]+)', "origAaShort" : r'(?P<origAaShort>[CISQMNPKDTFAGHLRWVEYX])', "mutAaShort" : r'(?P<mutAaShort>[CISQMNPKDTFAGHLRWVEYX*])', "skipAa" : r'(CYS|ILE|SER|GLN|MET|ASN|PRO|LYS|ASP|THR|PHE|ALA|GLY|HIS|LEU|ARG|TRP|VAL|GLU|TYR|TER|GLUTAMINE|GLUTAMIC ACID|LEUCINE|VALINE|ISOLEUCINE|LYSINE|ALANINE|GLYCINE|ASPARTATE|METHIONINE|THREONINE|HISTIDINE|ASPARTIC ACID|ARGININE|ASPARAGINE|TRYPTOPHAN|PROLINE|PHENYLALANINE|CYSTEINE|SERINE|GLUTAMATE|TYROSINE|STOP|X)', "origAaLong" : r'(?P<origAaLong>(CYS|ILE|SER|GLN|MET|ASN|PRO|LYS|ASP|THR|PHE|ALA|GLY|HIS|LEU|ARG|TRP|VAL|GLU|TYR|TER|GLUTAMINE|GLUTAMIC ACID|LEUCINE|VALINE|ISOLEUCINE|LYSINE|ALANINE|GLYCINE|ASPARTATE|METHIONINE|THREONINE|HISTIDINE|ASPARTIC ACID|ARGININE|ASPARAGINE|TRYPTOPHAN|PROLINE|PHENYLALANINE|CYSTEINE|SERINE|GLUTAMATE|TYROSINE|STOP|X))', "mutAaLong" : r'(?P<mutAaLong>(CYS|ILE|SER|GLN|MET|ASN|PRO|LYS|ASP|THR|PHE|ALA|GLY|HIS|LEU|ARG|TRP|VAL|GLU|TYR|TER|GLUTAMINE|GLUTAMIC ACID|LEUCINE|VALINE|ISOLEUCINE|LYSINE|ALANINE|GLYCINE|ASPARTATE|METHIONINE|THREONINE|HISTIDINE|ASPARTIC ACID|ARGININE|ASPARAGINE|TRYPTOPHAN|PROLINE|PHENYLALANINE|CYSTEINE|SERINE|GLUTAMATE|TYROSINE|STOP|X))', "dna" : r'(?P<dna>[actgACTG])', "origDna" : r'(?P<origDna>[actgACTG])', "mutDna" : r'(?P<mutDna>[actgACTG])', "fs" : r'(?P<fs>(fs\*?[0-9]*)|fs\*|fs|)?', } regexTab = join(mutDataDir, "regex.txt") logging.info("Parsing regexes from %s" % regexTab) regexList = [] counts = defaultdict(int) for row in maxCommon.iterTsvRows(regexTab, commentPrefix="#"): logging.log(5, "Translating %s" % row.pat) patName = row.patName if patName=="": patName = row.pat patFull = row.pat.format(**replDict) logging.log(5, "full pattern is %s" % patFull) flags = 0 if "Long}" in row.pat: flags = re.IGNORECASE logging.log(5, "ignoring case for this pattern") patComp = re.compile(patFull, flags=flags) regexList.append((row.seqType, row.mutType, patName, patComp)) counts[(row.seqType, row.mutType)] += 1 for regexType, count in counts.iteritems(): logging.info("regexType %s, found %d regexes" % (str(regexType), count)) return regexList
def iterCdr3Rows(fname): for row in maxCommon.iterTsvRows(fname): seq = row.seq logging.debug("seq %s" % seq) if not (row.prefixFilterAccept=="Y" and row.suffixFilterAccept=="Y"): logging.debug("didn't pass prefix or suffix filter") continue if "CLASS" in seq: logging.debug("contains CLASS") continue if seq in blackList: logging.debug("blacklisted") continue if not hasCdr3Prefix(seq): logging.debug("prefix not OK") continue if hasCdr3Len(seq): logging.debug("and length OK") yield row else: # trying to split cdr3s that got fused into separate seqs again # not that this makes the annotation ID longer: it adds three additional digits for the sub-parts logging.debug("Length not OK, trying to split") parts = splitAndKeep(row.seq, cdr3Regex) okParts = [] for p in parts: if hasCdr3Prefix(p) and hasCdr3Len(p): okParts.append(p) if len(parts) - len(okParts)< len(parts)/3: # we tolerate a few bad pieces for num, p in enumerate(okParts): numStr = "%03d" % num newRow = row._replace(annotId=row.annotId+numStr, seq=p) yield newRow
def getAllUpdateIds(datasets): " collect all available text dataset updateIds for all datasets " textUpdateIds = {} for dataset in datasets: textDir = pubConf.resolveTextDir(dataset) updateFname = join(textDir, "updates.tab") logging.debug("Reading %s" % updateFname) updateIds = [] for row in maxCommon.iterTsvRows(updateFname): updateIds.append(row.updateId) textUpdateIds[dataset] = updateIds return textUpdateIds # also save to file, so we don't have to do this again outFname = join(batchDir, "updateIds.json") json.dumps( textUpdateIds, open(outFname, "w"), sort_keys=True, indent=4, ) return textUpdateIds
def addPmids(datasetString): " for a given dataset, add the pmids from the pubFingerprint output file to the article files " #datasetString = args[0] textDir = pubConf.resolveTextDir(datasetString) logging.info("Changing article files in %s" % textDir) aToPfn = join(textDir, pubConf.idFname) logging.info("Reading art -> pmid mapping from %s" % aToPfn) artToPmid = parseIdFname(aToPfn) fnames = glob.glob(join(textDir, "*.articles.gz")) logging.info("Running on %d article files" % len(fnames)) pm = maxCommon.ProgressMeter(len(fnames), stepCount=100) updateSqliteIds(textDir, artToPmid.items()) #sys.exit(0) logging.info("Updating tab sep files") for fname in fnames: # write headers newFname = join(pubConf.TEMPDIR, basename(fname)) logging.debug("reading %s, writing %s" % (fname, newFname)) newF = gzip.open(newFname, "w") newF.write(gzip.open(fname).readline()) # write rows, replacing pmids on the way for row in maxCommon.iterTsvRows(fname): artId = int(row.articleId) if int(row.articleId) in artToPmid: row = row._replace(pmid=artToPmid[artId]) newF.write((u'\t'.join(row)).encode("utf8")) newF.write("\n") newF.close() # rename old, move over the new one shutil.move(fname, fname+".bak") shutil.move(newFname, fname) pm.taskCompleted()
def addPmids(datasetString): " for a given dataset, add the pmids from the pubFingerprint output file to the article files " #datasetString = args[0] textDir = pubConf.resolveTextDir(datasetString) logging.info("Changing article files in %s" % textDir) aToPfn = join(textDir, pubConf.idFname) logging.info("Reading art -> pmid mapping from %s" % aToPfn) artToPmid = parseIdFname(aToPfn) fnames = glob.glob(join(textDir, "*.articles.gz")) logging.info("Running on %d article files" % len(fnames)) pm = maxCommon.ProgressMeter(len(fnames), stepCount=100) updateSqliteIds(textDir, artToPmid.items()) #sys.exit(0) logging.info("Updating tab sep files") for fname in fnames: # write headers newFname = join(pubConf.TEMPDIR, basename(fname)) logging.debug("reading %s, writing %s" % (fname, newFname)) newF = gzip.open(newFname, "w") newF.write(gzip.open(fname).readline()) # write rows, replacing pmids on the way for row in maxCommon.iterTsvRows(fname): artId = int(row.articleId) if int(row.articleId) in artToPmid: row = row._replace(pmid=artToPmid[artId]) newF.write((u'\t'.join(row)).encode("utf8")) newF.write("\n") newF.close() # rename old, move over the new one shutil.move(fname, fname + ".bak") shutil.move(newFname, fname) pm.taskCompleted()
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ diskDir = abspath(join(zipDir, "..", "disk")) store = pubStore.PubWriterFile(outFile) # read all already done IDs doneIds = parseDoneIds(inIdFile) # open output id files idFname = join(dirname(outFile), basename(outFile).split(".")[0]+".ids.tab") logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\tdoi\tpmid\n") pmidFinder = pubCompare.PmidFinder() unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir()) maxCommon.delOnExit(unzipTmp) i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) logging.info("Converting %d files" % len(inRows)) convCount = 0 pdfNotFound = 0 for row in inRows: # read line i+=1 articleId = row.articleId zipFilename, filename = row.zipFilename, row.filename if u'\xbf' in filename: logging.info("Found weird character, skipping file") continue articleData = pubStore.createEmptyArticleDict(publisher="springer") if zipFilename=="": xmlString, pdfString = getDiskData(diskDir, filename) articleData["origFile"] = filename else: xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename) articleData["origFile"] = zipFilename+":"+filename if pdfString==None: pdfNotFound+=1 logging.error("Could not open pdf or xml file") continue articleId=int(articleId) # parse xml logging.debug("Parsing XML") try: xmlTree = pubXml.etreeFromXml(xmlString) except lxml.etree.XMLSyntaxError: logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename)) continue articleData = parseXml(xmlTree, articleData) if articleData==None: logging.warn("Parser got no data for %s" % filename) continue if articleData["doi"] in doneIds: logging.error("article %s has already been converted, skipping" % articleData["doi"]) continue articleData["pmid"] = pmidFinder.lookupPmid(articleData) articleData["origFile"]=zipFilename+"/"+filename articleData["externalId"]=articleData["doi"] # convert pdf to ascii fileData = createFileData(articleData, "application/pdf", pdfString) logging.debug("converting pdf to ascii") pubGeneric.toAscii(fileData, "application/pdf") # write to output store.writeArticle(articleId, articleData) store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"]) # write IDs to separate file idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])] idFh.write("\t".join(idRow)) idFh.write("\n") doneIds.add(articleData["doi"]) convCount += 1 logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound)) store.close() idFh.close()
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ store = pubStore.PubWriterFile(outFile) # read all already done IDs donePiis = pubGeneric.parseDoneIds(inIdFile) # open output id files idFname = join(dirname(outFile), basename(outFile).split(".")[0] + ".ids.tab") logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\tdoi\texternalId\tpmid\n") i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) #doi2pmid = None convCount = 0 skipCount = 0 pmidFinder = pubCompare.PmidFinder() logging.info("Converting %d files" % len(inRows)) for row in inRows: # read line i += 1 articleId = row.articleId zipFilename, filename = row.zipFilename, row.filename articleId = int(articleId) pii = splitext(basename(filename))[0] if pii in donePiis: logging.debug("PII %s has already been converted, skipping" % pii) skipCount += 1 continue donePiis.add(pii) # open file from zipfile fullZipPath = join(zipDir, zipFilename) zipFile = zipfile.ZipFile(fullZipPath) logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows) - i)) #if doi2pmid==None: #doi2pmid = parseDoi2Pmid(baseDir) xmlString = zipFile.open(filename).read() try: xmlTree = pubXml.etreeFromXml(xmlString) except lxml.etree.XMLSyntaxError: logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename)) continue # parse xml articleData = pubStore.createEmptyArticleDict(publisher="elsevier") articleData = parseElsevier(xmlTree, articleData) if articleData == None: logging.warn("Parser got no data for %s" % filename) continue articleData["origFile"] = zipFilename + ":" + filename #if articleData["doi"] in doi2pmid: #articleData["pmid"] = doi2pmid[articleData["doi"]] articleData["externalId"] = pii articleData[ "fulltextUrl"] = "http://www.sciencedirect.com/science/svapps/pii/" + pii #articleData["pmid"] = pmidFinder.lookupPmid(articleData) # convert to ascii asciiString, mimeType = treeToAscii_Elsevier(xmlTree) if asciiString == None: logging.warn("No ASCII for %s / %s" % (zipFilename, filename)) continue store.writeArticle(articleId, articleData) # write IDs to separate file idRow = [ str(articleData["articleId"]), articleData["doi"], articleData["externalId"], str(articleData["pmid"]) ] idFh.write("\t".join(idRow)) idFh.write("\n") # write to output fileData = createFileData(articleData, mimeType, asciiString) store.writeFile(articleId, (1000 * (articleId)) + 1, fileData, externalId=articleData["externalId"]) convCount += 1 logging.info("Converted %d files, skipped %d" % (convCount, skipCount)) store.close() idFh.close()
def parseIdFname(fname): res = {} for row in maxCommon.iterTsvRows(fname): res[int(row.artId1)] = row.pmid return res
logging.info("Reading %s" % inFname) headerLine = gzip.open(inFname).readline() if "publisher" in headerLine: logging.info("%s is OK" % inFname) continue bakFname = inFname+".bak" if isfile(bakFname): logging.info("%s exists" % bakFname) sys.exit(1) logging.info("Renaming %s to %s" % (inFname, bakFname)) shutil.move(inFname, bakFname) headers = headerLine.strip().split("\t") headers.insert(3, "publisher") outFname = inFname inFname = bakFname logging.info("Writing %s" % outFname) ofh = gzip.open(outFname, "w") ofh.write("\t".join(headers)+"\n") for row in maxCommon.iterTsvRows(inFname, isGzip=True): row = list(row) row.insert(3, publisher) row = [r.encode("utf8") for r in row] line = "\t".join(row)+"\n" ofh.write(line)
s.add(word) #s.add(word[:3]) #s.add(word[:4]) return s uniprotFname = join(pubConf.dbRefDir, "uniprot.tab") print "Reading %s" % uniprotFname dictFh = open("uniProt.dict.tab", "w") print ("parsing BNC") bncWords = parseBnc() print ("constructing dictionary") for row in maxCommon.iterTsvRows(uniprotFname): if row.taxonId!="9606": continue accs = set() #accs = appendAll(accs, row.accList.split("|")) #accs = appendAll(accs, [x.split(".")[0] for x in row.refSeq.split("|")]) # remove version number #accs = appendAll(accs, row.ensemblProt.split("|")) #accs = appendAll(accs, row.ensemblGene.split("|")) #accs = appendAll(accs, row.embl.split("|")) #accs = appendAll(accs, row.pdb.split("|")) #accs = appendAll(accs, row.uniGene.split("|")) #accs = appendAll(accs, row.omim.split("|"), prefixList=["omim ", "OMIM ", "MIM "]) #accs = list(set(accs)) #for delChar in ["*", ",", ".", "/", "(", ")"]: #accs = [acc.replace(delChar," ").replace(" ", " ") for acc in accs]
inFnames = glob.glob(join(inDir, "*.articles.gz")) for inFname in inFnames: logging.info("Reading %s" % inFname) headerLine = gzip.open(inFname).readline() if "publisher" in headerLine: logging.info("%s is OK" % inFname) continue bakFname = inFname + ".bak" if isfile(bakFname): logging.info("%s exists" % bakFname) sys.exit(1) logging.info("Renaming %s to %s" % (inFname, bakFname)) shutil.move(inFname, bakFname) headers = headerLine.strip().split("\t") headers.insert(3, "publisher") outFname = inFname inFname = bakFname logging.info("Writing %s" % outFname) ofh = gzip.open(outFname, "w") ofh.write("\t".join(headers) + "\n") for row in maxCommon.iterTsvRows(inFname, isGzip=True): row = list(row) row.insert(3, publisher) row = [r.encode("utf8") for r in row] line = "\t".join(row) + "\n" ofh.write(line)
"uniProt" : "http://www.uniprot.org/uniprot/", "pubmed" : "http://www.ncbi.nlm.nih.gov/pubmed/" } def htmlLink(urlType, acc): return '<a href="%s%s">%s</a>' % (urls[urlType], acc, acc) if __name__ == '__main__': psls = indexPsls("uniProtVsGenome.psl") ofh = open("uniprotMutations.bed", "w") ofh2 = open("temp.bed", "w") uniProtMutFname = join(pubConf.dbRefDir, "uniprot.mut.tab") count = 0 notMapped = [] for mut in maxCommon.iterTsvRows(uniProtMutFname): mapper = PslMapBedMaker() if mut.acc not in psls: notMapped.append(mut.acc) continue mutPos = 3*(int(mut.position)-1) ofh2.write("\t".join([mut.acc, str(mutPos), str(mutPos+3), mut.acc+":"+mut.origAa+mut.position+mut.mutAa])+"\n") mapPsls = psls[mut.acc] for psl in mapPsls: bed = mapper.mapQuery(psl, mutPos, mutPos+3) if bed==None: print("Could not map: ", mut) continue bed[3] = " ".join((mut.disease.split("|")[0]).replace("-", " ").replace(" type", "").split()[:3]) bed.append(mut.disease)
def updatePmids(medlineDir, crawlDir, updateIds, minYear=None): """ go over subdirs of crawlDir, for each: read the ISSNs, and add new PMIDs we have in medlineDir to subdir/pmids.txt We never remove a PMID from pmids.txt. """ logging.info("Now updating crawler directories with the new PMIDs") eIssnToPIssn = getEIssnToPIssn(pubConf.publisherIssnTable) subDirs = getSubdirs(crawlDir) con, cur = pubStore.openArticleDb("medline", mustOpen=True, useRamdisk=True) for subdir in subDirs: if subdir.endswith(".tmp"): continue subPath = join(crawlDir, subdir) logging.info("Processing subdirectory %s" % subPath) if isfile(pubCrawlLib.getLockFname(subPath)): logging.warn( "Found lockfile, looks like a crawl is going on in %s, skipping" % subPath) continue pmidFname = join(crawlDir, subdir, "pmids.txt") issnFname = join(crawlDir, subdir, "issns.tab") if not isfile(issnFname) or not isfile(pmidFname): logging.info("Skipping %s, ISSN or docId file not found" % subPath) continue logging.debug("reading subdir %s: %s and %s" % (subdir, pmidFname, issnFname)) issns = [row.issn.strip() for row in maxCommon.iterTsvRows(issnFname)] logging.debug("ISSNs: %s" % ",".join(issns)) # read old pmids oldPmids = set([int(line.rstrip()) for line in open(pmidFname)]) #newPmids = set() # add new pmids, for each issn newPmids = getPmidsForIssns(con, cur, issns, minYear) logging.debug("%d PMIDs" % (len(newPmids))) oldCount = len(oldPmids) updateCount = len(newPmids) oldPmids.update( newPmids) # faster to add new to old set than old to new set pmids = oldPmids newCount = len(pmids) addCount = newCount - oldCount logging.info("crawl dir %s: old PMID count %d, update has %d, new total %d, added %d" % \ (subdir, oldCount, updateCount, newCount, addCount)) # write new pmids pmids = [str(x) for x in pmids] # randomize order, to distribute errors random.shuffle(pmids) # write all pmids to a tmp file pmidTmpFname = pmidFname + ".new" pmidFh = open(pmidTmpFname, "w") pmidFh.write("\n".join(pmids)) pmidFh.close() # keep a copy of the original pmid file shutil.copy(pmidFname, pmidFname + ".bak") # atomic rename the tmp file to the original file # to make sure that an intact pmid file always exists os.rename(pmidTmpFname, pmidFname)
def parseIdFname(fname): res = {} for row in maxCommon.iterTsvRows(fname): res[int(row.artId1)] = row.pmid return res
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ store = pubStore.PubWriterFile(outFile) # read all already done IDs donePiis = pubGeneric.parseDoneIds(inIdFile) # open output id files idFname = join(dirname(outFile), basename(outFile).split(".")[0]+".ids.tab") logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\tdoi\texternalId\tpmid\n") i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) #doi2pmid = None convCount = 0 skipCount = 0 pmidFinder = pubCompare.PmidFinder() logging.info("Converting %d files" % len(inRows)) for row in inRows: # read line i+=1 articleId = row.articleId zipFilename, filename = row.zipFilename, row.filename articleId=int(articleId) pii = splitext(basename(filename))[0] if pii in donePiis: logging.debug("PII %s has already been converted, skipping" % pii) skipCount += 1 continue donePiis.add(pii) # open file from zipfile fullZipPath = join(zipDir, zipFilename) zipFile = zipfile.ZipFile(fullZipPath) logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows)-i)) #if doi2pmid==None: #doi2pmid = parseDoi2Pmid(baseDir) xmlString = zipFile.open(filename).read() try: xmlTree = pubXml.etreeFromXml(xmlString) except lxml.etree.XMLSyntaxError: logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename)) continue # parse xml articleData = pubStore.createEmptyArticleDict(publisher="elsevier") articleData = parseElsevier(xmlTree, articleData) if articleData==None: logging.warn("Parser got no data for %s" % filename) continue articleData["origFile"]=zipFilename+":"+filename #if articleData["doi"] in doi2pmid: #articleData["pmid"] = doi2pmid[articleData["doi"]] articleData["externalId"]=pii articleData["fulltextUrl"]="http://www.sciencedirect.com/science/svapps/pii/"+pii #articleData["pmid"] = pmidFinder.lookupPmid(articleData) # convert to ascii asciiString, mimeType = treeToAscii_Elsevier(xmlTree) if asciiString==None: logging.warn("No ASCII for %s / %s" % (zipFilename, filename)) continue store.writeArticle(articleId, articleData) # write IDs to separate file idRow = [str(articleData["articleId"]), articleData["doi"], articleData["externalId"], str(articleData["pmid"])] idFh.write("\t".join(idRow)) idFh.write("\n") # write to output fileData = createFileData(articleData, mimeType, asciiString) store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"]) convCount += 1 logging.info("Converted %d files, skipped %d" % (convCount, skipCount)) store.close() idFh.close()
} def htmlLink(urlType, acc): return '<a href="%s%s">%s</a>' % (urls[urlType], acc, acc) if __name__ == '__main__': psls = indexPsls("uniProtVsGenome.psl") ofh = open("uniprotMutations.bed", "w") ofh2 = open("temp.bed", "w") uniProtMutFname = join(pubConf.dbRefDir, "uniprot.mut.tab") count = 0 notMapped = [] for mut in maxCommon.iterTsvRows(uniProtMutFname): mapper = PslMapBedMaker() if mut.acc not in psls: notMapped.append(mut.acc) continue mutPos = 3 * (int(mut.position) - 1) ofh2.write("\t".join([ mut.acc, str(mutPos), str(mutPos + 3), mut.acc + ":" + mut.origAa + mut.position + mut.mutAa ]) + "\n") mapPsls = psls[mut.acc] for psl in mapPsls: bed = mapper.mapQuery(psl, mutPos, mutPos + 3)
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ diskDir = abspath(join(zipDir, "..", "disk")) store = pubStore.PubWriterFile(outFile) # read all already done IDs doneIds = parseDoneIds(inIdFile) # open output id files idFname = join(dirname(outFile), basename(outFile).split(".")[0] + ".ids.tab") logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\tdoi\tpmid\n") pmidFinder = pubCompare.PmidFinder() unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir()) maxCommon.delOnExit(unzipTmp) i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) logging.info("Converting %d files" % len(inRows)) convCount = 0 pdfNotFound = 0 for row in inRows: # read line i += 1 articleId = row.articleId zipFilename, filename = row.zipFilename, row.filename if u"\xbf" in filename: logging.info("Found weird character, skipping file") continue articleData = pubStore.createEmptyArticleDict(publisher="springer") if zipFilename == "": xmlString, pdfString = getDiskData(diskDir, filename) articleData["origFile"] = filename else: xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename) articleData["origFile"] = zipFilename + ":" + filename if pdfString == None: pdfNotFound += 1 logging.error("Could not open pdf or xml file") continue articleId = int(articleId) # parse xml logging.debug("Parsing XML") try: xmlTree = pubXml.etreeFromXml(xmlString) except lxml.etree.XMLSyntaxError: logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename)) continue articleData = parseXml(xmlTree, articleData) if articleData == None: logging.warn("Parser got no data for %s" % filename) continue if articleData["doi"] in doneIds: logging.error("article %s has already been converted, skipping" % articleData["doi"]) continue articleData["pmid"] = pmidFinder.lookupPmid(articleData) articleData["origFile"] = zipFilename + "/" + filename articleData["externalId"] = articleData["doi"] # convert pdf to ascii fileData = createFileData(articleData, "application/pdf", pdfString) logging.debug("converting pdf to ascii") pubGeneric.toAscii(fileData, "application/pdf") # write to output store.writeArticle(articleId, articleData) store.writeFile(articleId, (1000 * (articleId)) + 1, fileData, externalId=articleData["externalId"]) # write IDs to separate file idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])] idFh.write("\t".join(idRow)) idFh.write("\n") doneIds.add(articleData["doi"]) convCount += 1 logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound)) store.close() idFh.close()