def getArtDbPath(datasetName): """ return the sqlite database name with meta info of a dataset """ dataDir = pubConf.resolveTextDir(datasetName, mustFind=False) if dataDir==None: return None dbPath = join(dataDir, "articles.db") return dbPath
def resolveDatasetDesc(descs): " resolve a comma-sep list of dataset identifiers like pmc or elsevier to a list of directories " dirs = [] for desc in descs.split(","): descDir = pubConf.resolveTextDir(desc) if descDir == None: raise Exception("Unknown dataset: %s" % desc) dirs.append(descDir) return dirs
def resolveDatasetDesc(descs): " resolve a comma-sep list of dataset identifiers like pmc or elsevier to a list of directories " dirs = [] for desc in descs.split(','): descDir = pubConf.resolveTextDir(desc) if descDir == None: raise Exception("Unknown dataset: %s" % desc) dirs.append(descDir) return dirs
def __init__(self, dataset): self.dataset = dataset self.textDir = pubConf.resolveTextDir(dataset) if self.textDir == None: raise Exception("dataset %s can not be resolved to a directory" % dataset) self.pubMapBaseDir = pubConf.pubMapBaseDir maxCommon.mustExistDir(pubConf.pubMapBaseDir, makeDir=True) self._defineBatchDirectories()
def findFiles(dataset): """ return all basenames for .gz files in datasets. inDir can be a list of datasetnames, a file or a directory with datasetnames """ #assert(type(datasets)==types.ListType) fnames = [] dataDir = pubConf.resolveTextDir(dataset) if dataDir==None: raise Exception("error in input data spec") fnames.extend(glob.glob(join(dataDir, "*.articles.gz"))) if len(fnames)==0: raise Exception("Could not find any *.articles.gz files in %s"% dataDir) return fnames
def __init__(self, dataset): self.markerCountsBase = MARKERCOUNTSBASE self.markerDirBase = MARKERDIRBASE self.pubMapBaseDir = pubConf.pubMapBaseDir maxCommon.mustExistDir(pubConf.pubMapBaseDir, makeDir=True) self.dataset = dataset if "," in dataset: logging.debug("comma in dataset description, deferring config") return self.textDir = pubConf.resolveTextDir(dataset) if self.textDir==None: raise Exception("dataset %s can not be resolved to a directory" % dataset) self._defineBatchDirectories()
def getAllUpdateIds(datasets): " collect all available text dataset updateIds for all datasets " textUpdateIds = {} for dataset in datasets: textDir = pubConf.resolveTextDir(dataset) updateFname = join(textDir, "updates.tab") logging.debug("Reading %s" % updateFname) updateIds = [] for row in maxCommon.iterTsvRows(updateFname): updateIds.append(row.updateId) textUpdateIds[dataset] = updateIds return textUpdateIds # also save to file, so we don't have to do this again outFname = join(batchDir, "updateIds.json") json.dumps(textUpdateIds, open(outFname, "w"), sort_keys=True, indent=4, ) return textUpdateIds
def getAllUpdateIds(datasets): " collect all available text dataset updateIds for all datasets " textUpdateIds = {} for dataset in datasets: textDir = pubConf.resolveTextDir(dataset) updateFname = join(textDir, "updates.tab") logging.debug("Reading %s" % updateFname) updateIds = [] for row in maxCommon.iterTsvRows(updateFname): updateIds.append(row.updateId) textUpdateIds[dataset] = updateIds return textUpdateIds # also save to file, so we don't have to do this again outFname = join(batchDir, "updateIds.json") json.dumps( textUpdateIds, open(outFname, "w"), sort_keys=True, indent=4, ) return textUpdateIds
def __init__(self, dataset, outDir): self.markerCountsBase = MARKERCOUNTSBASE self.markerDirBase = MARKERDIRBASE assert(outDir!=None and outDir!="") self.pubMapBaseDir = outDir maxCommon.mustExistDir(self.pubMapBaseDir, makeDir=True) logging.debug("Main pipeline outdir is %s" % outDir) self.dataset = dataset if "," in dataset: logging.debug("comma in dataset description, deferring config") return self.textDir = pubConf.resolveTextDir(dataset) if self.textDir==None: raise Exception("dataset %s can not be resolved to a directory" % dataset) # base dir for dataset self.baseDir = join(self.pubMapBaseDir, self.dataset) self.batchId = self._findCurrentBatchDir() self.batchDir = join(self.baseDirBatches, str(self.batchId)) self._defineBatchDirectories()
def addPmids(datasetString): " for a given dataset, add the pmids from the pubFingerprint output file to the article files " #datasetString = args[0] textDir = pubConf.resolveTextDir(datasetString) logging.info("Changing article files in %s" % textDir) aToPfn = join(textDir, pubConf.idFname) logging.info("Reading art -> pmid mapping from %s" % aToPfn) artToPmid = parseIdFname(aToPfn) fnames = glob.glob(join(textDir, "*.articles.gz")) logging.info("Running on %d article files" % len(fnames)) pm = maxCommon.ProgressMeter(len(fnames), stepCount=100) updateSqliteIds(textDir, artToPmid.items()) #sys.exit(0) logging.info("Updating tab sep files") for fname in fnames: # write headers newFname = join(pubConf.TEMPDIR, basename(fname)) logging.debug("reading %s, writing %s" % (fname, newFname)) newF = gzip.open(newFname, "w") newF.write(gzip.open(fname).readline()) # write rows, replacing pmids on the way for row in maxCommon.iterTsvRows(fname): artId = int(row.articleId) if int(row.articleId) in artToPmid: row = row._replace(pmid=artToPmid[artId]) newF.write((u'\t'.join(row)).encode("utf8")) newF.write("\n") newF.close() # rename old, move over the new one shutil.move(fname, fname+".bak") shutil.move(newFname, fname) pm.taskCompleted()
def addPmids(datasetString): " for a given dataset, add the pmids from the pubFingerprint output file to the article files " #datasetString = args[0] textDir = pubConf.resolveTextDir(datasetString) logging.info("Changing article files in %s" % textDir) aToPfn = join(textDir, pubConf.idFname) logging.info("Reading art -> pmid mapping from %s" % aToPfn) artToPmid = parseIdFname(aToPfn) fnames = glob.glob(join(textDir, "*.articles.gz")) logging.info("Running on %d article files" % len(fnames)) pm = maxCommon.ProgressMeter(len(fnames), stepCount=100) updateSqliteIds(textDir, artToPmid.items()) #sys.exit(0) logging.info("Updating tab sep files") for fname in fnames: # write headers newFname = join(pubConf.TEMPDIR, basename(fname)) logging.debug("reading %s, writing %s" % (fname, newFname)) newF = gzip.open(newFname, "w") newF.write(gzip.open(fname).readline()) # write rows, replacing pmids on the way for row in maxCommon.iterTsvRows(fname): artId = int(row.articleId) if int(row.articleId) in artToPmid: row = row._replace(pmid=artToPmid[artId]) newF.write((u'\t'.join(row)).encode("utf8")) newF.write("\n") newF.close() # rename old, move over the new one shutil.move(fname, fname + ".bak") shutil.move(newFname, fname) pm.taskCompleted()
def getArtDbPath(datasetName): " return the sqlite database name with meta info of a dataset " dataDir = pubConf.resolveTextDir(datasetName) dbPath = join(dataDir, "articles.db") return dbPath
def __init__(self): textDir = pubConf.resolveTextDir("medline") fname = join(textDir, FINGERPRINTFNAME) self.db = pubGeneric.openKeyValDb(fname) self.noPrints = [] self.noMatches = []
def filterCmd(inSpec, searchSpec, outSpec, options): outDir = pubConf.resolveTextDir(outSpec) assert(outDir!=None) maxCommon.mustBeEmptyDir(outDir) return submitJobs(inSpec, searchSpec, outDir)
def filterCmd(inSpec, searchSpec, outSpec, options): outDir = pubConf.resolveTextDir(outSpec) assert (outDir != None) maxCommon.mustBeEmptyDir(outDir) return submitJobs(inSpec, searchSpec, outDir)