def expData(inDirs, pmidListFname, outBase): logging.info("Reading %s" % pmidListFname) pmids = set([int(x) for x in tabfile.slurplist(pmidListFname)]) logging.info("Read %d pmids" % len(pmids)) posFname = outBase+".pos.tab" negFname = outBase+".neg.tab" posFh = codecs.open(posFname, "w", encoding="utf8") negFh = codecs.open(negFname, "w", encoding="utf8") for dataDir in inDirs: logging.debug(dataDir) for article, fileList in pubStore.iterArticleDirList(dataDir): if article.pmid=="": continue if int(article.pmid) in pmids: ofh = posFh txtClass = "pos" else: ofh = negFh txtClass = "neg" for fileData in fileList: if fileData.fileType=="main": text = fileData.content text = text.replace("\a", " ") ofh.write("%s\t%s\t%s\n" % (article.pmid, txtClass, text))
def expData(inDirs, pmidListFname, outBase): logging.info("Reading %s" % pmidListFname) pmids = set([int(x) for x in tabfile.slurplist(pmidListFname)]) logging.info("Read %d pmids" % len(pmids)) posFname = outBase + ".pos.tab" negFname = outBase + ".neg.tab" posFh = codecs.open(posFname, "w", encoding="utf8") negFh = codecs.open(negFname, "w", encoding="utf8") for dataDir in inDirs: logging.debug(dataDir) for article, fileList in pubStore.iterArticleDirList(dataDir): if article.pmid == "": continue if int(article.pmid) in pmids: ofh = posFh txtClass = "pos" else: ofh = negFh txtClass = "neg" for fileData in fileList: if fileData.fileType == "main": text = fileData.content text = text.replace("\a", " ") ofh.write("%s\t%s\t%s\n" % (article.pmid, txtClass, text))
def paramFile(self, param, value, cgiMode=False): """ return value OR contents of value (name-of-file) if conditions are met to read in the file """ if value!="": if (param.ptype.lower()=="textlinesarg" and not cgiMode) or param.ptype.lower()=="textlines": if param.textFormat=="": return tabfile.slurplist(value) elif param.textFormat=="dictlist": return tabfile.slurpdictlist(value) elif param.textFormat=="dict": return tabfile.slurpdict(value) else: return value else: return value
def startup(paramDict, resultDict): global doiSet doiFname = join(dirname(__file__), "data/doi.tab") doiSet = set(tabfile.slurplist(doiFname)) for doi in doiSet: resultDict[doi]=set()
def tabToFasta(inFilename, outFilename, linnaeusFilename, genomesFilename, numArtIdField=0, seqIdField=1, artIdField=2, seqField=5, maxTotalSeqLen=1000000, maxSeqLen=100000, maxSeqCount=100): """ convert tab-sep file from fttools to fasta file, linnaeusFile has docId<tab>taxonId format and genomesFilename has <genomeId> format """ if linnaeusFilename!=None: logging.debug("Reading LINNAEUS files") docToTaxon = tabfile.slurpdictset(linnaeusFilename) logging.debug("Reading sequenced genome list") sequencedTaxons = set(tabfile.slurplist(genomesFilename)) taxonFileObjCache = {} logging.debug("Filtering LINNAEUS data by sequenced genome list") # filter the taxonIds by sequenced genome Ids filteredDocToTaxon = {} allSequencedTaxonIds = set() for docId, taxonIds in docToTaxon.iteritems(): filtTaxons = set() for taxonId in taxonIds: if taxonId in sequencedTaxons: filtTaxons.add(taxonId) allSequencedTaxonIds.add(taxonId) filteredDocToTaxon[docId]=filtTaxons docToTaxon = {} # no need to waste memory with this logging.debug("Got data for %d sequenced genomes" % len(allSequencedTaxonIds)) # open filehandles for these taxonToFile = {} for taxonId in allSequencedTaxonIds: taxonToFile[taxonId] = open(outFilename+"."+str(taxonId)+".fa", "w") else: docToTaxon=None outFileObj = open(outFilename, "w") colCount = len(open(inFilename).readline().split("\t")) br = maxTables.BlockReader(open(inFilename), 0, mustBeSorted=False) for artId, block in br.readNext(): sequences = [] seqSet = set() internalId = None externalId = None for fs in block: internalId = fs[numArtIdField] externalId = fs[artIdField] seq = fs[seqField] seqId = fs[seqIdField] letterCount = len(set(seq)) if letterCount<=2: logging.debug("Skipping sequence %s of article %s, not more than 2 letters in string: %s" % (seqId, externalId, seq)) elif seq in seqSet: logging.debug("Skipping sequence %s of article %s, already seen before: %s" % (seqId, externalId, seq)) elif len(seq)>maxSeqLen: logging.debug("Skipping sequence %s of article %s; longer than %d bp: %s" % (seqId, externalId, maxSeqLen, seq)) else: sequences.append ( (seqId, seq) ) seqSet.add(seq) seqCount = len(sequences) totalSeqLen = sum([len(y) for x,y in sequences]) if totalSeqLen > maxTotalSeqLen: logging.debug("Skipping article %s, total sequence length %d > %d" % (externalId, totalSeqLen, maxTotalSeqLen)) elif seqCount>maxSeqCount: logging.debug("Skipping article %s, total sequence count %d > %d" % (externalId, seqCount, maxSeqCount)) else: for seqId, seq in sequences: if docToTaxon==None: outFileObj.write(">%s|%s\n%s\n" % (internalId, seqId, seq)) else: taxonIds = filteredDocToTaxon.get(internalId, None) #logging.debug("TaxonIds are %s" % str(taxonIds)) if taxonIds==None: taxonIds = allSequencedTaxonIds for taxonId in taxonIds: outFileObj = taxonToFile[taxonId] outFileObj.write(">%s|%s\n%s\n" % (internalId, seqId, seq))
def tabToFasta(inFilename, outFilename, linnaeusFilename, genomesFilename, numArtIdField=0, seqIdField=1, artIdField=2, seqField=5, maxTotalSeqLen=1000000, maxSeqLen=100000, maxSeqCount=100): """ convert tab-sep file from fttools to fasta file, linnaeusFile has docId<tab>taxonId format and genomesFilename has <genomeId> format """ if linnaeusFilename != None: logging.debug("Reading LINNAEUS files") docToTaxon = tabfile.slurpdictset(linnaeusFilename) logging.debug("Reading sequenced genome list") sequencedTaxons = set(tabfile.slurplist(genomesFilename)) taxonFileObjCache = {} logging.debug("Filtering LINNAEUS data by sequenced genome list") # filter the taxonIds by sequenced genome Ids filteredDocToTaxon = {} allSequencedTaxonIds = set() for docId, taxonIds in docToTaxon.iteritems(): filtTaxons = set() for taxonId in taxonIds: if taxonId in sequencedTaxons: filtTaxons.add(taxonId) allSequencedTaxonIds.add(taxonId) filteredDocToTaxon[docId] = filtTaxons docToTaxon = {} # no need to waste memory with this logging.debug("Got data for %d sequenced genomes" % len(allSequencedTaxonIds)) # open filehandles for these taxonToFile = {} for taxonId in allSequencedTaxonIds: taxonToFile[taxonId] = open( outFilename + "." + str(taxonId) + ".fa", "w") else: docToTaxon = None outFileObj = open(outFilename, "w") colCount = len(open(inFilename).readline().split("\t")) br = maxTables.BlockReader(open(inFilename), 0, mustBeSorted=False) for artId, block in br.readNext(): sequences = [] seqSet = set() internalId = None externalId = None for fs in block: internalId = fs[numArtIdField] externalId = fs[artIdField] seq = fs[seqField] seqId = fs[seqIdField] letterCount = len(set(seq)) if letterCount <= 2: logging.debug( "Skipping sequence %s of article %s, not more than 2 letters in string: %s" % (seqId, externalId, seq)) elif seq in seqSet: logging.debug( "Skipping sequence %s of article %s, already seen before: %s" % (seqId, externalId, seq)) elif len(seq) > maxSeqLen: logging.debug( "Skipping sequence %s of article %s; longer than %d bp: %s" % (seqId, externalId, maxSeqLen, seq)) else: sequences.append((seqId, seq)) seqSet.add(seq) seqCount = len(sequences) totalSeqLen = sum([len(y) for x, y in sequences]) if totalSeqLen > maxTotalSeqLen: logging.debug( "Skipping article %s, total sequence length %d > %d" % (externalId, totalSeqLen, maxTotalSeqLen)) elif seqCount > maxSeqCount: logging.debug("Skipping article %s, total sequence count %d > %d" % (externalId, seqCount, maxSeqCount)) else: for seqId, seq in sequences: if docToTaxon == None: outFileObj.write(">%s|%s\n%s\n" % (internalId, seqId, seq)) else: taxonIds = filteredDocToTaxon.get(internalId, None) #logging.debug("TaxonIds are %s" % str(taxonIds)) if taxonIds == None: taxonIds = allSequencedTaxonIds for taxonId in taxonIds: outFileObj = taxonToFile[taxonId] outFileObj.write(">%s|%s\n%s\n" % (internalId, seqId, seq))
def startup(paramDict, resultDict): global doiSet doiFname = join(dirname(__file__), "data/doi.tab") doiSet = set(tabfile.slurplist(doiFname)) for doi in doiSet: resultDict[doi] = set()