def load(self, fname): self.name = splitext(basename(fname))[0] fileObj = openFile(fname) logging.info("reading dimers from file...") gc.disable() self.logProbs = cPickle.load(fileObj) gc.enable()
def save(self, fname): fileObj = openFile(fname, "w") logging.info("writing dimers to file...") gc.disable() cPickle.dump(self.logProbs, fileObj) gc.enable()
def convertBlastFiles(blastDirs, genomeToTaxFile, tempFilename, outFilename, fileFormat): """ collect all *.blast files from blastDirs and subdirs, map to taxon ids using genomToTaxFile and write results to outfh fileobject fileFormat can be blat or blast """ outfh = maxbio.openFile(tempFilename, "w") # read genome -> taxid map if genomeToTaxFile: orgToNum = {} logger.info( "Reading genome -> number table from %s" % genomeToTaxFile) logger.info( "Expected input fields are: (GenomeName, other field,..., other field, genomeId)") for l in open(genomeToTaxFile): if l.startswith("#"): continue fs = l.strip().split("\t") genome = fs[0] num = fs[-1] num = int(num) genome=genome.lower().replace(" ", "_") orgToNum[genome]=num else: logger.info( "No genome map specified, results are not genome-based") orgToNum=None dropFileCount=0 lineCount=0 finishedTaxons = set() lastDir = None # read blast files if fileFormat=="blast": ext = ".blast" elif fileFormat=="blat": ext = ".psl" elif fileFormat=="bwa": ext = ".sam" else: assert(False) # wrong file format parameter for blastDir in blastDirs: logger.info("Searching for files with extension %s in directory %s" % (ext, blastDir)) files = list(util.findSubdirFiles(blastDir, ext)) logger.info("Found %d files" % len(files)) # convert blast files files = list (files) for fname in files: # convert organism to taxid, skip if not possible dirname = os.path.dirname(fname) org = os.path.basename(dirname) org = org.lower() if orgToNum!=None: if org in orgToNum: orgNum = orgToNum[org] else: # try to find any organism from genome list in filename found = False for dbOrg in orgToNum: if dbOrg.replace(" ", "_").lower() in fname.lower(): orgNum = orgToNum[dbOrg] logger.info("Found orgName %s in filename %s, using organism id %s" % (dbOrg, fname, str(orgNum))) found=True break if not found: logger.warn("warning: could not resolve filename %s to taxid, dropping this file (recognized organism %s)" % (fname, org)) dropFileCount+=1 continue else: orgNum=-1 # check if not already processed AND in different directory (blast creates several indices per directory), skip if yes #if orgNum in finishedTaxons and dirname!=lastDir: #print("warning: already processed this taxon id %d, skipping input file %s)" % (orgNum, fname)) #continue finishedTaxons.add(orgNum) lastDir = dirname # convert lines f = open(fname, "r") #print "Reading %s, writing hits to %s"%(fname,outFile) if orgNum!=-1: logger.info( "Reading %s, genomeID %d"%(fname, orgNum)) else: logger.info( "Reading %s, not linked to any genome id" % (fname)) if fileFormat=="bwa": tp = maxTables.TableParser(fileType="sam") for l in f: lineCount+=1 # parse blast line fs = l.strip().split("\t") if fileFormat=="blast": # example # 11495631 chr1 100.00 23 0 0 1 23 25500772 25500750 2e-05 46.1 srcId, trgId, perc, length, dummy, dummy, dummy, length, trgStart, trgEnd, eVal, score = fs elif fileFormat=="blat": # psl-format from http://genome.ucsc.edu/FAQ/FAQformat.html#format2 # matches - Number of bases that match that aren't repeats # misMatches - Number of bases that don't match # repMatches - Number of bases that match but are part of repeats # nCount - Number of 'N' bases # qNumInsert - Number of inserts in query # qBaseInsert - Number of bases inserted in query # tNumInsert - Number of inserts in target # tBaseInsert - Number of bases inserted in target # strand - '+' or '-' for query strand. For translated alignments, second '+'or '-' is for genomic strand # qName - Query sequence name # qSize - Query sequence size # qStart - Alignment start position in query # qEnd - Alignment end position in query # tName - Target sequence name # tSize - Target sequence size # tStart - Alignment start position in target # tEnd - Alignment end position in target # blockCount - Number of blocks in the alignment (a block contains no gaps) # blockSizes - Comma-separated list of sizes of each block # qStarts - Comma-separated list of starting positions of each block in query # tStarts - Comma-separated list of starting positions of each block in target # 23 0 0 0 0 0 0 0 + 11075971|1 2299 2248 2271 scaffold_281 111378 17336 17359 1 23, 2248, 17336, matches, misMatches, repMatches, nCount, qNumInsert, qBaseInsert, tNumInsert, tBaseInsert, strand, qName, qSize, qStart, qEnd, tName, tSize, tStart, tEnd, blockCount, blockSizes, qStarts, tStarts = fs score = matches perc = "%2.2f" % ((float(matches)+float(repMatches)) / (float(matches)+float(misMatches)+float(repMatches)) * 100.0) trgId = tName srcId = qName trgStart = tStart trgEnd = tEnd elif fileFormat=="bwa": if fs[0].startswith("@"): continue if len(fs)==11: fs.append("") row = tp.parseTuple(fs) tuple = maxTables.samToBed(row) if tuple==None: continue chrom, start, end, name, score, strand = tuple srcId = name trgId = chrom perc = "0" length = start-end trgStart, trgEnd=start, end else: assert(False) # file format not found? trgEnd = int(trgEnd) trgStart = int(trgStart) if trgEnd < trgStart: trgStart, trgEnd = trgEnd, trgStart fs = srcId.split("|") srcId = fs[0] srcSeq = fs[1] pmcId = srcId.replace("PMC", "") pmcId = pmcId.replace(".txt", "") pmcId = pmcId.replace(".pdf", "") data = [pmcId, str(orgNum), srcSeq, trgId, str(trgStart), str(trgEnd), str(score), str(perc)] outfh.write("\t".join(data)+"\n") outfh.close() logger.info("BlastHit output table format is (pmcId, genomeId, seqId, chrom, start, end, score, percentId)") logger.info("blastConvert : blast files dropped because of unresolvable species name %d, filesDropped=%d" % (dropFileCount, dropFileCount)) logger.info("blastConvert : processed %d blast matches, blastMatches=%d" % (lineCount, lineCount)) logger.info("Now sorting the file with the UNIX sort command") cmdLine = "sort -n %s | uniq > %s" % (tempFilename, outFilename) logger.info(cmdLine) ret = os.system(cmdLine) if ret==0: logger.info("Sorting finished, no error") else: logger.info("Error occured while sorting")
def convertBlastFiles(blastDirs, genomeToTaxFile, tempFilename, outFilename, fileFormat): """ collect all *.blast files from blastDirs and subdirs, map to taxon ids using genomToTaxFile and write results to outfh fileobject fileFormat can be blat or blast """ outfh = maxbio.openFile(tempFilename, "w") # read genome -> taxid map if genomeToTaxFile: orgToNum = {} logger.info("Reading genome -> number table from %s" % genomeToTaxFile) logger.info( "Expected input fields are: (GenomeName, other field,..., other field, genomeId)" ) for l in open(genomeToTaxFile): if l.startswith("#"): continue fs = l.strip().split("\t") genome = fs[0] num = fs[-1] num = int(num) genome = genome.lower().replace(" ", "_") orgToNum[genome] = num else: logger.info("No genome map specified, results are not genome-based") orgToNum = None dropFileCount = 0 lineCount = 0 finishedTaxons = set() lastDir = None # read blast files if fileFormat == "blast": ext = ".blast" elif fileFormat == "blat": ext = ".psl" elif fileFormat == "bwa": ext = ".sam" else: assert (False) # wrong file format parameter for blastDir in blastDirs: logger.info("Searching for files with extension %s in directory %s" % (ext, blastDir)) files = list(util.findSubdirFiles(blastDir, ext)) logger.info("Found %d files" % len(files)) # convert blast files files = list(files) for fname in files: # convert organism to taxid, skip if not possible dirname = os.path.dirname(fname) org = os.path.basename(dirname) org = org.lower() if orgToNum != None: if org in orgToNum: orgNum = orgToNum[org] else: # try to find any organism from genome list in filename found = False for dbOrg in orgToNum: if dbOrg.replace(" ", "_").lower() in fname.lower(): orgNum = orgToNum[dbOrg] logger.info( "Found orgName %s in filename %s, using organism id %s" % (dbOrg, fname, str(orgNum))) found = True break if not found: logger.warn( "warning: could not resolve filename %s to taxid, dropping this file (recognized organism %s)" % (fname, org)) dropFileCount += 1 continue else: orgNum = -1 # check if not already processed AND in different directory (blast creates several indices per directory), skip if yes #if orgNum in finishedTaxons and dirname!=lastDir: #print("warning: already processed this taxon id %d, skipping input file %s)" % (orgNum, fname)) #continue finishedTaxons.add(orgNum) lastDir = dirname # convert lines f = open(fname, "r") #print "Reading %s, writing hits to %s"%(fname,outFile) if orgNum != -1: logger.info("Reading %s, genomeID %d" % (fname, orgNum)) else: logger.info("Reading %s, not linked to any genome id" % (fname)) if fileFormat == "bwa": tp = maxTables.TableParser(fileType="sam") for l in f: lineCount += 1 # parse blast line fs = l.strip().split("\t") if fileFormat == "blast": # example # 11495631 chr1 100.00 23 0 0 1 23 25500772 25500750 2e-05 46.1 srcId, trgId, perc, length, dummy, dummy, dummy, length, trgStart, trgEnd, eVal, score = fs elif fileFormat == "blat": # psl-format from http://genome.ucsc.edu/FAQ/FAQformat.html#format2 # matches - Number of bases that match that aren't repeats # misMatches - Number of bases that don't match # repMatches - Number of bases that match but are part of repeats # nCount - Number of 'N' bases # qNumInsert - Number of inserts in query # qBaseInsert - Number of bases inserted in query # tNumInsert - Number of inserts in target # tBaseInsert - Number of bases inserted in target # strand - '+' or '-' for query strand. For translated alignments, second '+'or '-' is for genomic strand # qName - Query sequence name # qSize - Query sequence size # qStart - Alignment start position in query # qEnd - Alignment end position in query # tName - Target sequence name # tSize - Target sequence size # tStart - Alignment start position in target # tEnd - Alignment end position in target # blockCount - Number of blocks in the alignment (a block contains no gaps) # blockSizes - Comma-separated list of sizes of each block # qStarts - Comma-separated list of starting positions of each block in query # tStarts - Comma-separated list of starting positions of each block in target # 23 0 0 0 0 0 0 0 + 11075971|1 2299 2248 2271 scaffold_281 111378 17336 17359 1 23, 2248, 17336, matches, misMatches, repMatches, nCount, qNumInsert, qBaseInsert, tNumInsert, tBaseInsert, strand, qName, qSize, qStart, qEnd, tName, tSize, tStart, tEnd, blockCount, blockSizes, qStarts, tStarts = fs score = matches perc = "%2.2f" % ((float(matches) + float(repMatches)) / (float(matches) + float(misMatches) + float(repMatches)) * 100.0) trgId = tName srcId = qName trgStart = tStart trgEnd = tEnd elif fileFormat == "bwa": if fs[0].startswith("@"): continue if len(fs) == 11: fs.append("") row = tp.parseTuple(fs) tuple = maxTables.samToBed(row) if tuple == None: continue chrom, start, end, name, score, strand = tuple srcId = name trgId = chrom perc = "0" length = start - end trgStart, trgEnd = start, end else: assert (False) # file format not found? trgEnd = int(trgEnd) trgStart = int(trgStart) if trgEnd < trgStart: trgStart, trgEnd = trgEnd, trgStart fs = srcId.split("|") srcId = fs[0] srcSeq = fs[1] pmcId = srcId.replace("PMC", "") pmcId = pmcId.replace(".txt", "") pmcId = pmcId.replace(".pdf", "") data = [ pmcId, str(orgNum), srcSeq, trgId, str(trgStart), str(trgEnd), str(score), str(perc) ] outfh.write("\t".join(data) + "\n") outfh.close() logger.info( "BlastHit output table format is (pmcId, genomeId, seqId, chrom, start, end, score, percentId)" ) logger.info( "blastConvert : blast files dropped because of unresolvable species name %d, filesDropped=%d" % (dropFileCount, dropFileCount)) logger.info("blastConvert : processed %d blast matches, blastMatches=%d" % (lineCount, lineCount)) logger.info("Now sorting the file with the UNIX sort command") cmdLine = "sort -n %s | uniq > %s" % (tempFilename, outFilename) logger.info(cmdLine) ret = os.system(cmdLine) if ret == 0: logger.info("Sorting finished, no error") else: logger.info("Error occured while sorting")