예제 #1
0
 def load(self, fname):
     self.name = splitext(basename(fname))[0]
     fileObj = openFile(fname)
     logging.info("reading dimers from file...")
     gc.disable()
     self.logProbs = cPickle.load(fileObj)
     gc.enable()
예제 #2
0
 def load(self, fname):
     self.name = splitext(basename(fname))[0]
     fileObj = openFile(fname)
     logging.info("reading dimers from file...")
     gc.disable()
     self.logProbs = cPickle.load(fileObj)
     gc.enable()
예제 #3
0
 def save(self, fname):
     fileObj = openFile(fname, "w")
     logging.info("writing dimers to file...")
     gc.disable()
     cPickle.dump(self.logProbs, fileObj)
     gc.enable()
예제 #4
0
def convertBlastFiles(blastDirs, genomeToTaxFile, tempFilename, outFilename, fileFormat):
    """ collect all *.blast files from blastDirs and subdirs, map to taxon ids using genomToTaxFile and write results to outfh fileobject 
    fileFormat can be blat or blast """

    outfh               = maxbio.openFile(tempFilename, "w")

    # read genome -> taxid map 
    if genomeToTaxFile:
        orgToNum = {}
        logger.info( "Reading genome -> number table from %s" % genomeToTaxFile)
        logger.info( "Expected input fields are: (GenomeName, other field,..., other field, genomeId)")

        for l in open(genomeToTaxFile):
            if l.startswith("#"):
                continue
            fs = l.strip().split("\t")
            genome = fs[0]
            num = fs[-1]
            num = int(num)
            genome=genome.lower().replace(" ", "_")
            orgToNum[genome]=num
    else:
        logger.info( "No genome map specified, results are not genome-based")
        orgToNum=None

    dropFileCount=0
    lineCount=0
    finishedTaxons = set()
    lastDir = None

    # read blast files
    if fileFormat=="blast":
        ext = ".blast"
    elif fileFormat=="blat":
        ext = ".psl"
    elif fileFormat=="bwa":
        ext = ".sam"
    else:
        assert(False) # wrong file format parameter

    for blastDir in blastDirs:
        logger.info("Searching for files with extension %s in directory %s" % (ext, blastDir)) 
        files = list(util.findSubdirFiles(blastDir, ext))
        logger.info("Found %d files" % len(files))

        # convert blast files
        files = list (files)
        for fname in files:
            # convert organism to taxid, skip if not possible
            dirname = os.path.dirname(fname)
            org = os.path.basename(dirname)
            org = org.lower()

            if orgToNum!=None:
                if org in orgToNum:
                    orgNum = orgToNum[org]
                else:
                    # try to find any organism from genome list in filename
                    found = False
                    for dbOrg in orgToNum:
                        if dbOrg.replace(" ", "_").lower() in fname.lower():
                            orgNum = orgToNum[dbOrg]
                            logger.info("Found orgName %s in filename %s, using organism id %s" % (dbOrg, fname, str(orgNum)))
                            found=True
                            break

                    if not found:
                        logger.warn("warning: could not resolve filename %s to taxid, dropping this file (recognized organism %s)" % (fname, org))
                        dropFileCount+=1
                        continue
            else:
                orgNum=-1

            # check if not already processed AND in different directory (blast creates several indices per directory), skip if yes
            #if orgNum in finishedTaxons and dirname!=lastDir:
                #print("warning: already processed this taxon id %d, skipping input file %s)" % (orgNum, fname))
                #continue
            finishedTaxons.add(orgNum)
            lastDir = dirname

            # convert lines
            f = open(fname, "r")
            #print "Reading %s, writing hits to %s"%(fname,outFile)
            if orgNum!=-1:
                logger.info( "Reading %s, genomeID %d"%(fname, orgNum))
            else:
                logger.info( "Reading %s, not linked to any genome id" % (fname))

            if fileFormat=="bwa":
                tp = maxTables.TableParser(fileType="sam")

            for l in f:
                lineCount+=1
                # parse blast line
                fs = l.strip().split("\t")
                if fileFormat=="blast":
                    # example
                    # 11495631        chr1    100.00  23      0       0       1       23      25500772        25500750        2e-05   46.1
                    srcId, trgId, perc, length, dummy, dummy, dummy, length, trgStart, trgEnd, eVal, score = fs
                elif fileFormat=="blat":
                    # psl-format from http://genome.ucsc.edu/FAQ/FAQformat.html#format2
                    # matches - Number of bases that match that aren't repeats
                    # misMatches - Number of bases that don't match
                    # repMatches - Number of bases that match but are part of repeats
                    # nCount - Number of 'N' bases
                    # qNumInsert - Number of inserts in query
                    # qBaseInsert - Number of bases inserted in query
                    # tNumInsert - Number of inserts in target
                    # tBaseInsert - Number of bases inserted in target
                    # strand - '+' or '-' for query strand. For translated alignments, second '+'or '-' is for genomic strand
                    # qName - Query sequence name
                    # qSize - Query sequence size
                    # qStart - Alignment start position in query
                    # qEnd - Alignment end position in query
                    # tName - Target sequence name
                    # tSize - Target sequence size
                    # tStart - Alignment start position in target
                    # tEnd - Alignment end position in target
                    # blockCount - Number of blocks in the alignment (a block contains no gaps)
                    # blockSizes - Comma-separated list of sizes of each block
                    # qStarts - Comma-separated list of starting positions of each block in query
                    # tStarts - Comma-separated list of starting positions of each block in target
                    # 23      0       0       0       0       0       0       0       +       11075971|1      2299    2248    2271    scaffold_281    111378  17336   17359   1       23,     2248,   17336,
                    matches, misMatches, repMatches, nCount, qNumInsert, qBaseInsert, tNumInsert, tBaseInsert, strand, qName, qSize, qStart, qEnd, tName, tSize, tStart, tEnd, blockCount, blockSizes, qStarts, tStarts = fs

                    score = matches
                    perc = "%2.2f" % ((float(matches)+float(repMatches)) / (float(matches)+float(misMatches)+float(repMatches)) * 100.0)

                    trgId = tName
                    srcId = qName
                    trgStart = tStart
                    trgEnd   = tEnd

                elif fileFormat=="bwa":
                    if fs[0].startswith("@"):
                        continue
                    if len(fs)==11:
                        fs.append("")
                    row = tp.parseTuple(fs)
                    tuple = maxTables.samToBed(row)
                    if tuple==None:
                        continue

                    chrom, start, end, name, score, strand = tuple
                    srcId = name
                    trgId = chrom
                    perc = "0"
                    length = start-end
                    trgStart, trgEnd=start, end
                    
                else:
                    assert(False) # file format not found?


                trgEnd = int(trgEnd)
                trgStart = int(trgStart)
                if trgEnd < trgStart:
                    trgStart, trgEnd = trgEnd, trgStart


                fs = srcId.split("|")
                srcId = fs[0]
                srcSeq = fs[1]
                pmcId = srcId.replace("PMC", "")
                pmcId = pmcId.replace(".txt", "")
                pmcId = pmcId.replace(".pdf", "")
                data = [pmcId, str(orgNum), srcSeq, trgId, str(trgStart), str(trgEnd), str(score), str(perc)]
                outfh.write("\t".join(data)+"\n")

    outfh.close()

    logger.info("BlastHit output table format is (pmcId, genomeId, seqId, chrom, start, end, score, percentId)")
    logger.info("blastConvert : blast files dropped because of unresolvable species name %d, filesDropped=%d" % (dropFileCount, dropFileCount))
    logger.info("blastConvert : processed %d blast matches, blastMatches=%d" % (lineCount, lineCount))
    logger.info("Now sorting the file with the UNIX sort command")

    cmdLine = "sort -n %s | uniq > %s" % (tempFilename, outFilename)
    logger.info(cmdLine)
    ret = os.system(cmdLine)

    if ret==0:
        logger.info("Sorting finished, no error")
    else:
        logger.info("Error occured while sorting")
예제 #5
0
 def save(self, fname):
     fileObj = openFile(fname, "w")
     logging.info("writing dimers to file...")
     gc.disable()
     cPickle.dump(self.logProbs, fileObj)
     gc.enable()
예제 #6
0
def convertBlastFiles(blastDirs, genomeToTaxFile, tempFilename, outFilename,
                      fileFormat):
    """ collect all *.blast files from blastDirs and subdirs, map to taxon ids using genomToTaxFile and write results to outfh fileobject 
    fileFormat can be blat or blast """

    outfh = maxbio.openFile(tempFilename, "w")

    # read genome -> taxid map
    if genomeToTaxFile:
        orgToNum = {}
        logger.info("Reading genome -> number table from %s" % genomeToTaxFile)
        logger.info(
            "Expected input fields are: (GenomeName, other field,..., other field, genomeId)"
        )

        for l in open(genomeToTaxFile):
            if l.startswith("#"):
                continue
            fs = l.strip().split("\t")
            genome = fs[0]
            num = fs[-1]
            num = int(num)
            genome = genome.lower().replace(" ", "_")
            orgToNum[genome] = num
    else:
        logger.info("No genome map specified, results are not genome-based")
        orgToNum = None

    dropFileCount = 0
    lineCount = 0
    finishedTaxons = set()
    lastDir = None

    # read blast files
    if fileFormat == "blast":
        ext = ".blast"
    elif fileFormat == "blat":
        ext = ".psl"
    elif fileFormat == "bwa":
        ext = ".sam"
    else:
        assert (False)  # wrong file format parameter

    for blastDir in blastDirs:
        logger.info("Searching for files with extension %s in directory %s" %
                    (ext, blastDir))
        files = list(util.findSubdirFiles(blastDir, ext))
        logger.info("Found %d files" % len(files))

        # convert blast files
        files = list(files)
        for fname in files:
            # convert organism to taxid, skip if not possible
            dirname = os.path.dirname(fname)
            org = os.path.basename(dirname)
            org = org.lower()

            if orgToNum != None:
                if org in orgToNum:
                    orgNum = orgToNum[org]
                else:
                    # try to find any organism from genome list in filename
                    found = False
                    for dbOrg in orgToNum:
                        if dbOrg.replace(" ", "_").lower() in fname.lower():
                            orgNum = orgToNum[dbOrg]
                            logger.info(
                                "Found orgName %s in filename %s, using organism id %s"
                                % (dbOrg, fname, str(orgNum)))
                            found = True
                            break

                    if not found:
                        logger.warn(
                            "warning: could not resolve filename %s to taxid, dropping this file (recognized organism %s)"
                            % (fname, org))
                        dropFileCount += 1
                        continue
            else:
                orgNum = -1

            # check if not already processed AND in different directory (blast creates several indices per directory), skip if yes
            #if orgNum in finishedTaxons and dirname!=lastDir:
            #print("warning: already processed this taxon id %d, skipping input file %s)" % (orgNum, fname))
            #continue
            finishedTaxons.add(orgNum)
            lastDir = dirname

            # convert lines
            f = open(fname, "r")
            #print "Reading %s, writing hits to %s"%(fname,outFile)
            if orgNum != -1:
                logger.info("Reading %s, genomeID %d" % (fname, orgNum))
            else:
                logger.info("Reading %s, not linked to any genome id" %
                            (fname))

            if fileFormat == "bwa":
                tp = maxTables.TableParser(fileType="sam")

            for l in f:
                lineCount += 1
                # parse blast line
                fs = l.strip().split("\t")
                if fileFormat == "blast":
                    # example
                    # 11495631        chr1    100.00  23      0       0       1       23      25500772        25500750        2e-05   46.1
                    srcId, trgId, perc, length, dummy, dummy, dummy, length, trgStart, trgEnd, eVal, score = fs
                elif fileFormat == "blat":
                    # psl-format from http://genome.ucsc.edu/FAQ/FAQformat.html#format2
                    # matches - Number of bases that match that aren't repeats
                    # misMatches - Number of bases that don't match
                    # repMatches - Number of bases that match but are part of repeats
                    # nCount - Number of 'N' bases
                    # qNumInsert - Number of inserts in query
                    # qBaseInsert - Number of bases inserted in query
                    # tNumInsert - Number of inserts in target
                    # tBaseInsert - Number of bases inserted in target
                    # strand - '+' or '-' for query strand. For translated alignments, second '+'or '-' is for genomic strand
                    # qName - Query sequence name
                    # qSize - Query sequence size
                    # qStart - Alignment start position in query
                    # qEnd - Alignment end position in query
                    # tName - Target sequence name
                    # tSize - Target sequence size
                    # tStart - Alignment start position in target
                    # tEnd - Alignment end position in target
                    # blockCount - Number of blocks in the alignment (a block contains no gaps)
                    # blockSizes - Comma-separated list of sizes of each block
                    # qStarts - Comma-separated list of starting positions of each block in query
                    # tStarts - Comma-separated list of starting positions of each block in target
                    # 23      0       0       0       0       0       0       0       +       11075971|1      2299    2248    2271    scaffold_281    111378  17336   17359   1       23,     2248,   17336,
                    matches, misMatches, repMatches, nCount, qNumInsert, qBaseInsert, tNumInsert, tBaseInsert, strand, qName, qSize, qStart, qEnd, tName, tSize, tStart, tEnd, blockCount, blockSizes, qStarts, tStarts = fs

                    score = matches
                    perc = "%2.2f" % ((float(matches) + float(repMatches)) /
                                      (float(matches) + float(misMatches) +
                                       float(repMatches)) * 100.0)

                    trgId = tName
                    srcId = qName
                    trgStart = tStart
                    trgEnd = tEnd

                elif fileFormat == "bwa":
                    if fs[0].startswith("@"):
                        continue
                    if len(fs) == 11:
                        fs.append("")
                    row = tp.parseTuple(fs)
                    tuple = maxTables.samToBed(row)
                    if tuple == None:
                        continue

                    chrom, start, end, name, score, strand = tuple
                    srcId = name
                    trgId = chrom
                    perc = "0"
                    length = start - end
                    trgStart, trgEnd = start, end

                else:
                    assert (False)  # file format not found?

                trgEnd = int(trgEnd)
                trgStart = int(trgStart)
                if trgEnd < trgStart:
                    trgStart, trgEnd = trgEnd, trgStart

                fs = srcId.split("|")
                srcId = fs[0]
                srcSeq = fs[1]
                pmcId = srcId.replace("PMC", "")
                pmcId = pmcId.replace(".txt", "")
                pmcId = pmcId.replace(".pdf", "")
                data = [
                    pmcId,
                    str(orgNum), srcSeq, trgId,
                    str(trgStart),
                    str(trgEnd),
                    str(score),
                    str(perc)
                ]
                outfh.write("\t".join(data) + "\n")

    outfh.close()

    logger.info(
        "BlastHit output table format is (pmcId, genomeId, seqId, chrom, start, end, score, percentId)"
    )
    logger.info(
        "blastConvert : blast files dropped because of unresolvable species name %d, filesDropped=%d"
        % (dropFileCount, dropFileCount))
    logger.info("blastConvert : processed %d blast matches, blastMatches=%d" %
                (lineCount, lineCount))
    logger.info("Now sorting the file with the UNIX sort command")

    cmdLine = "sort -n %s | uniq > %s" % (tempFilename, outFilename)
    logger.info(cmdLine)
    ret = os.system(cmdLine)

    if ret == 0:
        logger.info("Sorting finished, no error")
    else:
        logger.info("Error occured while sorting")