Exemplo n.º 1
0
    def parseFasta(self, fasta_f, fasta_type):
        print BtLog.status_d['1'] % ('FASTA', fasta_f)
        self.assembly_f = abspath(fasta_f)
        if (fasta_type):
            # Set up CovLibObj for coverage in assembly header
            self.covLibs[fasta_type] = CovLibObj(fasta_type, fasta_type, fasta_f)

        for name, seq in BtIO.readFasta(fasta_f):
            blObj = BlObj(name, seq)
            if not blObj.name in self.dict_of_blobs:
                self.seqs += 1
                self.length += blObj.length
                self.n_count += blObj.n_count
                
                if (fasta_type):
                    cov = BtIO.parseCovFromHeader(fasta_type, blObj.name)
                    self.covLibs[fasta_type].cov_sum += cov
                    blObj.addCov(fasta_type, cov)

                self.order_of_blobs.append(blObj.name)
                self.dict_of_blobs[blObj.name] = blObj
            else:
                BtLog.error('5', blObj.name)
        
        if self.seqs == 0 or self.length == 0:
            BtLog.error('1')
Exemplo n.º 2
0
    def parseFasta(self, fasta_f, fasta_type):
        print BtLog.status_d['1'] % ('FASTA', fasta_f)
        self.assembly_f = abspath(fasta_f)
        if (fasta_type):
            # Set up CovLibObj for coverage in assembly header
            self.covLibs[fasta_type] = CovLibObj(fasta_type, fasta_type,
                                                 fasta_f)

        for name, seq in BtIO.readFasta(fasta_f):
            blObj = BlObj(name, seq)
            if not blObj.name in self.dict_of_blobs:
                self.seqs += 1
                self.length += blObj.length
                self.n_count += blObj.n_count

                if (fasta_type):
                    cov = BtIO.parseCovFromHeader(fasta_type, blObj.name)
                    self.covLibs[fasta_type].cov_sum += cov
                    blObj.addCov(fasta_type, cov)

                self.order_of_blobs.append(blObj.name)
                self.dict_of_blobs[blObj.name] = blObj
            else:
                BtLog.error('5', blObj.name)

        if self.seqs == 0 or self.length == 0:
            BtLog.error('1')
Exemplo n.º 3
0
 def __init__(self, name='experimental', view_dir=''):
     self.name = name
     self.view_dir = view_dir
     self.length = []
     self.gc = []
     self.names = []
     self.tax = {}
     self.covs = {}
     self.meta = {}
     BtIO.create_dir(self.view_dir)
Exemplo n.º 4
0
def main():
    args = docopt(__doc__)
    #print(args)
    bam_f = args['--bam']
    include_f = args['--include']
    exclude_f = args['--exclude']
    out_prefix = args['--out']
    read_format = args['--read_format']
    if not read_format in set(['fq', 'fa']):
        sys.exit("[X] Read format must be fq or fa!")
    noninterleaved = args['--noninterleaved']
    include_unmapped = True
    if args['--exclude_unmapped']:
        include_unmapped = False
    out_f = BtIO.getOutFile(bam_f, out_prefix, None)
    if include_f and exclude_f:
        print(BtLog.error('43'))
    elif include_f:
        sequence_list = BtIO.parseList(include_f)
        BtIO.parseBamForFilter(bam_f, include_unmapped, noninterleaved, out_f, sequence_list, None, read_format)
    elif exclude_f:
        sequence_list = BtIO.parseList(exclude_f)
        BtIO.parseBamForFilter(bam_f, include_unmapped, noninterleaved, out_f, None, sequence_list, read_format)
    else:
        BtIO.parseBamForFilter(bam_f, include_unmapped, noninterleaved, out_f, None, None, read_format)
Exemplo n.º 5
0
    def mapping():
        out_f, hit_f, map_f, taxid_d = None, None, None, {}
        hit_f = megablast_output  #hit file: BLAST similarity search result (TSV format)
        map_f = "/home/nancy/assembly_app/blobtools/blobtools-master/taxon_n"  #mapping file (TSV format), in which one column lists a sequence ID (of a subject) and another the NCBI TaxID
        map_col_sseqid = "0"  #column of mapping file containing sequence IDs (of the subject)
        map_col_taxid = "2"  #column of mapping file containing the TaxID of the subject
        hit_col_qseqid = "0"  #column of the hit file containing query ID
        hit_col_sseqid = "1"  #column of the hit file containing subject ID
        hit_col_score = "11"  #column of the hit file containing (bit)score

        try:
            hit_col_qseqid = int(hit_col_qseqid)
            hit_col_sseqid = int(hit_col_sseqid)
            hit_col_score = int(hit_col_score)
        except ValueError:
            BtLog.error('41' % (
                "--hit_column_qseqid, --hit_column_sseqid and --hit_column_score"
            ))

        if map_f:
            if map_col_sseqid and map_col_taxid:
                try:
                    map_col_sseqid = int(map_col_sseqid)
                    map_col_taxid = int(map_col_taxid)
                except ValueError:
                    BtLog.error('44')
                print BtLog.status_d['1'] % ("Mapping file", map_f)
                taxid_d = BtIO.parseDict(map_f, map_col_sseqid, map_col_taxid)
                out_f = BtIO.getOutFile("taxified", hit_f, "out")
            else:
                BtLog.error('44')
        else:
            BtLog.error('41')

        output = []
        print BtLog.status_d['1'] % ("similarity search result", hit_f)
        with open(hit_f) as fh:
            for idx, line in enumerate(fh):
                col = line.rstrip("\n").split()
                qseqid = col[hit_col_qseqid]
                sseqid = col[hit_col_sseqid]
                score = col[hit_col_score]
                tax_id = None
                if sseqid not in taxid_d:
                    BtLog.warn_d['12'] % (sseqid, map_f)
                tax_id = taxid_d.get(sseqid, "N/A")
                output.append("%s\t%s\t%s\t%s" %
                              (qseqid, tax_id, score, sseqid))
        if output:
            with open(out_f, "w") as fh:
                print BtLog.status_d['24'] % out_f
                fh.write("\n".join(output) + "\n")
Exemplo n.º 6
0
 def __init__(self, name='experimental', view_dir='', blobDb={}):
     self.name = name
     self.view_dir = re.sub(".blobDB", "", view_dir)
     self.length = []
     self.gc = []
     self.n_count = []
     self.names = []
     self.tax = {}
     self.covs = {}
     self.read_covs = defaultdict(list)
     self.tax_scores = {}
     self.blobDb = blobDb
     self.meta = {}
     BtIO.create_dir(self.view_dir)
Exemplo n.º 7
0
 def _write_output(self):
     primary = [
         'meta', 'blob_id', 'length', 'gc', 'n_count', 'agct_count',
         'tax_id'
     ]
     secondary = ['cov_base', 'cov_read', 'tax', 'tax_hit']
     directory = BtIO.create_dir(self.path)
     out_fs = []
     if (directory):
         out_fs = []
         strings = []
         for key in self.files:
             if key in primary:
                 out_f.append(self.files[key])
                 data.append(getattr(self, key))
             elif key in secondary:
                 for key2 in self.files[key]:
                     out_f.append(self.files[key][key2])
                     data.append(getattr(self, key)[key2])
             else:
                 pass
         with tarfile.open(out_f, "a:gz") as tar:
             for out_f, string in zip(out_fs, strings):
                 with open(out_f, 'w') as fh:
                     json.dump(string,
                               fh,
                               indent=1,
                               separators=(',', ' : '))
                 tar.add(out_f)
Exemplo n.º 8
0
 def parse_data(self, key, exp_count):
     data = BtIO.read_json_list(self.files[key])
     if not len(data) == exp_count:
         # error
         pass
     else:
         setattr(self, key, data)
Exemplo n.º 9
0
 def parseCovs(self, covLibObjs):
     for covLib in covLibObjs:
         self.addCovLib(covLib)
         print BtLog.status_d['1'] % (covLib.name, covLib.f)
         if covLib.fmt == 'bam' or covLib.fmt == 'sam':
             base_cov_dict = {}
             if covLib.fmt == 'bam':
                 base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readBam(
                     covLib.f, set(self.dict_of_blobs))
             else:
                 base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readSam(
                     covLib.f, set(self.dict_of_blobs))
             if covLib.reads_total == 0:
                 print BtLog.warn_d['4'] % covLib.f
             for name, base_cov in base_cov_dict.items():
                 cov = base_cov / self.dict_of_blobs[name].agct_count
                 covLib.cov_sum += cov
                 self.dict_of_blobs[name].addCov(covLib.name, cov)
                 self.dict_of_blobs[name].read_cov = {
                     covLib.name: read_cov_dict[name]
                 }
         elif covLib.fmt == 'cas':
             cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readCas(
                 covLib.f, self.order_of_blobs)
             if covLib.reads_total == 0:
                 print BtLog.warn_d['4'] % covLib.f
             for name, cov in cov_dict.items():
                 covLib.cov_sum += cov
                 self.dict_of_blobs[name].addCov(covLib.name, cov)
                 self.dict_of_blobs[name].read_cov = {
                     covLib.name: read_cov_dict[name]
                 }
         elif covLib.fmt == 'cov':
             cov_dict = BtIO.readCov(covLib.f, set(self.dict_of_blobs))
             if not len(cov_dict) == self.seqs:
                 print BtLog.warn_d['4'] % covLib.f
             for name, cov in cov_dict.items():
                 covLib.cov_sum += cov
                 self.dict_of_blobs[name].addCov(covLib.name, cov)
         else:
             pass
         covLib.mean_cov = covLib.cov_sum / self.seqs
         self.covLibs[covLib.name] = covLib
Exemplo n.º 10
0
 def parse_meta(self, meta_f):
     meta = BtIO.read_meta(meta_f)
     for key, value in meta.items():
         setattr(key, value)
     self.name = meta['name']
     self.covlib = meta['covlib']
     self.taxrule = meta['taxrule']
     self.taxlib = meta['taxlib']
     self.files = meta['files']
     self.blobs_count = meta['count']
     self.ranks = meta['ranks']
Exemplo n.º 11
0
 def parseHits(self, hitLibs):
     for hitLib in hitLibs:
         self.hitLibs[hitLib.name] = hitLib
         print BtLog.status_d['1'] % (hitLib.name, hitLib.f)
         # only accepts format 'seqID\ttaxID\tscore'
         for hitDict in BtIO.readTax(hitLib.f, set(self.dict_of_blobs)):
             if ";" in hitDict['taxId']:
                 hitDict['taxId'] = hitDict['taxId'].split(";")[0]
                 print BtLog.warn['5'] % (hitDict['name'], hitLib)
             self.set_of_taxIds.add(hitDict['taxId'])
             self.dict_of_blobs[hitDict['name']].addHits(hitLib.name, hitDict)
Exemplo n.º 12
0
def main():
    args = docopt(__doc__)
    names_f = args['--names']
    nodes_f = args['--nodes']

    # Parse names.dmp, nodes.dmp
    nodesDB_default = join(dirname(abspath(__file__)), "../data/nodesDB.txt")
    nodesDB, nodesDB_f = BtIO.parseNodesDB(nodes=nodes_f,
                                           names=names_f,
                                           nodesDB=None,
                                           nodesDBdefault=nodesDB_default)
Exemplo n.º 13
0
def main():
    args = docopt(__doc__)
    names_f = args['--names']
    nodes_f = args['--nodes']

    # Parse names.dmp, nodes.dmp
    nodesDB_default = join(blobtools.DATADIR, "nodesDB.txt")
    nodesDB, nodesDB_f = BtIO.parseNodesDB(nodes=nodes_f,
                                           names=names_f,
                                           nodesDB=None,
                                           nodesDBdefault=nodesDB_default)
Exemplo n.º 14
0
def main():
    args = docopt(__doc__)
    fasta_f = args['--infile']
    list_f = args['--list']
    invert = args['--invert']
    prefix = args['--out']

    output = []
    out_f = BtIO.getOutFile(fasta_f, prefix, "filtered.fna")

    print BtLog.status_d['1'] % ("list", list_f)
    items = BtIO.parseSet(list_f)
    items_count = len(items)
    print BtLog.status_d['22'] % fasta_f
    items_parsed = []
    sequences = 0
    for header, sequence in BtIO.readFasta(fasta_f):
        sequences += 1
        if header in items:
            if not (invert):
                items_parsed.append(header)
                output.append(">%s\n%s\n" % (header, sequence))
        else:
            if (invert):
                items_parsed.append(header)
                output.append(">%s\n%s\n" % (header, sequence))
        BtLog.progress(len(output), 10, items_count, no_limit=True)
    BtLog.progress(items_count, 10, items_count)

    items_parsed_count = len(items_parsed)
    print BtLog.status_d['23'] % ('{:.2%}'.format(items_parsed_count/sequences), "{:,}".format(items_count), "{:,}".format(items_parsed_count), "{:,}".format(sequences))

    items_parsed_count_unique = len(set(items_parsed))
    if not items_parsed_count == items_parsed_count_unique:
        print BtLog.warn_d['8'] % "\n\t\t\t".join(list(set([x for x in items_parsed if items_parsed.count(x) > 1])))

    with open(out_f, "w") as fh:
        print BtLog.status_d['24'] % out_f
        fh.write("".join(output))
Exemplo n.º 15
0
def main():
    args = docopt(__doc__)
    fasta_f = args['--infile']
    list_f = args['--list']
    invert = args['--invert']
    prefix = args['--out']

    output = []
    out_f = BtIO.getOutFile(fasta_f, prefix, "filtered.fna")

    print(BtLog.status_d['1'] % ("list", list_f))
    items = BtIO.parseSet(list_f)
    items_count = len(items)
    print(BtLog.status_d['22'] % fasta_f)
    items_parsed = []
    
    with tqdm(total=items_count, desc="[%] ", ncols=200, unit_scale=True) as pbar:
        for header, sequence in BtIO.readFasta(fasta_f):
            if header in items:
                if not (invert):
                    items_parsed.append(header)
                    output.append(">%s\n%s\n" % (header, sequence))
            else:
                if (invert):
                    items_parsed.append(header)
                    output.append(">%s\n%s\n" % (header, sequence))
        pbar.update()

    items_parsed_count = len(items_parsed)

    items_parsed_count_unique = len(set(items_parsed))
    if not items_parsed_count == items_parsed_count_unique:
        print(BtLog.warn_d['8'] % "\n\t\t\t".join(list(set([x for x in items_parsed if items_parsed.count(x) > 1]))))

    with open(out_f, "w") as fh:
        print(BtLog.status_d['24'] % out_f)
        fh.write("".join(output))
Exemplo n.º 16
0
 def load(self, BlobDb_f):
     blobDict = BtIO.readJson(BlobDb_f)
     self.title = blobDict['title']
     self.assembly_f = blobDict['assembly_f']
     self.nodesDB_f = blobDict['nodesDB_f']
     self.lineages = blobDict['lineages']
     self.set_of_taxIds = blobDict['lineages'].keys()
     self.order_of_blobs = blobDict['order_of_blobs']
     self.dict_of_blobs = blobDict['dict_of_blobs'] 
     self.length = int(blobDict['length'])
     self.seqs = int(blobDict['seqs'])
     self.n_count = int(blobDict['n_count'])
     self.covLibs = blobDict['covLibs']
     self.hitLibs = blobDict['hitLibs']
     self.taxrules = blobDict['taxrules']
Exemplo n.º 17
0
 def parseCovs(self, covLibObjs):
     for covLib in covLibObjs:
         self.addCovLib(covLib)
         print BtLog.status_d['1'] % (covLib.name, covLib.f)
         if covLib.fmt == 'bam' or covLib.fmt == 'sam':
             base_cov_dict = {}
             if covLib.fmt == 'bam':
                 base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readBam(covLib.f, set(self.dict_of_blobs))
             else:
                 base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readSam(covLib.f, set(self.dict_of_blobs))    
             if covLib.reads_total == 0:
                 print BtLog.warn_d['4'] % covLib.f
             for name, base_cov in base_cov_dict.items():
                 cov = base_cov / self.dict_of_blobs[name].agct_count
                 covLib.cov_sum += cov
                 self.dict_of_blobs[name].addCov(covLib.name, cov)
                 self.dict_of_blobs[name].read_cov = {covLib.name : read_cov_dict[name]}
         elif covLib.fmt == 'cas':
             cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readCas(covLib.f, self.order_of_blobs)
             if covLib.reads_total == 0:
                 print BtLog.warn_d['4'] % covLib.f
             for name, cov in cov_dict.items():
                 covLib.cov_sum += cov
                 self.dict_of_blobs[name].addCov(covLib.name, cov)
                 self.dict_of_blobs[name].read_cov = {covLib.name : read_cov_dict[name]}
         elif covLib.fmt == 'cov':
             cov_dict = BtIO.readCov(covLib.f, set(self.dict_of_blobs))
             if not len(cov_dict) == self.seqs:
                 print BtLog.warn_d['4'] % covLib.f
             for name, cov in cov_dict.items():
                 covLib.cov_sum += cov
                 self.dict_of_blobs[name].addCov(covLib.name, cov)
         else:
             pass        
         covLib.mean_cov = covLib.cov_sum/self.seqs
         self.covLibs[covLib.name] = covLib
Exemplo n.º 18
0
 def load(self, BlobDb_f):
     blobDict = BtIO.readJson(BlobDb_f)
     self.title = blobDict['title']
     self.assembly_f = blobDict['assembly_f']
     self.nodesDB_f = blobDict['nodesDB_f']
     self.lineages = blobDict['lineages']
     self.set_of_taxIds = blobDict['lineages'].keys()
     self.order_of_blobs = blobDict['order_of_blobs']
     self.dict_of_blobs = blobDict['dict_of_blobs']
     self.length = int(blobDict['length'])
     self.seqs = int(blobDict['seqs'])
     self.n_count = int(blobDict['n_count'])
     self.covLibs = blobDict['covLibs']
     self.hitLibs = blobDict['hitLibs']
     self.taxrules = blobDict['taxrules']
Exemplo n.º 19
0
def main():
    args = docopt(__doc__)
    bam_f = args['--bam']
    include_f = args['--include']
    exclude_f = args['--exclude']
    out_prefix = args['--out']
    include_unmapped = args['--include_unmapped']
    gzip = None
    do_sort = args['--sort']
    keep_sorted = args['--keep']
    sort_threads = int(args['--threads'])
    out_f = BtIO.getOutFile(bam_f, out_prefix, None)
    if include_f and exclude_f:
        print BtLog.error('43')
    elif include_f:
        sequence_list = BtIO.parseList(include_f)
        BtIO.parseBamForFilter(bam_f, include_unmapped, out_f, sequence_list, None, gzip, do_sort, keep_sorted, sort_threads)
    elif exclude_f:
        sequence_list = BtIO.parseList(exclude_f)
        BtIO.parseBamForFilter(bam_f, include_unmapped, out_f, None, sequence_list, gzip, do_sort, keep_sorted, sort_threads)
    else:
        BtIO.parseBamForFilter(bam_f, include_unmapped, out_f, None, None, gzip, do_sort, keep_sorted, sort_threads)
Exemplo n.º 20
0
 def relabel_and_colour(self, colour_f, user_labels):
     if (colour_f):
         colour_dict = BtIO.parseColourDict(colour_f)
     else:
         groups = self.group_order[0:self.max_group_plot]
         colour_groups = [
             group if not (group in user_labels) else user_labels[group]
             for group in groups
         ]
         colour_dict = generateColourDict(colour_groups)
     for idx, group in enumerate(self.group_order):
         if (self.exclude_groups):
             if group in self.exclude_groups:
                 self.group_labels[group].add('other')
                 self.colours[group] = WHITE
         elif group in user_labels:
             label = user_labels[group]
             self.group_labels[group].add(label)
             self.group_labels[group].add(group)
             self.colours[label] = colour_dict[label]
             if label not in self.plot_order:
                 self.plot_order.append(label)
         elif group in colour_dict:
             self.group_labels[group].add(group)
             self.colours[group] = colour_dict[group]
             self.plot_order.append(group)
         elif idx > self.max_group_plot:
             self.group_labels[group].add('other')
             self.group_labels[group].add(group)
             self.colours['other'] = WHITE
             self.labels.add('other')
         else:
             self.group_labels[group].add('other')
             self.group_labels[group].add(group)
             self.colours['other'] = WHITE
             self.labels.add('other')
         self.group_labels[group].add('all')
     if 'other' in self.labels:
         self.plot_order.append('other')
Exemplo n.º 21
0
 def relabel_and_colour(self, colour_f, user_labels):
     if (colour_f):
         colour_dict = BtIO.parseColourDict(colour_f)
     else:
         groups = self.group_order[0:self.max_group_plot]
         colour_groups = [group if not (group in user_labels) else user_labels[group] for group in groups]
         colour_dict = generateColourDict(colour_groups)
     for idx, group in enumerate(self.group_order):
         if (self.exclude_groups):
             if group in self.exclude_groups:
                 self.group_labels[group].add('other')
                 self.colours[group] = WHITE     
         elif group in user_labels:
             label = user_labels[group]
             self.group_labels[group].add(label)
             self.group_labels[group].add(group)
             self.colours[label] = colour_dict[label]
             if label not in self.plot_order:
                 self.plot_order.append(label)
         elif group in colour_dict:    
             self.group_labels[group].add(group)
             self.colours[group] = colour_dict[group] 
             self.plot_order.append(group)
         elif idx > self.max_group_plot:
             self.group_labels[group].add('other')
             self.group_labels[group].add(group)
             self.colours['other'] = WHITE
             self.labels.add('other')
         else:
             self.group_labels[group].add('other')
             self.group_labels[group].add(group)
             self.colours['other'] = WHITE
             self.labels.add('other')
         self.group_labels[group].add('all')
     if 'other' in self.labels:
         self.plot_order.append('other')
Exemplo n.º 22
0
def main():
    #print(data_dir)
    args = docopt(__doc__)
    blobdb_f = args['--input']
    prefix = args['--out']
    ranks = args['--rank']
    taxrule = args['--taxrule']
    hits_flag = args['--hits']
    seq_list_f = args['--list']
    concoct = args['--concoct']
    cov = args['--cov']
    notable = args['--notable']
    experimental = args['--experimental']
    # Does blobdb_f exist ?
    if not isfile(blobdb_f):
        BtLog.error('0', blobdb_f)

    out_f = BtIO.getOutFile(blobdb_f, prefix, None)

    # Are ranks sane ?
    if 'all' in ranks:
        temp_ranks = RANKS[0:-1]
        ranks = temp_ranks[::-1]
    else:
        for rank in ranks:
            if rank not in RANKS:
                BtLog.error('9', rank)

    # Does seq_list file exist?
    seqs = []
    if (seq_list_f):
        if isfile(seq_list_f):
            seqs = BtIO.parseList(seq_list_f)
        else:
            BtLog.error('0', seq_list_f)

    # Load BlobDb
    blobDb = BtCore.BlobDb('new')
    print(BtLog.status_d['9'] % (blobdb_f))
    blobDb.load(blobdb_f)
    blobDb.version = interface.__version__

    # Is taxrule sane and was it computed?
    if (blobDb.hitLibs) and taxrule not in blobDb.taxrules:
        BtLog.error('11', taxrule, blobDb.taxrules)

    # view(s)
    viewObjs = []
    print(BtLog.status_d['14'])
    if not (notable):
        tableView = None
        if len(blobDb.hitLibs) > 1:
            tableView = BtCore.ViewObj(name="table",
                                       out_f=out_f,
                                       suffix="%s.table.txt" % (taxrule),
                                       body=[])
        else:
            tableView = BtCore.ViewObj(name="table",
                                       out_f=out_f,
                                       suffix="table.txt",
                                       body=[])
        viewObjs.append(tableView)
    if not experimental == 'False':
        meta = {}
        if isfile(experimental):
            meta = BtIO.readYaml(experimental)
        experimentalView = BtCore.ExperimentalViewObj(name="experimental",
                                                      view_dir=out_f,
                                                      blobDb=blobDb,
                                                      meta=meta)
        viewObjs.append(experimentalView)
    if (concoct):
        concoctTaxView = None
        concoctCovView = None
        if len(blobDb.hitLibs) > 1:
            concoctTaxView = BtCore.ViewObj(
                name="concoct_tax",
                out_f=out_f,
                suffix="%s.concoct_taxonomy_info.csv" % (taxrule),
                body=dict())
            concoctCovView = BtCore.ViewObj(
                name="concoct_cov",
                out_f=out_f,
                suffix="%s.concoct_coverage_info.tsv" % (taxrule),
                body=[])
        else:
            concoctTaxView = BtCore.ViewObj(name="concoct_tax",
                                            out_f=out_f,
                                            suffix="concoct_taxonomy_info.csv",
                                            body=dict())
            concoctCovView = BtCore.ViewObj(name="concoct_cov",
                                            out_f=out_f,
                                            suffix="concoct_coverage_info.tsv",
                                            body=[])
        viewObjs.append(concoctTaxView)
        viewObjs.append(concoctCovView)
    if (cov):
        for cov_lib_name, covLibDict in blobDb.covLibs.items():
            out_f = BtIO.getOutFile(covLibDict['f'], prefix, None)
            covView = BtCore.ViewObj(name="covlib",
                                     out_f=out_f,
                                     suffix="cov",
                                     body=[])
            blobDb.view(viewObjs=[covView],
                        ranks=None,
                        taxrule=None,
                        hits_flag=None,
                        seqs=None,
                        cov_libs=[cov_lib_name],
                        progressbar=True)
    if (viewObjs):
        #for viewObj in viewObjs:
        #    print(viewObj.name)
        blobDb.view(viewObjs=viewObjs,
                    ranks=ranks,
                    taxrule=taxrule,
                    hits_flag=hits_flag,
                    seqs=seqs,
                    cov_libs=[],
                    progressbar=True)
    print(BtLog.status_d['19'])
Exemplo n.º 23
0
    # Does blobdb_f exist ?
    if not isfile(blobdb_f):
        BtLog.error('0', blobdb_f)

    # Are ranks sane ?
    for rank in ranks:
        if rank not in RANKS:
            BtLog.error('9', rank)
    if 'all' in ranks:
        ranks = RANKS[0:-1]            

    # Is list a list of sequence names or a file?
    seqs = []
    if (seq_list):
        if isfile(seq_list):
            seqs = BtIO.parseList(seq_list)
        elif "," in seq_list:
            seqs = seq_list.split(",")
        else:
            seqs = [seq_list]

    # Load BlobDb
    blobDB = bt.BlobDb('new')
    blobDB.load(blobdb_f)

    # Is taxrule sane and was it computed?
    if (blobDB.hitLibs) and taxrule not in blobDB.taxrules:
        BtLog.error('11', taxrule, blobDB.taxrules)
    blobDB.view(out_f, ranks, taxrule, hits_flag, seqs)

Exemplo n.º 24
0
 def output(self):
     # meta
     meta = self.get_meta()
     meta_f = join(self.view_dir, "meta.json")
     BtIO.writeJson(meta, meta_f, indent=2)
     # gc
     gc_f = join(self.view_dir, "gc.json")
     print BtLog.status_d['13'] % (gc_f)
     BtIO.writeJson({"values": self._format_float(self.gc)}, gc_f, indent=1)
     # length
     length_f = join(self.view_dir, "length.json")
     print BtLog.status_d['13'] % (length_f)
     BtIO.writeJson({"values": self.length}, length_f, indent=1)
     # Ns
     if max(self.n_count) > 0:
         n_f = join(self.view_dir, "ncount.json")
         print BtLog.status_d['13'] % (n_f)
         BtIO.writeJson(
             {"values": map(lambda x: max(x, 0.2), self.n_count)},
             n_f,
             indent=1)
     # identifiers
     ids_f = join(self.view_dir, "identifiers.json")
     print BtLog.status_d['13'] % (ids_f)
     BtIO.writeJson(self.names, ids_f, indent=1)
     # cov
     for cov_name, cov in self.covs.items():
         name = self._remove_cov_suffix(cov_name, self.blobDb.covLibs)
         cov_f = join(self.view_dir, "%s_cov.json" % name)
         print BtLog.status_d['13'] % (cov_f)
         BtIO.writeJson({"values": self._format_float(cov, 0.02)},
                        cov_f,
                        indent=1)
     # read_cov
     for cov_name, cov in self.read_covs.items():
         name = self._remove_cov_suffix(cov_name, self.blobDb.covLibs)
         cov_f = join(self.view_dir, "%s_read_cov.json" % name)
         print BtLog.status_d['13'] % (cov_f)
         BtIO.writeJson({"values": map(lambda x: max(x, 0.2), cov)},
                        cov_f,
                        indent=1)
     # tax
     for taxrule in self.tax:
         for rank in self.tax[taxrule]:
             tax = self._keyed_list(self.tax[taxrule][rank])
             rank_f = join(self.view_dir, "%s_%s.json" % (taxrule, rank))
             BtIO.writeJson(tax, rank_f, indent=1)
             score = self.tax_scores[taxrule][rank]['score']
             score_f = join(self.view_dir,
                            "%s_%s_score.json" % (taxrule, rank))
             BtIO.writeJson({"values": map(lambda x: max(x, 0.2), score)},
                            score_f,
                            indent=1)
             cindex = self.tax_scores[taxrule][rank]['c_index']
             cindex_f = join(self.view_dir,
                             "%s_%s_cindex.json" % (taxrule, rank))
             BtIO.writeJson({"values": cindex}, cindex_f, indent=1)
Exemplo n.º 25
0
def main():
    args = docopt(__doc__)
    args = BtPlot.check_input(args)

    blobdb_f = args['--infile']
    rank = args['--rank']
    min_length = int(args['--length'])
    max_group_plot = int(args['--plotgroups'])
    hide_nohits = args['--nohit']
    taxrule = args['--taxrule']
    c_index = args['--cindex']
    exclude_groups = args['--exclude']
    labels = args['--label']
    colour_f = args['--colours']
    refcov_f = args['--refcov']
    catcolour_f = args['--catcolour']

    multiplot = args['--multiplot']
    out_prefix = args['--out']
    sort_order = args['--sort']
    sort_first = args['--sort_first']
    hist_type = args['--hist']
    no_title = args['--notitle']
    ignore_contig_length = args['--noscale']
    format_plot = args['--format']
    no_plot_blobs = args['--noblobs']
    no_plot_reads = args['--noreads']
    legend_flag = args['--legend']
    cumulative_flag = args['--cumulative']
    cov_lib_selection = args['--lib']

    filelabel = args['--filelabel']

    exclude_groups = BtIO.parseCmdlist(exclude_groups)
    refcov_dict = BtIO.parseReferenceCov(refcov_f)
    user_labels = BtIO.parseCmdLabels(labels)
    catcolour_dict = BtIO.parseCatColour(catcolour_f)
    colour_dict = BtIO.parseColours(colour_f)

    # Load BlobDb
    print BtLog.status_d['9'] % blobdb_f
    blobDb = BtCore.BlobDb('blobplot')
    blobDb.version = blobtools.__version__
    blobDb.load(blobdb_f)

    # Generate plot data
    print BtLog.status_d['18']
    data_dict, min_cov, max_cov, cov_lib_dict = blobDb.getPlotData(
        rank, min_length, hide_nohits, taxrule, c_index, catcolour_dict)
    plotObj = BtPlot.PlotObj(data_dict, cov_lib_dict, cov_lib_selection,
                             'blobplot', sort_first)
    plotObj.exclude_groups = exclude_groups
    plotObj.version = blobDb.version
    plotObj.format = format_plot
    plotObj.max_cov = max_cov
    plotObj.min_cov = min_cov
    plotObj.no_title = no_title
    plotObj.multiplot = multiplot
    plotObj.hist_type = hist_type
    plotObj.ignore_contig_length = ignore_contig_length
    plotObj.max_group_plot = max_group_plot
    plotObj.legend_flag = legend_flag
    plotObj.cumulative_flag = cumulative_flag
    # order by which to plot (should know about user label)
    plotObj.group_order = BtPlot.getSortedGroups(data_dict, sort_order,
                                                 sort_first)
    # labels for each level of stats
    plotObj.labels.update(plotObj.group_order)
    # plotObj.group_labels is dict that contains labels for each group : all/other/user_label
    if (user_labels):
        for group, label in user_labels.items():
            plotObj.labels.add(label)
    plotObj.group_labels = {group: set() for group in plotObj.group_order}
    plotObj.relabel_and_colour(colour_dict, user_labels)
    plotObj.compute_stats()
    plotObj.refcov_dict = refcov_dict
    # Plotting
    info_flag = 1
    out_f = ''
    for cov_lib in plotObj.cov_libs:
        plotObj.ylabel = "Coverage"
        plotObj.xlabel = "GC proportion"
        if (filelabel):
            plotObj.ylabel = basename(cov_lib_dict[cov_lib]['f'])
        out_f = "%s.%s.%s.p%s.%s.%s" % (blobDb.title, taxrule, rank,
                                        max_group_plot, hist_type, min_length)
        if catcolour_dict:
            out_f = "%s.%s" % (out_f, "catcolour")
        if ignore_contig_length:
            out_f = "%s.%s" % (out_f, "noscale")
        if c_index:
            out_f = "%s.%s" % (out_f, "c_index")
        if exclude_groups:
            out_f = "%s.%s" % (out_f, "exclude_" + "_".join(exclude_groups))
        if labels:
            out_f = "%s.%s" % (out_f, "userlabel_" + "_".join(
                set([name for name in user_labels.values()])))
        out_f = "%s.%s" % (out_f, "blobplot")
        if (plotObj.cumulative_flag):
            out_f = "%s.%s" % (out_f, "cumulative")
        if (plotObj.multiplot):
            out_f = "%s.%s" % (out_f, "multiplot")
        out_f = BtIO.getOutFile(out_f, out_prefix, None)
        if not (no_plot_blobs):
            plotObj.plotScatter(cov_lib, info_flag, out_f)
            info_flag = 0
        if not (no_plot_reads) and (
                plotObj.cov_libs_total_reads_dict[cov_lib]):
            # prevent plotting if --noreads or total_reads == 0
            plotObj.plotBar(cov_lib, out_f)
    plotObj.write_stats(out_f)
Exemplo n.º 26
0
def main():

    #main_dir = dirname(__file__)
    args = docopt(__doc__)
    fasta_f = args['--infile']
    fasta_type = args['--type']
    bam_fs = args['--bam']
    cov_fs = args['--cov']
    cas_fs = args['--cas']
    hit_fs = args['--hitsfile']
    prefix = args['--out']
    nodesDB_f = args['--db']
    names_f = args['--names']
    estimate_cov_flag = True if not args['--calculate_cov'] else False
    nodes_f = args['--nodes']
    taxrules = args['--taxrule']
    try:
        min_bitscore_diff = float(args['--min_diff'])
        min_score = float(args['--min_score'])
    except ValueError():
        BtLog.error('45')
    tax_collision_random = args['--tax_collision_random']
    title = args['--title']

    # outfile
    out_f = BtIO.getOutFile("blobDB", prefix, "json")
    if not (title):
        title = out_f

    # coverage
    if not (fasta_type) and not bam_fs and not cov_fs and not cas_fs:
        BtLog.error('1')
    cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \
           [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \
           [BtCore.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)]

    # taxonomy
    hit_libs = [
        BtCore.HitLibObj('tax' + str(idx), 'tax', lib_f)
        for idx, lib_f in enumerate(hit_fs)
    ]

    # Create BlobDB object
    blobDb = BtCore.BlobDb(title)
    blobDb.version = interface.__version__
    # Parse FASTA
    blobDb.parseFasta(fasta_f, fasta_type)

    # Parse nodesDB OR names.dmp, nodes.dmp
    nodesDB_default = join(dirname(abspath(__file__)), "../data/nodesDB.txt")
    nodesDB, nodesDB_f = BtIO.parseNodesDB(nodes=nodes_f,
                                           names=names_f,
                                           nodesDB=nodesDB_f,
                                           nodesDBdefault=nodesDB_default)
    blobDb.nodesDB_f = nodesDB_f

    # Parse similarity hits
    if (hit_libs):
        blobDb.parseHits(hit_libs)
        if not taxrules:
            if len(hit_libs) > 1:
                taxrules = ['bestsum', 'bestsumorder']
            else:
                taxrules = ['bestsum']
        blobDb.computeTaxonomy(taxrules, nodesDB, min_score, min_bitscore_diff,
                               tax_collision_random)
    else:
        print(BtLog.warn_d['0'])

    # Parse coverage
    blobDb.parseCoverage(covLibObjs=cov_libs,
                         estimate_cov=estimate_cov_flag,
                         prefix=prefix)

    # Generating BlobDB and writing to file
    print(BtLog.status_d['7'] % out_f)
    BtIO.writeJson(blobDb.dump(), out_f)
Exemplo n.º 27
0
               [bt.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \
               [bt.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \
               [bt.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)] 
               
    # Create BlobDB object              
    blobDb = bt.BlobDb(title)

    # Parse FASTA
    blobDb.parseFasta(fasta_f, fasta_type)
    # Parse coverage
    blobDb.parseCovs(cov_libs)

    # Parse Tax
    hitLibs = [bt.hitLibObj('tax' + str(idx), 'tax', lib_f) for idx, lib_f in enumerate(hit_fs)]
    blobDb.parseHits(hitLibs)
    
    # Parse nodesDB
    nodesDB, nodesDB_f = BtIO.getNodesDB(nodes=nodes_f, names=names_f, nodesDB=nodesDB_f)
    blobDb.nodesDB_f = nodesDB_f
        
    if not os.path.isfile(nodesDB_f):
        print BtLog.status_d['5'] % nodesDB_f
        BtIO.writeNodesDB(nodesDB, nodesDB_f)

    # Computing taxonomy based on taxrules
    print BtLog.status_d['6'] % ",".join(taxrules)
    blobDb.computeTaxonomy(taxrules, nodesDB)

    # Generating BlobDB and writing to file
    print BtLog.status_d['7'] % out_f
    BtIO.writeJson(blobDb.dump(), out_f)
Exemplo n.º 28
0
 def load(self, BlobDb_f):
     blobDict = BtIO.readJson(BlobDb_f)
     for k, v in blobDict.items():
         setattr(self, k, v)
     self.set_of_taxIds = blobDict['lineages'].keys()
Exemplo n.º 29
0
    if not isfile(blobdb_f):
        BtLog.error('0', blobdb_f)

    # Are ranks sane ?
    for rank in ranks:
        if rank not in RANKS:
            BtLog.error('9', rank)
    if 'all' in ranks:
        temp_ranks = RANKS[0:-1]
        ranks = temp_ranks[::-1]           

    # Is list a list of sequence names or a file?
    seqs = []
    if (seq_list):
        if isfile(seq_list):
            seqs = BtIO.parseList(seq_list)
        elif "," in seq_list:
            seqs = seq_list.split(",")
        else:
            seqs = [seq_list]

    # Load BlobDb
    blobDB = bt.BlobDb('new')
    blobDB.load(blobdb_f)

    # Is taxrule sane and was it computed?
    if (blobDB.hitLibs) and taxrule not in blobDB.taxrules:
        BtLog.error('11', taxrule, blobDB.taxrules)
    blobDB.view(out_f, ranks, taxrule, hits_flag, seqs)

Exemplo n.º 30
0
def main():
    args = docopt(__doc__)
    out_f, hit_f, map_f, taxid_d = None, None, None, {}
    hit_f = args['--hit_file']
    hit_col_qseqid = args['--hit_column_qseqid']
    hit_col_sseqid = args['--hit_column_sseqid']
    hit_col_score = args['--hit_column_score']
    map_f = args['--taxid_mapping_file']
    map_col_sseqid = args['--map_col_sseqid']
    map_col_taxid = args['--map_col_taxid']
    custom_f = args['--custom']
    custom_taxid = args['--custom_taxid']
    custom_score = args['--custom_score']
    prefix = args['--out']

    try:
        hit_col_qseqid = int(hit_col_qseqid)
        hit_col_sseqid = int(hit_col_sseqid)
        hit_col_score = int(hit_col_score)
    except ValueError:
        BtLog.error('41' % (
            "--hit_column_qseqid, --hit_column_sseqid and --hit_column_score"))

    if custom_taxid:
        try:
            custom_taxid = int(custom_taxid)
        except TypeError:
            BtLog.error('26')
        out_f = BtIO.getOutFile(hit_f, prefix, "taxID_%s.out" % custom_taxid)
        taxid_d = defaultdict(lambda: custom_taxid)
    elif map_f:
        if map_col_sseqid and map_col_taxid:
            try:
                map_col_sseqid = int(map_col_sseqid)
                map_col_taxid = int(map_col_taxid)
            except ValueError:
                BtLog.error('44')
            print BtLog.status_d['1'] % ("Mapping file", map_f)
            taxid_d = BtIO.parseDict(map_f, map_col_sseqid, map_col_taxid)
            out_f = BtIO.getOutFile(hit_f, prefix, "taxified.out")
        else:
            BtLog.error('44')
    else:
        BtLog.error('41')

    output = []
    print BtLog.status_d['1'] % ("similarity search result", hit_f)
    with open(hit_f) as fh:
        for idx, line in enumerate(fh):
            col = line.rstrip("\n").split()
            qseqid = col[hit_col_qseqid]
            sseqid = col[hit_col_sseqid]
            score = col[hit_col_score]
            tax_id = None
            if custom_taxid:
                tax_id = taxid_d[sseqid]
            else:
                if sseqid not in taxid_d:
                    BtLog.warn_d['12'] % (sseqid, map_f)
                tax_id = taxid_d.get(sseqid, "N/A")
            output.append("%s\t%s\t%s\t%s" % (qseqid, tax_id, score, sseqid))
    if output:
        with open(out_f, "w") as fh:
            print BtLog.status_d['24'] % out_f
            fh.write("\n".join(output) + "\n")
Exemplo n.º 31
0
 def output(self):
     # meta
     meta = self.get_meta()
     meta_f = join(self.view_dir, "meta.json")
     BtIO.writeJson(meta, meta_f, indent=2)
     # gc
     gc_f = join(self.view_dir, "gc.json")
     print BtLog.status_d['13'] % (gc_f)
     BtIO.writeJson(self.gc, gc_f, indent=1)
     # length
     length_f = join(self.view_dir, "length.json")
     print BtLog.status_d['13'] % (length_f)
     BtIO.writeJson(self.length, length_f, indent=1)
     # names
     names_f = join(self.view_dir, "names.json")
     print BtLog.status_d['13'] % (names_f)
     BtIO.writeJson(self.names, names_f, indent=1)
     # cov
     cov_d = join(self.view_dir, "covs")
     BtIO.create_dir(directory=cov_d)
     for cov_lib, cov in self.covs.items():
         cov_f = join(cov_d, "%s.json" % cov_lib)
         print BtLog.status_d['13'] % (cov_f)
         BtIO.writeJson(cov, cov_f, indent=1)
     # tax
     taxrule_d = join(self.view_dir, "taxrule")
     BtIO.create_dir(directory=taxrule_d)
     for taxrule in self.tax:
         tax_d = join(taxrule_d, taxrule)
         BtIO.create_dir(directory=tax_d)
         for rank in self.tax[taxrule]:
             tax = self.tax[taxrule][rank]
             rank_f = join(tax_d, "%s.json" % rank)
             BtIO.writeJson(tax, rank_f, indent=1)
Exemplo n.º 32
0
    def parseCoverage(self, **kwargs):
        # arguments
        covLibObjs = kwargs['covLibObjs']
        no_base_cov = kwargs['no_base_cov']

        for covLib in covLibObjs:
            self.addCovLib(covLib)
            print BtLog.status_d['1'] % (covLib.name, covLib.f)
            if covLib.fmt == 'bam' or covLib.fmt == 'sam':
                base_cov_dict = {}
                if covLib.fmt == 'bam':
                    base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.parseBam(
                        covLib.f, set(self.dict_of_blobs), no_base_cov)
                else:
                    base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.parseSam(
                        covLib.f, set(self.dict_of_blobs), no_base_cov)

                if covLib.reads_total == 0:
                    print BtLog.warn_d['4'] % covLib.f

                for name, base_cov in base_cov_dict.items():
                    cov = base_cov / self.dict_of_blobs[name].agct_count
                    covLib.cov_sum += cov
                    self.dict_of_blobs[name].addCov(covLib.name, cov)
                    self.dict_of_blobs[name].addReadCov(
                        covLib.name, read_cov_dict[name])
                # Create COV file for future use
                out_f = BtIO.getOutFile(covLib.f, kwargs.get('prefix', None),
                                        None)
                covView = ViewObj(name="covlib",
                                  out_f=out_f,
                                  suffix="cov",
                                  header="",
                                  body=[])
                self.view(viewObjs=[covView],
                          ranks=None,
                          taxrule=None,
                          hits_flag=None,
                          seqs=None,
                          cov_libs=[covLib.name],
                          progressbar=False)

            elif covLib.fmt == 'cas':
                cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.parseCas(
                    covLib.f, self.order_of_blobs)
                if covLib.reads_total == 0:
                    print BtLog.warn_d['4'] % covLib.f
                for name, cov in cov_dict.items():
                    covLib.cov_sum += cov
                    self.dict_of_blobs[name].addCov(covLib.name, cov)
                    self.dict_of_blobs[name].addReadCov(
                        covLib.name, read_cov_dict[name])
                out_f = BtIO.getOutFile(covLib.f, kwargs.get('prefix', None),
                                        None)
                covView = ViewObj(name="covlib",
                                  out_f=out_f,
                                  suffix="cov",
                                  header="",
                                  body=[])
                self.view(viewObjs=[covView],
                          ranks=None,
                          taxrule=None,
                          hits_flag=None,
                          seqs=None,
                          cov_libs=[covLib.name],
                          progressbar=False)

            elif covLib.fmt == 'cov':
                base_cov_dict, covLib.reads_total, covLib.reads_mapped, covLib.reads_unmapped, read_cov_dict = BtIO.parseCov(
                    covLib.f, set(self.dict_of_blobs))
                #cov_dict = BtIO.readCov(covLib.f, set(self.dict_of_blobs))
                if not len(base_cov_dict) == self.seqs:
                    print BtLog.warn_d['4'] % covLib.f
                for name, cov in base_cov_dict.items():
                    covLib.cov_sum += cov
                    self.dict_of_blobs[name].addCov(covLib.name, cov)
                    if name in read_cov_dict:
                        self.dict_of_blobs[name].addReadCov(
                            covLib.name, read_cov_dict[name])
            else:
                pass
            covLib.mean_cov = covLib.cov_sum / self.seqs
            if covLib.cov_sum == 0.0:
                print BtLog.warn_d['6'] % (covLib.name)
            self.covLibs[covLib.name] = covLib