Exemplo n.º 1
0
def list_organisms(ofus, hclus, nt_cat, typetable, outpath, cut_h):
    bgc_dd = defaultdict(list)
    for value, key in hclus.itertuples(index=True):
        key = str('%05d' % key)
        bgc_dd[key].extend(list([value]))
    ofu_list = ofus.split(',')
    i = 0
    # Preload the Database and Tree
    db = RefSeqDatabase()
    nt = NCBITree()
    for ofu in ofu_list:
        ofu = str(ofu)
        if ofu.startswith('ofu'):
            ofu_n = str(ofu.replace('ofu', ''))
        elif ofu.startswith('ofu_'):
            ofu_n = str(ofu.replace('ofu', ''))
        else:
            ofu_n = ofu
        bgcs = bgc_dd[ofu_n]
        name_dict = defaultdict(list)
        with suppress_stdout():
            for bgc in bgcs:
                if bgc.startswith('ncbi_tid'):
                    ncbi_tid = bgc.split('|')[1]
                    if ncbi_tid == 'na':
                        name = bgc.split('|')[3]
                    else:
                        ncbi_tid = int(ncbi_tid)
                        name = nt.green_genes_lineage(ncbi_tid,
                                                      depth=8,
                                                      depth_force=True)
                elif '|genbank|' in bgc:
                    gbk_id = bgc.split('|')[3].split('_cluster')[0]
                    if nt_cat == '-':
                        sys.exit(
                            'Genbank ID BGC headers require an NT Catalog for annotation... see --help'
                        )
                    tid, organism = genbank_id_to_tid(gbk_id, nt_cat)
                    name = organism
                else:
                    refseqid = '_'.join(bgc.split('_')[:2])
                    name = refseq_to_name(refseqid, db=db, nt=nt)
                if typetable is not False:
                    ctype = typetable.filter(like=bgc, axis=0)
                    ctype = str(ctype.iloc[0, 0])
                else:
                    ctype = 'NA'
                if bgc == name:
                    name_dict[bgc] = [ctype, refseqid]
                else:
                    name_dict[bgc] = [ctype, name]
        ofu_file = ''.join(['ofu', ofu_n, '_id', cut_h, '.txt'])
        with open(os.path.join(outpath, ofu_file), 'w') as outf:
            outdf = pd.DataFrame.from_dict(name_dict, orient='index')
            outdf.columns = ['predicted_type', 'organism']
            outdf.to_csv(outf, sep='\t')
        i += 1
    print('\nOrganism information for %d OFUs written to file.\n' % i)
    return bgc_dd
Exemplo n.º 2
0
def genbank_id_to_tid(gbk_id, nt_cat):
	with open(nt_cat, 'r') as nt_catalog:
		reader = csv.reader(nt_catalog, delimiter='\t')
		next(reader)
		nt = NCBITree()
		gbk_set = set()
		gbk_set.add(gbk_id)
		for line in reader:
			if line[1] in gbk_set:
				tid = line[2]
				tid = int(tid)
				organism = nt.green_genes_lineage(tid, depth=8, depth_force=True)
			else:
				tid, organism = 'na', 'k__None;p__None;c__None;o__None;f__None;g__None;s__None;t__None'
	return tid, organism
Exemplo n.º 3
0
def shogun_bt2_lca(input, output, bt2_indx, extract_ncbi_tid, depth, threads, annotate_lineage, run_lca):
    verify_make_dir(output)

    basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')]

    for basename in basenames:
        fna_inf = os.path.join(input, basename + '.fna')
        sam_outf = os.path.join(output, basename + '.sam')
        if os.path.isfile(sam_outf):
            print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf)
        else:
            print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads))
    
    if run_lca:
        tree = NCBITree()
        rank_name = list(tree.lineage_ranks.keys())[depth-1]
        if not rank_name:
            raise ValueError('Depth must be between 0 and 7, it was %d' % depth)

        begin, end = extract_ncbi_tid.split(',')

        counts = []
        for basename in basenames:
            sam_file = os.path.join(output, basename + '.sam')
            lca_map = {}
            for qname, rname in yield_alignments_from_sam_inf(sam_file):
                ncbi_tid = int(find_between(rname, begin, end))
                if qname in lca_map:
                    current_ncbi_tid = lca_map[qname]
                    if current_ncbi_tid:
                        if current_ncbi_tid != ncbi_tid:
                            lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid)
                else:
                    lca_map[qname] = ncbi_tid

            if annotate_lineage:
                lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            else:
                lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            counts.append(taxon_counts)

        df = pd.DataFrame(counts, index=basenames)
        df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
Exemplo n.º 4
0
def main():
	parser = make_arg_parser()
	args = parser.parse_args()

	db = RefSeqDatabase()
	nt = NCBITree()
	# parse command line
	with open(args.output, 'w') if args.output != '-' else sys.stdout as outf:
		if args.assembly != '-':
			ncbi_tid = db.get_ncbi_tid_from_assembly_accession_version(args.assembly)[0]
		elif args.refseq != '-':
			ncbi_tid = db.get_ncbi_tid_from_refseq_accession(args.refseq)[0]
		elif args.tid != '-':
			ncbi_tid = int(args.tid)
		organism = nt.green_genes_lineage(ncbi_tid)
# 		genus_species = organism.split(';')[-1]
# 		genus_species = genus_species.replace('s__','')
		outf.write('>ncbi_tid|%d|organism|%s\n' % (ncbi_tid, organism))
		outf.write('\n')
def shogun_bt2_capitalist(input, output, bt2_indx, reference_fasta, reference_map, extract_ncbi_tid, depth, threads):
    verify_make_dir(output)

    fna_files = [os.path.join(input, filename) for filename in os.listdir(input) if filename.endswith('.fna')]

    for fna_file in fna_files:
        sam_outf = os.path.join(output, '.'.join(str(os.path.basename(fna_file)).split('.')[:-1]) + '.sam')
        print(bowtie2_align(fna_file, sam_outf, bt2_indx, num_threads=threads))

    tree = NCBITree()
    begin, end = extract_ncbi_tid.split(',')

    sam_files = [os.path.join(output, filename) for filename in os.listdir(output) if filename.endswith('.sam')]
    lca_maps = {}
    for sam_file in sam_files:
        lca_map = {}
        for qname, rname in yield_alignments_from_sam_inf(sam_file):
            ncbi_tid = int(find_between(rname, begin, end))
            if qname in lca_map:
                current_ncbi_tid = lca_map[qname]
                if current_ncbi_tid:
                    if current_ncbi_tid != ncbi_tid:
                        lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid)
            else:
                lca_map[qname] = ncbi_tid

        lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map)
        # filter out null values
        lca_maps['.'.join(os.path.basename(sam_file).split('.')[:-1])] = reverse_collision_dict(lca_map)

    for basename in lca_maps.keys():
        lca_maps[basename] = valmap(lambda val: (basename, val), lca_maps[basename])

    lca_map_2 = defaultdict(list)
    for basename in lca_maps.keys():
        for key, val in lca_maps[basename].items():
            if key:
                lca_map_2[key].append(val)

    fna_faidx = {}
    for fna_file in fna_files:
        fna_faidx[os.path.basename(fna_file)[:-4]] = pyfaidx.Fasta(fna_file)

    dict_reference_map = defaultdict(list)
    with open(reference_map) as inf:
        tsv_in = csv.reader(inf, delimiter='\t')
        for line in tsv_in:
            dict_reference_map[';'.join(line[1].split('; '))].append(line[0])

    # reverse the dict to feed into embalmer
    references_faidx = pyfaidx.Fasta(reference_fasta)

    tmpdir = tempfile.mkdtemp()
    with open(os.path.join(output, 'embalmer_out.txt'), 'w') as embalmer_cat:
        for key in lca_map_2.keys():

            queries_fna_filename = os.path.join(tmpdir, 'queries.fna')
            references_fna_filename = os.path.join(tmpdir, 'reference.fna')
            output_filename = os.path.join(tmpdir, 'output.txt')

            with open(queries_fna_filename, 'w') as queries_fna:
                for basename, headers in lca_map_2[key]:
                    for header in headers:
                        record = fna_faidx[basename][header][:]
                        queries_fna.write('>filename|%s|%s\n%s\n' % (basename, record.name, record.seq))

            with open(references_fna_filename, 'w') as references_fna:
                for i in dict_reference_map[key]:
                        record = references_faidx[i][:]
                        references_fna.write('>%s\n%s\n' % (record.name, record.seq))

            embalmer_align(queries_fna_filename, references_fna_filename, output_filename)

            with open(output_filename) as embalmer_out:
                for line in embalmer_out:
                    embalmer_cat.write(line)

            os.remove(queries_fna_filename)
            os.remove(references_fna_filename)
            os.remove(output_filename)

    os.rmdir(tmpdir)

    sparse_ncbi_dict = defaultdict(dict)

    # build query by NCBI_TID DataFrame
    with open(os.path.join(output, 'embalmer_out.txt')) as embalmer_cat:
        embalmer_csv = csv.reader(embalmer_cat, delimiter='\t')
        for line in embalmer_csv:
            # line[0] = qname, line[1] = rname, line[2] = %match
            ncbi_tid = np.int(find_between(line[1], begin, end))
            sparse_ncbi_dict[line[0]][ncbi_tid] = np.float(line[2])

    df = pd.DataFrame.from_dict(sparse_ncbi_dict)
    df.to_csv(os.path.join(output, 'strain_alignments.csv'))
def main():
	parser = make_arg_parser()
	args = parser.parse_args()

	# Parse command line
	method = args.method
	height = 1 - (args.height / 100)
	with open(args.input, 'r') as inf:
		if args.clusterme:
			print('...performing hierarchical clustering, tree cut at height of %s...\n' % args.height)
			hclus = process_hierarchy(inf, height, method)
		else:
			hclus = pd.read_csv(inf, sep=',', header=0, index_col=0)
		size = hclus.max(0)[0]  # get the total number of clustered OFUs (depends on height cut)
		print('\n...Preparing OFU profile for %s OFUs...\n' % size)
		size += 1
		fill = outer(size)
		dd = defaultdict(fill)  # Initialize the dict with all zeros
		# Collapse into an OFU reference table, strains vs OFUs
		if args.clusterme:
			hclus.to_csv('hcsv_temp.csv')
			with open('hcsv_temp.csv', 'r') as inf2:
				df = cluster_ofus(inf2, dd)
		else:
			with open(args.input, 'r') as inf2:
				df = cluster_ofus(inf2, dd)
		j = 0
		k = 0
		if args.annotate:
			# Preload the Database and Tree
			db = RefSeqDatabase()
			nt = NCBITree()
			strain_label = []
			refseq_list = list(df.index)
			for refseq_id in refseq_list:
				if refseq_id.startswith('ncbi_tid'):
					ncbi_tid = refseq_id.split('|')[1]
					if ncbi_tid == 'na':
						genbank = '|'.join(refseq_id.split('_')[1].split('|')[2:4])
						j += 1
					else:
						ncbi_tid = int(ncbi_tid)
						organism = nt.green_genes_lineage(ncbi_tid, depth=8, depth_force=True)
					if organism == 'k__;p__;c__;o__;f__;g__;s__;t__' and ncbi_tid != 'na':
						strain_label.append('|'.join(['ncbi_tid', str(ncbi_tid)]))
						k += 1
					elif ncbi_tid == 'na':
						strain_label.append(genbank)
					else:
						strain_label.append(organism)
				else:
					# TODO: Finish the regex for refseq id
					# p = re.compile(r"N\w\_[\w+\d+]*\.\d")
					# m = p.search(refseq_id)  # searches using the regex defined above
					# refseq_id_extract = ''.join(m)

					organism = refseq_to_name(refseq_id, db=db, nt=nt)
					ncbi_tid = refseq_to_tid(refseq_id, db=db)
					ncbi_tid = str(ncbi_tid)
					if args.taxonomy:
						if ncbi_tid == organism:  # sometimes DOJO can't look up the refseq accession; in this case, just return refseq.
							strain_label.append(refseq_id)
						else:
							strain_label.append(organism)
					elif args.ncbitid:
						if ncbi_tid == organism:  # sometimes DOJO can't look up the refseq accession; in this case, just return refseq.
							strain_label.append(refseq_id)
						else:
							strain_label.append(ncbi_tid)
					else:
						if ncbi_tid == organism:  # sometimes DOJO can't look up the refseq accession; in this case, just return refseq.
							strain_label.append(refseq_id)
						elif organism.endswith('None') or organism.endswith('t__'):
							genus_species = organism.split(';')[-2]
							# genus_species = genus_species.strip('s__')
							strain_label.append('ncbi_tid|%s|ref|%s|organism|%s' % (ncbi_tid, refseq_id, genus_species))
						else:
							strain = organism.split(';')[-1]
							# strain = strain.strip('t__')
							strain_label.append('ncbi_tid|%s|ref|%s|organism|%s' % (ncbi_tid, refseq_id, strain))
			df.index = strain_label
			df.sort_index(axis=0, inplace=True)
			if j > 0 or k > 0:
				print('Note: Organism information was not obtained for all clusters:\n')
				if j > 0:
					print('%s clusters had no NCBI tid...\n' % j)
				if k > 0:
					print('%s clusters did not match a full named taxonomy annotation\n' % k)
		else:
			pass

	with open(args.output, 'w') if args.output != '-' else sys.stdout as outf:
		df.to_csv(outf)
	print('...all done, cleaning up...\n')
	os.remove('hcsv_temp.csv')