Exemplo n.º 1
0
def subset_fasta():
    parser = make_arg_parser()
    args = parser.parse_args()
    with open(args.input) as inf:
        fasta = FASTA(inf)
        num_reads = sum(1 for i in fasta.read())
    with open(args.input) as inf:
        fasta = FASTA(inf)
        fasta_gen = fasta.read()
        filtered_fasta_gen = filter_fasta(fasta_gen, num_reads, args.keep)
        with open(args.output, 'w') if args.output else sys.stdout as outf:
            for title, data in filtered_fasta_gen:
                outf.write('>%s\n%s\n' % (title, data))
Exemplo n.º 2
0
def subset_fasta():
    parser = make_arg_parser()
    args = parser.parse_args()
    with open(args.input) as inf:
        fasta = FASTA(inf)
        num_reads = sum(1 for i in fasta.read())
    with open(args.input) as inf:
        fasta = FASTA(inf)
        fasta_gen = fasta.read()
        filtered_fasta_gen = filter_fasta(fasta_gen, num_reads, args.keep)
        with open(args.output, 'w') if args.output else sys.stdout as outf:
            for title, data in filtered_fasta_gen:
                outf.write('>%s\n%s\n' % (title, data))
Exemplo n.º 3
0
def shogun_bt2_db(input, output, annotater, extract_id, prefixes, depth, depth_force):
    verify_make_dir(output)
    # Verify the FASTA is annotated
    if input == '-':
        output_fn = 'stdin'
    else:
        output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1])

    outf_fasta = os.path.join(output, output_fn + '.annotated.fna')
    outf_map = os.path.join(output, output_fn + '.annotated.map')
    if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map):
        tree = NCBITree()
        db = RefSeqDatabase()

        if annotater == 'refseq':
            annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force)
        elif annotater == 'nt':
            annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force)
        else:
            annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force)

        with open(outf_fasta, 'w') as output_fna:
            with open(outf_map, 'w') as output_map:
                with open(input) as inf:
                    inf_fasta = FASTA(inf)
                    for lines_fna, lines_map in annotater_class.annotate(inf_fasta.read()):
                        output_fna.write(lines_fna)
                        output_map.write(lines_map)
    else:
        print("Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % (
            outf_fasta, outf_map))

    # Build the output BT2 database
    verify_make_dir(os.path.join(output, 'bt2'))
    print(bowtie2_build(outf_fasta, os.path.join(output, 'bt2', output_fn)))
def main():
	parser = make_arg_parser()
	args = parser.parse_args()

	db = RefSeqDatabase()
	nt = NCBITree()
	# parse command line
	with open(args.input, 'r') if args.input != '-' else sys.stdin as inf:
		fasta_gen = FASTA(inf)
		assembly_version = os.path.basename(args.input).split('_genomic')[0]
		with open(args.output, 'w') if args.output != '-' else sys.stdout as outf:
			for header, sequence in fasta_gen.read():
				if '.cluster' in header:
					header = header.replace('.cluster','_cluster')
				else:
					pass
				ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(header.split('_cluster')[0])
				if ncbi_tid:
					ncbi_tid = ncbi_tid[0]
					organism = nt.gg_lineage(ncbi_tid)
					# genus_species = organism.split(';')[-1]
					# genus_species = genus_species.replace('s__','')
					outf.write('>ncbi_tid|%d|ref|%s|organism|%s|\n' % (ncbi_tid, header, organism))
					outf.write(sequence+'\n')
				else:
					outf.write('>ref|%s|\n' % (header))
					outf.write(sequence+'\n')
def main():
	parser = make_arg_parser()
	args = parser.parse_args()

	db = RefSeqDatabase()
	nt = NCBITree()
    # parse command line
	with open(args.input, 'r') if args.input != '-' else sys.stdin as inf:
		fasta_gen = FASTA(inf)
		assembly_version = os.path.basename(args.input).split('_genomic')[0]
		with open(args.output, 'w') if args.output != '-' else sys.stdout as outf:
			for header, sequence in fasta_gen.read():
				if '.cluster' in header:
					header = header.replace('.cluster','_cluster')
				else:
					pass
				ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(header.split('_cluster')[0])
				if ncbi_tid:
					ncbi_tid = ncbi_tid[0]
					organism = nt.gg_lineage(ncbi_tid)
					genus_species = organism.split(';')[-1]
					genus_species = genus_species.replace('s__','')
					outf.write('>ncbi_tid|%d|ref|%s|organism|%s|\n' % (ncbi_tid, header, genus_species))
					outf.write(sequence+'\n')
				else:
					outf.write('>ref|%s|\n' % (header))
					outf.write(sequence+'\n')
Exemplo n.º 6
0
def extract_genome_lengths(input, map, output):
    d = defaultdict(int)
    inf_fasta = FASTA(input)
    for header, seq in inf_fasta.read():
        map_line = next(map).rstrip().split('\t')[1].replace('; ', ';')
        d[map_line] += len(seq)
    for key, value in d.items():
        output.write('%s\t%s\n' % (key, value))
def extract_genome_lengths(input, map, output):
    d = defaultdict(int)
    inf_fasta = FASTA(input)
    for header, seq in inf_fasta.read():
        map_line = next(map).rstrip().split('\t')[1].replace('; ', ';')
        d[map_line] += len(seq)
    for key, value in d.items():
        output.write('%s\t%s\n' % (key, value))
def filter_dusted_fasta(input, threshold, output):
    fasta_gen = FASTA(input)

    for title, seq in fasta_gen.read():
        seq = re.sub('[^A-Z]', 'N', seq)
        hits = sum(1.0 for i in seq if not i == 'N')
        if len(seq) and hits and hits/len(seq) > threshold:
            output.write('>%s\n' % title)
            output.write('%s\n' % re.sub('[^A-Z]', 'N', seq))
Exemplo n.º 9
0
def filter_dusted_fasta(input, threshold, output):
    fasta_gen = FASTA(input)

    for title, seq in fasta_gen.read():
        seq = re.sub('[^A-Z]', 'N', seq)
        hits = sum(1.0 for i in seq if not i == 'N')
        if len(seq) and hits and hits / len(seq) > threshold:
            output.write('>%s\n' % title)
            output.write('%s\n' % re.sub('[^A-Z]', 'N', seq))
Exemplo n.º 10
0
def compile_ofu_dnasequences(inf_d, bgc, dna_outf):
    fasta_gen = FASTA(inf_d)
    bgc = str(bgc)
    for header, sequence in fasta_gen.read():
        if '.cluster' in header:
            header = header.replace('.cluster', '_cluster')
        if bgc in header:
            dna_outf.write(''.join(['>', header, '\n']))
            dna_outf.write(''.join([sequence, '\n']))
    return dna_outf
Exemplo n.º 11
0
def compile_ofu_sequences(inf_m, bgc, aa_outf):
    mpfa_gen = FASTA(inf_m)
    bgc = str(bgc)
    for header, sequence in mpfa_gen.read():
        if '.cluster' in header:
            header = header.replace('.cluster', '_cluster')
        if bgc in header:
            aa_outf.write(''.join(['>', header, '\n']))
            aa_outf.write(''.join([sequence, '\n']))
    return aa_outf
Exemplo n.º 12
0
def cmd_open_fasta(fastas):
    """Loads one or FASTA files for processing.
    """
    for fasta in fastas:
        try:
            click.echo('Opening "%s"' % fasta)
            file_handle = click.open_file(fasta)
            inf_fasta = FASTA(file_handle)
            yield inf_fasta.read()
        except Exception as e:
            click.echo('Could not open FASTA "%s": %s' % (fasta, e), err=True)
Exemplo n.º 13
0
def shogun_bt2_db(input, output, annotater, extract_id, prefixes, depth,
                  depth_force):
    verify_make_dir(output)
    # Verify the FASTA is annotated
    if input == '-':
        output_fn = 'stdin'
    else:
        output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1])

    outf_fasta = os.path.join(output, output_fn + '.annotated.fna')
    outf_map = os.path.join(output, output_fn + '.annotated.map')
    if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map):
        tree = NCBITree()
        db = RefSeqDatabase()

        if annotater == 'refseq':
            annotater_class = RefSeqAnnotater(extract_id,
                                              prefixes,
                                              db,
                                              tree,
                                              depth=depth,
                                              depth_force=depth_force)
        elif annotater == 'nt':
            annotater_class = NTAnnotater(extract_id,
                                          prefixes,
                                          db,
                                          tree,
                                          depth=depth,
                                          depth_force=depth_force)
        else:
            annotater_class = GIAnnotater(extract_id,
                                          db,
                                          tree,
                                          depth=depth,
                                          depth_force=depth_force)

        with open(outf_fasta, 'w') as output_fna:
            with open(outf_map, 'w') as output_map:
                with open(input) as inf:
                    inf_fasta = FASTA(inf)
                    for lines_fna, lines_map in annotater_class.annotate(
                            inf_fasta.read()):
                        output_fna.write(lines_fna)
                        output_map.write(lines_map)
    else:
        print(
            "Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file."
            % (outf_fasta, outf_map))

    # Build the output BT2 database
    verify_make_dir(os.path.join(output, 'bt2'))
    print(bowtie2_build(outf_fasta, os.path.join(output, 'bt2', output_fn)))
Exemplo n.º 14
0
def build_cluster_map(inf, bread='ref|,|'):
	begin, end = bread.split(',')
	cluster_map = defaultdict(set)
	fasta_gen = FASTA(inf)
	for header, sequence in fasta_gen.read():
		if '.cluster' in header:
			header = header.replace('.cluster', '_cluster')
		ref = find_between(header, begin, end)
		header_split = ref.split('_')
		key = '_'.join(header_split[:3])
		value = '_'.join(header_split[-2:])
		cluster_map[key].add(value)
	return cluster_map
def build_cluster_map(inf, bread='ref|,|'):
	begin,end = bread.split(',')
	cluster_map = defaultdict(set)
	fasta_gen = FASTA(inf)
	for header, sequence in fasta_gen.read():
		if '.cluster' in header:
			header = header.replace('.cluster','_cluster')
		ref = find_between(header, begin, end)
		header_split = ref.split('_')
		key = '_'.join(header_split[:3])
		value = header_split[-1]
		cluster_map[key].add(value)
	return cluster_map
def compile_files(outpath):
	protein_seqs = os.path.join(outpath, 'asDB_protein_seqs.mpfa')
	dna_seqs = os.path.join(outpath, 'asDB_dna_seqs.fna')
	with open(protein_seqs, 'w') as aa_outfile:
		for aafile in os.listdir(os.path.join(outpath, 'antismash_db_protein_seqs')):
			aafile = os.path.join(outpath, 'antismash_db_protein_seqs', aafile)
			with open(aafile, 'r') as aa_in:
				fasta_gen = FASTA(aa_in)
				for header, sequence in fasta_gen.read():
					aa_outfile.write('>' + header + '\n')
					aa_outfile.write(sequence + '\n')
	aa_outfile.close()
	with open(dna_seqs, 'w') as dna_outfile:
		for dnafile in os.listdir(os.path.join(outpath, 'antismash_db_dna_seqs')):
			dnafile = os.path.join(outpath, 'antismash_db_dna_seqs', dnafile)
			with open(dnafile, 'r') as dna_in:
				fasta_gen = FASTA(dna_in)
				for header, sequence in fasta_gen.read():
					dna_outfile.write('>' + header + '\n')
					dna_outfile.write(sequence + '\n')
	dna_outfile.close()
	return None
def main():
    parser = make_arg_parser()
    args = parser.parse_args()

    db = RefSeqDatabase()
    # parse command line
    with open(args.input, 'r') if args.input != '-' else sys.stdin as inf:
        fasta_gen = FASTA(inf)
        assembly_version = os.path.basename(args.input).split('_genomic')[0]
        with open(args.output, 'w') if args.output != '-' else sys.stdout as outf:
            for header, sequence in fasta_gen.read():
                ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(header.split()[0])[0]
                outf.write('>ncbi_tid|%d|assembly|%s|ref|%s\n' % (ncbi_tid, assembly_version, header))
                outf.write(sequence+'\n')
def main():
    parser = make_arg_parser()
    args = parser.parse_args()

    db = RefSeqDatabase()
    # parse command line
    with open(args.input, 'r') if args.input != '-' else sys.stdin as inf:
        fasta_gen = FASTA(inf)
        assembly_version = os.path.basename(args.input).split('_genomic')[0]
        with open(args.output,
                  'w') if args.output != '-' else sys.stdout as outf:
            for header, sequence in fasta_gen.read():
                ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(
                    header.split()[0])[0]
                outf.write('>ncbi_tid|%d|assembly|%s|ref|%s\n' %
                           (ncbi_tid, assembly_version, header))
                outf.write(sequence + '\n')
Exemplo n.º 19
0
def pluckify_mibig(inf, outaa, gbk_dd):
    mibig_orig = FASTA(inf)
    for header, sequence in mibig_orig.read():
        m_head = header.split('|')
        # print(m_head)
        bgc_id = m_head[0]
        bgc_info = gbk_dd[bgc_id]
        # print(bgc_info)
        bgc_tid = bgc_info[0]
        if bgc_tid == 'None':
            continue
        bgc_bug = bgc_info[1]
        outaa.write('>' + 'ncbi_tid|' + str(bgc_tid) + '|mibig|' + bgc_id +
                    '.1_cluster001' + '_' + m_head[1] + '_' + m_head[2] +
                    '|genbank|' + m_head[6] + '|organism|' + bgc_bug + '\n')
        outaa.write(sequence + '\n')
    return None
Exemplo n.º 20
0
def shogun_utree_db(input, output, annotater, extract_id, threads, prefixes, depth, depth_force):
    verify_make_dir(output)
    # Verify the FASTA is annotated
    if input == '-':
        output_fn = 'stdin'
    else:
        output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1])

    outf_fasta = os.path.join(output, output_fn + '.annotated.fna')
    outf_map = os.path.join(output, output_fn + '.annotated.map')
    if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map):
        tree = NCBITree()
        db = RefSeqDatabase()

        if annotater == 'refseq':
            annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force)
        elif annotater == 'nt':
            annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force)
        elif annotater == 'ncbi':
            annotater_class = NCBIAnnotater(extract_id, tree, depth=depth, depth_force=depth_force)
        else:
            annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force)

        with open(outf_fasta, 'w') as output_fna:
            with open(outf_map, 'w') as output_map:
                with open(input) as inf:
                    inf_fasta = FASTA(inf)
                    for lines_fna, lines_map in annotater_class.annotate(inf_fasta.read()):
                        output_fna.write(lines_fna)
                        output_map.write(lines_map)
    else:
        print("Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % (
            outf_fasta, outf_map))

    # Build the output CTR
    verify_make_dir(os.path.join(output, 'utree'))
    path_uncompressed_tree = os.path.join(output, 'utree', output_fn + '.utr')
    path_compressed_tree = os.path.join(output, 'utree', output_fn + '.ctr')
    if os.path.exists(path_compressed_tree):
        print('Compressed tree database file %s exists, skipping this step.' % path_compressed_tree)
    else:
        if not os.path.exists(path_uncompressed_tree):
            print(utree_build(outf_fasta, outf_map, path_uncompressed_tree, threads=threads))
        print(utree_compress(path_uncompressed_tree, path_compressed_tree))
        os.remove(path_uncompressed_tree)
Exemplo n.º 21
0
def shogun_utree_db(input, output, annotater, extract_id, threads, prefixes, depth, depth_force):
    verify_make_dir(output)
    # Verify the FASTA is annotated
    if input == '-':
        output_fn = 'stdin'
    else:
        output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1])

    outf_fasta = os.path.join(output, output_fn + '.annotated.fna')
    outf_map = os.path.join(output, output_fn + '.annotated.map')
    if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map):
        tree = NCBITree()
        db = RefSeqDatabase()

        if annotater == 'refseq':
            annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force)
        elif annotater == 'nt':
            annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force)
        else:
            annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force)

        with open(outf_fasta, 'w') as output_fna:
            with open(outf_map, 'w') as output_map:
                with open(input) as inf:
                    inf_fasta = FASTA(inf)
                    for lines_fna, lines_map in annotater_class.annotate(inf_fasta.read()):
                        output_fna.write(lines_fna)
                        output_map.write(lines_map)
    else:
        print("Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % (
            outf_fasta, outf_map))

    # Build the output CTR
    verify_make_dir(os.path.join(output, 'utree'))
    path_uncompressed_tree = os.path.join(output, 'utree', output_fn + '.utr')
    path_compressed_tree = os.path.join(output, 'utree', output_fn + '.ctr')
    if os.path.exists(path_compressed_tree):
        print('Compressed tree database file %s exists, skipping this step.' % path_compressed_tree)
    else:
        if not os.path.exists(path_uncompressed_tree):
            print(utree_build(outf_fasta, outf_map, path_uncompressed_tree, threads=threads))
        print(utree_compress(path_uncompressed_tree, path_compressed_tree))
        os.remove(path_uncompressed_tree)
Exemplo n.º 22
0
def refseq_annotate(input, output, extract_refseq_id, prefixes):
    db = RefSeqDatabase()

    # check for the glob prefix
    prefixes = prefixes.split(',')

    begin, end = extract_refseq_id.split(',')

    if '*' in prefixes:
        prefix_set = set([_ for _ in db.refseq_prefix_mapper.keys()])
    else:
        prefix_set = set([_ for _ in prefixes])

    inf_fasta = FASTA(input)
    for title, seq in inf_fasta.read():
        title = '>' + title
        refseq_accession_version = find_between(title, begin, end)
        if refseq_accession_version[:2] in prefix_set:
            ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(refseq_accession_version)
            if ncbi_tid:
                title = '>ncbi_tid|%d|%s' % (ncbi_tid[0], title[1:])
            output.write('%s\n%s\n' % (title, seq))
Exemplo n.º 23
0
def annotate_fasta(input, output, extract_refseq_id, prefixes, depth,
                   depth_force):
    verify_make_dir(output)

    if input == '-':
        output_fn = 'stdin'
    else:
        output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1])

    with open(input, 'r') if input != '-' else sys.stdin as inf:
        with open(os.path.join(output, output_fn + '.annotated.fna'),
                  'w') as output_fna:
            with open(os.path.join(output, output_fn + '.annotated.map'),
                      'w') as output_map:
                inf_fasta = FASTA(inf)
                annotater = refseq_annotater(inf_fasta.read(),
                                             prefixes,
                                             extract_refseq_id,
                                             depth=depth,
                                             depth_force=depth_force)
                for lines_fna, lines_map in annotater:
                    output_fna.write(lines_fna)
                    output_map.write(lines_map)
Exemplo n.º 24
0
def refseq_annotate(input, output, extract_refseq_id, prefixes):
    db = RefSeqDatabase()

    # check for the glob prefix
    prefixes = prefixes.split(',')

    begin, end = extract_refseq_id.split(',')

    if '*' in prefixes:
        prefix_set = set([_ for _ in db.refseq_prefix_mapper.keys()])
    else:
        prefix_set = set([_ for _ in prefixes])

    inf_fasta = FASTA(input)
    for title, seq in inf_fasta.read():
        title = '>' + title
        refseq_accession_version = find_between(title, begin, end)
        if refseq_accession_version[:2] in prefix_set:
            ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(
                refseq_accession_version)
            if ncbi_tid:
                title = '>ncbi_tid|%d|%s' % (ncbi_tid[0], title[1:])
            output.write('%s\n%s\n' % (title, seq))
Exemplo n.º 25
0
def soft_mask2hard_mask(input, output):
    fasta_gen = FASTA(input)

    for title, seq in fasta_gen.read():
        output.write('>%s\n' % title)
        output.write('%s\n' % re.sub('[^A-Z]', 'N', seq))
Exemplo n.º 26
0
def soft_mask2hard_mask(input, output):
    fasta_gen = FASTA(input)

    for title, seq in fasta_gen.read():
        output.write('>%s\n' % title)
        output.write('%s\n' % re.sub('[^A-Z]', 'N', seq))
Exemplo n.º 27
0
#!/usr/bin/env python
import ipdb
from ninja_utils.parsers import FASTA
import csv

accession2taxid = dict()
with open(snakemake.input.mapping) as inf:
	csv_inf = csv.reader(inf, delimiter='\t')
	next(csv_inf)
	for line in csv_inf:
		accession2taxid[line[0]] = line[1]

with open(snakemake.output[0], 'w') as outf:
	with open(snakemake.input.fasta[0]) as inf:
		parser = FASTA(inf)
		for title, seq in parser.read():
			rowname = title.split(".")[0]
			if  rowname in accession2taxid:
				taxid = accession2taxid[rowname]
				outf.write('>%s|kraken:taxid|%s\n%s\n' % (title, taxid, seq))