예제 #1
0
def _filter_bad_species(cursor, read2tax, bad_species, tax_tree):
    questionable_reads = []
    for read, tax in read2tax.iteritems():
        if tax in bad_species:
            questionable_reads.append(read)

    with timeit('EXTRACTING BAD SPECIES READ INFO'):
        cursor.execute(
            'SELECT * FROM alignment WHERE read_id in (%s)' %
            (','.join(['?'] * len(questionable_reads))), questionable_reads)
        data = cursor.fetchall()

    read2aln = defaultdict(list)
    for d in data:
        read2aln[d[0]].append(d[1:])

    new_read2tax = {}
    with timeit('ASSIGNING NEW OTU TO BAD SPECIES READS'):
        reads = set(map(lambda d: d[0], data))
        for read in reads:
            alns = read2aln[read]
            # alns = map(lambda d: d[1:], filter(lambda d: d[0] == read, data))
            new_read2tax[read] = assign_OTU_from_scores(
                alns, tax_tree, bad_species)

    return new_read2tax
예제 #2
0
def parse_tab_delimited(input_file,
                        db_type,
                        at_once=1e5,
                        detailed=False,
                        read_alns=defaultdict(list),
                        target_seqs={},
                        filter_low_scoring=True,
                        annotate=True,
                        entry_cnt=None):

    if entry_cnt is None:
        with timeit('Line count'):
            with open(input_file) as f:
                line_count = sum(1 for line in f)
    else:
        line_count = entry_cnt
    rolls = int(line_count / at_once + 1)

    with timeit('Pure file parsing'):
        with open(input_file, 'r') as fin:
            for i in xrange(rolls):
                with oneline_timeit('Step %d/%d' % (i, rolls)):
                    new_reads = _parse_n_lines(fin, at_once, read_alns,
                                               target_seqs, db_type)
                    if filter_low_scoring:
                        _filter_low_scoring(read_alns, 100., 0.01, new_reads)

    if not annotate:
        return read_alns, target_seqs
    else:
        return annotate_targets(read_alns, target_seqs)
예제 #3
0
def fill_blast_aln_table(db, cursor, data_access, aln_file):
	gis = set()
	sql = 'INSERT INTO %s_tmp(read_id, version, nucl_gi, tax_id, score, start, end, strand) VALUES (?,?,?,?,?,?,?,?)' % _ALN_TABLE_NAME_

	with timeit('PARSING BLAST'):
		for read_id, aln_data in BLASTParser().parse_file(aln_file):
			gis.add(aln_data.gi)

	with timeit('LOADING TAXIDS'):
		tax_ids = data_access.get_taxids(list(gis), format=dict)

	with timeit('INSERTING INTO DATABASE'):
		for read_id, aln_data in BLASTParser().parse_file(aln_file):
			tax_id = tax_ids.get(aln_data.gi, -1)
			cursor.execute(sql, (read_id,
								 aln_data.nucleotide_accession,
								 aln_data.gi,
								 tax_id,
								 aln_data.score,
								 aln_data.start,
								 aln_data.stop,
								 aln_data.strand))
	with timeit('REORDERING DATA'):
		cursor.execute('CREATE TABLE alignment(read_id TEXT, version TEXT, nucl_gi INTEGER, tax_id INTEGER, score REAL, start INTEGER, end INTEGER, strand TEXT)')
		cursor.execute('INSERT INTO alignment (read_id, version, nucl_gi, tax_id, score, start, end, strand) SELECT read_id, version, nucl_gi, tax_id, score, start, end, strand FROM alignment_tmp ORDER BY read_id')
		cursor.execute('DROP TABLE alignment_tmp')
		db.commit()

	db.close()
예제 #4
0
파일: sam.py 프로젝트: abulovic/metagenomix
def parse_cds_sam(input_file,
                  db_type,
                  binary=False,
                  annotate=True,
                  read_alns=defaultdict(list),
                  target_seqs={},
                  entry_cnt=None,
                  detailed=False):
    with timeit('Pysam load of the %s file (%s)' % (input_file.split(
            os.path.sep)[-1], get_appropriate_file_size(input_file))):
        if binary:
            samfile = pysam.Samfile(input_file, 'rb')
        else:
            samfile = pysam.Samfile(input_file, 'r')

    if entry_cnt is not None:
        _1perc_reads = entry_cnt / 100
    data_access = DataAccess()
    unmapped = set()
    lengths = np.array(samfile.lengths)
    with timeit('Sequence iteration'):
        for i, ar in enumerate(samfile.fetch()):
            if entry_cnt is not None:
                if i % _1perc_reads == 0:
                    print '%d %%' % (i / _1perc_reads)
            else:
                if i % 10000 == 0:
                    print 'Processed %d aligned reads.' % i
            if ar.is_unmapped:
                unmapped.add(ar.qname)
                continue

            read_id = ar.qname.split()[0]
            tstart, tend = ar.aend - ar.alen, ar.aend
            qstart, qend = ar.qstart, ar.qend
            if ar.tid == -1:
                continue
            target_name = samfile.getrname(ar.tid)
            target_len = lengths[ar.tid]
            target_seq = create_sequence(target_name, target_len, db_type)
            if target_seq.get_id() not in target_seqs:
                target_seqs[target_seq.get_id()] = target_seq

            aln = SamAlignment(read_id, target_seq.get_id(), qstart, qend,
                               tstart, tend, ar.mapq)
            if not ar.is_secondary:
                aln.is_best = True
            read_alns[read_id].append(aln)

    if not annotate:
        return read_alns, target_seqs
    else:
        return annotate_targets(read_alns, target_seqs)

    samfile.close()
예제 #5
0
파일: sam.py 프로젝트: abulovic/metagenomix
def parse_cds_sam(input_file, db_type, binary=False, annotate=True,
				  read_alns=defaultdict(list), target_seqs={}, entry_cnt=None, detailed=False):
	with timeit('Pysam load of the %s file (%s)' % (input_file.split(os.path.sep)[-1],
													get_appropriate_file_size(input_file))):
		if binary:
			samfile = pysam.Samfile(input_file, 'rb')
		else:
			samfile = pysam.Samfile(input_file, 'r')

	if entry_cnt is not None:
		_1perc_reads = entry_cnt / 100
	data_access = DataAccess()
	unmapped = set()
	lengths = np.array(samfile.lengths)
	with timeit('Sequence iteration'):
		for i, ar in enumerate(samfile.fetch()):
			if entry_cnt is not None:
				if i % _1perc_reads == 0:
					print '%d %%' % (i / _1perc_reads)
			else:
				if i % 10000 == 0:
					print 'Processed %d aligned reads.' % i
			if ar.is_unmapped:
				unmapped.add(ar.qname)
				continue

			read_id = ar.qname.split()[0]
			tstart, tend = ar.aend - ar.alen, ar.aend
			qstart, qend = ar.qstart, ar.qend
			if ar.tid == -1:
				continue
			target_name = samfile.getrname(ar.tid)
			target_len = lengths[ar.tid]
			target_seq = create_sequence(target_name, target_len, db_type)
			if target_seq.get_id() not in target_seqs:
				target_seqs[target_seq.get_id()] = target_seq

			aln = SamAlignment(read_id, target_seq.get_id(), qstart, qend, tstart,
							   tend, ar.mapq)
			if not ar.is_secondary:
				aln.is_best = True
			read_alns[read_id].append(aln)

	if not annotate:
		return read_alns, target_seqs
	else:
		return annotate_targets(read_alns, target_seqs)

	samfile.close()
예제 #6
0
def analyze():
	parser = get_aln_option_parser()
	namespace = parser.parse_args()
	db = sqlite3.connect(namespace.db_file)
	cursor = db.cursor()
	empty_reads_file = os.path.sep.join([namespace.output_dir, 'no_alignment_reads.fa'])
	basic_stats_file = os.path.sep.join([namespace.output_dir, 'basic_alignment_stats.txt'])
	phylum_composition_file = os.path.sep.join([namespace.output_dir, 'phylum_composition.csv'])
	no_otu_reads_file = os.path.sep.join([namespace.output_dir, 'no_OTU_assignment_reads.fa'])
	otu_csv = os.path.sep.join([namespace.output_dir, 'OTU_assignment.csv'])
	otu_json = os.path.sep.join([namespace.output_dir, 'OTU_assignment.json'])

	with timeit('LOADING TAX TREE'):
		tt = TaxTree()
	read2tax, low_count_taxa = analysis.perform_OTU_analysis_on_db(db, cursor, tt, empty_reads_file, basic_stats_file, phylum_composition_file)
	taxa = set(read2tax.values())
	num_alns = len(read2tax)

	no_otu_reads = filter(lambda r: read2tax[r] == -1, read2tax.keys())
	save_reads_to_fasta(db, cursor, no_otu_reads, no_otu_reads_file)

	export_OTU_to_csv(read2tax, tt, otu_csv)
	export_OTU_to_json(read2tax, tt, otu_json)

	db.close()
예제 #7
0
def analyze():
    parser = get_aln_option_parser()
    namespace = parser.parse_args()
    db = sqlite3.connect(namespace.db_file)
    cursor = db.cursor()
    empty_reads_file = os.path.sep.join(
        [namespace.output_dir, 'no_alignment_reads.fa'])
    basic_stats_file = os.path.sep.join(
        [namespace.output_dir, 'basic_alignment_stats.txt'])
    phylum_composition_file = os.path.sep.join(
        [namespace.output_dir, 'phylum_composition.csv'])
    no_otu_reads_file = os.path.sep.join(
        [namespace.output_dir, 'no_OTU_assignment_reads.fa'])
    otu_csv = os.path.sep.join([namespace.output_dir, 'OTU_assignment.csv'])
    otu_json = os.path.sep.join([namespace.output_dir, 'OTU_assignment.json'])

    with timeit('LOADING TAX TREE'):
        tt = TaxTree()
    read2tax, low_count_taxa = analysis.perform_OTU_analysis_on_db(
        db, cursor, tt, empty_reads_file, basic_stats_file,
        phylum_composition_file)
    taxa = set(read2tax.values())
    num_alns = len(read2tax)

    no_otu_reads = filter(lambda r: read2tax[r] == -1, read2tax.keys())
    save_reads_to_fasta(db, cursor, no_otu_reads, no_otu_reads_file)

    export_OTU_to_csv(read2tax, tt, otu_csv)
    export_OTU_to_json(read2tax, tt, otu_json)

    db.close()
예제 #8
0
def annotate_targets(read_alns, target_seqs):
    with timeit('Removing reads with 0 alignments'):
        zero_aln_reads = filter(lambda r: len(read_alns[r]) == 0, read_alns)
        for read in zero_aln_reads:
            read_alns.pop(read)
        print 'Removed %d reads' % len(zero_aln_reads)

    # Annotate the best alignment for all the reads.
    with timeit('Adding alignments to target sequences'):
        total_reads = len(read_alns)
        step = max(1, total_reads / 100)
        print 'Number of reads to annotate:', total_reads
        progressbar(0, start=True)
        for i, (read, alns) in enumerate(read_alns.iteritems()):
            if i % step == 0:
                progressbar(i / step)
            alns.sort(reverse=True)
            max_score = alns[0].bitscore
            for aln in alns[:max(1, int(0.1 * len(alns)))]:
                if aln.bitscore == max_score:
                    aln.is_best = True
                else:
                    aln.is_best = False
        progressbar(100, end=True)

    with timeit('Functional transcript annotation'):
        for read, alns in read_alns.iteritems():
            for aln in alns:
                target = target_seqs[aln.target_id]
                if aln.is_best:
                    target.num_of_ba_reads += 1
                target.num_of_aln_reads += 1
                target.add_alignment(aln)

    with timeit('Removing 0 alignments target seqs'):
        zero_aln_targets = filter(
            lambda t: len(target_seqs[t].get_alignments()) == 0,
            target_seqs.iterkeys())
        for target in zero_aln_targets:
            target_seqs.pop(target)
        print 'Removed %d targets.' % len(zero_aln_targets)

    for target in target_seqs.itervalues():
        target.join_regions()
        target.join_ba_regions()

    return read_alns, target_seqs
예제 #9
0
def generate_consensus():
    parser = get_consensus_parser()
    args = parser.parse_args()

    if args.seq_file is not None:
        generate_seq, seq = True, args.seq_file
    else:
        generate_seq, seq = False, None

    with utils.timeit('Loading tax tree'):
        tt = tax.TaxTree()

    with utils.timeit(
            'Parsing aln file %s <%s>' %
        (args.aln_file, utils.get_appropriate_file_size(args.aln_file))):
        read_alns, target_seqs = din.parse(args.aln_file,
                                           args.db_type,
                                           detailed=True)

    with utils.timeit('Retrieving tax data'):
        utils.retrieve_tax_ids(target_seqs, args.db_type)

    with utils.timeit('Creating consensuses'):
        with open(args.output_vcf, 'w') as fout:
            filters = consensus_tools.get_filters()
            infos = consensus_tools.get_infos()
            fout.write(vcf_header(filters, infos, args.ref_db))
            fout.write('\n')
            fout.write('#CHROM\tPOS\tID\tREF\tALT\tFILTER\tINFO\n')
            for ts in target_seqs.itervalues():
                snps = 0
                for loc, cov, e, fres, infos in consensus_tools.itervariants(
                        ts.alignment, args.output_vcf, generate_seq, seq):
                    snps += 1
                    filter_output = 'PASS' if fres == [] else ';'.join(fres)
                    info_output = ';'.join('%s=%s' % (k, v)
                                           for k, v in infos.iteritems())
                    fout.write('%s\t%d\t.\t%s\t%s\t%s\t%s\n' %
                               (ts.get_id(), loc, e.target, e.query,
                                filter_output, info_output))
                print '-' * 80
                print 'Organism:        ', tt.get_org_name(ts.tax_id)
                print 'Sequence length: ', ts.alignment.length
                print 'Total coverage:  ', ts.alignment.get_coverage()
                print 'Fold:            ', ts.alignment.get_fold()
                print 'Confirmed SNPs:  ', snps
                print
예제 #10
0
def annotate_targets(read_alns, target_seqs):
	with timeit('Removing reads with 0 alignments'):
		zero_aln_reads = filter(lambda r: len(read_alns[r])==0, read_alns)
		for read in zero_aln_reads:
			read_alns.pop(read)
		print 'Removed %d reads' % len(zero_aln_reads)

	# Annotate the best alignment for all the reads.
	with timeit('Adding alignments to target sequences'):
		total_reads = len(read_alns)
		step = max(1, total_reads/100)
		print 'Number of reads to annotate:', total_reads
		progressbar(0, start=True)
		for i, (read, alns) in enumerate(read_alns.iteritems()):
			if i % step == 0:
				progressbar(i/step)
			alns.sort(reverse=True)
			max_score = alns[0].bitscore
			for aln in alns[:max(1, int(0.1*len(alns)))]:
				if aln.bitscore == max_score:
					aln.is_best = True
				else:
					aln.is_best = False
		progressbar(100, end=True)

	with timeit('Functional transcript annotation'):
		for read, alns in read_alns.iteritems():
			for aln in alns:
				target = target_seqs[aln.target_id]
				if aln.is_best:
					target.num_of_ba_reads += 1
				target.num_of_aln_reads += 1
				target.add_alignment(aln)

	with timeit('Removing 0 alignments target seqs'):
		zero_aln_targets = filter(lambda t: len(target_seqs[t].get_alignments()) == 0, target_seqs.iterkeys())
		for target in zero_aln_targets:
			target_seqs.pop(target)
		print 'Removed %d targets.' % len(zero_aln_targets)

	for target in target_seqs.itervalues():
		target.join_regions()
		target.join_ba_regions()

	return read_alns, target_seqs
예제 #11
0
	def __init__(self, seq_dir, freq_tool):
		super(SpeciesCorpus, self).__init__()
		self.seq_dir = seq_dir
		self.freq_tool = freq_tool
		with open(os.path.sep.join([seq_dir, 'name2tax.txt'])) as fin:
			self.name2tax = dict(map(lambda d: (d[1], int(d[0])), [l.strip().split('|') for l in fin]))
		self.index2tax = {}
		with timeit('Initializing dictionary'):
			self.dictionary = corpora.Dictionary((data for data in iter_over_species_files(self.seq_dir)), freq_tool=freq_tool)
예제 #12
0
def lsi_corpus():
	parser = get_freq_analysis_parser()
	namespace = parser.parse_args()
	seq_dir = namespace.seq_dir
	freq_analysis_tool = namespace.freq_analysis_tool

	with timeit('Calculating species corpus'):
		sc = SpeciesCorpus(seq_dir, namespace.freq_analysis_tool)
		corpora.MmCorpus.serialize('/home/abulovic/tmp/bact_corpus.mm', sc)
		sc.dictionary.save('/home/abulovic/tmp/bact_triplets.dict')
예제 #13
0
def parse():
    parser = get_parse_option_parser()
    namespace = parser.parse_args()
    if namespace.type != 'cds':
        raise NotImplementedError('Genome parsing will be up in a jiffy.')

    file_type = get_file_type(namespace.input_file)
    with timeit('Parsing alignment file'):
        if file_type == 'blast':
            read_alns, transcripts = parse_cds_megablast(
                namespace.input_file, namespace.output_dir)
        elif file_type == 'sam':
            read_alns, transcripts = parse_cds_sam(namespace.input_file,
                                                   namespace.output_dir,
                                                   binary=False)
        elif file_type == 'bam':
            read_alns, transcripts = parse_cds_sam(namespace.input_file,
                                                   namespace.output_dir,
                                                   binary=True)

    with timeit('Loading tax tree'):
        tax_tree = TaxTree()
    spec2trans = get_species_transcript_distribution(transcripts, tax_tree)
    for spec, trans in spec2trans.iteritems():
        if len(filter(lambda t: t.total_coverage > 0.9, trans)) > 0:
            if spec <= 0:
                continue
            int1 = filter(lambda t: t.total_coverage > 0.9, trans)
            print tax_tree.nodes[spec].organism_name,
            print len(trans)
            print ' '.join(
                map(
                    lambda t: "(%.3f, %.3f)" %
                    (t.total_coverage, t.coverage_fold), int1))
            print
    get_read_aln_distribution(
        read_alns, os.path.sep.join([namespace.output_dir, 'plot.png']))
    get_spec_transcript_distribution(
        spec2trans, os.path.sep.join([namespace.output_dir, 'plot2.png']))
예제 #14
0
def generate_consensus():
	parser = get_consensus_parser()
	args = parser.parse_args()

	if args.seq_file is not None:
		generate_seq, seq = True, args.seq_file
	else:
		generate_seq, seq = False, None

	with utils.timeit('Loading tax tree'):
		tt = tax.TaxTree()

	with utils.timeit('Parsing aln file %s <%s>' % (args.aln_file, utils.get_appropriate_file_size(args.aln_file))):
		read_alns, target_seqs = din.parse(args.aln_file, args.db_type, detailed=True)

	with utils.timeit('Retrieving tax data'):
		utils.retrieve_tax_ids(target_seqs, args.db_type)

	with utils.timeit('Creating consensuses'):
		with open(args.output_vcf, 'w') as fout:
			filters = consensus_tools.get_filters()
			infos = consensus_tools.get_infos()
			fout.write(vcf_header(filters, infos, args.ref_db))
			fout.write('\n')
			fout.write('#CHROM\tPOS\tID\tREF\tALT\tFILTER\tINFO\n')
			for ts in target_seqs.itervalues():
				snps = 0
				for loc, cov, e, fres, infos in consensus_tools.itervariants(ts.alignment, args.output_vcf, generate_seq, seq):
					snps += 1
					filter_output = 'PASS' if fres == [] else ';'.join(fres)
					info_output = ';'.join('%s=%s' % (k, v) for k, v in infos.iteritems())
					fout.write('%s\t%d\t.\t%s\t%s\t%s\t%s\n' % (ts.get_id(), loc, e.target, e.query, filter_output, info_output))
				print '-'*80
				print 'Organism:        ', tt.get_org_name(ts.tax_id)
				print 'Sequence length: ', ts.alignment.length
				print 'Total coverage:  ', ts.alignment.get_coverage()
				print 'Fold:            ', ts.alignment.get_fold()
				print 'Confirmed SNPs:  ', snps
				print
예제 #15
0
def _filter_bad_species(cursor, read2tax, bad_species, tax_tree):
	questionable_reads = []
	for read, tax in read2tax.iteritems():
		if tax in bad_species:
			questionable_reads.append(read)

	with timeit('EXTRACTING BAD SPECIES READ INFO'):
		cursor.execute('SELECT * FROM alignment WHERE read_id in (%s)' % (','.join(['?']*len(questionable_reads))), questionable_reads)
		data = cursor.fetchall()

	read2aln = defaultdict(list)
	for d in data:
		read2aln[d[0]].append(d[1:])

	new_read2tax = {}
	with timeit('ASSIGNING NEW OTU TO BAD SPECIES READS'):
		reads = set(map(lambda d: d[0], data))
		for read in reads:
			alns = read2aln[read]
			# alns = map(lambda d: d[1:], filter(lambda d: d[0] == read, data))
			new_read2tax[read] = assign_OTU_from_scores(alns, tax_tree, bad_species)

	return new_read2tax
예제 #16
0
def parse_tab_delimited(input_file, db_type, at_once=1e5, detailed=False,
						read_alns=defaultdict(list), target_seqs={},
						filter_low_scoring=True, annotate=True, entry_cnt=None):

	if entry_cnt is None:
		with timeit('Line count'):
			with open(input_file) as f:
				line_count = sum(1 for line in f)
	else:
		line_count = entry_cnt
	rolls = int(line_count / at_once + 1)

	with timeit('Pure file parsing'):
		with open(input_file, 'r') as fin:
			for i in xrange(rolls):
				with oneline_timeit('Step %d/%d' % (i, rolls)):
					new_reads = _parse_n_lines(fin, at_once, read_alns, target_seqs, db_type)
					if filter_low_scoring:
						_filter_low_scoring(read_alns, 100., 0.01, new_reads)

	if not annotate:
		return read_alns, target_seqs
	else:
		return annotate_targets(read_alns, target_seqs)
예제 #17
0
파일: sam.py 프로젝트: abulovic/metagenomix
def annotate_targets(read_alns, target_seqs):

	with timeit('Semantic target sequence annotation'):
		for read, alns in read_alns.iteritems():
			for aln in alns:
				target = target_seqs[aln.target_id]
				if aln.is_best:
					target.num_of_ba_reads += 1
				target.num_of_aln_reads += 1
				target.add_alignment(aln)

		for target in target_seqs.itervalues():
			target.join_regions()
			target.join_ba_regions()

	return read_alns, target_seqs
예제 #18
0
파일: sam.py 프로젝트: abulovic/metagenomix
def annotate_targets(read_alns, target_seqs):

    with timeit('Semantic target sequence annotation'):
        for read, alns in read_alns.iteritems():
            for aln in alns:
                target = target_seqs[aln.target_id]
                if aln.is_best:
                    target.num_of_ba_reads += 1
                target.num_of_aln_reads += 1
                target.add_alignment(aln)

        for target in target_seqs.itervalues():
            target.join_regions()
            target.join_ba_regions()

    return read_alns, target_seqs
예제 #19
0
def fill_read_table(db, cursor, fasta1, fasta2, format, pair_end, store_seq):
	if pair_end:
		with nested(open(fasta1), open(fasta2)) as (fin1, fin2):
			records1 = SeqIO.parse(fin1, format)
			records2 = SeqIO.parse(fin2, format)
			for rec1, rec2 in izip(records1, records2):
				if store_seq:
					cursor.execute('INSERT INTO %s(read_id, sequence1, sequence2) VALUES (?,?,?)' % _READ_TABLE_NAME_,
									(rec1.id, str(rec1.seq), str(rec2.seq)))
				else:
					cursor.execute('INSERT INTO %s(read_id) VALUES (?)' % _READ_TABLE_NAME_, (rec1.id,))
	else:
		with timeit('STORING SEQUENCES'):
			for rec in iter_input_records(fasta1, format):
				if store_seq:
					cursor.execute('INSERT INTO %s(read_id, sequence) VALUES (?,?)' % _READ_TABLE_NAME_,
								   (rec.id, str(rec.seq)))
				else:
					cursor.execute('INSERT INTO %s(read_id) VALUES (?)' % _READ_TABLE_NAME_, (rec.id,))
			db.commit()
	db.close()
예제 #20
0
def greedy():
	parser = get_OTU_assign_option_parser()
	args = parser.parse_args()

	if args.read_count != 0:
		total_read_count = args.read_count

	all_reads = utils.reads_from_fasta(args.original_fasta)

	with Report(args.output_dir, 'microbe') as report:

		file_type = utils.get_file_type(args.input_file)

		with report.timeit('Parsing %s <%s>' % (args.input_file, utils.get_appropriate_file_size(args.input_file))):
			if file_type == 'blast':
				count_entries = blast.get_entry_cnt_tab
				parse_func = blast.parse_tab_delimited
			elif file_type in ('sam', 'bam'):
				count_entries = sam.get_entry_cnt_sam
				parse_func = sam.parse_cds_sam
			elif file_type == 'xml':
				count_entries = blast.get_entry_cnt_xml
				parse_func = blast.parse_xml
			else:
				raise ValueError('%s alignment format not supported!' % file_type)

			with utils.timeit('Retrieving entry count'):
				entry_cnt = count_entries(args.input_file)
				report.mark('\tTotal entries: %d' % entry_cnt)
			with utils.timeit('File parsing'):
				read_alns, target_seqs = parse_func(args.input_file, args.db_type, entry_cnt=entry_cnt, detailed=True)
				report.mark('\tTarget sequences: %d' % len(target_seqs))

		with report.timeit('GI2TAX database querying'):
			utils.retrieve_tax_ids(target_seqs, args.db_type)

		with report.timeit('loading tax tree'):
			tt = TaxTree()
			report.mark('Loaded %d nodes.' % len(tt.nodes))

		#profiling.get_read_overlap(target_seqs, read_alns, tt, all_reads)
		#profiling.sequential_read_set_analysis(read_alns, target_seqs, tt)

		coverage_limit = 0.6
		fold_limit = 1.
		with report.timeit('greedy transcript assignment'):
			report.mark('Coverage threshold: %.2f' % coverage_limit)
			report.mark('Fold threshold: %.2f' % fold_limit)
			report.mark('#Transcripts (before read assignment)  : %d' % len(target_seqs))
			prefilt_transcripts = otu.greedy_transcript_assign(target_seqs, read_alns)
			report.mark('#Transcripts (after read assignemnt)   : %d' % len(prefilt_transcripts))
			#if args.db_type == 'cds':
			final_transcripts = otu.filter_by_coverage_fold(prefilt_transcripts, 0.6, 1.)
			#else:
			#	final_transcripts = prefilt_transcripts
			report.mark('#Transcripts (after cov-fold filtering): %d' % len(final_transcripts))
			total_reads = len(read_alns)

		with report.timeit('Species assignment stats'):
			s2t_nofilt = utils.get_species_transcript_distribution(target_seqs, tt)
			s2t_greedy = utils.get_species_transcript_distribution(final_transcripts, tt)
			report.mark('#Species (pre-read-assignment) : %d' % len(s2t_nofilt))
			report.mark('#Species (post-read-assignment): %d' % len(s2t_greedy))
			report.rank_distribution(s2t_nofilt, tt, 'nofilt')
			report.rank_distribution(s2t_greedy, tt, 'greedy')
			report.tax_tree(s2t_nofilt, tt, 'nofilt')
			report.tax_tree(s2t_greedy, tt, 'greedy')

		if args.db_type == 'cds':
			new_s2t = otu.remove_orthologue_strains(s2t_greedy)
		else:
			new_s2t = s2t_greedy

		if args.db_type == 'cds':
			with report.timeit('Transcript stats'):
				report.transcript_stats(s2t_nofilt, tt, 'nofilt')
				report.transcript_stats(new_s2t, tt, 'greedy', assigned=True)
				report.tax2reads(new_s2t, tt, 'greedy', 'json')

		if args.db_type == 'cds':
			with report.timeit('Outputing gene expression'):
				report.gene_expression(s2t_nofilt, tt, 'gene_expression_nofilt', assigned=False)
				report.gene_expression(new_s2t, tt, 'gene_expression', assigned=True)

		report.summary(read_alns, s2t_nofilt, new_s2t, args.original_fasta)