Пример #1
0
    def run(self, aa_gene_files, evalue, per_identity, output_dir):
        """Apply reciprocal blast to all pairs of genomes in parallel.

        Parameters
        ----------
        aa_gene_files : list of str
            Amino acid fasta files to process via reciprocal blast.
        evalue : float
            E-value threshold for reporting hits.
        per_identity : float
            Percent identity threshold for reporting hits.
        output_dir : str
            Directory to store blast results.
        """

        # concatenate all gene files and create a single diamond database
        self.logger.info('  Creating diamond database (be patient!).')
        gene_file = os.path.join(output_dir, 'all_genes.faa')
        concatenate_files(aa_gene_files, gene_file)
        diamond_db = os.path.join(output_dir, 'all_genes')

        diamond = Diamond(self.cpus)
        diamond.make_database(gene_file, diamond_db)

        # blast all genes against the database
        self.logger.info('')
        self.logger.info('  Identifying hits between all pairs of genomes (be patient!).')
        hits_daa_file = os.path.join(output_dir, 'all_hits')
        diamond.blastp(gene_file, diamond_db, evalue, per_identity, len(aa_gene_files) * 10, hits_daa_file)

        # create flat hits table
        self.logger.info('  Creating table with hits.')
        hits_table_file = os.path.join(output_dir, 'all_hits.tsv')
        diamond.view(hits_daa_file + '.daa', hits_table_file)
Пример #2
0
    def _run_reciprocal_diamond(self, query_gene_file, target_gene_file,
                                evalue, per_identity, per_aln_len, max_hits,
                                sensitive, high_mem, tmp_dir, output_dir):
        """Perform similarity search of query genes against target genes, and reciprocal hits.

        Parameters
        ----------
        query_gene_file : str
            File with all query proteins.
        target_gene_file : str
            File with all target proteins.
        evalue : float
            E-value threshold for reporting hits.
        per_identity : float
            Percent identity threshold for reporting hits.
        per_aln_len : float
            Percent query coverage threshold for reporting hits.
        max_hits : int
            Maximum number of hits to report per query sequences.
        tmp_dir : str
            Directory to store temporary files.
        output_dir : str
            Directory to store blast results.
        """

        self.logger.info(
            'Creating DIAMOND database of query proteins (be patient!).')
        diamond = Diamond(self.cpus)
        query_diamond_db = os.path.join(output_dir, 'query_genes')
        diamond.make_database(query_gene_file, query_diamond_db)

        self.logger.info(
            'Creating DIAMOND database of target proteins (be patient!).')
        target_diamond_db = os.path.join(output_dir, 'target_genes')
        diamond.make_database(target_gene_file, target_diamond_db)

        # blast query genes against target proteins
        self.logger.info(
            'Performing similarity sequence between query and target proteins (be patient!).'
        )

        if tmp_dir:
            tmp_query_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_query_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', delete=False)
        tmp_query_hits_table.close()

        query_hits_daa_file = os.path.join(output_dir, 'query_hits')

        if high_mem:
            diamond.blastp(query_gene_file,
                           target_diamond_db,
                           evalue,
                           per_identity,
                           per_aln_len,
                           max_hits,
                           sensitive,
                           tmp_query_hits_table.name,
                           'standard',
                           tmp_dir,
                           chunk_size=1,
                           block_size=8)
        else:
            diamond.blastp(query_gene_file, target_diamond_db, evalue,
                           per_identity, per_aln_len, max_hits, sensitive,
                           tmp_query_hits_table.name, 'standard', tmp_dir)

        # get target genes hit by one or more query proteins
        self.logger.info(
            'Creating file with target proteins with similarity to query proteins.'
        )
        target_hit = set()
        for line in open(tmp_query_hits_table.name):
            line_split = line.split('\t')
            target_hit.add(line_split[1])

        target_genes_hits = os.path.join(output_dir, 'target_genes_hit.faa')
        fout = open(target_genes_hits, 'w')
        for seq_id, seq in seq_io.read_seq(target_gene_file):
            if seq_id in target_hit:
                fout.write('>' + seq_id + '\n')
                fout.write(seq + '\n')
        fout.close()

        self.logger.info(
            'Identified %d target proteins to be used in reciprocal search.' %
            len(target_hit))

        # perform reciprocal blast
        self.logger.info(
            'Performing reciprocal similarity sequence between target and query proteins (be patient!).'
        )

        if tmp_dir:
            tmp_target_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_target_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', delete=False)
        tmp_target_hits_table.close()

        if high_mem:
            diamond.blastp(target_genes_hits,
                           query_diamond_db,
                           evalue,
                           per_identity,
                           per_aln_len,
                           max_hits,
                           sensitive,
                           tmp_target_hits_table.name,
                           'standard',
                           tmp_dir,
                           chunk_size=1,
                           block_size=8)
        else:
            diamond.blastp(target_genes_hits, query_diamond_db, evalue,
                           per_identity, per_aln_len, max_hits, sensitive,
                           tmp_target_hits_table.name, 'standard', tmp_dir)

        # combine hit tables and sort
        os.system('cat %s >> %s' %
                  (tmp_target_hits_table.name, tmp_query_hits_table.name))
        os.remove(tmp_target_hits_table.name)
        hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv')
        self._sort_hit_table(tmp_query_hits_table.name, hits_table_file)
Пример #3
0
    def _run_self_diamond(self, query_gene_file, evalue, per_identity,
                          per_aln_len, max_hits, sensitive, high_mem, tmp_dir,
                          output_dir):
        """Perform similarity search of query genes against themselves.

        Parameters
        ----------
        query_gene_file : str
            File with all query sequences.
        evalue : float
            E-value threshold for reporting hits.
        per_identity : float
            Percent identity threshold for reporting hits.
        per_aln_len : float
            Percent query coverage threshold for reporting hits.
        max_hits : int
            Maximum number of hits to report per query sequences.
        tmp_dir : str
            Directory to store temporary files.
        output_dir : str
            Directory to store blast results.
        """

        self.logger.info('Creating DIAMOND database (be patient!).')

        diamond_db = os.path.join(output_dir, 'query_genes')
        diamond = Diamond(self.cpus)
        diamond.make_database(query_gene_file, diamond_db)

        # create flat hits table
        if tmp_dir:
            tmp_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', delete=False)
        tmp_hits_table.close()

        # blast all genes against the database
        self.logger.info(
            'Performing self similarity sequence between genomes (be patient!).'
        )

        if high_mem:
            diamond.blastp(query_gene_file,
                           diamond_db,
                           evalue,
                           per_identity,
                           per_aln_len,
                           max_hits,
                           sensitive,
                           tmp_hits_table.name,
                           'standard',
                           tmp_dir,
                           chunk_size=1,
                           block_size=8)
        else:
            diamond.blastp(query_gene_file, diamond_db, evalue, per_identity,
                           per_aln_len, max_hits, sensitive,
                           tmp_hits_table.name, 'standard', tmp_dir)

        # sort hit table
        hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv')
        self._sort_hit_table(tmp_hits_table.name, hits_table_file)
Пример #4
0
    def _run_reciprocal_diamond(self, query_gene_file,
                                        target_gene_file,
                                        evalue, 
                                        per_identity, 
                                        per_aln_len,
                                        max_hits,
                                        sensitive,
                                        high_mem,
                                        tmp_dir,
                                        output_dir):
        """Perform similarity search of query genes against target genes, and reciprocal hits.

        Parameters
        ----------
        query_gene_file : str
            File with all query proteins.
        target_gene_file : str
            File with all target proteins.
        evalue : float
            E-value threshold for reporting hits.
        per_identity : float
            Percent identity threshold for reporting hits.
        per_aln_len : float
            Percent query coverage threshold for reporting hits.
        max_hits : int
            Maximum number of hits to report per query sequences.
        tmp_dir : str
            Directory to store temporary files.
        output_dir : str
            Directory to store blast results.
        """
        
        self.logger.info('Creating DIAMOND database of query proteins (be patient!).')
        diamond = Diamond(self.cpus)
        query_diamond_db = os.path.join(output_dir, 'query_genes')
        diamond.make_database(query_gene_file, query_diamond_db)
        
        self.logger.info('Creating DIAMOND database of target proteins (be patient!).')
        target_diamond_db = os.path.join(output_dir, 'target_genes')
        diamond.make_database(target_gene_file, target_diamond_db)

        # blast query genes against target proteins
        self.logger.info('Performing similarity sequence between query and target proteins (be patient!).')
        
        if tmp_dir:
            tmp_query_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_query_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False)
        tmp_query_hits_table.close()
        
        query_hits_daa_file = os.path.join(output_dir, 'query_hits')
        
        if high_mem:
            diamond.blastp(query_gene_file, 
                            target_diamond_db, 
                            evalue, 
                            per_identity, 
                            per_aln_len, 
                            max_hits,
                            sensitive,
                            tmp_query_hits_table.name, 
                            'standard', 
                            tmp_dir, 
                            chunk_size=1, 
                            block_size=8)
        else:
            diamond.blastp(query_gene_file, 
                            target_diamond_db, 
                            evalue, 
                            per_identity, 
                            per_aln_len, 
                            max_hits,
                            sensitive,
                            tmp_query_hits_table.name, 
                            'standard', 
                            tmp_dir)
                
        # get target genes hit by one or more query proteins
        self.logger.info('Creating file with target proteins with similarity to query proteins.')
        target_hit = set()
        for line in open(tmp_query_hits_table.name):
            line_split = line.split('\t')
            target_hit.add(line_split[1])

        target_genes_hits = os.path.join(output_dir, 'target_genes_hit.faa')
        fout = open(target_genes_hits, 'w')
        for seq_id, seq in seq_io.read_seq(target_gene_file):
            if seq_id in target_hit:
                fout.write('>' + seq_id + '\n')
                fout.write(seq + '\n')
        fout.close()
        
        self.logger.info('Identified %d target proteins to be used in reciprocal search.' % len(target_hit))
        
        # perform reciprocal blast
        self.logger.info('Performing reciprocal similarity sequence between target and query proteins (be patient!).')

        if tmp_dir:
            tmp_target_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_target_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False)
        tmp_target_hits_table.close()
        
        if high_mem:
            diamond.blastp(target_genes_hits, 
                            query_diamond_db, 
                            evalue, 
                            per_identity, 
                            per_aln_len, 
                            max_hits,
                            sensitive,
                            tmp_target_hits_table.name, 
                            'standard', 
                            tmp_dir, 
                            chunk_size=1, 
                            block_size=8)
        else:
            diamond.blastp(target_genes_hits, 
                            query_diamond_db, 
                            evalue, 
                            per_identity, 
                            per_aln_len, 
                            max_hits,
                            sensitive,
                            tmp_target_hits_table.name, 
                            'standard', 
                            tmp_dir)
                
        # combine hit tables and sort
        os.system('cat %s >> %s' % (tmp_target_hits_table.name, tmp_query_hits_table.name))
        os.remove(tmp_target_hits_table.name)
        hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv')
        self._sort_hit_table(tmp_query_hits_table.name, hits_table_file)
Пример #5
0
    def _run_self_diamond(self, query_gene_file, 
                                evalue, 
                                per_identity, 
                                per_aln_len,
                                max_hits,
                                sensitive,
                                high_mem,
                                tmp_dir,
                                output_dir):
        """Perform similarity search of query genes against themselves.

        Parameters
        ----------
        query_gene_file : str
            File with all query sequences.
        evalue : float
            E-value threshold for reporting hits.
        per_identity : float
            Percent identity threshold for reporting hits.
        per_aln_len : float
            Percent query coverage threshold for reporting hits.
        max_hits : int
            Maximum number of hits to report per query sequences.
        tmp_dir : str
            Directory to store temporary files.
        output_dir : str
            Directory to store blast results.
        """
        
        self.logger.info('Creating DIAMOND database (be patient!).')
        
        diamond_db = os.path.join(output_dir, 'query_genes')
        diamond = Diamond(self.cpus)
        diamond.make_database(query_gene_file, diamond_db)
            
        # create flat hits table
        if tmp_dir:
            tmp_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False)
        tmp_hits_table.close()

        # blast all genes against the database
        self.logger.info('Performing self similarity sequence between genomes (be patient!).')

        if high_mem:
            diamond.blastp(query_gene_file, 
                            diamond_db, 
                            evalue, 
                            per_identity, 
                            per_aln_len, 
                            max_hits,
                            sensitive,
                            tmp_hits_table.name, 
                            'standard', 
                            tmp_dir, 
                            chunk_size=1, 
                            block_size=8)
        else:
            diamond.blastp(query_gene_file, 
                            diamond_db, 
                            evalue, 
                            per_identity, 
                            per_aln_len, 
                            max_hits,
                            sensitive,
                            tmp_hits_table.name, 
                            'standard', 
                            tmp_dir)

        # sort hit table
        hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv')
        self._sort_hit_table(tmp_hits_table.name, hits_table_file)
Пример #6
0
    def run(self, scaffold_gene_file, stat_file, ref_genome_gene_files, db_file, evalue, per_identity, per_aln_len):
        """Create taxonomic profiles for a set of genomes.

        Parameters
        ----------
        scaffold_gene_file : str
            Fasta file of genes on scaffolds in amino acid space.
        stat_file : str
            File with statistics for individual scaffolds.
        ref_genome_gene_files : list of str
            Fasta files of called genes on reference genomes of interest.
        db_file : str
            Database of competing reference genes.
        evalue : float
            E-value threshold of valid hits.
        per_identity : float
            Percent identity threshold of valid hits [0,100].
        per_aln_len : float
            Percent query coverage of valid hits [0, 100].
        """

        # read statistics file
        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(stat_file)

        # perform homology searches
        self.logger.info('Creating diamond database for reference genomes.')
        ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa')
        concatenate_gene_files(ref_genome_gene_files, ref_gene_file)

        diamond = Diamond(self.cpus)
        ref_diamond_db = os.path.join(self.output_dir, 'ref_genes')
        diamond.make_database(ref_gene_file, ref_diamond_db)

        self.logger.info('Identifying homologs within reference genomes of interest (be patient!).')
        self.diamond_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(self.diamond_dir)
        hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv')
        diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, per_aln_len, 1, hits_ref_genomes)

        self.logger.info('Identifying homologs within competing reference genomes (be patient!).')
        hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv')
        diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, per_aln_len, 1, hits_comp_ref_genomes)

        # get list of genes with a top hit to the reference genomes of interest
        hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes)

        # get number of genes on each scaffold
        num_genes_on_scaffold = defaultdict(int)
        for seq_id, _seq in seq_io.read_seq(scaffold_gene_file):
            scaffold_id = seq_id[0:seq_id.rfind('_')]
            num_genes_on_scaffold[scaffold_id] += 1

        # get hits to each scaffold
        hits_to_scaffold = defaultdict(list)
        for query_id, hit in hits_to_ref.iteritems():
            gene_id = query_id[0:query_id.rfind('~')]
            scaffold_id = gene_id[0:gene_id.rfind('_')]
            hits_to_scaffold[scaffold_id].append(hit)

        # report summary stats for each scaffold
        reference_out = os.path.join(self.output_dir, 'references.tsv')
        fout = open(reference_out, 'w')
        fout.write('Scaffold ID\tSubject genome IDs\tSubject scaffold IDs')
        fout.write('\tGenome ID\tLength (bp)\tGC\tMean coverage')
        fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n')

        for scaffold_id, hits in hits_to_scaffold.iteritems():
            aln_len = []
            perc_iden = []
            evalue = []
            bitscore = []
            subject_scaffold_ids = defaultdict(int)
            subject_bin_ids = defaultdict(int)
            for hit in hits:
                aln_len.append(hit.aln_length)
                perc_iden.append(hit.perc_identity)
                evalue.append(hit.evalue)
                bitscore.append(hit.bitscore)

                subject_bin_id, subject_gene_id  = hit.subject_id.split('~')
                subject_scaffold_id = subject_gene_id[0:subject_gene_id.rfind('_')]
                subject_scaffold_ids[subject_scaffold_id] += 1
                subject_bin_ids[subject_bin_id] += 1
               

            sorted_subject_bin_ids = sorted(subject_bin_ids.items(), 
                                                key=operator.itemgetter(1),
                                                reverse=True)
            subject_bin_id_str = []
            for bin_id, num_hits in sorted_subject_bin_ids:
                subject_bin_id_str.append(bin_id + ':' + str(num_hits))
            subject_bin_id_str = ','.join(subject_bin_id_str)

            sorted_subject_scaffold_ids = sorted(subject_scaffold_ids.items(), 
                                                    key=operator.itemgetter(1),
                                                    reverse=True)
            subject_scaffold_id_str = []
            for subject_id, num_hits in sorted_subject_scaffold_ids:
                subject_scaffold_id_str.append(subject_id + ':' + str(num_hits))
            subject_scaffold_id_str = ','.join(subject_scaffold_id_str)

            fout.write('%s\t%s\t%s\t%s\t%.2f\t%d\t%d\t%.2f\t%d\t%.2f\t%.2g\t%.2f\n' % (
                                                                        scaffold_id,
                                                                        subject_bin_id_str,
                                                                        subject_scaffold_id_str,
                                                                        scaffold_stats.print_stats(scaffold_id),
                                                                        mean(scaffold_stats.coverage(scaffold_id)),
                                                                        num_genes_on_scaffold[scaffold_id],
                                                                        len(hits),
                                                                        len(hits) * 100.0 / num_genes_on_scaffold[scaffold_id],
                                                                        mean(aln_len),
                                                                        mean(perc_iden),
                                                                        mean(evalue),
                                                                        mean(bitscore)))

        fout.close()

        return reference_out
Пример #7
0
    def run(self, scaffold_gene_file, stat_file, ref_genome_gene_files, db_file, evalue, per_identity,):
        """Create taxonomic profiles for a set of genomes.

        Parameters
        ----------
        scaffold_gene_file : str
            Fasta file of genes on scaffolds in amino acid space.
        stat_file : str
            File with statistics for individual scaffolds.
        ref_genome_gene_files : list of str
            Fasta files of called genes on reference genomes of interest.
        db_file : str
            Database of competing reference genes.
        evalue : float
            E-value threshold used by blast.
        per_identity: float
            Percent identity threshold used by blast.
        """

        # read statistics file
        self.logger.info('')
        self.logger.info('  Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(stat_file)

        # perform homology searches
        self.logger.info('')
        self.logger.info('  Creating diamond database for reference genomes.')
        ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa')
        concatenate_gene_files(ref_genome_gene_files, ref_gene_file)

        diamond = Diamond(self.cpus)
        ref_diamond_db = os.path.join(self.output_dir, 'ref_genes')
        diamond.make_database(ref_gene_file, ref_diamond_db)

        self.logger.info('  Identifying homologs within reference genomes of interest (be patient!).')
        self.diamond_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(self.diamond_dir)
        hits_ref_genomes_daa = os.path.join(self.diamond_dir, 'ref_hits')
        diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, 1, hits_ref_genomes_daa)

        hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv')
        diamond.view(hits_ref_genomes_daa + '.daa', hits_ref_genomes)

        self.logger.info('  Identifying homologs within competing reference genomes (be patient!).')
        hits_comp_ref_genomes_daa = os.path.join(self.diamond_dir, 'competing_ref_hits')
        diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, 1, hits_comp_ref_genomes_daa)

        hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv')
        diamond.view(hits_comp_ref_genomes_daa + '.daa', hits_comp_ref_genomes)

        # get list of genes with a top hit to the reference genomes of interest
        hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes)

        # get number of genes on each scaffold
        num_genes_on_scaffold = defaultdict(int)
        for seq_id, _seq in seq_io.read_seq(scaffold_gene_file):
            scaffold_id = seq_id[0:seq_id.rfind('_')]
            num_genes_on_scaffold[scaffold_id] += 1

        # get hits to each scaffold
        hits_to_scaffold = defaultdict(list)
        for query_id, hit in hits_to_ref.iteritems():
            gene_id = query_id[0:query_id.rfind('~')]
            scaffold_id = gene_id[0:gene_id.rfind('_')]
            hits_to_scaffold[scaffold_id].append(hit)

        # report summary stats for each scaffold
        reference_out = os.path.join(self.output_dir, 'references.tsv')
        fout = open(reference_out, 'w')
        fout.write('Scaffold id\tSubject scaffold ids\tSubject genome ids')
        fout.write('\tGenome id\tLength (bp)\tGC\tMean coverage')
        fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n')

        for scaffold_id, hits in hits_to_scaffold.iteritems():
            aln_len = []
            perc_iden = []
            evalue = []
            bitscore = []
            subject_scaffold_ids = defaultdict(int)
            subject_bin_ids = defaultdict(int)
            for hit in hits:
                aln_len.append(hit.aln_length)
                perc_iden.append(hit.perc_identity)
                evalue.append(hit.evalue)
                bitscore.append(hit.bitscore)

                subject_id, subject_bin_id = hit.subject_id.split('~')
                subject_scaffold_id = subject_id[0:subject_id.rfind('_')]
                subject_scaffold_ids[subject_scaffold_id] += 1
                subject_bin_ids[subject_bin_id] += 1

            subject_scaffold_id_str = []
            for subject_id, num_hits in subject_scaffold_ids.iteritems():
                subject_scaffold_id_str.append(subject_id + ':' + str(num_hits))
            subject_scaffold_id_str = ','.join(subject_scaffold_id_str)

            subject_bin_id_str = []
            for bin_id, num_hits in subject_bin_ids.iteritems():
                subject_bin_id_str.append(bin_id + ':' + str(num_hits))
            subject_bin_id_str = ','.join(subject_bin_id_str)

            fout.write('%s\t%s\t%s\t%s\t%.2f\t%d\t%d\t%.2f\t%d\t%.2f\t%.2g\t%.2f\n' % (
                                                                        scaffold_id,
                                                                        subject_scaffold_id_str,
                                                                        subject_bin_id_str,
                                                                        scaffold_stats.print_stats(scaffold_id),
                                                                        mean(scaffold_stats.coverage(scaffold_id)),
                                                                        num_genes_on_scaffold[scaffold_id],
                                                                        len(hits),
                                                                        len(hits) * 100.0 / num_genes_on_scaffold[scaffold_id],
                                                                        mean(aln_len),
                                                                        mean(perc_iden),
                                                                        mean(evalue),
                                                                        mean(bitscore)))

        fout.close()

        return reference_out