示例#1
0
    def parse_reference_output(self):
        """Reads and processes DIAMOND tabular output of the first DIAMOND
        search.

        Note: this function finds query sequences similar to reference
        proteins. Since a query sequence may have more than one areas of
        similarity (for instance, in fusion proteins of two subunits or
        in multi-domain proteins), it will try to find as many such areas
        as possible.

        DIAMOND hits are filtered by two parameters: length of alignment
        and amino acid identity %.

        This function does not return anything. Instead, it populates
        'reads' dictionary with AnnotatedRead objects.

        """
        tsvfile = os.path.join(
            self.options.get_project_dir(self.sample.sample_id),
            self.sample.sample_id + '_' + self.end + '_' + self.options.ref_output_name
        )
        current_sequence_read_id = ''
        hit_list = DiamondHitList(current_sequence_read_id)
        # TODO: cleanup identity_cutoff = self.config.get_identity_cutoff(self.collection)
        length_cutoff = self.config.get_length_cutoff(self.collection)
        print('Length cutoff:', length_cutoff)
        with open(tsvfile, 'r', newline='') as infile:
            tsvin = csv.reader(infile, delimiter='\t')
            for row in tsvin:
                hit = DiamondHit()
                (row[0], _) = parse_fastq_seqid(row[0])
                hit.create_hit(row)
                # filtering by length
                if hit.length < length_cutoff:
                    continue  # go to next hit

                if hit.query_id != current_sequence_read_id:
                    # when new query ID reached, process collected hits,
                    # then start over with new query identifier
                    # filtering: remove overlapping hits
                    hit_list.filter_list(self.config.get_overlap_cutoff(self.collection))
                    # if any hits left, assign function to hits and populate reads dictionary
                    hit_list.annotate_hits(self.ref_data)
                    hit_list.filter_list_by_identity(self.ref_data)
                    if hit_list.hits_number != 0:
                        read = AnnotatedRead(current_sequence_read_id)
                        read.hit_list = hit_list
                        self.reads[current_sequence_read_id] = read
                    # start over
                    current_sequence_read_id = hit.query_id
                    hit_list = DiamondHitList(current_sequence_read_id)
                hit_list.add_hit(hit)
            # when EOF reached, process collected hits
            hit_list.filter_list(self.config.get_overlap_cutoff(self.collection))
            hit_list.annotate_hits(self.ref_data)
            hit_list.filter_list_by_identity(self.ref_data)
            if hit_list.hits_number != 0:
                read = AnnotatedRead(current_sequence_read_id)
                read.hit_list = hit_list
                self.reads[current_sequence_read_id] = read
    def test_3_protein_taxonomy(self):
        self.project.import_reads_json(sample, ENDS)
        protein = 'D16-4706_contig_11213_7'
        print('D16-4706_contig_11213_7 taxonomy')
        print(self.project.samples[sample].reads[end][protein].taxonomy)
        
        parser = DiamondParser(config=self.project.config,
                       options=self.project.options,
                       taxonomy_data=self.project.taxonomy_data,
                       ref_data=self.project.ref_data,
                       sample=self.project.samples[sample],
                       end=end)
        parser.parse_reference_output()
        print(str(parser.reads[protein]))
        
#        parse_background_output(parser)
        hit_line = 'D16-4706_contig_11213_7|4|257	fig|408672.3.peg.2637	63.0	254	94	256	1	254	2	255	1.1e-97	362.1'
        hit = DiamondHit()
        hit.create_hit(tabular_output_fields=hit_line.split('\t'))
        hit_list = DiamondHitList('D16-4706_contig_11213_7|4|257')
        hit_list.add_hit(hit)
        hit_list.annotate_hits(self.project.ref_data)
        hit_list.filter_list_by_identity(self.project.ref_data)
        print('hit_list')
        print(hit_list)
        
        compare_protein_hits_lca(parser.reads[protein], 4, 257, hit_list, 0.03, 1.0, 1.0, self.project.taxonomy_data, self.project.ref_data)
        print(parser.reads[protein].taxonomy)
        self.assertEqual(parser.reads[protein].taxonomy, '408672')
示例#3
0
    def parse_background_output(self):
        """Reads and processes DIAMOND tabular output of the classification
        DIAMOND search.

        Note: this function takes existing list of hits and compares each
        of them with results of new similarity serach (against classification DB).
        For the comparison, it calls compare_hits_lca function.

        """
        tsvfile = os.path.join(self.assembly_dir,
                               'all_contigs_' + self.project.options.background_output_name)
        current_query_id = None
        hit_list = None
        length_cutoff = self.project.config.get_length_cutoff(
            self.project.options.get_collection())
        biscore_range_cutoff = self.project.config.get_biscore_range_cutoff(
            self.project.options.get_collection())
        print('Relative bit-score cutoff: ', biscore_range_cutoff,
              ', Length cutoff: ', length_cutoff)

        average_coverage = self.assembly.calculate_average_coverage()

        with open(tsvfile, 'r', newline='') as infile:
            tsvin = csv.reader(infile, delimiter='\t')
            function_id = ''
            contig_id = ''
            gene_id = ''
            coverage = ''
            for row in tsvin:
                if current_query_id is None:
                    current_query_id = row[0]
                    hit_list = DiamondHitList(current_query_id)

                hit = DiamondHit()
                hit.create_hit(row)
                # filtering by identity and length
                if hit.length < length_cutoff:
                    continue  # skip this hit

                if hit.query_id != current_query_id:
                    hit_list.annotate_hits(self.project.ref_data)
                    hit_list.filter_list_by_identity(self.project.ref_data)
                    # compare list of hits from search in background DB with existing
                    # hit from search in reference DB
                    current_query_id_tokens = current_query_id.split('|')
                    function_id = current_query_id_tokens[0]
                    contig_id = '_'.join(current_query_id_tokens[1].split('_')[:-1])
                    gene_id = '|'.join(current_query_id_tokens[:-2])
                    coverage = self.assembly.contigs[function_id][contig_id].get_coverage()
                    try:
                        compare_protein_hits_lca(
                            self.assembly.contigs[function_id][contig_id].genes[gene_id],
                            int(current_query_id_tokens[-2]),  # hit_start
                            int(current_query_id_tokens[-1]),  # hit_end
                            hit_list,
                            biscore_range_cutoff,
                            coverage,
                            average_coverage,
                            self.project.taxonomy_data,
                            self.project.ref_data
                            )
                    except KeyError:
                        print(' '.join(['Gene not found:', gene_id, 'in', function_id, contig_id]))
                    current_query_id = hit.query_id
                    hit_list = DiamondHitList(current_query_id)
                hit_list.add_hit(hit)
            hit_list.annotate_hits(self.project.ref_data)
            hit_list.filter_list_by_identity(self.project.ref_data)
            current_query_id_tokens = current_query_id.split('|')
            function_id = current_query_id_tokens[0]
            contig_id = '_'.join(current_query_id_tokens[1].split('_')[:-1])
            gene_id = '|'.join(current_query_id_tokens[:-2])
            coverage = self.assembly.contigs[function_id][contig_id].get_coverage()
            try:
                compare_protein_hits_lca(
                    self.assembly.contigs[function_id][contig_id].genes[gene_id],
                    int(current_query_id_tokens[-2]),  # hit_start
                    int(current_query_id_tokens[-1]),  # hit_end
                    hit_list,
                    biscore_range_cutoff,
                    coverage,
                    average_coverage,
                    self.project.taxonomy_data,
                    self.project.ref_data
                )
            except KeyError:
                print(' '.join(['Gene not found:', gene_id, 'in', function_id, contig_id]))
示例#4
0
    def parse_background_output(self):
        """Reads and processes DIAMOND tabular output of the second DIAMOND
        search.

        Note: this function takes existing list of hits and compares each
        of them with results of other similarity serach (against larger DB).
        For the comparison, it calls compare_hits_erpk_lca function, which
        in turn updates entries in the 'reads' dictionary.

        Raises:
            KeyError if read identifier not found in the 'reads' dictionary

        """
        if not self.reads:
            # Something went wrong and 'reads' dictionary is empty.
            # Let's try to import list of reads from file.
            self.reads = self.import_hit_list()

        tsvfile = os.path.join(
            self.sample.work_directory,
            self.sample.sample_id + '_' + self.end + '_' + self.options.background_output_name
        )

        average_read_length = self.sample.get_avg_read_length(self.end)

        current_query_id = None
        hit_list = None
        length_cutoff = self.config.get_length_cutoff(self.collection)
        bitscore_range_cutoff = self.config.get_biscore_range_cutoff(self.collection)
        print('Relative bit-score cutoff:', bitscore_range_cutoff,
              ', Length cutoff:', length_cutoff
              )

        with open(tsvfile, 'r', newline='') as infile:
            tsvin = csv.reader(infile, delimiter='\t')
            for row in tsvin:
                if current_query_id is None:
                    current_query_id = row[0]
                    hit_list = DiamondHitList(current_query_id)
                hit = DiamondHit()
                hit.create_hit(row)
                # filtering by length
                if hit.length < length_cutoff:
                    continue  # skip this line

                # when new query ID reached, process collected hits,
                # then start over with new query identifier
                if hit.query_id != current_query_id:
                    # assign functions to selected hits
                    hit_list.annotate_hits(self.ref_data)
                    hit_list.filter_list_by_identity(self.ref_data)
                    # extract initial read identifier from identifier of the hit
                    current_query_id_tokens = current_query_id.split('|')
                    read_id = '|'.join(current_query_id_tokens[:-2])
                    # compare list of hits from search in background DB
                    # with existing hit from the first similarity search
                    try:
                        compare_hits_erpk_lca(
                            self.reads[read_id],
                            int(current_query_id_tokens[-2]),  # hit_start
                            int(current_query_id_tokens[-1]),  # hit_end
                            hit_list, bitscore_range_cutoff, length_cutoff,
                            average_read_length, self.taxonomy_data, self.ref_data
                            )
                    except KeyError:
                        print('Read not found: ', read_id)
                    # starting over
                    current_query_id = hit.query_id
                    hit_list = DiamondHitList(current_query_id)
                hit_list.add_hit(hit)
            # when EOF reached, process collected hits
            # assign functions to selected hits
            hit_list.annotate_hits(self.ref_data)
            hit_list.filter_list_by_identity(self.ref_data)
            # extract initial read identifier from identifier of the hit
            current_query_id_tokens = current_query_id.split('|')
            read_id = '|'.join(current_query_id_tokens[:-2])
            # compare list of hits from search in background DB with
            # existing hit from the first similarity search
            try:
                compare_hits_erpk_lca(
                    self.reads[read_id],
                    int(current_query_id_tokens[-2]),  # hit_start
                    int(current_query_id_tokens[-1]),  # hit_end
                    hit_list, bitscore_range_cutoff, length_cutoff,
                    average_read_length, self.taxonomy_data, self.ref_data
                    )
            except KeyError:
                print('Read not found: ', read_id)
def parse_background_output(parser):
    """Reads and processes DIAMOND tabular output of the second DIAMOND
    search.

    Args:
        parser (:obj:DiamondParser): parser object

    Note: this function takes existing list of hits and compares each
    of them with results of other similarity serach (against larger DB).
    For the comparison, it calls compare_hits_lca function, which
    in turn updates entries in the 'reads' dictionary.

    Raises:
        KeyError if read identifier not found in the 'reads' dictionary
    """
    tsvfile = os.path.join(
        parser.sample.work_directory, parser.sample.sample_id + '_' +
        parser.end + '_' + parser.options.background_output_name)

    coverage_data = load_coverage_data(parser)
    total_coverage = 0.0
    if coverage_data:
        for contig_id in coverage_data.keys():
            total_coverage += coverage_data[contig_id]
        average_coverage = total_coverage / len(coverage_data)
    else:
        average_coverage = 1.0

    current_query_id = None
    _hit_list = None
    length_cutoff = parser.config.get_length_cutoff(parser.collection)
    biscore_range_cutoff = parser.config.get_biscore_range_cutoff(
        parser.collection)
    print('Relative bitscore cutoff: ', biscore_range_cutoff,
          ', Length cutoff: ', length_cutoff)

    with open(tsvfile, 'r', newline='') as infile:
        tsvin = csv.reader(infile, delimiter='\t')
        for row in tsvin:
            if current_query_id is None:
                current_query_id = row[0]
                _hit_list = DiamondHitList(current_query_id)

            hit = DiamondHit()
            hit.create_hit(row)
            # filtering by length
            if hit.length < length_cutoff:
                continue

            if hit.query_id != current_query_id:
                _hit_list.annotate_hits(parser.ref_data)
                _hit_list.filter_list_by_identity(parser.ref_data)

                current_query_id_tokens = current_query_id.split('|')
                protein_id = '|'.join(current_query_id_tokens[:-2])
                hit_start = int(current_query_id_tokens[-2])
                hit_end = int(current_query_id_tokens[-1])
                # Coverage data can be used only if protein ID contains contig ID
                contig_id = '_'.join(
                    protein_id.split(' # ')[0].split('_')[:-1])[1:]
                coverage = 1.0
                if coverage_data is not None and contig_id in coverage_data:
                    coverage = coverage_data[contig_id]
                try:
                    compare_protein_hits_lca(parser.reads[protein_id],
                                             hit_start, hit_end, _hit_list,
                                             biscore_range_cutoff, coverage,
                                             average_coverage,
                                             parser.taxonomy_data,
                                             parser.ref_data)
                except KeyError:
                    print('Protein not found: ', protein_id)
                current_query_id = hit.query_id
                _hit_list = DiamondHitList(current_query_id)
            _hit_list.add_hit(hit)
        _hit_list.annotate_hits(parser.ref_data)
        _hit_list.filter_list_by_identity(parser.ref_data)
        current_query_id_tokens = current_query_id.split('|')
        protein_id = '|'.join(current_query_id_tokens[:-2])
        hit_start = int(current_query_id_tokens[-2])
        hit_end = int(current_query_id_tokens[-1])
        try:
            # Coverage data can be used only if protein ID contains contig ID
            contig_id = '_'.join(
                protein_id.split(' # ')[0].split('_')[:-1])[1:]
            coverage = 1.0
            if coverage_data is not None and contig_id in coverage_data:
                coverage = coverage_data[contig_id]
            compare_protein_hits_lca(parser.reads[protein_id], hit_start,
                                     hit_end, _hit_list, biscore_range_cutoff,
                                     coverage, average_coverage,
                                     parser.taxonomy_data, parser.ref_data)
        except KeyError:
            print('Protein not found: ', protein_id)