예제 #1
0
    def predict(self):
        self.h2_prediction = self.get_antigen_gene_blast_results(
            self.h2_prediction, FLJB_FASTA_PATH)
        if not self.h2_prediction.is_missing:
            if not self.h2_prediction.is_perfect_match:
                top_result = self.h2_prediction.top_result
                match_len = top_result['length']
                pident = top_result['pident']

                # short lower %ID matches are treated as missing or '-' for H2
                if match_len <= 600 and pident < 88.0:
                    self.h2_prediction.h2 = '-'
                    self.h2_prediction.is_missing = True
                    return

                if match_len <= 600 and not self.h2_prediction.is_trunc:
                    self.h2_prediction.h2 = '-'
                    self.h2_prediction.is_missing = True
                    return

                df_blast_results = pd.DataFrame(
                    self.h2_prediction.blast_results)
                df_blast_results = df_blast_results[
                    (df_blast_results['mismatch'] <= 50)
                    & (df_blast_results['length'] >= 700)]

                if df_blast_results.shape[0] == 0:
                    self.h2_prediction.is_missing = True
                    self.h2_prediction.top_result = None
                    self.h2_prediction.h2 = '-'
                    return

                df_blast_results_over1000 = df_blast_results[
                    (df_blast_results['mismatch'] <= 5)
                    & (df_blast_results['length'] >= 1000)]

                if df_blast_results_over1000.shape[0] > 0:
                    df_blast_results = df_blast_results_over1000.sort_values(
                        by='mismatch')
                else:
                    df_blast_results = df_blast_results.sort_values(
                        by='bitscore', ascending=False)

                result_dict = BlastReader.df_first_row_to_dict(
                    df_blast_results)
                result_trunc = BlastReader.is_blast_result_trunc(
                    qstart=result_dict['qstart'],
                    qend=result_dict['qend'],
                    sstart=result_dict['sstart'],
                    send=result_dict['send'],
                    qlen=result_dict['qlen'],
                    slen=result_dict['slen'])
                self.h2_prediction.top_result = result_dict
                self.h2_prediction.is_trunc = result_trunc
            self.h2_prediction.h2 = get_antigen_name(
                self.h2_prediction.top_result['qseqid'])

        if self.h2_prediction.is_missing:
            self.h2_prediction.h2 = '-'
예제 #2
0
    def get_antigen_gene_blast_results(self, model_obj, antigen_gene_fasta):
        blast_outfile = self.blast_runner.blast_against_query(
            antigen_gene_fasta)
        blast_reader = BlastReader(blast_outfile)
        is_missing = blast_reader.is_missing
        model_obj.is_missing = is_missing
        if not is_missing:
            model_obj.blast_results = blast_reader.df_dict()

            model_obj.top_result = blast_reader.top_result()
            model_obj.is_perfect_match = blast_reader.is_perfect_match
            model_obj.is_trunc = blast_reader.is_trunc

        return model_obj
예제 #3
0
    def predict(self, filter=['N/A']):
        self.h1_prediction = self.get_antigen_gene_blast_results(
            self.h1_prediction, FLIC_FASTA_PATH, filter)
        if not self.h1_prediction.is_missing and self.h1_prediction.top_result is not None:
            if not self.h1_prediction.is_perfect_match:
                df_blast_results = pd.DataFrame(
                    self.h1_prediction.blast_results)
                df_blast_results = df_blast_results[
                    (df_blast_results['mismatch'] <= 25)
                    & (df_blast_results['length'] >= 700)]

                if df_blast_results.shape[0] == 0:
                    df_blast_results = pd.DataFrame(
                        self.h1_prediction.blast_results)
                    df_blast_results = df_blast_results[
                        (df_blast_results['mismatch'] <= 0)
                        & (df_blast_results['length'] >= 400)]
                    if df_blast_results.shape[0] == 0:
                        self.h1_prediction.is_missing = True
                        self.h1_prediction.top_result = None
                        self.h1_prediction.h1 = None
                        return

                df_blast_results_over1000 = df_blast_results[
                    (df_blast_results['mismatch'] <= 5)
                    & (df_blast_results['length'] >= 1000)]

                if df_blast_results_over1000.shape[0] > 0:
                    df_blast_results = df_blast_results_over1000.sort_values(
                        by='mismatch')
                else:
                    df_blast_results = df_blast_results.sort_values(
                        by='bitscore', ascending=False)

                result_dict = BlastReader.df_first_row_to_dict(
                    df_blast_results)
                result_trunc = BlastReader.is_blast_result_trunc(
                    qstart=result_dict['qstart'],
                    qend=result_dict['qend'],
                    sstart=result_dict['sstart'],
                    send=result_dict['send'],
                    qlen=result_dict['qlen'],
                    slen=result_dict['slen'])
                self.h1_prediction.top_result = result_dict
                self.h1_prediction.is_trunc = result_trunc
            self.h1_prediction.h1 = get_antigen_name(
                self.h1_prediction.top_result['qseqid'])
예제 #4
0
def test_BlastReader_is_blast_result_trunc():

    # not truncated; match found in the middle of the subject sequence
    assert not BlastReader.is_blast_result_trunc(qstart=1,
                                                 qend=100,
                                                 sstart=101,
                                                 send=200,
                                                 qlen=100,
                                                 slen=1000)

    # not truncated; shorter match (-10bp) found in the middle of the subject
    # sequence
    assert not BlastReader.is_blast_result_trunc(qstart=1,
                                                 qend=90,
                                                 sstart=101,
                                                 send=190,
                                                 qlen=100,
                                                 slen=1000)

    # not truncated; shorter match (-20bp) found in the middle of the subject
    # sequence
    assert not BlastReader.is_blast_result_trunc(qstart=1,
                                                 qend=80,
                                                 sstart=101,
                                                 send=180,
                                                 qlen=100,
                                                 slen=1000)


    # truncated at the start of the subject
    assert BlastReader.is_blast_result_trunc(qstart=51,
                                             qend=100,
                                             sstart=1,
                                             send=50,
                                             qlen=100,
                                             slen=1000)

    # truncated at the end of the subject
    assert BlastReader.is_blast_result_trunc(qstart=51,
                                             qend=100,
                                             sstart=951,
                                             send=1000,
                                             qlen=100,
                                             slen=1000)
예제 #5
0
def run_cgmlst(blast_runner, full=False):
    """Perform in silico cgMLST on an input genome

    Args:
        blast_runner (sistr.src.blast_wrapper.BlastRunner): blastn runner object with genome fasta initialized

    Returns:
        dict: cgMLST ref genome match, distance to closest ref genome, subspecies and serovar predictions
        dict: marker allele match results (seq, allele name, blastn results)
    """
    from sistr.src.serovar_prediction.constants import genomes_to_serovar

    df_cgmlst_profiles = ref_cgmlst_profiles()

    logging.debug('{} distinct cgMLST330 profiles'.format(
        df_cgmlst_profiles.shape[0]))

    logging.info('Running BLAST on serovar predictive cgMLST330 alleles')
    cgmlst_fasta_path = CGMLST_CENTROID_FASTA_PATH if not full else CGMLST_FULL_FASTA_PATH
    blast_outfile = blast_runner.blast_against_query(cgmlst_fasta_path)
    logging.info('Reading BLAST output file "{}"'.format(blast_outfile))
    blast_reader = BlastReader(blast_outfile)
    if blast_reader.df is None:
        logging.error('No cgMLST330 alleles found!')
        return (
            {
                'distance': 1.0,
                'genome_match': None,
                'serovar': None,
                'matching_alleles': 0,
                'subspecies': None,
                'cgmlst330_ST': None,
            },
            {},
        )
    logging.info('Found {} cgMLST330 allele BLAST results'.format(
        blast_reader.df.shape[0]))

    df_cgmlst_blastn = process_cgmlst_results(blast_reader.df)

    marker_match_results = matches_to_marker_results(
        df_cgmlst_blastn[df_cgmlst_blastn.is_match])
    contig_blastn_records = alleles_to_retrieve(df_cgmlst_blastn)
    retrieved_marker_alleles = get_allele_sequences(blast_runner.fasta_path,
                                                    contig_blastn_records,
                                                    full=full)
    logging.info('Type retrieved_marker_alleles %s',
                 type(retrieved_marker_alleles))
    all_marker_results = marker_match_results.copy()
    found_cgmlst_genes = 0
    for marker, res in retrieved_marker_alleles.items():
        all_marker_results[marker] = res
    for marker in df_cgmlst_profiles.columns:
        if marker not in all_marker_results:
            all_marker_results[marker] = {
                'blast_result': None,
                'name': None,
                'seq': None,
            }
    cgmlst_results = {}

    for marker, res in all_marker_results.items():
        try:
            cgmlst_results[marker] = int(res['name'])
            found_cgmlst_genes += 1
        except:
            logging.error('Missing cgmlst_results for %s', marker)
            logging.debug(res)
    logging.info(
        'Calculating number of matching alleles to serovar predictive cgMLST330 profiles'
    )
    df_relatives = find_closest_related_genome(cgmlst_results,
                                               df_cgmlst_profiles)
    genome_serovar_dict = genomes_to_serovar()
    df_relatives['serovar'] = [
        genome_serovar_dict[genome] for genome in df_relatives.index
    ]
    logging.debug('Top 5 serovar predictive cgMLST profiles:\n{}'.format(
        df_relatives.head()))
    spp = None
    subspeciation_tuple = cgmlst_subspecies_call(df_relatives)
    if subspeciation_tuple is not None:
        spp, distance, spp_counter = subspeciation_tuple
        logging.info(
            'Top subspecies by cgMLST is "{}" (min dist={}, Counter={})'.
            format(spp, distance, spp_counter))
    else:
        logging.warning('Subspeciation by cgMLST was not possible!')

    cgmlst_serovar = None
    cgmlst_matching_genome = None
    cgmlst_matching_alleles = 0
    cgmlst_distance = 1.0
    for idx, row in df_relatives.iterrows():
        cgmlst_distance = row['distance']
        cgmlst_matching_alleles = row['matching']
        cgmlst_found_loci = found_cgmlst_genes
        cgmlst_serovar = row['serovar'] if cgmlst_distance <= 1.0 else None
        cgmlst_matching_genome = idx if cgmlst_distance <= 1.0 else None
        logging.info(
            'Top serovar by cgMLST profile matching: "{}" with {} matching alleles, distance={:.1%}'
            .format(cgmlst_serovar, cgmlst_matching_alleles, cgmlst_distance))
        break

    cgmlst_st = None
    cgmlst_markers_sorted = sorted(all_marker_results.keys())
    cgmlst_allele_names = []
    marker = None
    for marker in cgmlst_markers_sorted:
        try:
            aname = all_marker_results[marker]['name']
            if aname:
                cgmlst_allele_names.append(str(aname))
            else:
                break
        except:
            break
    if len(cgmlst_allele_names) == len(cgmlst_markers_sorted):
        cgmlst_st = allele_name('-'.join(cgmlst_allele_names))
        logging.info('cgMLST330 Sequence Type=%s', cgmlst_st)
    else:
        logging.warning(
            'Could not compute cgMLST330 Sequence Type due to missing data (marker %s)',
            marker)
    return (
        {
            'distance': cgmlst_distance,
            'genome_match': cgmlst_matching_genome,
            'serovar': cgmlst_serovar,
            'matching_alleles': cgmlst_matching_alleles,
            'found_loci': cgmlst_found_loci,
            'subspecies': spp,
            'cgmlst330_ST': cgmlst_st,
        },
        all_marker_results,
    )
예제 #6
0
def test_BlastReader(blast_runner):
    blast_outfile = blast_runner.run_blast(WZX_FASTA_PATH)
    blast_reader = BlastReader(blast_outfile)
    top_result = blast_reader.top_result()
    assert get_antigen_name(top_result['qseqid']) == 'O58'