Exemplo n.º 1
0
def remove_bad_chars_fasta(fasta):
    new_headers = list()
    new_seqs = list()
    for seq in read_sequence(fasta, format='fasta'):
        new_header = seq.metadata['id']
        new_header = new_header.replace(';', '_').replace('=', '_')
        if new_header in new_headers:
            raise ValueError(
                'Replacement of ; or = with _ generated reduntant sequence headers, must be modified '
                'manually')
        seq.metadata['id'] = new_header
        new_seqs.append(seq)
    return new_seqs
Exemplo n.º 2
0
def remove_bad_chars_fasta(fasta):
    if is_affi_tab_not_fasta(fasta):
        raise ValueError(
            "The input file format matches virsorter affi contigs not fasta, please check "
            "that the flags match the file type: '-v' for virsorter, and '-i' for fasta."
        )

    new_headers = list()
    new_seqs = list()
    for seq in read_sequence(fasta, format='fasta'):
        new_header = seq.metadata['id']
        new_header = new_header.replace(';', '_').replace('=', '_')
        if new_header in new_headers:
            raise ValueError(
                'Replacement of ; or = with _ generated redundant sequence headers, must be modified '
                'manually')
        seq.metadata['id'] = new_header
        new_seqs.append(seq)
    return new_seqs
Exemplo n.º 3
0
def test_add_intervals_to_gff(annotated_fake_gff_loc, tmpdir):
    add_intervals_test_loc = tmpdir.mkdir('fake_rrnas_loc')
    annotate_fake_gff_loc_w_rna = os.path.join(add_intervals_test_loc,
                                               'fake.gff')
    copy(annotated_fake_gff_loc, annotate_fake_gff_loc_w_rna)
    fake_rrnas_loc = os.path.join(add_intervals_test_loc, 'rrnas.tsv')
    fake_rrnas = pd.DataFrame([[
        'fake_NC_001422.1', 990, 1000, 101.0, '+', '16S ribosomal rRNA gene',
        pd.np.NaN
    ]],
                              columns=[
                                  'scaffold', 'begin', 'end', 'e-value',
                                  'strand', 'type', 'note'
                              ])
    fake_rrnas.to_csv(fake_rrnas_loc, sep='\t')
    add_intervals_to_gff(fake_rrnas_loc, annotate_fake_gff_loc_w_rna,
                         {'fake_NC_001422.1': 6000}, make_rrnas_interval,
                         'scaffold')
    assert os.path.isfile(annotate_fake_gff_loc_w_rna)
    gff = list(read_sequence(annotate_fake_gff_loc_w_rna, format='gff3'))
    assert type(gff) is list
    assert open(annotate_fake_gff_loc_w_rna).read() == annotated_fake_gff_w_rna
Exemplo n.º 4
0
def annotate_vgfs(input_fasta,
                  virsorter_affi_contigs=None,
                  output_dir='.',
                  min_contig_size=2500,
                  prodigal_mode='meta',
                  trans_table='11',
                  bit_score_threshold=60,
                  rbh_bit_score_threshold=350,
                  custom_db_name=(),
                  custom_fasta_loc=(),
                  use_uniref=False,
                  low_mem_mode=False,
                  skip_trnascan=False,
                  keep_tmp_dir=True,
                  threads=10,
                  verbose=True):
    # set up
    start_time = datetime.now()
    print('%s: Viral annotation started' % str(datetime.now()))

    # check inputs
    prodigal_modes = ['train', 'meta', 'single']
    if prodigal_mode not in prodigal_modes:
        raise ValueError('Prodigal mode must be one of %s.' %
                         ', '.join(prodigal_modes))
    elif prodigal_mode in ['normal', 'single']:
        warnings.warn(
            'When running prodigal in single mode your bins must have long contigs (average length >3 Kbp), '
            'be long enough (total length > 500 Kbp) and have very low contamination in order for prodigal '
            'training to work well.')

    # get database locations
    db_locs = get_database_locs()
    db_handler = DatabaseHandler(db_locs['description_db'])
    db_locs_anno = filter_db_locs(db_locs, low_mem_mode, use_uniref,
                                  VMAG_DBS_TO_ANNOTATE)

    if virsorter_affi_contigs is not None:
        virsorter_hits = get_virsorter_hits(virsorter_affi_contigs)
    else:
        virsorter_hits = None

    # split sequences into seperate fastas
    mkdir(output_dir)
    contig_dir = path.join(output_dir, 'vMAGs')
    mkdir(contig_dir)
    contig_locs = list()
    for seq in read_sequence(input_fasta, format='fasta'):
        if len(seq) >= min_contig_size:
            if '=' in seq.metadata['id'] or ';' in seq.metadata['id']:
                raise ValueError(
                    'FASTA headers must not have = or ; before the first space (%s). To run DRAM-v you '
                    'must rerun VIRSorter with = and ; removed from the headers or run DRAM-v.py '
                    'remove_bad_characters and then rerun DRAM-v' %
                    seq.metadata['id'])
            if virsorter_hits is not None:
                if get_virsorter_affi_contigs_name(
                        seq.metadata['id']
                ) not in virsorter_hits['name'].values:
                    raise ValueError(
                        "No virsorter calls found in %s for scaffold %s from input fasta"
                        % (virsorter_affi_contigs, seq.metadata['id']))
            contig_loc = path.join(contig_dir, '%s.fasta' % seq.metadata['id'])
            write_sequence((i for i in [seq]), format='fasta', into=contig_loc)
            contig_locs.append(contig_loc)

    # annotate vMAGs
    rename_bins = False
    annotations = annotate_fastas(contig_locs, output_dir, db_locs_anno,
                                  db_handler, min_contig_size, prodigal_mode,
                                  trans_table, bit_score_threshold,
                                  rbh_bit_score_threshold, custom_db_name,
                                  custom_fasta_loc, skip_trnascan, rename_bins,
                                  keep_tmp_dir, start_time, threads, verbose)
    print('%s: Annotations complete, processing annotations' %
          str(datetime.now() - start_time))

    # setting up scoring viral genes
    amg_database_frame = pd.read_csv(db_locs['amg_database'], sep='\t')
    genome_summary_form = pd.read_csv(db_locs['genome_summary_form'],
                                      sep='\t',
                                      index_col=0)
    genome_summary_form = genome_summary_form.loc[
        genome_summary_form.potential_amg]

    # add auxiliary score
    if virsorter_hits is not None:
        gene_virsorter_category_dict = dict()
        gene_auxiliary_score_dict = dict()
        for scaffold, dram_frame in annotations.groupby('scaffold'):
            virsorter_scaffold_name = get_virsorter_affi_contigs_name(scaffold)
            virsorter_frame = virsorter_hits.loc[virsorter_hits.name ==
                                                 virsorter_scaffold_name]
            gene_order = get_gene_order(dram_frame, virsorter_frame)
            gene_virsorter_category_dict.update({
                dram_gene: virsorter_category
                for dram_gene, _, virsorter_category in gene_order
                if dram_gene is not None
            })
            gene_auxiliary_score_dict.update(
                calculate_auxiliary_scores(gene_order))
        annotations['virsorter_category'] = pd.Series(
            gene_virsorter_category_dict)
        annotations['auxiliary_score'] = pd.Series(gene_auxiliary_score_dict)

    # get metabolic flags
    scaffold_length_dict = {
        seq.metadata['id']: len(seq)
        for seq in read_sequence(input_fasta, format='fasta')
    }
    metabolic_genes = set(genome_summary_form.index)
    if 'pfam_hits' in annotations:
        annotations['is_transposon'] = [
            is_transposon(i) for i in annotations['pfam_hits']
        ]
    else:
        annotations['is_transposon'] = False

    amgs = get_amg_ids(amg_database_frame)
    verified_amgs = get_amg_ids(
        amg_database_frame.loc[amg_database_frame.verified])
    annotations['amg_flags'] = pd.Series(
        get_metabolic_flags(annotations, metabolic_genes, amgs, verified_amgs,
                            scaffold_length_dict))

    # downgrade B flag auxiliary scores
    if virsorter_affi_contigs is not None:
        annotations['auxiliary_score'] = pd.Series({
            gene: (4 if 'B' in row['amg_flags'] and row['auxiliary_score'] < 4
                   else row['auxiliary_score'])
            for gene, row in annotations.iterrows()
        })

    # write annotations
    annotations.to_csv(path.join(output_dir, 'annotations.tsv'), sep='\t')

    print("%s: Completed annotations" % str(datetime.now() - start_time))
Exemplo n.º 5
0
def annotate_vgfs(input_fasta,
                  virsorter_affi_contigs=None,
                  output_dir='.',
                  min_contig_size=2500,
                  prodigal_mode='meta',
                  trans_table='11',
                  bit_score_threshold=60,
                  rbh_bit_score_threshold=350,
                  custom_db_name=(),
                  custom_fasta_loc=(),
                  use_uniref=False,
                  low_mem_mode=False,
                  skip_trnascan=False,
                  keep_tmp_dir=True,
                  threads=10,
                  verbose=True):
    # set up
    start_time = datetime.now()
    print('%s: Viral annotation started' % str(datetime.now()))

    # check inputs
    prodigal_modes = ['train', 'meta', 'single']
    if prodigal_mode not in prodigal_modes:
        raise ValueError('Prodigal mode must be one of %s.' %
                         ', '.join(prodigal_modes))
    elif prodigal_mode in ['normal', 'single']:
        warnings.warn(
            'When running prodigal in single mode your bins must have long contigs (average length >3 Kbp), '
            'be long enough (total length > 500 Kbp) and have very low contamination in order for prodigal '
            'training to work well.')

    # get database locations
    db_locs = get_database_locs()
    db_handler = DatabaseHandler(db_locs['description_db'])
    db_locs_anno = filter_db_locs(db_locs, low_mem_mode, use_uniref,
                                  VMAG_DBS_TO_ANNOTATE)

    if virsorter_affi_contigs is not None:
        virsorter_hits = get_virsorter_hits(virsorter_affi_contigs)
    else:
        virsorter_hits = None

    # split sequences into seperate fastas
    mkdir(output_dir)
    contig_dir = path.join(output_dir, 'vMAGs')
    mkdir(contig_dir)
    contig_locs = list()
    for seq in read_sequence(input_fasta, format='fasta'):
        if len(seq) >= min_contig_size:
            if '=' in seq.metadata['id'] or ';' in seq.metadata['id']:
                raise ValueError(
                    'FASTA headers must not have = or ; before the first space (%s). To run DRAM-v you '
                    'must rerun VIRSorter with = and ; removed from the headers or run DRAM-v.py '
                    'remove_bad_characters and then rerun DRAM-v' %
                    seq.metadata['id'])
            if virsorter_hits is not None:
                if get_virsorter_affi_contigs_name(
                        seq.metadata['id']
                ) not in virsorter_hits['name'].values:
                    raise ValueError(
                        "No virsorter calls found in %s for scaffold %s from input fasta"
                        % (virsorter_affi_contigs, seq.metadata['id']))
            contig_loc = path.join(contig_dir, '%s.fasta' % seq.metadata['id'])
            write_sequence((i for i in [seq]), format='fasta', into=contig_loc)
            contig_locs.append(contig_loc)

    # annotate vMAGs
    rename_bins = False
    annotations = annotate_fastas(contig_locs, output_dir, db_locs_anno,
                                  db_handler, min_contig_size, prodigal_mode,
                                  trans_table, bit_score_threshold,
                                  rbh_bit_score_threshold, custom_db_name,
                                  custom_fasta_loc, skip_trnascan, rename_bins,
                                  keep_tmp_dir, start_time, threads, verbose)
    print('%s: Annotations complete, assigning auxiliary scores and flags' %
          str(datetime.now() - start_time))

    annotations = add_dramv_scores_and_flags(annotations, db_locs,
                                             virsorter_hits, input_fasta)

    # write annotations
    annotations.to_csv(path.join(output_dir, 'annotations.tsv'), sep='\t')

    print("%s: Completed annotations" % str(datetime.now() - start_time))
Exemplo n.º 6
0
def add_dramv_scores_and_flags(annotations,
                               db_locs=None,
                               virsorter_hits=None,
                               input_fasta=None):
    # setting up scoring viral genes
    amg_database_frame = pd.read_csv(db_locs['amg_database'], sep='\t')
    genome_summary_form = pd.read_csv(db_locs['genome_summary_form'],
                                      sep='\t',
                                      index_col=0)
    genome_summary_form = genome_summary_form.loc[
        genome_summary_form.potential_amg]

    # add auxiliary score
    if virsorter_hits is not None:
        gene_virsorter_category_dict = dict()
        gene_auxiliary_score_dict = dict()
        for scaffold, dram_frame in annotations.groupby('scaffold'):
            virsorter_scaffold_name = get_virsorter_affi_contigs_name(scaffold)
            virsorter_frame = virsorter_hits.loc[virsorter_hits.name ==
                                                 virsorter_scaffold_name]
            gene_order = get_gene_order(dram_frame, virsorter_frame)
            gene_virsorter_category_dict.update({
                dram_gene: virsorter_category
                for dram_gene, _, virsorter_category in gene_order
                if dram_gene is not None
            })
            gene_auxiliary_score_dict.update(
                calculate_auxiliary_scores(gene_order))
        annotations['virsorter_category'] = pd.Series(
            gene_virsorter_category_dict)
        annotations['auxiliary_score'] = pd.Series(gene_auxiliary_score_dict)

    # get metabolic flags
    scaffold_length_dict = {
        seq.metadata['id']: len(seq)
        for seq in read_sequence(input_fasta, format='fasta')
    }
    metabolic_genes = set(genome_summary_form.index)
    if 'pfam_hits' in annotations:
        annotations['is_transposon'] = [
            is_transposon(i) for i in annotations['pfam_hits']
        ]
    else:
        annotations['is_transposon'] = False

    amgs = get_amg_ids(amg_database_frame)
    verified_amgs = get_amg_ids(
        amg_database_frame.loc[amg_database_frame.verified])
    annotations['amg_flags'] = pd.Series(
        get_metabolic_flags(annotations, metabolic_genes, amgs, verified_amgs,
                            scaffold_length_dict))

    # downgrade B flag auxiliary scores
    if virsorter_hits is not None:
        annotations['auxiliary_score'] = pd.Series({
            gene: (4 if 'B' in row['amg_flags'] and row['auxiliary_score'] < 4
                   else row['auxiliary_score'])
            for gene, row in annotations.iterrows()
        })

    return annotations