def summarize_vgfs(input_file, output_dir, groupby_column='scaffold', max_auxiliary_score=3, remove_transposons=False, remove_fs=False): start_time = datetime.now() # set up annotations = pd.read_csv(input_file, sep='\t', index_col=0) db_locs = get_database_locs() if 'genome_summary_form' not in db_locs: raise ValueError( 'Genome summary form location must be set in order to summarize genomes' ) mkdir(output_dir) genome_summary_form = pd.read_csv(db_locs['genome_summary_form'], sep='\t', index_col=0) print('%s: Retrieved database locations and descriptions' % (str(datetime.now() - start_time))) # get potential AMGs # potential_amgs = filter_to_amgs(annotations.fillna(''), max_aux=max_auxiliary_score, # remove_transposons=remove_transposons, remove_fs=remove_fs, remove_js=remove_js) potential_amgs = filter_to_amgs(annotations.fillna(''), max_aux=max_auxiliary_score, remove_transposons=remove_transposons, remove_fs=remove_fs) print('%s: Determined potential amgs' % (str(datetime.now() - start_time))) # make distillate viral_genome_stats = make_viral_stats_table(annotations, potential_amgs, groupby_column) viral_genome_stats.to_csv(path.join(output_dir, 'vMAG_stats.tsv'), sep='\t') print('%s: Calculated viral genome statistics' % (str(datetime.now() - start_time))) viral_distillate = make_viral_distillate(potential_amgs, genome_summary_form) viral_distillate.to_csv(path.join(output_dir, 'amg_summary.tsv'), sep='\t', index=None) print('%s: Generated AMG summary' % (str(datetime.now() - start_time))) # make liquor vgf_order = make_vgf_order(potential_amgs) amg_column = make_amg_count_column(potential_amgs, vgf_order) viral_function_df = make_viral_functional_df(potential_amgs, genome_summary_form, groupby_column=groupby_column) viral_functional_heatmap = make_viral_functional_heatmap( viral_function_df, vgf_order) alt.hconcat(amg_column, viral_functional_heatmap, spacing=5).save(path.join(output_dir, 'product.html')) print('%s: Generated product heatmap' % (str(datetime.now() - start_time))) print("%s: Completed distillation" % str(datetime.now() - start_time))
def populate_description_db(output_loc=None, db_dict=None, start_time=None): if start_time is None: start_time = datetime.now() print('%s: Populating description database' % str(datetime.now() - start_time)) # setup if db_dict is None: db_dict = get_database_locs() if db_dict.get('description_db') is None and output_loc is not None: db_dict['description_db'] = output_loc elif db_dict.get('description_db') is None and output_loc is None: raise ValueError( 'Must provide output location if description db location is not set in configuration' ) elif path.exists(db_dict['description_db']): remove(db_dict['description_db']) create_description_db(db_dict['description_db']) db_handler = DatabaseHandler(db_dict['description_db']) print('%s: Database connection established' % str(datetime.now() - start_time)) # fill database add_to_description_db(db_dict['kegg'], 'kegg_description', make_header_dict_from_mmseqs_db, db_handler) print('%s: KEGG descriptions added to description database' % str(datetime.now() - start_time)) add_to_description_db(db_dict['uniref'], 'uniref_description', make_header_dict_from_mmseqs_db, db_handler) print('%s: UniRef descriptions added to description database' % str(datetime.now() - start_time)) add_to_description_db(db_dict['pfam_hmm_dat'], 'pfam_description', process_pfam_descriptions, db_handler) print('%s: PFAM descriptions added to description database' % str(datetime.now() - start_time)) add_to_description_db(db_dict['dbcan_fam_activities'], 'dbcan_description', process_dbcan_descriptions, db_handler) print('%s: dbCAN descriptions added to description database' % str(datetime.now() - start_time)) add_to_description_db(db_dict['viral'], 'viral_description', make_header_dict_from_mmseqs_db, db_handler) print('%s: RefSeq viral descriptions added to description database' % str(datetime.now() - start_time)) add_to_description_db(db_dict['peptidase'], 'peptidase_description', make_header_dict_from_mmseqs_db, db_handler) print('%s: MEROPS descriptions added to description database' % str(datetime.now() - start_time)) add_to_description_db(db_dict['vog_annotations'], 'vogdb_description', process_vogdb_descriptions, db_handler) print('%s: VOGdb descriptions added to description database' % str(datetime.now() - start_time)) print('%s: Description database populated' % str(datetime.now() - start_time))
def test_set_database_paths(tmpdir): test_config_dir = tmpdir.mkdir('test_config') # first test that adding nothing doesn't change CONFIG test_config = os.path.join(test_config_dir, 'CONFIG') pretest_db_dict = get_database_locs() set_database_paths(config_loc=test_config) test_db_dict = get_database_locs(test_config) assert type(test_db_dict) is dict assert pretest_db_dict == test_db_dict # test that adding something that doesn't exist throws error test_fake_database = os.path.join(test_config_dir, 'fake_database.mmsdb') with pytest.raises(ValueError): set_database_paths(kegg_db_loc=test_fake_database) # test that adding something real is really added kegg_loc = os.path.join('tests', 'data', 'fake_gff.gff') set_database_paths(kegg_db_loc=kegg_loc, config_loc=test_config) test_db_dict = get_database_locs(test_config) assert test_db_dict['kegg'] == os.path.realpath(kegg_loc) # test that adding something with use_current_locs False works set_database_paths(kegg_db_loc=kegg_loc, config_loc=test_config, use_current_locs=False) test_db_dict = get_database_locs(test_config) assert test_db_dict['kegg'] == os.path.realpath(kegg_loc) assert test_db_dict['description_db'] is None
def print_database_locations(db_locs=None): if db_locs is None: db_locs = get_database_locs() print('KEGG db: %s' % db_locs.get('kegg')) print('KOfam db: %s' % db_locs.get('kofam')) print('KOfam KO list: %s' % db_locs.get('kofam_ko_list')) print('UniRef db: %s' % db_locs.get('uniref')) print('Pfam db: %s' % db_locs.get('pfam')) print('Pfam hmm dat: %s' % db_locs.get('pfam_hmm_dat')) print('dbCAN db: %s' % db_locs.get('dbcan')) print('dbCAN family activities: %s' % db_locs.get('dbcan_fam_activities')) print('RefSeq Viral db: %s' % db_locs.get('viral')) print('MEROPS peptidase db: %s' % db_locs.get('peptidase')) print('VOGDB db: %s' % db_locs.get('vogdb')) print('VOG annotations: %s' % db_locs.get('vog_annotations')) print('Description db: %s' % db_locs.get('description_db')) print('Genome summary form: %s' % db_locs.get('genome_summary_form')) print('Module step form: %s' % db_locs.get('module_step_form')) print('ETC module database: %s' % db_locs.get('etc_module_database')) print('Function heatmap form: %s' % db_locs.get('function_heatmap_form')) print('AMG database: %s' % db_locs.get('amg_database'))
def annotate_vgfs(input_fasta, virsorter_affi_contigs=None, output_dir='.', min_contig_size=2500, prodigal_mode='meta', trans_table='11', bit_score_threshold=60, rbh_bit_score_threshold=350, custom_db_name=(), custom_fasta_loc=(), use_uniref=False, low_mem_mode=False, skip_trnascan=False, keep_tmp_dir=True, threads=10, verbose=True): # set up start_time = datetime.now() print('%s: Viral annotation started' % str(datetime.now())) # check inputs prodigal_modes = ['train', 'meta', 'single'] if prodigal_mode not in prodigal_modes: raise ValueError('Prodigal mode must be one of %s.' % ', '.join(prodigal_modes)) elif prodigal_mode in ['normal', 'single']: warnings.warn( 'When running prodigal in single mode your bins must have long contigs (average length >3 Kbp), ' 'be long enough (total length > 500 Kbp) and have very low contamination in order for prodigal ' 'training to work well.') # get database locations db_locs = get_database_locs() db_handler = DatabaseHandler(db_locs['description_db']) db_locs_anno = filter_db_locs(db_locs, low_mem_mode, use_uniref, VMAG_DBS_TO_ANNOTATE) if virsorter_affi_contigs is not None: virsorter_hits = get_virsorter_hits(virsorter_affi_contigs) else: virsorter_hits = None # split sequences into seperate fastas mkdir(output_dir) contig_dir = path.join(output_dir, 'vMAGs') mkdir(contig_dir) contig_locs = list() for seq in read_sequence(input_fasta, format='fasta'): if len(seq) >= min_contig_size: if '=' in seq.metadata['id'] or ';' in seq.metadata['id']: raise ValueError( 'FASTA headers must not have = or ; before the first space (%s). To run DRAM-v you ' 'must rerun VIRSorter with = and ; removed from the headers or run DRAM-v.py ' 'remove_bad_characters and then rerun DRAM-v' % seq.metadata['id']) if virsorter_hits is not None: if get_virsorter_affi_contigs_name( seq.metadata['id'] ) not in virsorter_hits['name'].values: raise ValueError( "No virsorter calls found in %s for scaffold %s from input fasta" % (virsorter_affi_contigs, seq.metadata['id'])) contig_loc = path.join(contig_dir, '%s.fasta' % seq.metadata['id']) write_sequence((i for i in [seq]), format='fasta', into=contig_loc) contig_locs.append(contig_loc) # annotate vMAGs rename_bins = False annotations = annotate_fastas(contig_locs, output_dir, db_locs_anno, db_handler, min_contig_size, prodigal_mode, trans_table, bit_score_threshold, rbh_bit_score_threshold, custom_db_name, custom_fasta_loc, skip_trnascan, rename_bins, keep_tmp_dir, start_time, threads, verbose) print('%s: Annotations complete, processing annotations' % str(datetime.now() - start_time)) # setting up scoring viral genes amg_database_frame = pd.read_csv(db_locs['amg_database'], sep='\t') genome_summary_form = pd.read_csv(db_locs['genome_summary_form'], sep='\t', index_col=0) genome_summary_form = genome_summary_form.loc[ genome_summary_form.potential_amg] # add auxiliary score if virsorter_hits is not None: gene_virsorter_category_dict = dict() gene_auxiliary_score_dict = dict() for scaffold, dram_frame in annotations.groupby('scaffold'): virsorter_scaffold_name = get_virsorter_affi_contigs_name(scaffold) virsorter_frame = virsorter_hits.loc[virsorter_hits.name == virsorter_scaffold_name] gene_order = get_gene_order(dram_frame, virsorter_frame) gene_virsorter_category_dict.update({ dram_gene: virsorter_category for dram_gene, _, virsorter_category in gene_order if dram_gene is not None }) gene_auxiliary_score_dict.update( calculate_auxiliary_scores(gene_order)) annotations['virsorter_category'] = pd.Series( gene_virsorter_category_dict) annotations['auxiliary_score'] = pd.Series(gene_auxiliary_score_dict) # get metabolic flags scaffold_length_dict = { seq.metadata['id']: len(seq) for seq in read_sequence(input_fasta, format='fasta') } metabolic_genes = set(genome_summary_form.index) if 'pfam_hits' in annotations: annotations['is_transposon'] = [ is_transposon(i) for i in annotations['pfam_hits'] ] else: annotations['is_transposon'] = False amgs = get_amg_ids(amg_database_frame) verified_amgs = get_amg_ids( amg_database_frame.loc[amg_database_frame.verified]) annotations['amg_flags'] = pd.Series( get_metabolic_flags(annotations, metabolic_genes, amgs, verified_amgs, scaffold_length_dict)) # downgrade B flag auxiliary scores if virsorter_affi_contigs is not None: annotations['auxiliary_score'] = pd.Series({ gene: (4 if 'B' in row['amg_flags'] and row['auxiliary_score'] < 4 else row['auxiliary_score']) for gene, row in annotations.iterrows() }) # write annotations annotations.to_csv(path.join(output_dir, 'annotations.tsv'), sep='\t') print("%s: Completed annotations" % str(datetime.now() - start_time))
def annotate_vgfs(input_fasta, virsorter_affi_contigs=None, output_dir='.', min_contig_size=2500, prodigal_mode='meta', trans_table='11', bit_score_threshold=60, rbh_bit_score_threshold=350, custom_db_name=(), custom_fasta_loc=(), use_uniref=False, low_mem_mode=False, skip_trnascan=False, keep_tmp_dir=True, threads=10, verbose=True): # set up start_time = datetime.now() print('%s: Viral annotation started' % str(datetime.now())) # check inputs prodigal_modes = ['train', 'meta', 'single'] if prodigal_mode not in prodigal_modes: raise ValueError('Prodigal mode must be one of %s.' % ', '.join(prodigal_modes)) elif prodigal_mode in ['normal', 'single']: warnings.warn( 'When running prodigal in single mode your bins must have long contigs (average length >3 Kbp), ' 'be long enough (total length > 500 Kbp) and have very low contamination in order for prodigal ' 'training to work well.') # get database locations db_locs = get_database_locs() db_handler = DatabaseHandler(db_locs['description_db']) db_locs_anno = filter_db_locs(db_locs, low_mem_mode, use_uniref, VMAG_DBS_TO_ANNOTATE) if virsorter_affi_contigs is not None: virsorter_hits = get_virsorter_hits(virsorter_affi_contigs) else: virsorter_hits = None # split sequences into seperate fastas mkdir(output_dir) contig_dir = path.join(output_dir, 'vMAGs') mkdir(contig_dir) contig_locs = list() for seq in read_sequence(input_fasta, format='fasta'): if len(seq) >= min_contig_size: if '=' in seq.metadata['id'] or ';' in seq.metadata['id']: raise ValueError( 'FASTA headers must not have = or ; before the first space (%s). To run DRAM-v you ' 'must rerun VIRSorter with = and ; removed from the headers or run DRAM-v.py ' 'remove_bad_characters and then rerun DRAM-v' % seq.metadata['id']) if virsorter_hits is not None: if get_virsorter_affi_contigs_name( seq.metadata['id'] ) not in virsorter_hits['name'].values: raise ValueError( "No virsorter calls found in %s for scaffold %s from input fasta" % (virsorter_affi_contigs, seq.metadata['id'])) contig_loc = path.join(contig_dir, '%s.fasta' % seq.metadata['id']) write_sequence((i for i in [seq]), format='fasta', into=contig_loc) contig_locs.append(contig_loc) # annotate vMAGs rename_bins = False annotations = annotate_fastas(contig_locs, output_dir, db_locs_anno, db_handler, min_contig_size, prodigal_mode, trans_table, bit_score_threshold, rbh_bit_score_threshold, custom_db_name, custom_fasta_loc, skip_trnascan, rename_bins, keep_tmp_dir, start_time, threads, verbose) print('%s: Annotations complete, assigning auxiliary scores and flags' % str(datetime.now() - start_time)) annotations = add_dramv_scores_and_flags(annotations, db_locs, virsorter_hits, input_fasta) # write annotations annotations.to_csv(path.join(output_dir, 'annotations.tsv'), sep='\t') print("%s: Completed annotations" % str(datetime.now() - start_time))
def test_get_database_locs(): test_database_locs = get_database_locs() assert type(test_database_locs) is dict assert 'description_db' in test_database_locs
def summarize_genomes(input_file, trna_path=None, rrna_path=None, output_dir='.', groupby_column='fasta', custom_distillate=None, distillate_gene_names=False): start_time = datetime.now() # read in data annotations = pd.read_csv(input_file, sep='\t', index_col=0) if 'bin_taxnomy' in annotations: annotations = annotations.sort_values('bin_taxonomy') if trna_path is None: trna_frame = None else: trna_frame = pd.read_csv(trna_path, sep='\t') if rrna_path is None: rrna_frame = None else: rrna_frame = pd.read_csv(rrna_path, sep='\t') # get db_locs and read in dbs db_locs = get_database_locs() if 'genome_summary_form' not in db_locs: raise ValueError( 'Genome summary form location must be set in order to summarize genomes' ) if 'module_step_form' not in db_locs: raise ValueError( 'Module step form location must be set in order to summarize genomes' ) if 'function_heatmap_form' not in db_locs: raise ValueError( 'Functional heat map form location must be set in order to summarize genomes' ) # read in dbs genome_summary_form = pd.read_csv(db_locs['genome_summary_form'], sep='\t') if custom_distillate is not None: genome_summary_form = pd.concat( [genome_summary_form, pd.read_csv(custom_distillate, sep='\t')]) genome_summary_form = genome_summary_form.drop('potential_amg', axis=1) module_steps_form = pd.read_csv(db_locs['module_step_form'], sep='\t') function_heatmap_form = pd.read_csv(db_locs['function_heatmap_form'], sep='\t') etc_module_df = pd.read_csv(db_locs['etc_module_database'], sep='\t') print('%s: Retrieved database locations and descriptions' % (str(datetime.now() - start_time))) # make output folder mkdir(output_dir) # make genome stats genome_stats = make_genome_stats(annotations, rrna_frame, trna_frame, groupby_column=groupby_column) genome_stats.to_csv(path.join(output_dir, 'genome_stats.tsv'), sep='\t', index=None) print('%s: Calculated genome statistics' % (str(datetime.now() - start_time))) # make genome metabolism summary genome_summary = path.join(output_dir, 'metabolism_summary.xlsx') if distillate_gene_names: summarized_genomes = fill_genome_summary_frame_gene_names( annotations, genome_summary_form, groupby_column) else: summarized_genomes = make_genome_summary(annotations, genome_summary_form, trna_frame, rrna_frame, groupby_column) write_summarized_genomes_to_xlsx(summarized_genomes, genome_summary) print('%s: Generated genome metabolism summary' % (str(datetime.now() - start_time))) # make liquor if 'bin_taxonomy' in annotations: genome_order = get_ordered_uniques( annotations.sort_values('bin_taxonomy')[groupby_column]) # if gtdb format then get phylum and most specific if all([ i[:3] == 'd__' and len(i.split(';')) == 7 for i in annotations['bin_taxonomy'] ]): taxa_str_parser = get_phylum_and_most_specific # else just throw in what is there else: taxa_str_parser = lambda x: x labels = make_strings_no_repeats({ row[groupby_column]: taxa_str_parser(row['bin_taxonomy']) for _, row in annotations.iterrows() }) else: genome_order = get_ordered_uniques( annotations.sort_values(groupby_column)[groupby_column]) labels = None # make module coverage frame module_nets = { module: build_module_net(module_df) for module, module_df in module_steps_form.groupby('module') if module in HEATMAP_MODULES } if len(genome_order) > GENOMES_PER_LIQUOR: module_coverage_dfs = list() etc_coverage_dfs = list() function_dfs = list() # generates slice start and slice end to grab from genomes and labels from 0 to end of genome order pairwise_iter = pairwise( list(range(0, len(genome_order), GENOMES_PER_LIQUOR)) + [len(genome_order)]) for i, (start, end) in enumerate(pairwise_iter): genomes = genome_order[start:end] annotations_subset = annotations.loc[[ genome in genomes for genome in annotations[groupby_column] ]] dfs = fill_liquor_dfs(annotations_subset, module_nets, etc_module_df, function_heatmap_form, groupby_column='fasta') module_coverage_df_subset, etc_coverage_df_subset, function_df_subset = dfs module_coverage_dfs.append(module_coverage_df_subset) etc_coverage_dfs.append(etc_coverage_df_subset) function_dfs.append(function_df_subset) liquor = make_liquor_heatmap(module_coverage_df_subset, etc_coverage_df_subset, function_df_subset, genomes, labels) liquor.save(path.join(output_dir, 'product_%s.html' % i)) liquor_df = make_liquor_df(pd.concat(module_coverage_dfs), pd.concat(etc_coverage_dfs), pd.concat(function_dfs)) liquor_df.to_csv(path.join(output_dir, 'product.tsv'), sep='\t') else: module_coverage_df, etc_coverage_df, function_df = fill_liquor_dfs( annotations, module_nets, etc_module_df, function_heatmap_form, groupby_column=groupby_column) liquor_df = make_liquor_df(module_coverage_df, etc_coverage_df, function_df) liquor_df.to_csv(path.join(output_dir, 'product.tsv'), sep='\t') liquor = make_liquor_heatmap(module_coverage_df, etc_coverage_df, function_df, genome_order, labels) liquor.save(path.join(output_dir, 'product.html')) print('%s: Generated product heatmap and table' % (str(datetime.now() - start_time))) print("%s: Completed distillation" % str(datetime.now() - start_time))
def set_database_paths(kegg_db_loc=None, kofam_hmm_loc=None, kofam_ko_list_loc=None, uniref_db_loc=None, pfam_db_loc=None, pfam_hmm_dat=None, dbcan_db_loc=None, dbcan_fam_activities=None, viral_db_loc=None, peptidase_db_loc=None, vogdb_db_loc=None, vog_annotations=None, description_db_loc=None, genome_summary_form_loc=None, module_step_form_loc=None, etc_module_database_loc=None, function_heatmap_form_loc=None, amg_database_loc=None, start_time=None, config_loc=None, use_current_locs=True, update_description_db=False): if start_time is None: start_time = datetime.now() print('%s: Setting database paths' % str(datetime.now() - start_time)) if use_current_locs: db_dict = get_database_locs() else: db_dict = {} db_dict = check_exists_and_add_to_location_dict(kegg_db_loc, 'kegg', db_dict) db_dict = check_exists_and_add_to_location_dict(kofam_hmm_loc, 'kofam', db_dict) db_dict = check_exists_and_add_to_location_dict(kofam_ko_list_loc, 'kofam_ko_list', db_dict) db_dict = check_exists_and_add_to_location_dict(uniref_db_loc, 'uniref', db_dict) db_dict = check_exists_and_add_to_location_dict(pfam_db_loc, 'pfam', db_dict) db_dict = check_exists_and_add_to_location_dict(pfam_hmm_dat, 'pfam_hmm_dat', db_dict) db_dict = check_exists_and_add_to_location_dict(dbcan_db_loc, 'dbcan', db_dict) db_dict = check_exists_and_add_to_location_dict(dbcan_fam_activities, 'dbcan_fam_activities', db_dict) db_dict = check_exists_and_add_to_location_dict(viral_db_loc, 'viral', db_dict) db_dict = check_exists_and_add_to_location_dict(peptidase_db_loc, 'peptidase', db_dict) db_dict = check_exists_and_add_to_location_dict(vogdb_db_loc, 'vogdb', db_dict) db_dict = check_exists_and_add_to_location_dict(vog_annotations, 'vog_annotations', db_dict) db_dict = check_exists_and_add_to_location_dict(genome_summary_form_loc, 'genome_summary_form', db_dict) db_dict = check_exists_and_add_to_location_dict(module_step_form_loc, 'module_step_form', db_dict) db_dict = check_exists_and_add_to_location_dict(etc_module_database_loc, 'etc_module_database', db_dict) db_dict = check_exists_and_add_to_location_dict(function_heatmap_form_loc, 'function_heatmap_form', db_dict) db_dict = check_exists_and_add_to_location_dict(amg_database_loc, 'amg_database', db_dict) print('%s: Database locations added to CONFIG' % str(datetime.now() - start_time)) if update_description_db: if description_db_loc is None: description_db_loc = db_dict['description_db'] populate_description_db(description_db_loc, db_dict, start_time) print('%s: Database descriptions updated' % str(datetime.now() - start_time)) db_dict = check_exists_and_add_to_location_dict(description_db_loc, 'description_db', db_dict) # change data paths if config_loc is None: config_loc = get_config_loc() with open(config_loc, 'w') as f: f.write(json.dumps(db_dict)) print('%s: Database locations set' % str(datetime.now() - start_time))