def make_etc_coverage_df(etc_module_df, annotations, groupby_column='fasta'): etc_coverage_df_rows = list() for _, module_row in etc_module_df.iterrows(): definition = module_row['definition'] # remove optional subunits definition = re.sub(r'-K\d\d\d\d\d', '', definition) module_net, _ = make_module_network(definition) # add end node no_out = [ node for node in module_net.nodes() if module_net.out_degree(node) == 0 ] for node in no_out: module_net.add_edge(node, 'end') # go through each genome and check pathway coverage for group, frame in annotations.groupby(groupby_column): # get annotation genes grouped_ids = set(get_ids_from_annotation(frame).keys()) path_len, path_coverage_count, path_coverage_percent, genes, missing_genes = \ get_module_coverage(module_net, grouped_ids) complex_module_name = 'Complex %s: %s' % ( module_row['complex'].replace('Complex ', ''), module_row['module_name']) etc_coverage_df_rows.append([ module_row['module_id'], module_row['module_name'], module_row['complex'].replace('Complex ', ''), group, path_len, path_coverage_count, path_coverage_percent, ','.join(sorted(genes)), ','.join(sorted(missing_genes)), complex_module_name ]) return pd.DataFrame(etc_coverage_df_rows, columns=ETC_COVERAGE_COLUMNS)
def make_functional_df(annotations, function_heatmap_form, groupby_column='fasta'): # clean up function heatmap form function_heatmap_form = function_heatmap_form.apply(lambda x: x.str.strip() if x.dtype == "object" else x) function_heatmap_form = function_heatmap_form.fillna('') # build dict of ids per genome genome_to_id_dict = dict() for genome, frame in annotations.groupby(groupby_column, sort=False): id_list = get_ids_from_annotation(frame).keys() genome_to_id_dict[genome] = set(id_list) # build long from data frame rows = list() for function, frame in function_heatmap_form.groupby('function_name', sort=False): for bin_name, id_set in genome_to_id_dict.items(): presents_in_bin = list() functions_present = set() for _, row in frame.iterrows(): function_id_set = set([i.strip() for i in row.function_ids.strip().split(',')]) present_in_bin = id_set & function_id_set functions_present = functions_present | present_in_bin presents_in_bin.append(len(present_in_bin) > 0) function_in_bin = np.all(presents_in_bin) row = frame.iloc[0] rows.append([row.category, row.subcategory, row.function_name, ', '.join(functions_present), '; '.join(get_ordered_uniques(frame.long_function_name)), '; '.join(get_ordered_uniques(frame.gene_symbol)), bin_name, function_in_bin, '%s: %s' % (row.category, row.function_name)]) return pd.DataFrame(rows, columns=list(function_heatmap_form.columns) + ['genome', 'present', 'category_function_name'])
def fill_genome_summary_frame(annotations, genome_summary_frame, groupby_column): for genome, frame in annotations.groupby(groupby_column, sort=False): genome_summary_id_sets = [set([k.strip() for k in j.split(',')]) for j in genome_summary_frame['gene_id']] id_dict = get_ids_from_annotation(frame) counts = list() for i in genome_summary_id_sets: identifier_count = 0 for j in i: if j in id_dict: identifier_count += id_dict[j] counts.append(identifier_count) genome_summary_frame[genome] = counts return genome_summary_frame
def add_custom_ms(annotations, distillate_form): metabolic_genes = set(distillate_form.index) new_amg_flags = list() for gene, row in annotations.iterrows(): if 'M' in row['amg_flags']: new_amg_flags.append(row['amg_flags']) else: gene_annotations = set( get_ids_from_annotation(pd.DataFrame(row).transpose()).keys()) if len(metabolic_genes & gene_annotations) > 0: new_amg_flags.append(row['amg_flags'] + 'M') else: new_amg_flags.append(row['amg_flags']) return new_amg_flags
def get_metabolic_flags(annotations, metabolic_genes, amgs, verified_amgs, scaffold_length_dict, length_from_end=5000): flag_dict = dict() metabolic_genes = set(metabolic_genes) for scaffold, scaffold_annotations in annotations.groupby('scaffold'): # perc_xh = sum([i == 'Xh' if not pd.isna(i) else False for i in scaffold_annotations['vogdb_categories']]) \ # / scaffold_annotations.shape[0] # is_j = perc_xh >= 0.18 for gene, row in scaffold_annotations.iterrows(): # set up flags = '' gene_annotations = set( get_ids_from_annotation(pd.DataFrame(row).transpose()).keys()) # is viral if not pd.isna(row['vogdb_categories']): if len({'Xr', 'Xs'} & set(row['vogdb_categories'].split(';'))) > 0: flags += 'V' # is metabolic if len(metabolic_genes & gene_annotations) > 0: flags += 'M' # is this a reported AMG reported if len(gene_annotations & set(amgs)) > 0: if 'M' not in flags: flags += 'M' flags += 'K' # is this a experimentally verified amg if len(gene_annotations & set(verified_amgs)) > 0: flags += 'E' # is this gene a normal viral cell host entry gene if len(gene_annotations & CELL_ENTRY_CAZYS) > 0: flags += 'A' # is gene a normal virus peptidase if len(gene_annotations & VIRAL_PEPTIDASES_MEROPS) > 0: flags += 'P' # if there is a transposon in the contig if scaffold_annotations['is_transposon'].any(): flags += 'T' # within 5 kb of end of contig if (int(row['start_position']) < length_from_end) or \ (int(row['end_position']) > (scaffold_length_dict[row['scaffold']] - length_from_end)): flags += 'F' # if is_j: # flags += 'J' flag_dict[gene] = flags # get 3 metabolic genes in a row flag for i in range( len(scaffold_annotations) ): # this needs to be fixed. Will only give B to middle of 3 genes. if 0 < i < (len(scaffold_annotations) - 1): gene = scaffold_annotations.index[i] gene_flags = flag_dict[gene] previous_gene = scaffold_annotations.index[i - 1] previous_gene_flags = flag_dict[previous_gene] next_gene = scaffold_annotations.index[i + 1] next_gene_flags = flag_dict[next_gene] if 'M' in previous_gene_flags and 'M' in gene_flags and 'M' in next_gene_flags: if 'B' not in flag_dict[previous_gene]: flag_dict[previous_gene] += 'B' if 'B' not in flag_dict[gene]: flag_dict[gene] += 'B' if 'B' not in flag_dict[next_gene]: flag_dict[next_gene] += 'B' return flag_dict