def main(env, args): # type: (Environment, argparse.Namespace) -> None df_bac = load_obj(args.pf_input_bac) # type: pd.DataFrame df_arc = load_obj(args.pf_input_arc) # type: pd.DataFrame df_bac["Type"] = "Bacteria" df_arc["Type"] = "Archaea" df = pd.concat([df_bac, df_arc], sort=False) # df = df.sample(100) df["GENOME_TYPE"] = df["GENOME_TYPE"].apply( lambda x: x.strip().split("-")[1].upper()) df.loc[df["GENOME_TYPE"] == "D2", "GENOME_TYPE"] = "D" df.reset_index(inplace=True) import matplotlib matplotlib.rcParams.update({ # "pgf.texsystem": "pdflatex", 'font.family': 'serif', 'text.usetex': False, 'pgf.rcfonts': False, }) visualize_matrix_column(env, df, "RBS_MAT") visualize_matrix_column( env, df[(df["Type"] == "Bacteria") & (df["GENOME_TYPE"] == "C")], "PROMOTER_MAT")
def main(env, args): # type: (Environment, argparse.Namespace) -> None df = load_obj(args.pf_data) # type: pd.DataFrame df.reset_index(inplace=True) df = df[df["GENOME_TYPE"] == "group-a"].copy() df["RE"] = df[["RBS_MAT", "NON_MAT"]].apply(lambda r: relative_entropy( MotifModel(r["RBS_MAT"], None), GMS2Noncoding(r["NON_MAT"])), axis=1) sns.jointplot(df, "GC", "RE") sns.kdeplot(df, "GC", "RE")
def main(env, args): # type: (Environment, argparse.Namespace) -> None mgm_models = load_obj( args.pf_mgm_models ) # type: Dict[str, Dict[str, Dict[str, MGMMotifModelAllGC]]] df_test = pd.read_csv(args.pf_test) # type: pd.DataFrame # df_test = df_test.head(500).copy() run_mgm_models_on_test_data(env, mgm_models, df_test, args.species_type, args.pf_output) # df_test = parallelize_dataframe_by_chunks(df_test, run_mgm_models_on_test_data, "df_test", { # "env": env, "mgm_models": mgm_models, "species_type": args.species_type, "pf_output": args.pf_output # }) # return df_test.to_csv(args.pf_output, index=False)
def main(env, args): # type: (Environment, argparse.Namespace) -> None if args.pf_load_state is None: gcfid_to_number_of_targets = count_targets_per_gcfid( args.pf_sbsp_output) df_assembly_summary = read_assembly_summary_into_dataframe( args.pf_assembly_summary) gcfid_to_assembly_info = get_assembly_info_per_gcfid( df_assembly_summary) taxid_to_number_of_targets = { int(gcfid_to_assembly_info[gcfid]["taxid"]): gcfid_to_number_of_targets[gcfid] for gcfid in gcfid_to_number_of_targets if gcfid in gcfid_to_assembly_info } tree = TaxonomyTree.load(args.pf_taxonomy_tree) tree.update_tree_attributes( set_number_of_targets_per_taxid, {"taxid_to_number_of_targets": taxid_to_number_of_targets}, direction="bottom-up") if args.pf_save_state is not None: save_obj(tree, args.pf_save_state) else: tree = load_obj(args.pf_load_state) tree_string = tree.to_string(check_if_should_print=should_print, attribute_name="number_of_targets", attribute_format="{:,}", tag_name=args.tag, max_depth=args.max_depth) write_string_to_file(tree_string, args.pf_output)
def load(pf_load): # type: (str) -> TaxonomyTree return load_obj(pf_load)
def main(env, args): # type: (Environment, argparse.Namespace) -> None df_bac = load_obj(args.pf_data).reset_index() # type: pd.DataFrame df_bac = df_bac[df_bac["GENOME_TYPE"].isin(args.group)] min_gc = 20 max_gc = 70 if args.motif_type == "PROMOTER": df_bac = df_bac[df_bac["GC"] >= 40].copy() gc_values = np.arange(min_gc, max_gc, 2) models = get_models_by_gc(df_bac, gc_values, motif_type=args.motif_type) num_plots = len(models) num_rows = int(math.sqrt(num_plots)) num_cols = math.ceil(num_plots / float(num_rows)) fig, axes = plt.subplots(num_rows, num_cols, sharex="all", sharey="all", figsize=(12, 10)) model_index = 0 for r in range(num_rows): for c in range(num_cols): if model_index >= len(models): break if models[model_index] is None: model_index += 1 continue bgd = [0.25] * 4 bgd = background_from_gc(gc_values[model_index]) newmod = lm.transform_matrix(models[model_index][0], to_type="information", from_type="probability", background=models[model_index][1]) # from copy import copy # newmod = copy(models[model_index][0]) # for idx in newmod.index: # # see https://bioconductor.org/packages/release/bioc/vignettes/universalmotif/inst/doc/IntroductionToSequenceMotifs.pdf # # uncertainty = sum( # [newmod.at[idx, l] * math.log2(newmod.at[idx, l]) for l in newmod.columns] # ) # fIC = math.log2(4) - uncertainty # for i, l in enumerate(sorted(newmod.columns)): # newmod.at[idx, l] = max(1 * newmod.at[idx, l] * math.log2(newmod.at[idx, l] / models[model_index][1][i]), 0) lm.Logo(newmod, ax=axes[r][c]) axes[r][c].set_ylim(0, 2) axes[r][c].set_title(int(gc_values[model_index])) # fig.show() model_index += 1 plt.tight_layout() plt.savefig(next_name(env["pd-work"])) plt.show()