def run(): params = read_run_params() run = params["current_run"] out_home = params["container"] + "output/" out_dir = out_home + run + "/" ##### generate genome wide variants if params["source"] == "alignment_all": df = run_alignment_analysis(params, "alignment_all") variant_screener.genomewide_mutations(df, out_dir, run) elif params["source"] == "alignment_current": df = run_alignment_analysis(params, "alignment_current") variant_screener.genomewide_mutations(df, out_dir, run)
def run(): params = read_run_params() run = params["current_run"] out_home = params["container"] + "output/" out_dir = out_home + run + "/" ##### preprare for variant calling df_muttable = pd.read_excel(out_dir + "1_genomic_mutations_" + run + ".xlsx") if params["source"] == "alignment_current": df_muttable_prev = pd.read_excel( out_home + "/previous_genomic_table/1_genomic_mutations_all.xlsx") df_muttable = df_muttable.append(df_muttable_prev, ignore_index=True) df_muttable.drop_duplicates("strain", inplace=True) df_muttable.to_excel(out_dir + "1_genomic_mutations_all.xlsx", index=False) df_muttable.to_excel( out_home + "/previous_genomic_table/1_genomic_mutations_all_" + run + ".xlsx", index=False) df_muttable.nucleotide_change = df_muttable.nucleotide_change.apply( ast.literal_eval) df_muttable.nt_aa_pair = df_muttable.nt_aa_pair.apply(ast.literal_eval) # #### generate per variant summary with pangolin db_pangolin = params[ "container"] + "pangolin_analysis/pangolin_lineage_assignment_for_pipeline_" + run + ".xlsx" db = out_dir + "/database/5_strains_run_group_analysis_table_with_samplesheet.csv" variant_screener.variant_screener_per_variant_summary_with_pangolin( df_muttable, db, db_pangolin, out_dir, run) print("check...") print(df_muttable.head()) # generate_snp_table log_msg("generating snp table for s protein..") df_muttable.columns = [ "strain", "total_mutations", "nucleotide_change", "nt_aa_pair", "deletions" ] mutation.generate_snp_table(df_muttable, db, out_dir, run) variant_screener.variant_screener_summary_pangolin(df_muttable, out_dir, run)
def get_combine_df(): params = read_run_params() all_strains = [] nta_dir = params["container"]+"input_alignment/" nta_files = params["nta_group_v1"] for nta_group in sorted(nta_files) : log_msg("run group is - "+ nta_group) align_file = nta_files[nta_group].split(",")[0] adj = int(nta_files[nta_group].split(",")[1]) if "reference" in nta_group: all_strains.extend(genomicSequenceFromAlignment(nta_dir+align_file,reference=True)) else: all_strains.extend(genomicSequenceFromAlignment(nta_dir+align_file,adjustment=adj)) df = pd.DataFrame(all_strains) df = filterMainAlignment(df,"cds") return df
import pandas as pd import numpy as np from datetime import timedelta import gen_utils.gen_covid as covid from gen_utils.gen_io import read_run_params,log_msg import sys params = read_run_params() run = params["current_run"] out_dir = params["container"]+"output/"+run+"/" df = pd.read_csv(out_dir+"4_mcov_strain_variant_map_covid_pangolin_db_input_"+run+".csv") df = df[df.quality=="HQ"] start_date = sys.argv[1] end_date = sys.argv[2] dfs=[] for voi in covid.covid_variants.pangolin: print(voi) ###take unique patients with variant keep_mrns_variant = np.unique(df[df.variant==voi]["MRN"]) df_mrns = df[df.MRN.isin(keep_mrns_variant)] df_mrns = df_mrns[df_mrns.variant==voi] ###important step--remove non b117 variant df_mrns.sort_values("COLLECTION_DT",inplace=True) df_mrns.drop_duplicates("MRN",keep="first",inplace=True) keep_mrns_not_variant = np.unique(df[df.variant!=voi]["MRN"]) df_mrns_not_variant = df[df.MRN.isin(keep_mrns_not_variant)] df_mrns_not_variant = df_mrns_not_variant[df_mrns_not_variant.variant!=voi] df_mrns_not_variant.sort_values("COLLECTION_DT",inplace=True) df_mrns_not_variant.drop_duplicates("MRN",keep="first",inplace=True)
def getcon(): params = read_run_params() SQLALCHEMY_DATABASE_URI = "mysql+pymysql://" + params["dbcred"] sqlEngine = sqlalchemy.create_engine(SQLALCHEMY_DATABASE_URI, echo=False) dbConnection = sqlEngine.connect() return dbConnection