예제 #1
0
def genomewide_mutations_outside(df,out_dir,tag,out_tag):

    log_msg("generating genome-wide mutations..for outside samples")

    df_out = df[df['id'].str[0:4].isin([out_tag,'MN90'])] 

    df_muttable = mutation.genomeWideMutationTable(df_out)

    df_muttable = mutation.add_aa_mutations_column(df_muttable,out_dir,tag)

    df_muttable.to_excel(out_dir+"1_genomic_mutations_ocov_"+tag+".xlsx",index=False)
예제 #2
0
def variant_screener_summary_pangolin(df_muttable,out_dir,tag):

    log_msg("screening variants..")

    var_dict = covid.get_covid_variants()
    
    for variant in var_dict:
        total_snvs = str(len(var_dict[variant]))    
        df_muttable[variant] ,df_muttable[variant+"(SNVs:"+total_snvs+")"] =\
        get_variants_match_mutations(df_muttable,var_dict[variant])
    
    df_muttable.to_excel(out_dir+"2_variants_screen_summary_"+tag+".xlsx",index=False)
예제 #3
0
def genomewide_mutations(df,out_dir,tag,deletion=True):

    log_msg("generating genome-wide mutations..")

    df_muttable = mutation.genomeWideMutationTable(df)

    df_muttable = mutation.add_aa_mutations_column(df_muttable,out_dir,tag)

    if deletion:
        log_msg("generating genome-wide Deletions..")

        df_muttable = mutation.add_aa_deletions_column(df_muttable,out_dir,tag)

    df_muttable.to_excel(out_dir+"1_genomic_mutations_"+tag+".xlsx",index=False)
예제 #4
0
def generate_df_from_multiple_alignments(alignment_files):

    df = pd.DataFrame()
    for alignment_file in alignment_files:
        df_af = analyzeEntireGenomewithReference(alignment_file)
        df_af = filterMainAlignment(df_af)
        df = df.append(df_af,ignore_index=True)
        print(df.shape)

    df = filterMainAlignment(df)
    
    log_msg("completed processing all the version 2 alignment files..")
    print(df.head())

    return df
예제 #5
0
def run():
    params = read_run_params()
    run = params["current_run"]
    out_home = params["container"] + "output/"
    out_dir = out_home + run + "/"

    ##### preprare for variant calling
    df_muttable = pd.read_excel(out_dir + "1_genomic_mutations_" + run +
                                ".xlsx")

    if params["source"] == "alignment_current":
        df_muttable_prev = pd.read_excel(
            out_home + "/previous_genomic_table/1_genomic_mutations_all.xlsx")
        df_muttable = df_muttable.append(df_muttable_prev, ignore_index=True)
        df_muttable.drop_duplicates("strain", inplace=True)
        df_muttable.to_excel(out_dir + "1_genomic_mutations_all.xlsx",
                             index=False)
        df_muttable.to_excel(
            out_home + "/previous_genomic_table/1_genomic_mutations_all_" +
            run + ".xlsx",
            index=False)

    df_muttable.nucleotide_change = df_muttable.nucleotide_change.apply(
        ast.literal_eval)
    df_muttable.nt_aa_pair = df_muttable.nt_aa_pair.apply(ast.literal_eval)

    # #### generate per variant summary with pangolin
    db_pangolin = params[
        "container"] + "pangolin_analysis/pangolin_lineage_assignment_for_pipeline_" + run + ".xlsx"
    db = out_dir + "/database/5_strains_run_group_analysis_table_with_samplesheet.csv"

    variant_screener.variant_screener_per_variant_summary_with_pangolin(
        df_muttable, db, db_pangolin, out_dir, run)

    print("check...")
    print(df_muttable.head())
    # generate_snp_table
    log_msg("generating snp table for s protein..")
    df_muttable.columns = [
        "strain", "total_mutations", "nucleotide_change", "nt_aa_pair",
        "deletions"
    ]
    mutation.generate_snp_table(df_muttable, db, out_dir, run)

    variant_screener.variant_screener_summary_pangolin(df_muttable, out_dir,
                                                       run)
예제 #6
0
def get_combine_df():

    params = read_run_params()

    all_strains = []

    nta_dir = params["container"]+"input_alignment/"
    
    nta_files = params["nta_group_v1"]

    for nta_group in sorted(nta_files) :
        log_msg("run group is - "+ nta_group)
        align_file = nta_files[nta_group].split(",")[0]
        adj = int(nta_files[nta_group].split(",")[1])

        if "reference" in nta_group:
            all_strains.extend(genomicSequenceFromAlignment(nta_dir+align_file,reference=True))
        else:            
            all_strains.extend(genomicSequenceFromAlignment(nta_dir+align_file,adjustment=adj))

    df = pd.DataFrame(all_strains)
    df = filterMainAlignment(df,"cds")

    return df
예제 #7
0
def run_alignment_analysis(params, mode):

    nta_dir = params["container"] + "input_alignment/"

    if mode == "alignment_all":
        log_msg("processing version 1 data..")
        #### get data from start to run 10
        df_start_10 = run_version_1_analysis()

        log_msg("processing version 2 data..")
        #### get data for run 11 onwards
        nta_files = params["nta_group_v2"]

        alignment_files = []
        log_msg(
            "processing newer version alignments (after run11 novaseq - one codex pipeline) --"
        )
        for nta_group in sorted(nta_files):
            print(nta_group, nta_files[nta_group])
            alignment_files.append(nta_dir + nta_files[nta_group])

        df_11_18 = variant_screener.generate_df_from_multiple_alignments(
            alignment_files)

        sel_indx = [0]
        for x in range(266, 29675, 1):
            sel_indx.append(x)
        df_11_18 = df_11_18.loc[1:, sel_indx]  ###remove reference

        df_start_10 = df_start_10.append(df_11_18, ignore_index=True)

        df_start_10.rename(columns={0: "id"}, inplace=True)
        print("finalized data")
        print(df_start_10.head())

        return df_start_10

    elif mode == "alignment_current":

        alignment_file = [nta_dir + params["alignment_current"]]

        df = variant_screener.generate_df_from_multiple_alignments(
            alignment_file)

        sel_indx = [0]
        for x in range(266, 29675, 1):
            sel_indx.append(x)
        df = df[sel_indx]
        df.rename(columns={0: "id"}, inplace=True)
        print("finalized version 2 data")
        print(df.head())

        return df
예제 #8
0
def variant_screener_per_variant_summary_with_pangolin(df,db,db_pangolin,out_dir,tag):

    log_msg("generating per variant information..")

    df_db = pd.read_csv(db)
    df.rename(columns={"strain":"MCoVNumber"},inplace=True)
    dfjoin = pd.merge(df,df_db,on="MCoVNumber",how="left",indicator=True)
    dfjoin.rename(columns={'_merge':'variant_db_Match'},inplace=True)

    dfjoin = dfjoin[dfjoin.variant_db_Match=="both"]
    dfjoin.COLLECTION_DT = pd.to_datetime(dfjoin.COLLECTION_DT)

    ### add pangolin
    df_pangolin = pd.read_excel(db_pangolin)
    dfjoin = pd.merge(dfjoin,df_pangolin,right_on="taxon",left_on="MCoVNumber",how="left")
    print(dfjoin.head())


    writer = pd.ExcelWriter(out_dir+"3_variant_screening_per_variant_summary_pangolin_"+tag+".xlsx", engine='xlsxwriter',
    datetime_format='yyyy-mm-dd hh:mm:ss', date_format='yyyy-mm-dd')

    vois = covid.covid_variants.pangolin

    summary_variants = {}
    mcov_variant = []
    for variant in vois:
    
        df_variant = dfjoin[dfjoin["lineage"]==variant]

        df_variant = df_variant[['MCoVNumber', 'ORDER_ID', 'MRN','COLLECTION_DT', 
            'ORDERING_CLINIC_ID','ORDERING_CLINIC_NAME',
            'FACILITY','ADMISSION_DT','DISCHARGE_DT','HIS_PATIENT_TYPE',
            'ZIP','lineage','run_id_seq']]

        df_variant.sort_values("run_id_seq",inplace=True)

        for indx,row in df_variant.iterrows():
            mcov_variant.append(
                [
                    row['MCoVNumber'], variant
                ]
            )

        df_variant.to_excel(writer, sheet_name=variant,index=False)

        write_excel_table(writer,variant)

        summary_variants[variant] = df_variant.groupby("run_id_seq").size().to_dict()
    
    ###save mcov variants file for covid database
    df_mcov_variant = df_pangolin[["taxon","lineage"]]
    df_mcov_variant.columns = ['strain','variant']
    dfjoin_mcov_variant = pd.merge(df_db,df_mcov_variant,left_on="MCoVNumber",right_on="strain",how="left")
    
    
    mcov_first_variant_per_patient = dfjoin_mcov_variant[dfjoin_mcov_variant.quality=="HQ"].sort_values(by='COLLECTION_DT', ascending=True).drop_duplicates(['variant', 'MRN'])['MCoVNumber'].values
    dfjoin_mcov_variant['Is_First_For_Patient'] = [1 if x in mcov_first_variant_per_patient else 0 for x in dfjoin_mcov_variant["MCoVNumber"]]    
    dfjoin_mcov_variant.to_csv(out_dir+"4_mcov_strain_variant_map_covid_pangolin_db_input_"+tag+".csv",index=False)

    sel_runs = sorted([x for x in dfjoin.run_id_seq.unique() if "low_quality" not in x])
    
    summary_table = []
    summary_column = ["Run","Total","Start-Date","End-Date"]
    for variant in sorted(summary_variants):
        summary_column.append(variant)
    summary_table.append(summary_column)

    for runid in sel_runs:
        
        run_row = []

        df_runid = dfjoin[dfjoin.run_id_seq==runid]
        total = df_runid.shape[0]
        start = df_runid.COLLECTION_DT.min().date()
        end = df_runid.COLLECTION_DT.max().date()

        run_row.append(runid)
        run_row.append(total)
        run_row.append(start)
        run_row.append(end)

        for variant in sorted(summary_variants):
            try:
                run_row.append(summary_variants[variant][runid])
            except:
                run_row.append("")

        summary_table.append(run_row)
    
    df_summary = pd.DataFrame(summary_table)
    df_summary.columns = df_summary.iloc[0,:]
    df_summary.drop(0, inplace=True)
    df_summary.to_excel(writer, sheet_name="Summary",index=False)
    write_excel_table(writer,"Summary")
    
    writer.close()