'--memory', type=str, dest="memory", required=False, default=8, help='MAx memory to use') #argcomplete.autocomplete(parser) arguments = parser.parse_args() return arguments args = get_arguments() sample_list_F = file_to_list(args.sample_list) print("\n%d samples will be analysed: %s" % (len(sample_list_F), ",".join(sample_list_F))) ###################################################################### #####################START PIPELINE################################### ###################################################################### output = os.path.abspath(args.output) group_name = args.input.split("/")[-1].split(".")[0] out_vcf_dir = os.path.join(args.output, "VCF") check_create_dir(out_vcf_dir) output_vcf_file = os.path.abspath(args.input) base_input = os.path.basename(args.input)
def main(): """ Create main function to capture code errors: https://stackoverflow.com/questions/6234405/logging-uncaught-exceptions-in-python """ args = get_arguments() ###################################################################### #####################START PIPELINE################################### ###################################################################### output = os.path.abspath(args.output) group_name = output.split("/")[-1] reference = os.path.abspath(args.reference) #annotation = os.path.abspath(args.annotation) # LOGGING # Create log file with date and time right_now = str(datetime.datetime.now()) right_now_full = "_".join(right_now.split(" ")) log_filename = group_name + "_" + right_now_full + ".log" log_folder = os.path.join(output, 'Logs') check_create_dir(log_folder) log_full_path = os.path.join(log_folder, log_filename) logger = logging.getLogger() logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s:%(message)s') file_handler = logging.FileHandler(log_full_path) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(formatter) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) # stream_handler.setFormatter(formatter) logger.addHandler(stream_handler) logger.addHandler(file_handler) logger.info("\n\n" + BLUE + BOLD + "STARTING PIPELINE IN GROUP: " + group_name + END_FORMATTING) today = str(datetime.date.today()) logger.info("ARGUMENTS:") logger.info(str(args)) # Obtain all R1 and R2 from folder r1, r2 = extract_read_list(args.input_dir) # Check if there are samples to filter out sample_list_F = [] if args.sample_list == None: logger.info("\n" + "No samples to filter") for r1_file, r2_file in zip(r1, r2): sample = extract_sample(r1_file, r2_file) sample_list_F.append(sample) else: logger.info("samples will be filtered") sample_list_F = file_to_list(args.sample_list) new_samples = check_reanalysis(args.output, sample_list_F) logger.info("\n%d samples will be analysed: %s" % (len(sample_list_F), ",".join(sample_list_F))) logger.info("\n%d NEW samples will be analysed: %s" % (len(new_samples), ",".join(new_samples))) #DECLARE FOLDERS CREATED IN PIPELINE ################ #AND KEY FILES ###################################### ##################################################### # Annotation related parameters #script_dir = os.path.dirname(os.path.realpath(__file__)) # Output related out_qc_dir = os.path.join(output, "Quality") out_qc_pre_dir = os.path.join(out_qc_dir, "raw") # subfolder out_variant_dir = os.path.join(output, "Variants") out_core_dir = os.path.join(output, "Core") out_stats_dir = os.path.join(output, "Stats") out_stats_bamstats_dir = os.path.join( out_stats_dir, "Bamstats") # subfolder out_stats_coverage_dir = os.path.join( out_stats_dir, "Coverage") # subfolder out_compare_dir = os.path.join(output, "Compare") out_annot_dir = os.path.join(output, "Annotation") out_annot_snpeff_dir = os.path.join(out_annot_dir, "snpeff") # subfolder out_annot_user_dir = os.path.join(out_annot_dir, "user") # subfolder out_annot_user_aa_dir = os.path.join(out_annot_dir, "user_aa") # subfolder out_annot_blast_dir = os.path.join(out_annot_dir, "blast") # subfolder out_species_dir = os.path.join(output, "Species") new_sample_number = 0 for r1_file, r2_file in zip(r1, r2): # EXtract sample name sample = extract_sample(r1_file, r2_file) args.sample = sample if sample in sample_list_F: # VARINAT SAMPLE DIR sample_variant_dir = os.path.join(out_variant_dir, sample) sample_number = str(sample_list_F.index(sample) + 1) sample_total = str(len(sample_list_F)) if sample in new_samples: new_sample_number = str(int(new_sample_number) + 1) new_sample_total = str(len(new_samples)) logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample + " (" + sample_number + "/" + sample_total + ")" + " (" + new_sample_number + "/" + new_sample_total + ")" + END_FORMATTING) else: logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample + " (" + sample_number + "/" + sample_total + ")" + END_FORMATTING) output_final_vcf = os.path.join( sample_variant_dir, 'snps.all.ivar.tsv') if not os.path.isfile(output_final_vcf): ##############START PIPELINE##################### ################################################# # INPUT ARGUMENTS ################ # check_file_exists(r1_file) # check_file_exists(r2_file) args.output = os.path.abspath(args.output) check_create_dir(args.output) # QUALITY CHECK in RAW with fastqc ###################################################### check_create_dir(out_qc_dir) out_qc_raw_name_r1 = (".").join(r1_file.split( '/')[-1].split('.')[0:-2]) + '_fastqc.html' out_qc_raw_name_r2 = (".").join(r2_file.split( '/')[-1].split('.')[0:-2]) + '_fastqc.html' output_qc_raw_file_r1 = os.path.join( out_qc_pre_dir, out_qc_raw_name_r1) output_qc_raw_file_r2 = os.path.join( out_qc_pre_dir, out_qc_raw_name_r2) if os.path.isfile(output_qc_raw_file_r1) and os.path.isfile(output_qc_raw_file_r2): logger.info(YELLOW + DIM + output_qc_raw_file_r1 + " EXIST\nOmmiting QC for sample " + sample + END_FORMATTING) else: logger.info( GREEN + "Checking quality in sample " + sample + END_FORMATTING) logger.info("R1: " + r1_file + "\nR2: " + r2_file) fastqc_quality(r1_file, r2_file, out_qc_pre_dir, args.threads) """ TODO: Human filter """ # VARIANT CALLING WITH SNIPPY ################################################### output_vcf_sub = os.path.join( sample_variant_dir, "snps.subs.vcf") output_vcf = os.path.join(sample_variant_dir, "snps.vcf") if os.path.isfile(output_vcf_sub) and os.path.isfile(output_vcf): logger.info(YELLOW + DIM + output_vcf + " EXIST\nOmmiting Variant calling in " + sample + END_FORMATTING) else: logger.info( GREEN + "Calling variants with snippy " + sample + END_FORMATTING) run_snippy(r1_file, r2_file, reference, out_variant_dir, sample, threads=args.threads, minqual=10, minfrac=0.1, mincov=1) old_bam = os.path.join(sample_variant_dir, "snps.bam") old_bai = os.path.join(sample_variant_dir, "snps.bam.bai") new_bam = os.path.join(sample_variant_dir, sample + ".bam") new_bai = os.path.join( sample_variant_dir, sample + ".bam.bai") os.rename(old_bam, new_bam) os.rename(old_bai, new_bai) #VARIANT FORMAT COMBINATION (REMOVE COMPLEX) ######## ##################################################### out_variant_indel_sample = os.path.join( sample_variant_dir, "snps.indel.vcf") out_variant_all_sample = os.path.join( sample_variant_dir, "snps.all.vcf") if os.path.isfile(out_variant_indel_sample): logger.info(YELLOW + DIM + out_variant_indel_sample + " EXIST\nOmmiting indel filtering in sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Filtering INDELS in " + sample + END_FORMATTING) extract_indels(output_vcf) if os.path.isfile(out_variant_all_sample): logger.info(YELLOW + DIM + out_variant_all_sample + " EXIST\nOmmiting vcf combination in sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Combining vcf in " + sample + END_FORMATTING) merge_vcf(output_vcf_sub, out_variant_indel_sample) #VARIANT FORMAT ADAPTATION TO IVAR ################## ##################################################### out_variant_tsv_file = os.path.join( sample_variant_dir, 'snps.all.ivar.tsv') if os.path.isfile(out_variant_tsv_file): logger.info(YELLOW + DIM + out_variant_tsv_file + " EXIST\nOmmiting format adaptation for sample " + sample + END_FORMATTING) else: logger.info( GREEN + "Adapting variants format in sample " + sample + END_FORMATTING) prior = datetime.datetime.now() vcf_to_ivar_tsv(out_variant_all_sample, out_variant_tsv_file) after = datetime.datetime.now() print(("Done with function in: %s" % (after - prior))) # SPECIES DETERMINATION ################################################### check_create_dir(out_species_dir) output_species = os.path.join( out_species_dir, sample + ".screen.tab") if os.path.isfile(output_species): logger.info(YELLOW + DIM + output_species + " EXIST\nOmmiting Species determinatin in " + sample + END_FORMATTING) else: logger.info( GREEN + "Determining species in " + sample + END_FORMATTING) mash_screen(r1_file, out_species_dir, r2_file=r2_file, winner=True, threads=args.threads, mash_database=args.mash_database) ########################CREATE STATS AND QUALITY FILTERS######################################################################## ################################################################################################################################ #CREATE Bamstats####################################### ####################################################### check_create_dir(out_stats_dir) check_create_dir(out_stats_bamstats_dir) out_bamstats_name = sample + ".bamstats" out_bamstats_file = os.path.join( out_stats_bamstats_dir, out_bamstats_name) bam_sample_file = os.path.join(sample_variant_dir, sample + ".bam") if os.path.isfile(out_bamstats_file): logger.info(YELLOW + DIM + out_bamstats_file + " EXIST\nOmmiting Bamstats for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Creating bamstats in sample " + sample + END_FORMATTING) create_bamstat( bam_sample_file, out_stats_bamstats_dir, sample, threads=args.threads) #CREATE Bamstats####################################### ####################################################### check_create_dir(out_stats_coverage_dir) out_coverage_name = sample + ".cov" out_coverage_file = os.path.join( out_stats_coverage_dir, out_coverage_name) if os.path.isfile(out_coverage_file): logger.info(YELLOW + DIM + out_coverage_file + " EXIST\nOmmiting Bamstats for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Creating coverage in sample " + sample + END_FORMATTING) create_coverage(bam_sample_file, out_stats_coverage_dir, sample) # coverage OUTPUT SUMMARY ###################################################### prior_recal = datetime.datetime.now() logger.info(GREEN + "Creating summary report for coverage result in group " + group_name + END_FORMATTING) obtain_group_cov_stats(out_stats_dir, group_name) after_recal = datetime.datetime.now() logger.info("Done with report for coverage: %s" % (after_recal - prior_recal)) # READS and VARIANTS OUTPUT SUMMARY ###################################################### logger.info(GREEN + "Creating overal summary report in group " + group_name + END_FORMATTING) obtain_overal_stats(output, group_name) # REMOVE UNCOVERED ############################################################################################################################## logger.info(GREEN + "Removing low quality samples in group " + group_name + END_FORMATTING) uncovered_samples = remove_low_quality( output, min_coverage=args.coverage20, min_hq_snp=args.min_snp, type_remove='Uncovered') if len(uncovered_samples) > 1: logger.info(GREEN + "Uncovered samples: " + (",").join(uncovered_samples) + END_FORMATTING) else: logger.info(GREEN + "NO uncovered samples found" + END_FORMATTING) # RUN SNIPPY CORE ############################################################################################################################## if args.core: check_create_dir(out_core_dir) logger.info(GREEN + "Running snippy-core " + group_name + END_FORMATTING) run_snippy_core(out_variant_dir, out_core_dir, reference) logger.info(GREEN + "Adapting core-snp to compare format " + group_name + END_FORMATTING) core_vcf_file = os.path.join(out_core_dir, "core.vcf") core_vcf_file_adapted = os.path.join( out_core_dir, "core.vcf.adapted.tsv") core_vcf_file_removed = os.path.join( out_core_dir, "core.vcf.adapted.final.tsv") core_vcf_df_adapted = import_VCF4_core_to_compare(core_vcf_file) core_vcf_df_adapted.to_csv( core_vcf_file_adapted, sep="\t", index=False) logger.info(GREEN + "Obtaining clustered positions " + group_name + END_FORMATTING) close_positions_list = extract_close_snps( core_vcf_df_adapted, snps_in_10=1) logger.info(GREEN + "Obtaining uncovered positions " + group_name + END_FORMATTING) uncovered_list = identify_uncovered( out_stats_coverage_dir, min_coverage=10, nocall_fr=0.5) logger.debug('Clustered positions in core SNP:\n{}'.format( (",".join([str(x) for x in close_positions_list])))) logger.debug('Uncovered positions in all samples:\n{}'.format( (",".join([str(x) for x in uncovered_list])))) to_remove_list = close_positions_list + uncovered_list remove_df = remove_position_from_compare( core_vcf_df_adapted, to_remove_list) remove_df.to_csv(core_vcf_file_removed, sep="\t", index=False) ddtb_compare(core_vcf_file_removed, distance=10) #ANNOTATION WITH SNPEFF AND USER INPUT ############## ##################################################### logger.info("\n\n" + BLUE + BOLD + "STARTING ANNOTATION IN GROUP: " + group_name + END_FORMATTING + "\n") check_create_dir(out_annot_dir) check_create_dir(out_annot_snpeff_dir) # SNPEFF if args.snpeff_database != False: for root, _, files in os.walk(out_variant_dir): for name in files: if name == 'snps.all.vcf': sample = root.split('/')[-1] filename = os.path.join(root, name) chrom_filename = os.path.join( root, 'snps.all.chromosome.vcf') out_annot_file = os.path.join( out_annot_snpeff_dir, sample + ".annot") if os.path.isfile(out_annot_file): logger.info(YELLOW + DIM + out_annot_file + " EXIST\nOmmiting snpEff Annotation for sample " + sample + END_FORMATTING) else: logger.info( GREEN + "Annotating sample with snpEff: " + sample + END_FORMATTING) rename_reference_snpeff(filename, chrom_filename) annotate_snpeff(chrom_filename, out_annot_file, database=args.snpeff_database) else: logger.info(YELLOW + DIM + " No SnpEff database suplied, skipping annotation in group " + group_name + END_FORMATTING) # USER DEFINED if not args.annot_bed and not args.annot_vcf: logger.info( YELLOW + BOLD + "Ommiting User Annotation, no BED or VCF files supplied" + END_FORMATTING) else: check_create_dir(out_annot_user_dir) for root, _, files in os.walk(out_variant_dir): for name in files: if name == 'snps.all.ivar.tsv': sample = root.split('/')[-1] logger.info( 'User bed/vcf annotation in sample {}'.format(sample)) filename = os.path.join(root, name) out_annot_file = os.path.join( out_annot_user_dir, sample + ".tsv") user_annotation( filename, out_annot_file, vcf_files=args.annot_vcf, bed_files=args.annot_bed) # USER AA DEFINED if not args.annot_aa: logger.info( YELLOW + BOLD + "Ommiting User aa Annotation, no AA files supplied" + END_FORMATTING) else: check_create_dir(out_annot_user_aa_dir) for root, _, files in os.walk(out_annot_snpeff_dir): if root == out_annot_snpeff_dir: for name in files: if name.endswith('.annot'): sample = name.split('.')[0] logger.info( 'User aa annotation in sample {}'.format(sample)) filename = os.path.join(root, name) out_annot_aa_file = os.path.join( out_annot_user_aa_dir, sample + ".tsv") if os.path.isfile(out_annot_aa_file): user_annotation_aa( out_annot_aa_file, out_annot_aa_file, aa_files=args.annot_aa) else: user_annotation_aa( filename, out_annot_aa_file, aa_files=args.annot_aa) # USER FASTA ANNOTATION if not args.annot_fasta: logger.info( YELLOW + BOLD + "Ommiting User FASTA Annotation, no FASTA files supplied" + END_FORMATTING) else: check_create_dir(out_annot_blast_dir) for root, _, files in os.walk(out_variant_dir): for name in files: if name.endswith('.consensus.subs.fa'): filename = os.path.join(root, name) sample = root.split('/')[-1] logger.info( 'User FASTA annotation in sample {}'.format(sample)) # out_annot_aa_file = os.path.join( # out_annot_user_aa_dir, sample + ".tsv") for db in args.annot_fasta: make_blast(filename, db, sample, out_annot_blast_dir, db_type="nucl", query_type="nucl", evalue=0.0001, threads=8) # USER AA TO HTML if not args.annot_aa: logger.info( YELLOW + BOLD + "Ommiting User aa Annotation to HTML, no AA files supplied" + END_FORMATTING) else: annotated_samples = [] logger.info('Adapting annotation to html in {}'.format(group_name)) for root, _, files in os.walk(out_annot_user_aa_dir): if root == out_annot_user_aa_dir: for name in files: if name.endswith('.tsv'): sample = name.split('.')[0] annotated_samples.append(sample) filename = os.path.join(root, name) annotation_to_html(filename, sample) annotated_samples = [str(x) for x in annotated_samples] report_samples_html_all = report_samples_html.replace( 'ALLSAMPLES', ('","').join(annotated_samples)) # NEW with open(os.path.join(out_annot_user_aa_dir, '00_all_samples.html'), 'w+') as f: f.write(report_samples_html_all) # SNP COMPARISON using tsv variant files ###################################################### logger.info("\n\n" + BLUE + BOLD + "STARTING COMPARISON IN GROUP: " + group_name + END_FORMATTING + "\n") check_create_dir(out_compare_dir) folder_compare = today + "_" + group_name path_compare = os.path.join(out_compare_dir, folder_compare) check_create_dir(path_compare) full_path_compare = os.path.join(path_compare, group_name) compare_snp_matrix_recal = full_path_compare + ".revised.final.tsv" compare_snp_matrix_recal_intermediate = full_path_compare + ".revised_intermediate.tsv" compare_snp_matrix_recal_mpileup = full_path_compare + \ ".revised_intermediate_vcf.tsv" compare_snp_matrix_INDEL_intermediate = full_path_compare + \ ".revised_INDEL_intermediate.tsv" # Create intermediate recalibrated_snp_matrix_intermediate = ddbb_create_intermediate( out_variant_dir, out_stats_coverage_dir, min_freq_discard=0.1, min_alt_dp=10, only_snp=False) # recalibrated_snp_matrix_intermediate.to_csv( # compare_snp_matrix_recal_intermediate, sep="\t", index=False) # Remove SNPs from BED file (PE/PPE) if args.remove_bed: recalibrated_snp_matrix_intermediate = remove_bed_positions( recalibrated_snp_matrix_intermediate, args.remove_bed) recalibrated_snp_matrix_intermediate.to_csv( compare_snp_matrix_recal_intermediate, sep="\t", index=False) # Recalibrate intermediate with VCF prior_recal = datetime.datetime.now() recalibrated_snp_matrix_mpileup = recalibrate_ddbb_vcf_intermediate( compare_snp_matrix_recal_intermediate, out_variant_dir, min_cov_low_freq=10) recalibrated_snp_matrix_mpileup.to_csv( compare_snp_matrix_recal_mpileup, sep="\t", index=False) after_recal = datetime.datetime.now() logger.debug("Done with recalibration vcf: %s" % (after_recal - prior_recal)) # Remove SNPs located within INDELs compare_snp_matrix_INDEL_intermediate_df = remove_position_range( recalibrated_snp_matrix_mpileup) compare_snp_matrix_INDEL_intermediate_df.to_csv( compare_snp_matrix_INDEL_intermediate, sep="\t", index=False) # Extract all positions marked as complex complex_variants = extract_complex_list(out_variant_dir) logger.debug('Complex positions in all samples:\n{}'.format( (",".join([str(x) for x in complex_variants])))) # Clean all faulty positions and samples => Final table recalibrated_revised_INDEL_df = revised_df(compare_snp_matrix_INDEL_intermediate_df, path_compare, complex_pos=complex_variants, min_freq_include=0.8, min_threshold_discard_uncov_sample=args.min_threshold_discard_uncov_sample, min_threshold_discard_uncov_pos=args.min_threshold_discard_uncov_pos, min_threshold_discard_htz_sample=args.min_threshold_discard_htz_sample, min_threshold_discard_htz_pos=args.min_threshold_discard_htz_pos, min_threshold_discard_all_pos=args.min_threshold_discard_all_pos, min_threshold_discard_all_sample=args.min_threshold_discard_all_sample, remove_faulty=True, drop_samples=True, drop_positions=True, windows_size_discard=args.window) recalibrated_revised_INDEL_df.to_csv( compare_snp_matrix_recal, sep="\t", index=False) # Matrix to pairwise and mwk ddtb_compare(compare_snp_matrix_recal, distance=5) logger.info("\n\n" + MAGENTA + BOLD + "COMPARING FINISHED IN GROUP: " + group_name + END_FORMATTING + "\n") logger.info("\n\n" + MAGENTA + BOLD + "#####END OF PIPELINE AUTOSNIPPY ANALYSIS#####" + END_FORMATTING + "\n")
def main(): """ Create main function to capture code errors: https://stackoverflow.com/questions/6234405/logging-uncaught-exceptions-in-python """ # ARGUMENTS def get_arguments(): parser = argparse.ArgumentParser( prog='covidma.py', description= 'Pipeline to call variants (SNVs) with any non model organism. Specialised in SARS-CoV-2' ) input_group = parser.add_argument_group('Input', 'Input parameters') input_group.add_argument( '-i', '--input', dest="input_dir", metavar="input_directory", type=str, required=True, help='REQUIRED.Input directory containing all fast[aq] files') input_group.add_argument('-r', '--reference', metavar="reference", type=str, required=True, help='REQUIRED. File to map against') input_group.add_argument( '-a', '--annotation', metavar="annotation", type=str, required=True, help='REQUIRED. gff3 file to annotate variants') input_group.add_argument('-s', '--sample', metavar="sample", type=str, required=False, help='Sample to identify further files') input_group.add_argument( '-L', '--sample_list', type=str, required=False, help='Sample names to analyse only in the file supplied') input_group.add_argument( '-p', '--primers', type=str, default= '/home/laura/DATABASES/Anotacion/COVID/primers/nCoV-2019.bed', required=False, help='Bed file including primers to trim') quality_group = parser.add_argument_group( 'Quality parameters', 'parameters for diferent triming conditions') quality_group.add_argument( '-c', '--coverage20', type=int, default=90, required=False, help= 'Minimum percentage of coverage at 20x to clasify as uncovered (Default 90)' ) quality_group.add_argument('-n', '--min_snp', type=int, required=False, default=1, help='SNP number to pass quality threshold') output_group = parser.add_argument_group( 'Output', 'Required parameter to output results') output_group.add_argument( '-o', '--output', type=str, required=True, help='REQUIRED. Output directory to extract all results') output_group.add_argument( '-C', '--noclean', required=False, action='store_false', help='Clean unwanted files for standard execution') params_group = parser.add_argument_group( 'Parameters', 'parameters for diferent stringent conditions') params_group.add_argument('-T', '--threads', type=str, dest="threads", required=False, default=16, help='Threads to use') params_group.add_argument('-M', '--memory', type=str, dest="memory", required=False, default=32, help='Max memory to use') annot_group = parser.add_argument_group( 'Annotation', 'parameters for variant annotation') annot_group.add_argument('-B', '--annot_bed', type=str, default=[], required=False, action='append', help='bed file to annotate') annot_group.add_argument('-V', '--annot_vcf', type=str, default=[], required=False, action='append', help='vcf file to annotate') annot_group.add_argument('-A', '--annot_aa', type=str, default=[], required=False, action='append', help='aminoacid file to annotate') annot_group.add_argument('-R', '--remove_bed', type=str, default=False, required=False, help='BED file with positions to remove') annot_group.add_argument( '--mash_database', type=str, required=False, default=False, help='MASH ncbi annotation containing all species database') annot_group.add_argument('--snpeff_database', type=str, required=False, default='NC_045512.2', help='snpEFF annotation database') compare_group = parser.add_argument_group( 'Compare', 'parameters for compare_snp') compare_group.add_argument('-S', '--only_snp', required=False, action='store_true', help='Use INDELS while comparing') arguments = parser.parse_args() return arguments args = get_arguments() ###################################################################### #####################START PIPELINE################################### ###################################################################### output = os.path.abspath(args.output) group_name = output.split("/")[-1] reference = os.path.abspath(args.reference) annotation = os.path.abspath(args.annotation) # LOGGING # Create log file with date and time right_now = str(datetime.datetime.now()) right_now_full = "_".join(right_now.split(" ")) log_filename = group_name + "_" + right_now_full + ".log" log_folder = os.path.join(output, 'Logs') check_create_dir(log_folder) log_full_path = os.path.join(log_folder, log_filename) logger = logging.getLogger() logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s:%(message)s') file_handler = logging.FileHandler(log_full_path) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(formatter) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) # stream_handler.setFormatter(formatter) logger.addHandler(stream_handler) logger.addHandler(file_handler) logger.info("\n\n" + BLUE + BOLD + "STARTING PIPELINE IN GROUP: " + group_name + END_FORMATTING) today = str(datetime.date.today()) logger.info("ARGUMENTS:") logger.info(str(args)) # Obtain all R1 and R2 from folder r1, r2 = extract_read_list(args.input_dir) # Check if there are samples to filter out sample_list_F = [] if args.sample_list == None: logger.info("\n" + "No samples to filter") for r1_file, r2_file in zip(r1, r2): sample = extract_sample(r1_file, r2_file) sample_list_F.append(sample) else: logger.info("samples will be filtered") sample_list_F = file_to_list(args.sample_list) new_samples = check_reanalysis(args.output, sample_list_F) logger.info("\n%d samples will be analysed: %s" % (len(new_samples), ",".join(new_samples))) #PREPARE REFERENCE FOR MAPPING + FAI + DICT ######### ##################################################### # picard_dictionary(args) samtools_faidx(args) #DECLARE FOLDERS CREATED IN PIPELINE ################ #AND KEY FILES ###################################### ##################################################### # Annotation related parameters # script_dir = os.path.dirname(os.path.realpath(__file__)) # Output related out_qc_dir = os.path.join(output, "Quality") out_qc_pre_dir = os.path.join(out_qc_dir, "raw") # subfolder out_qc_post_dir = os.path.join(out_qc_dir, "processed") # subfolder out_trim_dir = os.path.join(output, "Trimmed") out_map_dir = os.path.join(output, "Bam") out_variant_dir = os.path.join(output, "Variants") out_variant_ivar_dir = os.path.join(out_variant_dir, "ivar_raw") # subfolder out_filtered_ivar_dir = os.path.join(out_variant_dir, "ivar_filtered") # subfolder out_consensus_dir = os.path.join(output, "Consensus") out_consensus_ivar_dir = os.path.join(out_consensus_dir, "ivar") # subfolder out_stats_dir = os.path.join(output, "Stats") out_stats_bamstats_dir = os.path.join(out_stats_dir, "Bamstats") # subfolder out_stats_coverage_dir = os.path.join(out_stats_dir, "Coverage") # subfolder out_compare_dir = os.path.join(output, "Compare") out_annot_dir = os.path.join(output, "Annotation") out_annot_snpeff_dir = os.path.join(out_annot_dir, "snpeff") # subfolder out_annot_pangolin_dir = os.path.join(out_annot_dir, "pangolin") # subfolder out_annot_user_dir = os.path.join(out_annot_dir, "user") # subfolder out_annot_user_aa_dir = os.path.join(out_annot_dir, "user_aa") # subfolder new_sample_number = 0 for r1_file, r2_file in zip(r1, r2): # EXtract sample name sample = extract_sample(r1_file, r2_file) args.sample = sample if sample in sample_list_F: sample_number = str(sample_list_F.index(sample) + 1) sample_total = str(len(sample_list_F)) out_markdup_trimmed_name = sample + ".rg.markdup.trimmed.sorted.bam" output_markdup_trimmed_file = os.path.join( out_map_dir, out_markdup_trimmed_name) if sample in new_samples: new_sample_number = str(int(new_sample_number) + 1) new_sample_total = str(len(new_samples)) logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample + " (" + sample_number + "/" + sample_total + ")" + " (" + new_sample_number + "/" + new_sample_total + ")" + END_FORMATTING) else: logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample + " (" + sample_number + "/" + sample_total + ")" + END_FORMATTING) if not os.path.isfile(output_markdup_trimmed_file): args.r1_file = r1_file args.r2_file = r2_file ##############START PIPELINE##################### ################################################# # INPUT ARGUMENTS ################ check_file_exists(r1_file) check_file_exists(r2_file) args.output = os.path.abspath(args.output) check_create_dir(args.output) # QUALITY CHECK in RAW with fastqc ###################################################### check_create_dir(out_qc_dir) out_qc_raw_name_r1 = (".").join( r1_file.split('/')[-1].split('.')[0:-2]) + '_fastqc.html' out_qc_raw_name_r2 = (".").join( r2_file.split('/')[-1].split('.')[0:-2]) + '_fastqc.html' output_qc_raw_file_r1 = os.path.join(out_qc_pre_dir, out_qc_raw_name_r1) output_qc_raw_file_r2 = os.path.join(out_qc_pre_dir, out_qc_raw_name_r2) if os.path.isfile(output_qc_raw_file_r1) and os.path.isfile( output_qc_raw_file_r2): logger.info(YELLOW + DIM + output_qc_raw_file_r1 + " EXIST\nOmmiting QC for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Checking quality in sample " + sample + END_FORMATTING) logger.info("R1: " + r1_file + "\nR2: " + r2_file) fastqc_quality(r1_file, r2_file, out_qc_pre_dir, args.threads) """ TODO: Human filter """ # QUALITY TRIMMING AND ADAPTER REMOVAL WITH fastp ################################################### out_trim_name_r1 = sample + ".trimmed_R1.fastq.gz" out_trim_name_r2 = sample + ".trimmed_R2.fastq.gz" output_trimming_file_r1 = os.path.join(out_trim_dir, out_trim_name_r1) output_trimming_file_r2 = os.path.join(out_trim_dir, out_trim_name_r2) if os.path.isfile(output_trimming_file_r1) and os.path.isfile( output_trimming_file_r2): logger.info(YELLOW + DIM + output_trimming_file_r1 + " EXIST\nOmmiting Trimming for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Trimming sample " + sample + END_FORMATTING) fastp_trimming(r1_file, r2_file, sample, out_trim_dir, threads=args.threads, min_qual=20, window_size=10, min_len=35) # QUALITY CHECK in TRIMMED with fastqc ###################################################### check_create_dir(out_qc_dir) out_qc_pos_r1 = sample + ".trimmed_R1_fastqc.html" out_qc_pos_r2 = sample + ".trimmed_R2_fastqc.html" output_qc_precessed_file_r1 = os.path.join( out_qc_post_dir, out_qc_pos_r1) output_qc_precessed_file_r2 = os.path.join( out_qc_post_dir, out_qc_pos_r2) if os.path.isfile( output_qc_precessed_file_r1) and os.path.isfile( output_qc_precessed_file_r2): logger.info(YELLOW + DIM + output_qc_raw_file_r1 + " EXIST\nOmmiting QC for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Checking quality in processed sample " + sample + END_FORMATTING) logger.info("R1: " + output_trimming_file_r1 + "\nR2: " + output_trimming_file_r2) fastqc_quality(output_trimming_file_r1, output_trimming_file_r2, out_qc_post_dir, args.threads) # MAPPING WITH BWA - SAM TO SORTED BAM - ADD HEADER SG ##################################################### out_map_name = sample + ".rg.sorted.bam" output_map_file = os.path.join(out_map_dir, out_map_name) if os.path.isfile(output_map_file): logger.info(YELLOW + DIM + output_map_file + " EXIST\nOmmiting Mapping for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Mapping sample " + sample + END_FORMATTING) logger.info("R1: " + output_trimming_file_r1 + "\nR2: " + output_trimming_file_r2 + "\nReference: " + reference) bwa_mapping(output_trimming_file_r1, output_trimming_file_r2, reference, sample, out_map_dir, threads=args.threads) sam_to_index_bam(sample, out_map_dir, output_trimming_file_r1, threads=args.threads) #MARK DUPLICATES WITH PICARDTOOLS ################### ##################################################### out_markdup_name = sample + ".rg.markdup.sorted.bam" output_markdup_file = os.path.join(out_map_dir, out_markdup_name) if os.path.isfile(output_markdup_file): logger.info(YELLOW + DIM + output_markdup_file + " EXIST\nOmmiting Duplucate Mark for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Marking Dupes in sample " + sample + END_FORMATTING) logger.info("Input Bam: " + output_map_file) picard_markdup(output_map_file) #TRIM PRIMERS WITH ivar trim ######################## ##################################################### if os.path.isfile(output_markdup_trimmed_file): logger.info(YELLOW + DIM + output_markdup_trimmed_file + " EXIST\nOmmiting Duplucate Mark for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Trimming primers in sample " + sample + END_FORMATTING) logger.info("Input Bam: " + output_markdup_file) ivar_trim(output_markdup_file, args.primers, sample, min_length=30, min_quality=20, sliding_window_width=4) else: logger.info( YELLOW + DIM + output_markdup_trimmed_file + " EXIST\nOmmiting BAM mapping and BAM manipulation in sample " + sample + END_FORMATTING) ########################END OF MAPPING AND BAM MANIPULATION##################################################################### ################################################################################################################################ #VARIANT CALLING WTIH ivar variants################## ##################################################### check_create_dir(out_variant_dir) out_ivar_variant_name = sample + ".tsv" out_ivar_variant_file = os.path.join(out_variant_ivar_dir, out_ivar_variant_name) if os.path.isfile(out_ivar_variant_file): logger.info(YELLOW + DIM + out_ivar_variant_file + " EXIST\nOmmiting Variant call for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Calling variants with ivar in sample " + sample + END_FORMATTING) ivar_variants(reference, output_markdup_trimmed_file, out_variant_dir, sample, annotation, min_quality=15, min_frequency_threshold=0.01, min_depth=1) #VARIANT FILTERING ################################## ##################################################### check_create_dir(out_filtered_ivar_dir) out_ivar_filtered_file = os.path.join(out_filtered_ivar_dir, out_ivar_variant_name) if os.path.isfile(out_ivar_filtered_file): logger.info(YELLOW + DIM + out_ivar_filtered_file + " EXIST\nOmmiting Variant filtering for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Filtering variants in sample " + sample + END_FORMATTING) filter_tsv_variants(out_ivar_variant_file, out_filtered_ivar_dir, min_frequency=0.7, min_total_depth=10, min_alt_dp=4, is_pass=True, only_snp=False) #CREATE CONSENSUS with ivar consensus################## ####################################################### check_create_dir(out_consensus_dir) check_create_dir(out_consensus_ivar_dir) out_ivar_consensus_name = sample + ".fa" out_ivar_consensus_file = os.path.join(out_consensus_ivar_dir, out_ivar_consensus_name) if os.path.isfile(out_ivar_consensus_file): logger.info(YELLOW + DIM + out_ivar_consensus_file + " EXIST\nOmmiting Consensus for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Creating consensus with ivar in sample " + sample + END_FORMATTING) ivar_consensus(output_markdup_trimmed_file, out_consensus_ivar_dir, sample, min_quality=20, min_frequency_threshold=0.8, min_depth=20, uncovered_character='N') logger.info(GREEN + "Replacing consensus header in " + sample + END_FORMATTING) replace_consensus_header(out_ivar_consensus_file) ########################CREATE STATS AND QUALITY FILTERS######################################################################## ################################################################################################################################ #CREATE Bamstats####################################### ####################################################### check_create_dir(out_stats_dir) check_create_dir(out_stats_bamstats_dir) out_bamstats_name = sample + ".bamstats" out_bamstats_file = os.path.join(out_stats_bamstats_dir, out_bamstats_name) if os.path.isfile(out_bamstats_file): logger.info(YELLOW + DIM + out_bamstats_file + " EXIST\nOmmiting Bamstats for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Creating bamstats in sample " + sample + END_FORMATTING) create_bamstat(output_markdup_trimmed_file, out_stats_bamstats_dir, sample, threads=args.threads) #CREATE Bamstats####################################### ####################################################### check_create_dir(out_stats_coverage_dir) out_coverage_name = sample + ".cov" out_coverage_file = os.path.join(out_stats_coverage_dir, out_coverage_name) if os.path.isfile(out_coverage_file): logger.info(YELLOW + DIM + out_coverage_file + " EXIST\nOmmiting Bamstats for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Creating coverage in sample " + sample + END_FORMATTING) create_coverage(output_markdup_trimmed_file, out_stats_coverage_dir, sample) # fastqc OUTPUT FORMAT FOR COMPARISON ###################################################### logger.info(GREEN + "Creating summary report for quality result " + END_FORMATTING) # format_html_image(out_qc_dir) # coverage OUTPUT SUMMARY ###################################################### logger.info(GREEN + "Creating summary report for coverage result " + END_FORMATTING) obtain_group_cov_stats(out_stats_coverage_dir, group_name) # READS and VARIANTS OUTPUT SUMMARY ###################################################### logger.info(GREEN + "Creating overal summary report " + END_FORMATTING) obtain_overal_stats(output, group_name) # REMOVE UNCOVERED ############################################################################################################################## logger.info(GREEN + "Removing low quality samples" + END_FORMATTING) # remove_low_quality(output, min_percentage_20x=args.coverage20, # min_hq_snp=args.min_snp, type_remove='Uncovered') #ANNOTATION WITH SNPEFF, USER INOUT AND PANGOLIN #### ##################################################### logger.info("\n\n" + BLUE + BOLD + "STARTING ANNOTATION IN GROUP: " + group_name + END_FORMATTING + "\n") check_create_dir(out_annot_dir) check_create_dir(out_annot_snpeff_dir) check_create_dir(out_annot_pangolin_dir) # SNPEFF if args.snpeff_database != False: # CHANGE FOR RAW/FILTERED ANNOTATION for root, _, files in os.walk(out_filtered_ivar_dir): if root == out_filtered_ivar_dir: # CHANGE FOR RAW/FILTERED ANNOTATION for name in files: if name.endswith('.tsv'): sample = name.split('.')[0] filename = os.path.join(root, name) out_annot_file = os.path.join(out_annot_snpeff_dir, sample + ".annot") if os.path.isfile(out_annot_file): logger.info( YELLOW + DIM + out_annot_file + " EXIST\nOmmiting snpEff Annotation for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Annotating sample with snpEff: " + sample + END_FORMATTING) output_vcf = os.path.join(out_annot_snpeff_dir, sample + '.vcf') annotate_snpeff(filename, output_vcf, out_annot_file, database=args.snpeff_database) # USER DEFINED if not args.annot_bed and not args.annot_vcf: logger.info(YELLOW + BOLD + "Ommiting User Annotation, no BED or VCF files supplied" + END_FORMATTING) else: check_create_dir(out_annot_user_dir) # CHANGE FOR RAW/FILTERED ANNOTATION for root, _, files in os.walk(out_variant_ivar_dir): if root == out_variant_ivar_dir: # CHANGE FOR RAW/FILTERED ANNOTATION for name in files: if name.endswith('.tsv'): sample = name.split('.')[0] logger.info( 'User bed/vcf annotation in sample {}'.format( sample)) filename = os.path.join(root, name) out_annot_file = os.path.join(out_annot_user_dir, sample + ".tsv") user_annotation(filename, out_annot_file, vcf_files=args.annot_vcf, bed_files=args.annot_bed) # USER AA DEFINED if not args.annot_aa: logger.info(YELLOW + BOLD + "Ommiting User aa Annotation, no AA files supplied" + END_FORMATTING) else: check_create_dir(out_annot_user_aa_dir) for root, _, files in os.walk(out_annot_snpeff_dir): if root == out_annot_snpeff_dir: for name in files: if name.endswith('.annot'): sample = name.split('.')[0] logger.info( 'User aa annotation in sample {}'.format(sample)) filename = os.path.join(root, name) out_annot_aa_file = os.path.join( out_annot_user_aa_dir, sample + ".tsv") if os.path.isfile(out_annot_aa_file): user_annotation_aa(out_annot_aa_file, out_annot_aa_file, aa_files=args.annot_aa) else: user_annotation_aa(filename, out_annot_aa_file, aa_files=args.annot_aa) # PANGOLIN with concurrent.futures.ThreadPoolExecutor( max_workers=args.threads) as executor: futures_pangolin = [] for root, _, files in os.walk(out_consensus_ivar_dir): if root == out_consensus_ivar_dir: for name in files: if name.endswith('.fa'): sample = name.split('.')[0] filename = os.path.join(root, name) out_pangolin_filename = sample + ".lineage.csv" out_pangolin_file = os.path.join( out_annot_pangolin_dir, out_pangolin_filename) if os.path.isfile(out_pangolin_file): logger.info( YELLOW + DIM + out_pangolin_file + " EXIST\nOmmiting Lineage for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Obtaining Lineage in sample " + sample + END_FORMATTING) future = executor.submit(annotate_pangolin, filename, out_annot_pangolin_dir, out_pangolin_filename, threads=args.threads, max_ambig=0.6) futures_pangolin.append(future) for future in concurrent.futures.as_completed( futures_pangolin): logger.info(future.result()) # annotate_pangolin(filename, out_annot_pangolin_dir, # out_pangolin_filename, threads=args.threads, max_ambig=0.6) # USER AA TO HTML annotated_samples = [] logger.info('Adapting annotation to html in {}'.format(group_name)) for root, _, files in os.walk(out_annot_user_aa_dir): if root == out_annot_user_aa_dir: for name in files: if name.endswith('.tsv'): sample = name.split('.')[0] annotated_samples.append(sample) filename = os.path.join(root, name) annotation_to_html(filename, sample) annotated_samples = [str(x) for x in annotated_samples] report_samples_html_all = report_samples_html.replace( 'ALLSAMPLES', ('","').join(annotated_samples)) # NEW with open(os.path.join(out_annot_user_aa_dir, '00_all_samples.html'), 'w+') as f: f.write(report_samples_html_all) # SNP COMPARISON using tsv variant files ###################################################### logger.info("\n\n" + BLUE + BOLD + "STARTING COMPARISON IN GROUP: " + group_name + END_FORMATTING + "\n") check_create_dir(out_compare_dir) folder_compare = today + "_" + group_name path_compare = os.path.join(out_compare_dir, folder_compare) check_create_dir(path_compare) full_path_compare = os.path.join(path_compare, group_name) # ddtb_add(out_filtered_ivar_dir, full_path_compare) compare_snp_matrix_recal = full_path_compare + ".revised.final.tsv" compare_snp_matrix_INDEL = full_path_compare + ".revised_INDEL.final.tsv" compare_snp_matrix_recal_intermediate = full_path_compare + ".revised_intermediate.tsv" compare_snp_matrix_INDEL_intermediate = full_path_compare + \ ".revised_INDEL_intermediate.tsv" recalibrated_snp_matrix_intermediate = ddbb_create_intermediate( out_variant_ivar_dir, out_stats_coverage_dir, min_freq_discard=0.1, min_alt_dp=4, only_snp=args.only_snp) recalibrated_snp_matrix_intermediate.to_csv( compare_snp_matrix_recal_intermediate, sep="\t", index=False) compare_snp_matrix_INDEL_intermediate_df = remove_position_range( recalibrated_snp_matrix_intermediate) compare_snp_matrix_INDEL_intermediate_df.to_csv( compare_snp_matrix_INDEL_intermediate, sep="\t", index=False) recalibrated_revised_df = revised_df(recalibrated_snp_matrix_intermediate, path_compare, min_freq_include=0.7, min_threshold_discard_sample=0.07, min_threshold_discard_position=0.4, remove_faulty=True, drop_samples=True, drop_positions=True) recalibrated_revised_df.to_csv(compare_snp_matrix_recal, sep="\t", index=False) recalibrated_revised_INDEL_df = revised_df( compare_snp_matrix_INDEL_intermediate_df, path_compare, min_freq_include=0.7, min_threshold_discard_sample=0.07, min_threshold_discard_position=0.4, remove_faulty=True, drop_samples=True, drop_positions=True) recalibrated_revised_INDEL_df.to_csv(compare_snp_matrix_INDEL, sep="\t", index=False) ddtb_compare(compare_snp_matrix_recal, distance=0) ddtb_compare(compare_snp_matrix_INDEL, distance=0, indel=True) logger.info("\n\n" + MAGENTA + BOLD + "COMPARING FINISHED IN GROUP: " + group_name + END_FORMATTING + "\n") #####################CONSENSUS WITH REFINED CALL###### ###################################################### logger.info(GREEN + "Creating refined consensus" + END_FORMATTING) create_consensus(reference, compare_snp_matrix_recal, out_stats_coverage_dir, out_consensus_dir) logger.info("\n\n" + MAGENTA + BOLD + "#####END OF PIPELINE COVID MULTI ANALYSIS#####" + END_FORMATTING + "\n")