def bbduk_trimming(args): """ TODO : handle params """ r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) output_dir = obtain_output_dir(args, "Trimmed") in1_param = "in1=" + r1 in2_param = "in2=" + r2 sample = extract_sample(r1, r2) out1_param = "out1=" + output_dir + "/" + sample + "_R1.clean.fastq.gz" out2_param = "out2=" + output_dir + "/" + sample + "_R2.clean.fastq.gz" stats_param = "stats=" + output_dir + "/" + sample + "_trim.stats" adapter_path = "ref=" + get_bbduk_adapters() memory_param = "-Xmx" + str(args.memory) + "g" threads_param = "threads=" + str(args.threads) check_create_dir(output_dir) #bbduk.sh cmd = [ "bbduk.sh", memory_param, in1_param, in2_param, out1_param, out2_param, adapter_path, "trimq=15", "qtrim=rl", "minlen=40", "ktrim=r", "k=21", "mink=11", "hammingdistance=2", threads_param, "tpe", "tbo", stats_param ] execute_subprocess(cmd)
def bwa_mapping(args): """ #Store output in a file when it is outputted in stdout https://stackoverflow.com/questions/4965159/how-to-redirect-output-with-subprocess-in-python """ r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) reference = os.path.abspath(args.reference) sample = extract_sample(r1, r2) output_dir = obtain_output_dir(args, "Bam") sample_name = sample + ".sam" output_file = os.path.join(output_dir, sample_name) check_create_dir(output_dir) cmd_index = ["bwa", "index", reference] execute_subprocess(cmd_index) cmd_map = [ "bwa", "mem", "-t", str(args.threads), "-o", output_file, reference, r1, r2 ] execute_subprocess(cmd_map) """
def bowtie2_mapping(args): r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) reference = os.path.abspath(args.reference) sample = extract_sample(r1, r2) output_dir = obtain_output_dir(args, "Bam") sample_name = sample + ".sam" output_file = os.path.join(output_dir, sample_name) check_create_dir(output_dir) if args.extensive_mapping: extensive_command = "-a" else: extensive_command = "" #bowtie2 index cmd_index = ["bowtie2-build", reference, reference] execute_subprocess(cmd_index) #bowtie map cmd_map = [ "bowtie2", "-1", r1, "-2", r2, "-S", output_file, "-q", "--very-sensitive-local", "-p", str(args.threads), "-x", reference, extensive_command ] execute_subprocess(cmd_map)
def mash_screen(r1_file, out_dir, r2_file=False, winner=True, threads=16, mash_database="/home/laura/DATABASES/Mash/bacteria_mash.msh"): # https://mash.readthedocs.io/en/latest/index.html # https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh #MASH refseq database # mash screen -w -p 4 ../refseq.genomes.k21s1000.msh 4_R1.fastq.gz 4_R2.fastq.gz > 4.winner.screen.tab # identity, shared-hashes, median-multiplicity, p-value, query-ID, query-comment if not os.path.isfile(mash_database): logger.info(RED + BOLD + "Mash database can't be found\n" + END_FORMATTING + "You can download it typing:\n\ wget https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh") sys.exit(1) r1_file = os.path.abspath(r1_file) sample = extract_sample(r1_file, r2_file) check_create_dir(out_dir) species_output_name = sample + ".screen.tab" species_output_file = os.path.join(out_dir, species_output_name) cmd = ["mash", "screen", "-p", str(threads), mash_database, r1_file] if winner == True: cmd.insert(2, "-w") # Use both r1 and r2 instead of just r1(faster) if r2_file: r2_file = os.path.abspath(r2_file) cmd.append(r2_file) prog = cmd[0] param = cmd[1:] try: # execute_subprocess(cmd) with open(species_output_file, "w+") as outfile: # calculate mash distance and save it in output file command = subprocess.run(cmd, stdout=outfile, stderr=subprocess.PIPE, universal_newlines=True) if command.returncode == 0: logger.info(GREEN + "Program %s successfully executed" % prog + END_FORMATTING) else: print(RED + BOLD + "Command %s FAILED\n" % prog + END_FORMATTING + BOLD + "WITH PARAMETERS: " + END_FORMATTING + " ".join(param) + "\n" + BOLD + "EXIT-CODE: %d\n" % command.returncode + "ERROR:\n" + END_FORMATTING + command.stderr) except OSError as e: sys.exit(RED + BOLD + "failed to execute program '%s': %s" % (prog, str(e)) + END_FORMATTING)
def combine_gvcf(args, recalibrate=False, all_gvcf=False): """ https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.2.0/org_broadinstitute_hellbender_tools_walkers_CombineGVCFs.php #combined multi-sample gVCF: gatk CombineGVCFs -R reference.fasta --variant sample1.g.vcf.gz --variant sample2.g.vcf.gz -O cohort.g.vcf.gz """ output = os.path.abspath(args.output) input_reference = os.path.abspath(args.reference) group_name = output.split("/")[-1] #group_name if recalibrate: gvcf_input_dir = obtain_output_dir(args, "GVCF_recal") else: gvcf_input_dir = obtain_output_dir(args, "GVCF") gvcf_output_file = group_name + ".cohort.g.vcf" gvcf_output_full = os.path.join(gvcf_input_dir, gvcf_output_file) check_create_dir(gvcf_input_dir) memory_param = "-Xmx" + str(args.memory) + "g" cmd = [ "gatk", "CombineGVCFs", "--java-options", memory_param, "--reference", input_reference, "--output", gvcf_output_full ] for root, _, files in os.walk(gvcf_input_dir): for name in files: filename = os.path.join(root, name) if filename.endswith(".g.vcf"): cmd.append("--variant") cmd.append(filename) if all_gvcf != False: if os.path.isdir(all_gvcf): all_gvcf = os.path.abspath(all_gvcf) print("Using gvcf from enricment folder:" + all_gvcf) for root, _, files in os.walk(all_gvcf): for name in files: filename = os.path.join(root, name) if filename.endswith(".g.vcf"): cmd.append("--variant") cmd.append(filename) else: print("GVCF enrichment folder does not exist") execute_subprocess(cmd)
def call_variants(args, recalibrate=False, group=True): """ https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_GenotypeGVCFs.php #Call variants: gatk --java-options "-Xmx4g" GenotypeGVCFs -R Homo_sapiens_assembly38.fasta -V input.g.vcf.gz -O output.vcf.gz """ output = os.path.abspath(args.output) input_reference = os.path.abspath(args.reference) if not args.sample: args.sample = "nosample" file_name = args.sample #sample_name group_name = output.split("/")[-1] #group_name if recalibrate: gvcf_input_dir = obtain_output_dir(args, "GVCF_recal") vcf_output_dir = obtain_output_dir(args, "VCF_recal") else: gvcf_input_dir = obtain_output_dir(args, "GVCF") vcf_output_dir = obtain_output_dir(args, "VCF") if group: gvcf_input_file = group_name + ".cohort.g.vcf" vcf_output_file = group_name + ".cohort.raw.vcf" else: gvcf_input_file = file_name + ".g.vcf" vcf_output_file = file_name + ".raw.vcf" gvcf_input_full = os.path.join(gvcf_input_dir, gvcf_input_file) vcf_output_full = os.path.join(vcf_output_dir, vcf_output_file) check_create_dir(gvcf_input_dir) check_create_dir(vcf_output_dir) memory_param = "-Xmx" + str(args.memory) + "g" cmd = [ "gatk", "GenotypeGVCFs", "--java-options", memory_param, "--reference", input_reference, "--variant", gvcf_input_full, "--output", vcf_output_full ] execute_subprocess(cmd)
def sam_to_index_bam(args): # input_sam_path = os.path.abspath(input_sam) # if output_bam == "inputdir": # output_bam = os.path.dirname(input_sam_path) # else: # output_bam = output_bam r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) sample = extract_sample(r1, r2) output_dir = obtain_output_dir(args, "Bam") sample_name = sample + ".sam" input_sam_path = os.path.join(output_dir, sample_name) input_name = (".").join(os.path.basename(input_sam_path).split(".")[:-1]) output_bam_name = input_name + ".bam" output_bam_path = os.path.join(output_dir, output_bam_name) output_bg_sorted_name = input_name + ".rg.sorted.bam" output_bg_sorted_path = os.path.join(output_dir, output_bg_sorted_name) check_create_dir(output_dir) """ #sam to bam: samtools view -Sb $input_file -o $output_dir/$sample.bam with open(output_bam_path, "w") as outfile: #map reads and save it in th eoutput file subprocess.run(["samtools", "view", "-Sb", input_sam_path], stdout=outfile, stderr=subprocess.PIPE, check=True, universal_newlines=True) """ cmd = [ "samtools", "view", "-Sb", input_sam_path, "-o", output_bam_path, "--threads", str(args.threads) ] execute_subprocess(cmd) check_remove_file(input_sam_path) add_SG(args, output_bam_path, output_bg_sorted_path) check_remove_file(output_bam_path) """
def picard_markdup(args): #java -jar picard.jar MarkDuplicates \ # I=input.bam O=marked_duplicates.bam M=marked_dup_metrics.txt picard_jar = get_picard_path() input_bam = os.path.abspath(args.input_bam) in_param = "I=" + input_bam path_file_name = input_bam.split(".")[0] file_name = path_file_name.split("/")[-1] output_markdup = path_file_name + ".rg.markdup.bam" output_markdup_sorted = path_file_name + ".rg.markdup.sorted.bam" out_param = "O=" + output_markdup stat_output_dir = obtain_output_dir(args, "Stats") stat_output_file = file_name + ".markdup.metrics.txt" stat_output_full = os.path.join(stat_output_dir, stat_output_file) stats_param = "M=" + stat_output_full check_create_dir(stat_output_dir) cmd_markdup = [ "java", "-jar", picard_jar, "MarkDuplicates", in_param, out_param, stats_param ] execute_subprocess(cmd_markdup) #samtools sort: samtools sort $output_dir/$sample".sorted.bam" -o $output_dir/$sample".sorted.bam" cmd_sort = [ "samtools", "sort", output_markdup, "-o", output_markdup_sorted ] execute_subprocess(cmd_sort) #Handled in Haplotype Caller function #samtools index: samtools index $output_dir/$sample".sorted.bam" subprocess.run(["samtools", "index", output_markdup_sorted], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) check_remove_file(input_bam) check_remove_file(output_markdup)
def fastp_trimming(r1, r2, sample, output_dir, threads=6, min_qual=20, window_size=10, min_len=35): check_create_dir(output_dir) output_trimmed_r1 = os.path.join(output_dir, sample + ".trimmed_R1.fastq.gz") output_trimmed_r2 = os.path.join(output_dir, sample + ".trimmed_R2.fastq.gz") html_dir = os.path.join(output_dir, 'html') json_dir = os.path.join(output_dir, 'json') check_create_dir(html_dir) check_create_dir(json_dir) html_file = os.path.join(html_dir, sample + '_fastp.html') json_file = os.path.join(json_dir, sample + '_fastp.json') cmd = [ 'fastp', '--in1', r1, '--in2', r2, '--out1', output_trimmed_r1, '--out2', output_trimmed_r2, '--detect_adapter_for_pe', '--cut_tail', '--cut_window_size', str(window_size), '--cut_mean_quality', str(min_qual), '--length_required', str(min_len), '--json', json_file, '--html', html_file, '--thread', str(threads) ] execute_subprocess(cmd)
def ivar_variants(reference, input_bam, output_variant, sample, annotation, min_quality=20, min_frequency_threshold=0.8, min_depth=20): """ Usage: samtools mpileup -aa -A -d 0 -B -Q 0 --reference [<reference-fasta] <input.bam> | ivar variants -p <prefix> [-q <min-quality>] [-t <min-frequency-threshold>] [-m <minimum depth>] [-r <reference-fasta>] [-g GFF file] Note : samtools mpileup output must be piped into ivar variants Input Options Description -q Minimum quality score threshold to count base (Default: 20) -t Minimum frequency threshold(0 - 1) to call variants (Default: 0.03) -m Minimum read depth to call variants (Default: 0) -r Reference file used for alignment. This is used to translate the nucleotide sequences and identify intra host single nucleotide variants -g A GFF file in the GFF3 format can be supplied to specify coordinates of open reading frames (ORFs). In absence of GFF file, amino acid translation will not be done. Output Options Description -p (Required) Prefix for the output tsv variant file """ ivar_folder = os.path.join(output_variant, 'ivar_raw') check_create_dir(ivar_folder) prefix = ivar_folder + '/' + sample input = { 'reference': reference, 'input_bam': input_bam, 'prefix': prefix, 'min_quality': str(min_quality), 'min_frequency_threshold': str(min_frequency_threshold), 'min_depth': str(min_depth), 'annotation': annotation } cmd = "samtools mpileup -aa -A -d 0 -B -Q 0 --reference {reference} {input_bam} | \ ivar variants -p {prefix} -q {min_quality} -t {min_frequency_threshold} -m {min_depth} -r {reference} -g {annotation}".format( **input) execute_subprocess(cmd, isShell=True)
def replace_reference(input_vcf, output, ref_old=False, ref_new="Chromosome"): """ This function replace all instances of a reference in a vcf file Depends on extract_reference_vcf 190909 - Function now uses chromosome name in file and replaces it with term provided (default "Chromosome") """ input_file = os.path.abspath(input_vcf) output_file = os.path.abspath(output) output_dir = os.path.dirname(output) check_create_dir(output_dir) if ref_old == False: ref_old = extract_reference_vcf(input_file) with open(input_file, 'r') as fi: with open(output_file, 'w') as fo: for line in fi: ref = ref_old + "\t" new = ref_new + "\t" line = line.replace(ref, new) fo.write(line)
def main(): """ Create main function to capture code errors: https://stackoverflow.com/questions/6234405/logging-uncaught-exceptions-in-python """ args = get_arguments() ###################################################################### #####################START PIPELINE################################### ###################################################################### output = os.path.abspath(args.output) group_name = output.split("/")[-1] reference = os.path.abspath(args.reference) #annotation = os.path.abspath(args.annotation) # LOGGING # Create log file with date and time right_now = str(datetime.datetime.now()) right_now_full = "_".join(right_now.split(" ")) log_filename = group_name + "_" + right_now_full + ".log" log_folder = os.path.join(output, 'Logs') check_create_dir(log_folder) log_full_path = os.path.join(log_folder, log_filename) logger = logging.getLogger() logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s:%(message)s') file_handler = logging.FileHandler(log_full_path) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(formatter) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) # stream_handler.setFormatter(formatter) logger.addHandler(stream_handler) logger.addHandler(file_handler) logger.info("\n\n" + BLUE + BOLD + "STARTING PIPELINE IN GROUP: " + group_name + END_FORMATTING) today = str(datetime.date.today()) logger.info("ARGUMENTS:") logger.info(str(args)) # Obtain all R1 and R2 from folder r1, r2 = extract_read_list(args.input_dir) # Check if there are samples to filter out sample_list_F = [] if args.sample_list == None: logger.info("\n" + "No samples to filter") for r1_file, r2_file in zip(r1, r2): sample = extract_sample(r1_file, r2_file) sample_list_F.append(sample) else: logger.info("samples will be filtered") sample_list_F = file_to_list(args.sample_list) new_samples = check_reanalysis(args.output, sample_list_F) logger.info("\n%d samples will be analysed: %s" % (len(sample_list_F), ",".join(sample_list_F))) logger.info("\n%d NEW samples will be analysed: %s" % (len(new_samples), ",".join(new_samples))) #DECLARE FOLDERS CREATED IN PIPELINE ################ #AND KEY FILES ###################################### ##################################################### # Annotation related parameters #script_dir = os.path.dirname(os.path.realpath(__file__)) # Output related out_qc_dir = os.path.join(output, "Quality") out_qc_pre_dir = os.path.join(out_qc_dir, "raw") # subfolder out_variant_dir = os.path.join(output, "Variants") out_core_dir = os.path.join(output, "Core") out_stats_dir = os.path.join(output, "Stats") out_stats_bamstats_dir = os.path.join( out_stats_dir, "Bamstats") # subfolder out_stats_coverage_dir = os.path.join( out_stats_dir, "Coverage") # subfolder out_compare_dir = os.path.join(output, "Compare") out_annot_dir = os.path.join(output, "Annotation") out_annot_snpeff_dir = os.path.join(out_annot_dir, "snpeff") # subfolder out_annot_user_dir = os.path.join(out_annot_dir, "user") # subfolder out_annot_user_aa_dir = os.path.join(out_annot_dir, "user_aa") # subfolder out_annot_blast_dir = os.path.join(out_annot_dir, "blast") # subfolder out_species_dir = os.path.join(output, "Species") new_sample_number = 0 for r1_file, r2_file in zip(r1, r2): # EXtract sample name sample = extract_sample(r1_file, r2_file) args.sample = sample if sample in sample_list_F: # VARINAT SAMPLE DIR sample_variant_dir = os.path.join(out_variant_dir, sample) sample_number = str(sample_list_F.index(sample) + 1) sample_total = str(len(sample_list_F)) if sample in new_samples: new_sample_number = str(int(new_sample_number) + 1) new_sample_total = str(len(new_samples)) logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample + " (" + sample_number + "/" + sample_total + ")" + " (" + new_sample_number + "/" + new_sample_total + ")" + END_FORMATTING) else: logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample + " (" + sample_number + "/" + sample_total + ")" + END_FORMATTING) output_final_vcf = os.path.join( sample_variant_dir, 'snps.all.ivar.tsv') if not os.path.isfile(output_final_vcf): ##############START PIPELINE##################### ################################################# # INPUT ARGUMENTS ################ # check_file_exists(r1_file) # check_file_exists(r2_file) args.output = os.path.abspath(args.output) check_create_dir(args.output) # QUALITY CHECK in RAW with fastqc ###################################################### check_create_dir(out_qc_dir) out_qc_raw_name_r1 = (".").join(r1_file.split( '/')[-1].split('.')[0:-2]) + '_fastqc.html' out_qc_raw_name_r2 = (".").join(r2_file.split( '/')[-1].split('.')[0:-2]) + '_fastqc.html' output_qc_raw_file_r1 = os.path.join( out_qc_pre_dir, out_qc_raw_name_r1) output_qc_raw_file_r2 = os.path.join( out_qc_pre_dir, out_qc_raw_name_r2) if os.path.isfile(output_qc_raw_file_r1) and os.path.isfile(output_qc_raw_file_r2): logger.info(YELLOW + DIM + output_qc_raw_file_r1 + " EXIST\nOmmiting QC for sample " + sample + END_FORMATTING) else: logger.info( GREEN + "Checking quality in sample " + sample + END_FORMATTING) logger.info("R1: " + r1_file + "\nR2: " + r2_file) fastqc_quality(r1_file, r2_file, out_qc_pre_dir, args.threads) """ TODO: Human filter """ # VARIANT CALLING WITH SNIPPY ################################################### output_vcf_sub = os.path.join( sample_variant_dir, "snps.subs.vcf") output_vcf = os.path.join(sample_variant_dir, "snps.vcf") if os.path.isfile(output_vcf_sub) and os.path.isfile(output_vcf): logger.info(YELLOW + DIM + output_vcf + " EXIST\nOmmiting Variant calling in " + sample + END_FORMATTING) else: logger.info( GREEN + "Calling variants with snippy " + sample + END_FORMATTING) run_snippy(r1_file, r2_file, reference, out_variant_dir, sample, threads=args.threads, minqual=10, minfrac=0.1, mincov=1) old_bam = os.path.join(sample_variant_dir, "snps.bam") old_bai = os.path.join(sample_variant_dir, "snps.bam.bai") new_bam = os.path.join(sample_variant_dir, sample + ".bam") new_bai = os.path.join( sample_variant_dir, sample + ".bam.bai") os.rename(old_bam, new_bam) os.rename(old_bai, new_bai) #VARIANT FORMAT COMBINATION (REMOVE COMPLEX) ######## ##################################################### out_variant_indel_sample = os.path.join( sample_variant_dir, "snps.indel.vcf") out_variant_all_sample = os.path.join( sample_variant_dir, "snps.all.vcf") if os.path.isfile(out_variant_indel_sample): logger.info(YELLOW + DIM + out_variant_indel_sample + " EXIST\nOmmiting indel filtering in sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Filtering INDELS in " + sample + END_FORMATTING) extract_indels(output_vcf) if os.path.isfile(out_variant_all_sample): logger.info(YELLOW + DIM + out_variant_all_sample + " EXIST\nOmmiting vcf combination in sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Combining vcf in " + sample + END_FORMATTING) merge_vcf(output_vcf_sub, out_variant_indel_sample) #VARIANT FORMAT ADAPTATION TO IVAR ################## ##################################################### out_variant_tsv_file = os.path.join( sample_variant_dir, 'snps.all.ivar.tsv') if os.path.isfile(out_variant_tsv_file): logger.info(YELLOW + DIM + out_variant_tsv_file + " EXIST\nOmmiting format adaptation for sample " + sample + END_FORMATTING) else: logger.info( GREEN + "Adapting variants format in sample " + sample + END_FORMATTING) prior = datetime.datetime.now() vcf_to_ivar_tsv(out_variant_all_sample, out_variant_tsv_file) after = datetime.datetime.now() print(("Done with function in: %s" % (after - prior))) # SPECIES DETERMINATION ################################################### check_create_dir(out_species_dir) output_species = os.path.join( out_species_dir, sample + ".screen.tab") if os.path.isfile(output_species): logger.info(YELLOW + DIM + output_species + " EXIST\nOmmiting Species determinatin in " + sample + END_FORMATTING) else: logger.info( GREEN + "Determining species in " + sample + END_FORMATTING) mash_screen(r1_file, out_species_dir, r2_file=r2_file, winner=True, threads=args.threads, mash_database=args.mash_database) ########################CREATE STATS AND QUALITY FILTERS######################################################################## ################################################################################################################################ #CREATE Bamstats####################################### ####################################################### check_create_dir(out_stats_dir) check_create_dir(out_stats_bamstats_dir) out_bamstats_name = sample + ".bamstats" out_bamstats_file = os.path.join( out_stats_bamstats_dir, out_bamstats_name) bam_sample_file = os.path.join(sample_variant_dir, sample + ".bam") if os.path.isfile(out_bamstats_file): logger.info(YELLOW + DIM + out_bamstats_file + " EXIST\nOmmiting Bamstats for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Creating bamstats in sample " + sample + END_FORMATTING) create_bamstat( bam_sample_file, out_stats_bamstats_dir, sample, threads=args.threads) #CREATE Bamstats####################################### ####################################################### check_create_dir(out_stats_coverage_dir) out_coverage_name = sample + ".cov" out_coverage_file = os.path.join( out_stats_coverage_dir, out_coverage_name) if os.path.isfile(out_coverage_file): logger.info(YELLOW + DIM + out_coverage_file + " EXIST\nOmmiting Bamstats for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Creating coverage in sample " + sample + END_FORMATTING) create_coverage(bam_sample_file, out_stats_coverage_dir, sample) # coverage OUTPUT SUMMARY ###################################################### prior_recal = datetime.datetime.now() logger.info(GREEN + "Creating summary report for coverage result in group " + group_name + END_FORMATTING) obtain_group_cov_stats(out_stats_dir, group_name) after_recal = datetime.datetime.now() logger.info("Done with report for coverage: %s" % (after_recal - prior_recal)) # READS and VARIANTS OUTPUT SUMMARY ###################################################### logger.info(GREEN + "Creating overal summary report in group " + group_name + END_FORMATTING) obtain_overal_stats(output, group_name) # REMOVE UNCOVERED ############################################################################################################################## logger.info(GREEN + "Removing low quality samples in group " + group_name + END_FORMATTING) uncovered_samples = remove_low_quality( output, min_coverage=args.coverage20, min_hq_snp=args.min_snp, type_remove='Uncovered') if len(uncovered_samples) > 1: logger.info(GREEN + "Uncovered samples: " + (",").join(uncovered_samples) + END_FORMATTING) else: logger.info(GREEN + "NO uncovered samples found" + END_FORMATTING) # RUN SNIPPY CORE ############################################################################################################################## if args.core: check_create_dir(out_core_dir) logger.info(GREEN + "Running snippy-core " + group_name + END_FORMATTING) run_snippy_core(out_variant_dir, out_core_dir, reference) logger.info(GREEN + "Adapting core-snp to compare format " + group_name + END_FORMATTING) core_vcf_file = os.path.join(out_core_dir, "core.vcf") core_vcf_file_adapted = os.path.join( out_core_dir, "core.vcf.adapted.tsv") core_vcf_file_removed = os.path.join( out_core_dir, "core.vcf.adapted.final.tsv") core_vcf_df_adapted = import_VCF4_core_to_compare(core_vcf_file) core_vcf_df_adapted.to_csv( core_vcf_file_adapted, sep="\t", index=False) logger.info(GREEN + "Obtaining clustered positions " + group_name + END_FORMATTING) close_positions_list = extract_close_snps( core_vcf_df_adapted, snps_in_10=1) logger.info(GREEN + "Obtaining uncovered positions " + group_name + END_FORMATTING) uncovered_list = identify_uncovered( out_stats_coverage_dir, min_coverage=10, nocall_fr=0.5) logger.debug('Clustered positions in core SNP:\n{}'.format( (",".join([str(x) for x in close_positions_list])))) logger.debug('Uncovered positions in all samples:\n{}'.format( (",".join([str(x) for x in uncovered_list])))) to_remove_list = close_positions_list + uncovered_list remove_df = remove_position_from_compare( core_vcf_df_adapted, to_remove_list) remove_df.to_csv(core_vcf_file_removed, sep="\t", index=False) ddtb_compare(core_vcf_file_removed, distance=10) #ANNOTATION WITH SNPEFF AND USER INPUT ############## ##################################################### logger.info("\n\n" + BLUE + BOLD + "STARTING ANNOTATION IN GROUP: " + group_name + END_FORMATTING + "\n") check_create_dir(out_annot_dir) check_create_dir(out_annot_snpeff_dir) # SNPEFF if args.snpeff_database != False: for root, _, files in os.walk(out_variant_dir): for name in files: if name == 'snps.all.vcf': sample = root.split('/')[-1] filename = os.path.join(root, name) chrom_filename = os.path.join( root, 'snps.all.chromosome.vcf') out_annot_file = os.path.join( out_annot_snpeff_dir, sample + ".annot") if os.path.isfile(out_annot_file): logger.info(YELLOW + DIM + out_annot_file + " EXIST\nOmmiting snpEff Annotation for sample " + sample + END_FORMATTING) else: logger.info( GREEN + "Annotating sample with snpEff: " + sample + END_FORMATTING) rename_reference_snpeff(filename, chrom_filename) annotate_snpeff(chrom_filename, out_annot_file, database=args.snpeff_database) else: logger.info(YELLOW + DIM + " No SnpEff database suplied, skipping annotation in group " + group_name + END_FORMATTING) # USER DEFINED if not args.annot_bed and not args.annot_vcf: logger.info( YELLOW + BOLD + "Ommiting User Annotation, no BED or VCF files supplied" + END_FORMATTING) else: check_create_dir(out_annot_user_dir) for root, _, files in os.walk(out_variant_dir): for name in files: if name == 'snps.all.ivar.tsv': sample = root.split('/')[-1] logger.info( 'User bed/vcf annotation in sample {}'.format(sample)) filename = os.path.join(root, name) out_annot_file = os.path.join( out_annot_user_dir, sample + ".tsv") user_annotation( filename, out_annot_file, vcf_files=args.annot_vcf, bed_files=args.annot_bed) # USER AA DEFINED if not args.annot_aa: logger.info( YELLOW + BOLD + "Ommiting User aa Annotation, no AA files supplied" + END_FORMATTING) else: check_create_dir(out_annot_user_aa_dir) for root, _, files in os.walk(out_annot_snpeff_dir): if root == out_annot_snpeff_dir: for name in files: if name.endswith('.annot'): sample = name.split('.')[0] logger.info( 'User aa annotation in sample {}'.format(sample)) filename = os.path.join(root, name) out_annot_aa_file = os.path.join( out_annot_user_aa_dir, sample + ".tsv") if os.path.isfile(out_annot_aa_file): user_annotation_aa( out_annot_aa_file, out_annot_aa_file, aa_files=args.annot_aa) else: user_annotation_aa( filename, out_annot_aa_file, aa_files=args.annot_aa) # USER FASTA ANNOTATION if not args.annot_fasta: logger.info( YELLOW + BOLD + "Ommiting User FASTA Annotation, no FASTA files supplied" + END_FORMATTING) else: check_create_dir(out_annot_blast_dir) for root, _, files in os.walk(out_variant_dir): for name in files: if name.endswith('.consensus.subs.fa'): filename = os.path.join(root, name) sample = root.split('/')[-1] logger.info( 'User FASTA annotation in sample {}'.format(sample)) # out_annot_aa_file = os.path.join( # out_annot_user_aa_dir, sample + ".tsv") for db in args.annot_fasta: make_blast(filename, db, sample, out_annot_blast_dir, db_type="nucl", query_type="nucl", evalue=0.0001, threads=8) # USER AA TO HTML if not args.annot_aa: logger.info( YELLOW + BOLD + "Ommiting User aa Annotation to HTML, no AA files supplied" + END_FORMATTING) else: annotated_samples = [] logger.info('Adapting annotation to html in {}'.format(group_name)) for root, _, files in os.walk(out_annot_user_aa_dir): if root == out_annot_user_aa_dir: for name in files: if name.endswith('.tsv'): sample = name.split('.')[0] annotated_samples.append(sample) filename = os.path.join(root, name) annotation_to_html(filename, sample) annotated_samples = [str(x) for x in annotated_samples] report_samples_html_all = report_samples_html.replace( 'ALLSAMPLES', ('","').join(annotated_samples)) # NEW with open(os.path.join(out_annot_user_aa_dir, '00_all_samples.html'), 'w+') as f: f.write(report_samples_html_all) # SNP COMPARISON using tsv variant files ###################################################### logger.info("\n\n" + BLUE + BOLD + "STARTING COMPARISON IN GROUP: " + group_name + END_FORMATTING + "\n") check_create_dir(out_compare_dir) folder_compare = today + "_" + group_name path_compare = os.path.join(out_compare_dir, folder_compare) check_create_dir(path_compare) full_path_compare = os.path.join(path_compare, group_name) compare_snp_matrix_recal = full_path_compare + ".revised.final.tsv" compare_snp_matrix_recal_intermediate = full_path_compare + ".revised_intermediate.tsv" compare_snp_matrix_recal_mpileup = full_path_compare + \ ".revised_intermediate_vcf.tsv" compare_snp_matrix_INDEL_intermediate = full_path_compare + \ ".revised_INDEL_intermediate.tsv" # Create intermediate recalibrated_snp_matrix_intermediate = ddbb_create_intermediate( out_variant_dir, out_stats_coverage_dir, min_freq_discard=0.1, min_alt_dp=10, only_snp=False) # recalibrated_snp_matrix_intermediate.to_csv( # compare_snp_matrix_recal_intermediate, sep="\t", index=False) # Remove SNPs from BED file (PE/PPE) if args.remove_bed: recalibrated_snp_matrix_intermediate = remove_bed_positions( recalibrated_snp_matrix_intermediate, args.remove_bed) recalibrated_snp_matrix_intermediate.to_csv( compare_snp_matrix_recal_intermediate, sep="\t", index=False) # Recalibrate intermediate with VCF prior_recal = datetime.datetime.now() recalibrated_snp_matrix_mpileup = recalibrate_ddbb_vcf_intermediate( compare_snp_matrix_recal_intermediate, out_variant_dir, min_cov_low_freq=10) recalibrated_snp_matrix_mpileup.to_csv( compare_snp_matrix_recal_mpileup, sep="\t", index=False) after_recal = datetime.datetime.now() logger.debug("Done with recalibration vcf: %s" % (after_recal - prior_recal)) # Remove SNPs located within INDELs compare_snp_matrix_INDEL_intermediate_df = remove_position_range( recalibrated_snp_matrix_mpileup) compare_snp_matrix_INDEL_intermediate_df.to_csv( compare_snp_matrix_INDEL_intermediate, sep="\t", index=False) # Extract all positions marked as complex complex_variants = extract_complex_list(out_variant_dir) logger.debug('Complex positions in all samples:\n{}'.format( (",".join([str(x) for x in complex_variants])))) # Clean all faulty positions and samples => Final table recalibrated_revised_INDEL_df = revised_df(compare_snp_matrix_INDEL_intermediate_df, path_compare, complex_pos=complex_variants, min_freq_include=0.8, min_threshold_discard_uncov_sample=args.min_threshold_discard_uncov_sample, min_threshold_discard_uncov_pos=args.min_threshold_discard_uncov_pos, min_threshold_discard_htz_sample=args.min_threshold_discard_htz_sample, min_threshold_discard_htz_pos=args.min_threshold_discard_htz_pos, min_threshold_discard_all_pos=args.min_threshold_discard_all_pos, min_threshold_discard_all_sample=args.min_threshold_discard_all_sample, remove_faulty=True, drop_samples=True, drop_positions=True, windows_size_discard=args.window) recalibrated_revised_INDEL_df.to_csv( compare_snp_matrix_recal, sep="\t", index=False) # Matrix to pairwise and mwk ddtb_compare(compare_snp_matrix_recal, distance=5) logger.info("\n\n" + MAGENTA + BOLD + "COMPARING FINISHED IN GROUP: " + group_name + END_FORMATTING + "\n") logger.info("\n\n" + MAGENTA + BOLD + "#####END OF PIPELINE AUTOSNIPPY ANALYSIS#####" + END_FORMATTING + "\n")
args.r1_file = r1_file args.r2_file = r2_file print("\n" + WHITE_BG + "STARTING SAMPLE: " + sample + END_FORMATTING) ##############START PIPELINE##################### ################################################# #INPUT ARGUMENTS ################ check_file_exists(args.r1_file) check_file_exists(args.r2_file) args.output = os.path.abspath(args.output) check_create_dir(args.output) #QUALITY CHECK ############## """ TODO: Quality check """ #QUALITY TRIMMING AND ADAPTER REMOVAL WITH bbduk.sh ################################################### out_trim_name_r1 = sample + "_R1.clean.fastq.gz" out_trim_name_r2 = sample + "_R2.clean.fastq.gz" output_trimming_file_r1 = os.path.join(out_trim_dir, out_trim_name_r1) output_trimming_file_r2 = os.path.join(out_trim_dir, out_trim_name_r2)
def haplotype_caller(args, recalibrate=False, ploidy=2, bamout=False, forceactive=False, intervals=False): #base_quality=13, """ #No excuses https://software.broadinstitute.org/gatk/documentation/article?id=11081 """ #input_bam = os.path.abspath(args.input_bam) input_reference = os.path.abspath(args.reference) bam_output_dir = obtain_output_dir(args, "Bam") #file_name = path_file_name.split("/")[-1] #sample_name file_name = args.sample #path_file_name = os.path.join(output_dir, gvcf_output_file) if recalibrate: input_bam_to_call_name = file_name + ".rg.markdup.sorted.bam" gvcf_output_dir = obtain_output_dir(args, "GVCF_recal") gvcf_output_file = file_name + ".g.vcf" else: input_bam_to_call_name = file_name + ".bqsr.bam" gvcf_output_dir = obtain_output_dir(args, "GVCF") gvcf_output_file = file_name + ".g.vcf" check_create_dir(gvcf_output_dir) input_bam_to_call = os.path.join(bam_output_dir, input_bam_to_call_name) gvcf_output_full = os.path.join(gvcf_output_dir, gvcf_output_file) memory_param = "-Xmx" + str(args.memory) + "g" hc_args = [ "gatk", "HaplotypeCaller", "--java-options", memory_param, "--reference", input_reference, "--input", input_bam_to_call, "--output", gvcf_output_full, "--emit-ref-confidence", "GVCF", "--annotation-group", "AS_StandardAnnotation", "--sample-ploidy", str(ploidy) ] #"--min-base-quality-score", str(base_quality), #Create bam index #cmd_index = ["samtools", "index", input_bam_to_call] #execute_subprocess(cmd_index) if bamout: bamout_output_dir = obtain_output_dir(args, "Bamout") bamout_output_file = file_name + ".p" + str(ploidy) + ".out.bam" bamout_output_full = os.path.join(bamout_output_dir, bamout_output_file) check_create_dir(bamout_output_dir) bamout_params = ["--bam-output", bamout_output_full] hc_args.extend(bamout_params) if forceactive: force_params = ["--force-active", "--disable-optimizations"] hc_args.extend(force_params) execute_subprocess(hc_args) """
def main(): """ Create main function to capture code errors: https://stackoverflow.com/questions/6234405/logging-uncaught-exceptions-in-python """ # ARGUMENTS def get_arguments(): parser = argparse.ArgumentParser( prog='covidma.py', description= 'Pipeline to call variants (SNVs) with any non model organism. Specialised in SARS-CoV-2' ) input_group = parser.add_argument_group('Input', 'Input parameters') input_group.add_argument( '-i', '--input', dest="input_dir", metavar="input_directory", type=str, required=True, help='REQUIRED.Input directory containing all fast[aq] files') input_group.add_argument('-r', '--reference', metavar="reference", type=str, required=True, help='REQUIRED. File to map against') input_group.add_argument( '-a', '--annotation', metavar="annotation", type=str, required=True, help='REQUIRED. gff3 file to annotate variants') input_group.add_argument('-s', '--sample', metavar="sample", type=str, required=False, help='Sample to identify further files') input_group.add_argument( '-L', '--sample_list', type=str, required=False, help='Sample names to analyse only in the file supplied') input_group.add_argument( '-p', '--primers', type=str, default= '/home/laura/DATABASES/Anotacion/COVID/primers/nCoV-2019.bed', required=False, help='Bed file including primers to trim') quality_group = parser.add_argument_group( 'Quality parameters', 'parameters for diferent triming conditions') quality_group.add_argument( '-c', '--coverage20', type=int, default=90, required=False, help= 'Minimum percentage of coverage at 20x to clasify as uncovered (Default 90)' ) quality_group.add_argument('-n', '--min_snp', type=int, required=False, default=1, help='SNP number to pass quality threshold') output_group = parser.add_argument_group( 'Output', 'Required parameter to output results') output_group.add_argument( '-o', '--output', type=str, required=True, help='REQUIRED. Output directory to extract all results') output_group.add_argument( '-C', '--noclean', required=False, action='store_false', help='Clean unwanted files for standard execution') params_group = parser.add_argument_group( 'Parameters', 'parameters for diferent stringent conditions') params_group.add_argument('-T', '--threads', type=str, dest="threads", required=False, default=16, help='Threads to use') params_group.add_argument('-M', '--memory', type=str, dest="memory", required=False, default=32, help='Max memory to use') annot_group = parser.add_argument_group( 'Annotation', 'parameters for variant annotation') annot_group.add_argument('-B', '--annot_bed', type=str, default=[], required=False, action='append', help='bed file to annotate') annot_group.add_argument('-V', '--annot_vcf', type=str, default=[], required=False, action='append', help='vcf file to annotate') annot_group.add_argument('-A', '--annot_aa', type=str, default=[], required=False, action='append', help='aminoacid file to annotate') annot_group.add_argument('-R', '--remove_bed', type=str, default=False, required=False, help='BED file with positions to remove') annot_group.add_argument( '--mash_database', type=str, required=False, default=False, help='MASH ncbi annotation containing all species database') annot_group.add_argument('--snpeff_database', type=str, required=False, default='NC_045512.2', help='snpEFF annotation database') compare_group = parser.add_argument_group( 'Compare', 'parameters for compare_snp') compare_group.add_argument('-S', '--only_snp', required=False, action='store_true', help='Use INDELS while comparing') arguments = parser.parse_args() return arguments args = get_arguments() ###################################################################### #####################START PIPELINE################################### ###################################################################### output = os.path.abspath(args.output) group_name = output.split("/")[-1] reference = os.path.abspath(args.reference) annotation = os.path.abspath(args.annotation) # LOGGING # Create log file with date and time right_now = str(datetime.datetime.now()) right_now_full = "_".join(right_now.split(" ")) log_filename = group_name + "_" + right_now_full + ".log" log_folder = os.path.join(output, 'Logs') check_create_dir(log_folder) log_full_path = os.path.join(log_folder, log_filename) logger = logging.getLogger() logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s:%(message)s') file_handler = logging.FileHandler(log_full_path) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(formatter) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) # stream_handler.setFormatter(formatter) logger.addHandler(stream_handler) logger.addHandler(file_handler) logger.info("\n\n" + BLUE + BOLD + "STARTING PIPELINE IN GROUP: " + group_name + END_FORMATTING) today = str(datetime.date.today()) logger.info("ARGUMENTS:") logger.info(str(args)) # Obtain all R1 and R2 from folder r1, r2 = extract_read_list(args.input_dir) # Check if there are samples to filter out sample_list_F = [] if args.sample_list == None: logger.info("\n" + "No samples to filter") for r1_file, r2_file in zip(r1, r2): sample = extract_sample(r1_file, r2_file) sample_list_F.append(sample) else: logger.info("samples will be filtered") sample_list_F = file_to_list(args.sample_list) new_samples = check_reanalysis(args.output, sample_list_F) logger.info("\n%d samples will be analysed: %s" % (len(new_samples), ",".join(new_samples))) #PREPARE REFERENCE FOR MAPPING + FAI + DICT ######### ##################################################### # picard_dictionary(args) samtools_faidx(args) #DECLARE FOLDERS CREATED IN PIPELINE ################ #AND KEY FILES ###################################### ##################################################### # Annotation related parameters # script_dir = os.path.dirname(os.path.realpath(__file__)) # Output related out_qc_dir = os.path.join(output, "Quality") out_qc_pre_dir = os.path.join(out_qc_dir, "raw") # subfolder out_qc_post_dir = os.path.join(out_qc_dir, "processed") # subfolder out_trim_dir = os.path.join(output, "Trimmed") out_map_dir = os.path.join(output, "Bam") out_variant_dir = os.path.join(output, "Variants") out_variant_ivar_dir = os.path.join(out_variant_dir, "ivar_raw") # subfolder out_filtered_ivar_dir = os.path.join(out_variant_dir, "ivar_filtered") # subfolder out_consensus_dir = os.path.join(output, "Consensus") out_consensus_ivar_dir = os.path.join(out_consensus_dir, "ivar") # subfolder out_stats_dir = os.path.join(output, "Stats") out_stats_bamstats_dir = os.path.join(out_stats_dir, "Bamstats") # subfolder out_stats_coverage_dir = os.path.join(out_stats_dir, "Coverage") # subfolder out_compare_dir = os.path.join(output, "Compare") out_annot_dir = os.path.join(output, "Annotation") out_annot_snpeff_dir = os.path.join(out_annot_dir, "snpeff") # subfolder out_annot_pangolin_dir = os.path.join(out_annot_dir, "pangolin") # subfolder out_annot_user_dir = os.path.join(out_annot_dir, "user") # subfolder out_annot_user_aa_dir = os.path.join(out_annot_dir, "user_aa") # subfolder new_sample_number = 0 for r1_file, r2_file in zip(r1, r2): # EXtract sample name sample = extract_sample(r1_file, r2_file) args.sample = sample if sample in sample_list_F: sample_number = str(sample_list_F.index(sample) + 1) sample_total = str(len(sample_list_F)) out_markdup_trimmed_name = sample + ".rg.markdup.trimmed.sorted.bam" output_markdup_trimmed_file = os.path.join( out_map_dir, out_markdup_trimmed_name) if sample in new_samples: new_sample_number = str(int(new_sample_number) + 1) new_sample_total = str(len(new_samples)) logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample + " (" + sample_number + "/" + sample_total + ")" + " (" + new_sample_number + "/" + new_sample_total + ")" + END_FORMATTING) else: logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample + " (" + sample_number + "/" + sample_total + ")" + END_FORMATTING) if not os.path.isfile(output_markdup_trimmed_file): args.r1_file = r1_file args.r2_file = r2_file ##############START PIPELINE##################### ################################################# # INPUT ARGUMENTS ################ check_file_exists(r1_file) check_file_exists(r2_file) args.output = os.path.abspath(args.output) check_create_dir(args.output) # QUALITY CHECK in RAW with fastqc ###################################################### check_create_dir(out_qc_dir) out_qc_raw_name_r1 = (".").join( r1_file.split('/')[-1].split('.')[0:-2]) + '_fastqc.html' out_qc_raw_name_r2 = (".").join( r2_file.split('/')[-1].split('.')[0:-2]) + '_fastqc.html' output_qc_raw_file_r1 = os.path.join(out_qc_pre_dir, out_qc_raw_name_r1) output_qc_raw_file_r2 = os.path.join(out_qc_pre_dir, out_qc_raw_name_r2) if os.path.isfile(output_qc_raw_file_r1) and os.path.isfile( output_qc_raw_file_r2): logger.info(YELLOW + DIM + output_qc_raw_file_r1 + " EXIST\nOmmiting QC for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Checking quality in sample " + sample + END_FORMATTING) logger.info("R1: " + r1_file + "\nR2: " + r2_file) fastqc_quality(r1_file, r2_file, out_qc_pre_dir, args.threads) """ TODO: Human filter """ # QUALITY TRIMMING AND ADAPTER REMOVAL WITH fastp ################################################### out_trim_name_r1 = sample + ".trimmed_R1.fastq.gz" out_trim_name_r2 = sample + ".trimmed_R2.fastq.gz" output_trimming_file_r1 = os.path.join(out_trim_dir, out_trim_name_r1) output_trimming_file_r2 = os.path.join(out_trim_dir, out_trim_name_r2) if os.path.isfile(output_trimming_file_r1) and os.path.isfile( output_trimming_file_r2): logger.info(YELLOW + DIM + output_trimming_file_r1 + " EXIST\nOmmiting Trimming for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Trimming sample " + sample + END_FORMATTING) fastp_trimming(r1_file, r2_file, sample, out_trim_dir, threads=args.threads, min_qual=20, window_size=10, min_len=35) # QUALITY CHECK in TRIMMED with fastqc ###################################################### check_create_dir(out_qc_dir) out_qc_pos_r1 = sample + ".trimmed_R1_fastqc.html" out_qc_pos_r2 = sample + ".trimmed_R2_fastqc.html" output_qc_precessed_file_r1 = os.path.join( out_qc_post_dir, out_qc_pos_r1) output_qc_precessed_file_r2 = os.path.join( out_qc_post_dir, out_qc_pos_r2) if os.path.isfile( output_qc_precessed_file_r1) and os.path.isfile( output_qc_precessed_file_r2): logger.info(YELLOW + DIM + output_qc_raw_file_r1 + " EXIST\nOmmiting QC for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Checking quality in processed sample " + sample + END_FORMATTING) logger.info("R1: " + output_trimming_file_r1 + "\nR2: " + output_trimming_file_r2) fastqc_quality(output_trimming_file_r1, output_trimming_file_r2, out_qc_post_dir, args.threads) # MAPPING WITH BWA - SAM TO SORTED BAM - ADD HEADER SG ##################################################### out_map_name = sample + ".rg.sorted.bam" output_map_file = os.path.join(out_map_dir, out_map_name) if os.path.isfile(output_map_file): logger.info(YELLOW + DIM + output_map_file + " EXIST\nOmmiting Mapping for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Mapping sample " + sample + END_FORMATTING) logger.info("R1: " + output_trimming_file_r1 + "\nR2: " + output_trimming_file_r2 + "\nReference: " + reference) bwa_mapping(output_trimming_file_r1, output_trimming_file_r2, reference, sample, out_map_dir, threads=args.threads) sam_to_index_bam(sample, out_map_dir, output_trimming_file_r1, threads=args.threads) #MARK DUPLICATES WITH PICARDTOOLS ################### ##################################################### out_markdup_name = sample + ".rg.markdup.sorted.bam" output_markdup_file = os.path.join(out_map_dir, out_markdup_name) if os.path.isfile(output_markdup_file): logger.info(YELLOW + DIM + output_markdup_file + " EXIST\nOmmiting Duplucate Mark for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Marking Dupes in sample " + sample + END_FORMATTING) logger.info("Input Bam: " + output_map_file) picard_markdup(output_map_file) #TRIM PRIMERS WITH ivar trim ######################## ##################################################### if os.path.isfile(output_markdup_trimmed_file): logger.info(YELLOW + DIM + output_markdup_trimmed_file + " EXIST\nOmmiting Duplucate Mark for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Trimming primers in sample " + sample + END_FORMATTING) logger.info("Input Bam: " + output_markdup_file) ivar_trim(output_markdup_file, args.primers, sample, min_length=30, min_quality=20, sliding_window_width=4) else: logger.info( YELLOW + DIM + output_markdup_trimmed_file + " EXIST\nOmmiting BAM mapping and BAM manipulation in sample " + sample + END_FORMATTING) ########################END OF MAPPING AND BAM MANIPULATION##################################################################### ################################################################################################################################ #VARIANT CALLING WTIH ivar variants################## ##################################################### check_create_dir(out_variant_dir) out_ivar_variant_name = sample + ".tsv" out_ivar_variant_file = os.path.join(out_variant_ivar_dir, out_ivar_variant_name) if os.path.isfile(out_ivar_variant_file): logger.info(YELLOW + DIM + out_ivar_variant_file + " EXIST\nOmmiting Variant call for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Calling variants with ivar in sample " + sample + END_FORMATTING) ivar_variants(reference, output_markdup_trimmed_file, out_variant_dir, sample, annotation, min_quality=15, min_frequency_threshold=0.01, min_depth=1) #VARIANT FILTERING ################################## ##################################################### check_create_dir(out_filtered_ivar_dir) out_ivar_filtered_file = os.path.join(out_filtered_ivar_dir, out_ivar_variant_name) if os.path.isfile(out_ivar_filtered_file): logger.info(YELLOW + DIM + out_ivar_filtered_file + " EXIST\nOmmiting Variant filtering for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Filtering variants in sample " + sample + END_FORMATTING) filter_tsv_variants(out_ivar_variant_file, out_filtered_ivar_dir, min_frequency=0.7, min_total_depth=10, min_alt_dp=4, is_pass=True, only_snp=False) #CREATE CONSENSUS with ivar consensus################## ####################################################### check_create_dir(out_consensus_dir) check_create_dir(out_consensus_ivar_dir) out_ivar_consensus_name = sample + ".fa" out_ivar_consensus_file = os.path.join(out_consensus_ivar_dir, out_ivar_consensus_name) if os.path.isfile(out_ivar_consensus_file): logger.info(YELLOW + DIM + out_ivar_consensus_file + " EXIST\nOmmiting Consensus for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Creating consensus with ivar in sample " + sample + END_FORMATTING) ivar_consensus(output_markdup_trimmed_file, out_consensus_ivar_dir, sample, min_quality=20, min_frequency_threshold=0.8, min_depth=20, uncovered_character='N') logger.info(GREEN + "Replacing consensus header in " + sample + END_FORMATTING) replace_consensus_header(out_ivar_consensus_file) ########################CREATE STATS AND QUALITY FILTERS######################################################################## ################################################################################################################################ #CREATE Bamstats####################################### ####################################################### check_create_dir(out_stats_dir) check_create_dir(out_stats_bamstats_dir) out_bamstats_name = sample + ".bamstats" out_bamstats_file = os.path.join(out_stats_bamstats_dir, out_bamstats_name) if os.path.isfile(out_bamstats_file): logger.info(YELLOW + DIM + out_bamstats_file + " EXIST\nOmmiting Bamstats for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Creating bamstats in sample " + sample + END_FORMATTING) create_bamstat(output_markdup_trimmed_file, out_stats_bamstats_dir, sample, threads=args.threads) #CREATE Bamstats####################################### ####################################################### check_create_dir(out_stats_coverage_dir) out_coverage_name = sample + ".cov" out_coverage_file = os.path.join(out_stats_coverage_dir, out_coverage_name) if os.path.isfile(out_coverage_file): logger.info(YELLOW + DIM + out_coverage_file + " EXIST\nOmmiting Bamstats for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Creating coverage in sample " + sample + END_FORMATTING) create_coverage(output_markdup_trimmed_file, out_stats_coverage_dir, sample) # fastqc OUTPUT FORMAT FOR COMPARISON ###################################################### logger.info(GREEN + "Creating summary report for quality result " + END_FORMATTING) # format_html_image(out_qc_dir) # coverage OUTPUT SUMMARY ###################################################### logger.info(GREEN + "Creating summary report for coverage result " + END_FORMATTING) obtain_group_cov_stats(out_stats_coverage_dir, group_name) # READS and VARIANTS OUTPUT SUMMARY ###################################################### logger.info(GREEN + "Creating overal summary report " + END_FORMATTING) obtain_overal_stats(output, group_name) # REMOVE UNCOVERED ############################################################################################################################## logger.info(GREEN + "Removing low quality samples" + END_FORMATTING) # remove_low_quality(output, min_percentage_20x=args.coverage20, # min_hq_snp=args.min_snp, type_remove='Uncovered') #ANNOTATION WITH SNPEFF, USER INOUT AND PANGOLIN #### ##################################################### logger.info("\n\n" + BLUE + BOLD + "STARTING ANNOTATION IN GROUP: " + group_name + END_FORMATTING + "\n") check_create_dir(out_annot_dir) check_create_dir(out_annot_snpeff_dir) check_create_dir(out_annot_pangolin_dir) # SNPEFF if args.snpeff_database != False: # CHANGE FOR RAW/FILTERED ANNOTATION for root, _, files in os.walk(out_filtered_ivar_dir): if root == out_filtered_ivar_dir: # CHANGE FOR RAW/FILTERED ANNOTATION for name in files: if name.endswith('.tsv'): sample = name.split('.')[0] filename = os.path.join(root, name) out_annot_file = os.path.join(out_annot_snpeff_dir, sample + ".annot") if os.path.isfile(out_annot_file): logger.info( YELLOW + DIM + out_annot_file + " EXIST\nOmmiting snpEff Annotation for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Annotating sample with snpEff: " + sample + END_FORMATTING) output_vcf = os.path.join(out_annot_snpeff_dir, sample + '.vcf') annotate_snpeff(filename, output_vcf, out_annot_file, database=args.snpeff_database) # USER DEFINED if not args.annot_bed and not args.annot_vcf: logger.info(YELLOW + BOLD + "Ommiting User Annotation, no BED or VCF files supplied" + END_FORMATTING) else: check_create_dir(out_annot_user_dir) # CHANGE FOR RAW/FILTERED ANNOTATION for root, _, files in os.walk(out_variant_ivar_dir): if root == out_variant_ivar_dir: # CHANGE FOR RAW/FILTERED ANNOTATION for name in files: if name.endswith('.tsv'): sample = name.split('.')[0] logger.info( 'User bed/vcf annotation in sample {}'.format( sample)) filename = os.path.join(root, name) out_annot_file = os.path.join(out_annot_user_dir, sample + ".tsv") user_annotation(filename, out_annot_file, vcf_files=args.annot_vcf, bed_files=args.annot_bed) # USER AA DEFINED if not args.annot_aa: logger.info(YELLOW + BOLD + "Ommiting User aa Annotation, no AA files supplied" + END_FORMATTING) else: check_create_dir(out_annot_user_aa_dir) for root, _, files in os.walk(out_annot_snpeff_dir): if root == out_annot_snpeff_dir: for name in files: if name.endswith('.annot'): sample = name.split('.')[0] logger.info( 'User aa annotation in sample {}'.format(sample)) filename = os.path.join(root, name) out_annot_aa_file = os.path.join( out_annot_user_aa_dir, sample + ".tsv") if os.path.isfile(out_annot_aa_file): user_annotation_aa(out_annot_aa_file, out_annot_aa_file, aa_files=args.annot_aa) else: user_annotation_aa(filename, out_annot_aa_file, aa_files=args.annot_aa) # PANGOLIN with concurrent.futures.ThreadPoolExecutor( max_workers=args.threads) as executor: futures_pangolin = [] for root, _, files in os.walk(out_consensus_ivar_dir): if root == out_consensus_ivar_dir: for name in files: if name.endswith('.fa'): sample = name.split('.')[0] filename = os.path.join(root, name) out_pangolin_filename = sample + ".lineage.csv" out_pangolin_file = os.path.join( out_annot_pangolin_dir, out_pangolin_filename) if os.path.isfile(out_pangolin_file): logger.info( YELLOW + DIM + out_pangolin_file + " EXIST\nOmmiting Lineage for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Obtaining Lineage in sample " + sample + END_FORMATTING) future = executor.submit(annotate_pangolin, filename, out_annot_pangolin_dir, out_pangolin_filename, threads=args.threads, max_ambig=0.6) futures_pangolin.append(future) for future in concurrent.futures.as_completed( futures_pangolin): logger.info(future.result()) # annotate_pangolin(filename, out_annot_pangolin_dir, # out_pangolin_filename, threads=args.threads, max_ambig=0.6) # USER AA TO HTML annotated_samples = [] logger.info('Adapting annotation to html in {}'.format(group_name)) for root, _, files in os.walk(out_annot_user_aa_dir): if root == out_annot_user_aa_dir: for name in files: if name.endswith('.tsv'): sample = name.split('.')[0] annotated_samples.append(sample) filename = os.path.join(root, name) annotation_to_html(filename, sample) annotated_samples = [str(x) for x in annotated_samples] report_samples_html_all = report_samples_html.replace( 'ALLSAMPLES', ('","').join(annotated_samples)) # NEW with open(os.path.join(out_annot_user_aa_dir, '00_all_samples.html'), 'w+') as f: f.write(report_samples_html_all) # SNP COMPARISON using tsv variant files ###################################################### logger.info("\n\n" + BLUE + BOLD + "STARTING COMPARISON IN GROUP: " + group_name + END_FORMATTING + "\n") check_create_dir(out_compare_dir) folder_compare = today + "_" + group_name path_compare = os.path.join(out_compare_dir, folder_compare) check_create_dir(path_compare) full_path_compare = os.path.join(path_compare, group_name) # ddtb_add(out_filtered_ivar_dir, full_path_compare) compare_snp_matrix_recal = full_path_compare + ".revised.final.tsv" compare_snp_matrix_INDEL = full_path_compare + ".revised_INDEL.final.tsv" compare_snp_matrix_recal_intermediate = full_path_compare + ".revised_intermediate.tsv" compare_snp_matrix_INDEL_intermediate = full_path_compare + \ ".revised_INDEL_intermediate.tsv" recalibrated_snp_matrix_intermediate = ddbb_create_intermediate( out_variant_ivar_dir, out_stats_coverage_dir, min_freq_discard=0.1, min_alt_dp=4, only_snp=args.only_snp) recalibrated_snp_matrix_intermediate.to_csv( compare_snp_matrix_recal_intermediate, sep="\t", index=False) compare_snp_matrix_INDEL_intermediate_df = remove_position_range( recalibrated_snp_matrix_intermediate) compare_snp_matrix_INDEL_intermediate_df.to_csv( compare_snp_matrix_INDEL_intermediate, sep="\t", index=False) recalibrated_revised_df = revised_df(recalibrated_snp_matrix_intermediate, path_compare, min_freq_include=0.7, min_threshold_discard_sample=0.07, min_threshold_discard_position=0.4, remove_faulty=True, drop_samples=True, drop_positions=True) recalibrated_revised_df.to_csv(compare_snp_matrix_recal, sep="\t", index=False) recalibrated_revised_INDEL_df = revised_df( compare_snp_matrix_INDEL_intermediate_df, path_compare, min_freq_include=0.7, min_threshold_discard_sample=0.07, min_threshold_discard_position=0.4, remove_faulty=True, drop_samples=True, drop_positions=True) recalibrated_revised_INDEL_df.to_csv(compare_snp_matrix_INDEL, sep="\t", index=False) ddtb_compare(compare_snp_matrix_recal, distance=0) ddtb_compare(compare_snp_matrix_INDEL, distance=0, indel=True) logger.info("\n\n" + MAGENTA + BOLD + "COMPARING FINISHED IN GROUP: " + group_name + END_FORMATTING + "\n") #####################CONSENSUS WITH REFINED CALL###### ###################################################### logger.info(GREEN + "Creating refined consensus" + END_FORMATTING) create_consensus(reference, compare_snp_matrix_recal, out_stats_coverage_dir, out_consensus_dir) logger.info("\n\n" + MAGENTA + BOLD + "#####END OF PIPELINE COVID MULTI ANALYSIS#####" + END_FORMATTING + "\n")
args.r1_file = r1_file args.r2_file = r2_file print("\n" + WHITE_BG + "STARTING SAMPLE: " + sample + " (" + sample_number + "/" + sample_total + ")" + END_FORMATTING) ##############START PIPELINE##################### ################################################# #INPUT ARGUMENTS ################ check_file_exists(args.r1_file) check_file_exists(args.r2_file) args.output = os.path.abspath(args.output) check_create_dir(args.output) #QUALITY CHECK ############## """ TODO: Quality check TODO: Human filter """ #QUALITY TRIMMING AND ADAPTER REMOVAL WITH bbduk.sh ################################################### out_trim_name_r1 = sample + "_R1.clean.fastq.gz" out_trim_name_r2 = sample + "_R2.clean.fastq.gz" output_trimming_file_r1 = os.path.join(out_trim_dir, out_trim_name_r1) output_trimming_file_r2 = os.path.join(out_trim_dir, out_trim_name_r2)
def fastqc_quality(r1, r2, output_dir, threads=8): check_create_dir(output_dir) cmd = ['fastqc', r1, r2, '-o', output_dir, '--threads', str(threads)] execute_subprocess(cmd)
###################################################################### #####################START PIPELINE################################### ###################################################################### #Annotation related script_dir = os.path.dirname(os.path.realpath(__file__)) annotation_dir = os.path.join(script_dir, "annotation/genes") if args.bed_remove == "TB": bed_polymorphism = os.path.join(annotation_dir, "MTB_repeats_annot.bed") output = os.path.abspath(args.output) #input_dir = os.path.abspath(args.input) group_name = output.split("/")[-1] out_gvcf_dir = os.path.join(args.output, "GVCF") out_vcf_dir = os.path.join(args.output, "VCF") check_create_dir(out_vcf_dir) gvcf_input_dir = os.path.abspath(args.input) print("\n\n" + BLUE + BOLD + "STARTING COHORT GVCF TO SPLIT SAMPLE VCF IN GROUP: " + group_name + END_FORMATTING) #CALL VARIANTS 2/2 FOR HARD FILTERING AND RECALIBRATION ####################################################### out_gvcf_name = group_name + ".cohort.g.vcf" output_gvcf_file = os.path.join(out_gvcf_dir, out_gvcf_name) if os.path.isfile(output_gvcf_file): print(YELLOW + DIM + output_gvcf_file + " EXIST\nOmmiting GVCF Combination for group " + group_name + END_FORMATTING) else:
def vcf_consensus_filter(vcf_file, distance=1, AF=0.75, QD=15, window_10=3, dp_limit=8, dp_AF=10, AF_dp=0.80, highly_hetz=False, non_genotyped=False, poorly_covered=False, bed_to_filter=False, var_type="SNP"): """ Apply custom filter to individual vcf based on: AF snp distance --> Replaced by window_10 QD Window_10, 20 and 30 gatk asigned genotype for diploid calls Highly heterozygous positions Poorly covered positions """ df_vcf = import_VCF42_to_pandas(vcf_file) vcf_path = os.path.abspath(vcf_file) output_dir = ("/").join(vcf_path.split("/")[:-2]) vcf_name = vcf_path.split("/")[-1] tab_name = (".").join(vcf_name.split(".")[:-1]) extend_raw = ".raw.tab" extend_final = "." + var_type + ".final.vcf" table_outputt_dir = os.path.join(output_dir, "Table") check_create_dir(table_outputt_dir) #Add polymorphic regions info (Phage, Transposon or PE/PPE regions for TB) if bed_to_filter == False: df_vcf['is_polymorphic'] = False else: annotate_bed_s(df_vcf, bed_to_filter) if highly_hetz != False: annotate_bed_s(df_vcf, highly_hetz) if non_genotyped != False: annotate_bed_s(df_vcf, non_genotyped) if poorly_covered != False: annotate_bed_s(df_vcf, poorly_covered) #Add info of nearby positions add_snp_distance(df_vcf) add_indel_distance(df_vcf) #Add info of clustered positions in sliding window add_window_distance(df_vcf, window_size=10) add_window_distance(df_vcf, window_size=20) add_window_distance(df_vcf, window_size=30) #Manage SNP INDEL filter if var_type == "SNP": var_to_filter = "INDEL" elif var_type == "INDEL": var_to_filter = "SNP" elif var_type == "ALL": var_to_filter = "*" else: print("Wrong variant type to filter, use SNP/INDEL/ALL") sys.exit(1) #output all raw info into a file in 'Table' folder new_out_file = tab_name + extend_raw output_raw_tab = os.path.join(table_outputt_dir, new_out_file) df_vcf.to_csv(output_raw_tab, sep='\t', index=False) #Apply all filters and extract positions as table to filer the final vcf list_positions_to_filter = df_vcf['POS'][((df_vcf.AF < AF) | (df_vcf.snp_left_distance <= distance)| (df_vcf.snp_right_distance <= distance)| (df_vcf.window_10 > window_10)| (df_vcf.AF <= 0.0)| (df_vcf.QD <= QD)| (df_vcf.dp == 0)| (df_vcf.len_AD > 2) | (df_vcf.ALT_AD < 2) | (df_vcf.ALT == '*') | (df_vcf.TYPE == var_to_filter) | (df_vcf.dp < dp_limit) | (df_vcf.FILTER != "PASS") | ((df_vcf.gt0 == 0) & (df_vcf.window_10 > 1)) | ((df_vcf.gt0 == 0) & (df_vcf.window_20 >= 2)) | ((df_vcf.gt0 == 0) & (df_vcf.window_30 >= 3)) | ((df_vcf.dp < dp_AF) & (df_vcf.AF < AF_dp)) | (df_vcf.highly_hetz == True) | (df_vcf.poorly_covered == True) | (df_vcf.non_genotyped == True) | (df_vcf.is_polymorphic == True))].tolist() final_vcf_name = tab_name + extend_final filter_vcf_list(vcf_path, list_positions_to_filter, final_vcf_name)