def test_read_simulation_and_get_reads(genome, window_l=2000, npairs=50000, read_length=150, median_insert_size=250, median_insert_size_sd=50, threads=4, replace=False): """ Takes a genome and simulates reads for it, saving them under <genome>_simulating_reads """ # define the outdir outdir = "%s_simulating_reads" % genome outdir_reads = "%s/getting_reads" % outdir # remove the outdirs if replace is True if replace is True: fun.delete_folder(outdir) fun.delete_folder(outdir_reads) # make folders fun.make_folder(outdir) fun.make_folder(outdir_reads) # define the expected reads reads1 = "%s/all_reads1.correct.fq.gz" % outdir_reads reads2 = "%s/all_reads2.correct.fq.gz" % outdir_reads if any([fun.file_is_empty(f) for f in [reads1, reads2]]): # run index the genome fun.run_cmd("%s faidx %s" % (fun.samtools, genome)) # get the windows df windows_bed = "%s/windows_file.bed" % outdir fun.run_cmd("%s makewindows -g %s.fai -w %i > %s" % (fun.bedtools, genome, window_l, windows_bed)) df_windows = fun.pd.read_csv(windows_bed, sep="\t", header=-1, names=["chromosome", "start", "end"]) df_windows["predicted_relative_coverage"] = fun.random.sample( list(fun.np.linspace(0.5, 2, 10000)), len(df_windows)) # simulate reads fun.simulate_readPairs_per_window(df_windows, genome, npairs, outdir_reads, read_length, median_insert_size, median_insert_size_sd, replace=False, threads=4) print("read simulation works well") return reads1, reads2
def test_parameter_optimisation_perSVade(sorted_bam, reference_genome, outdir, threads=4, replace=False): """This pipeline will test the parameter optimisation features of perSVade into outdir. It is expected to work for C.glabrata""" cmd = "%s -r %s -thr %i -o %s -sbam %s --nvars 5 --simulation_ploidies haploid,diploid_hetero --range_filtering_benchmark theoretically_meaningful -mchr mito_C_glabrata_CBS138" % ( fun.perSVade_py, reference_genome, threads, outdir, sorted_bam) fun.run_cmd(cmd) print("parameter optimisation worked successfully")
def test_conda_env_generation(outdir, replace=False): """This function exports the current perSVade_env to a .yml file, and generates a conda file""" # define the file that indicates that the enviornment is correct correct_env_file = "%s/correct_env.txt" % outdir # define a test_env_name test_env_name = "%s_test" % EnvName if fun.file_is_empty(correct_env_file) or replace is True: # remove previous env print("removing previous env") try: fun.run_cmd("conda remove -y -n %s --all" % test_env_name) except: print("%s does not exist" % test_env_name) # export file print("creating %s yml" % EnvName) yml_file = "%s/%s.yml" % (outdir, test_env_name) fun.run_cmd( "conda env export --no-builds --from-history -n %s --file %s" % (EnvName, yml_file)) # create environment print("re-generating as %s" % test_env_name) fun.run_cmd("conda env create --file %s --name %s" % (yml_file, test_env_name)) # test that the activation works print("activating %s" % test_env_name) cmd_activation = "source %s/etc/profile.d/conda.sh && conda activate %s && python -c 'import sys; sys.path.insert(0, \"%s\"); import sv_functions as fun'" % ( AnacondaDir, test_env_name, fun.get_fullpath(scripts_dir)) fun.run_cmd(cmd_activation) # remove file print("removing envs") fun.remove_file(yml_file) # remove env fun.run_cmd("conda remove -y -n %s --all" % test_env_name) # create file stating that the env is correct open(correct_env_file, "w").write("env is correct") print("%s can be correctly regenerated" % EnvName)
def test_smallVarCall_CNV_running( sorted_bam, outdir, ref_genome, gff, threads=4, mitochondrial_chromosome="mito_C_glabrata_CBS138", replace=False): """Takes a sorted bam (shuld have some mutations) and runs the variant calling pipeline on it""" # if replace is True, remove the outdir if replace is True: fun.delete_folder(outdir) # make the outdir fun.make_folder(outdir) # get the repeats repeats_table = fun.get_repeat_maskerDF(ref_genome, threads=4, replace=False)[1] for pooled_seq in [False ]: # this may be also [False, True] to test pooled seq outdir_varCall = "%s/varcall_pooledSeq_%s" % (outdir, str(pooled_seq)) # define the final file final_file = "%s/variant_annotation_ploidy2.tab" % outdir_varCall if fun.file_is_empty(final_file) or replace is True: print( "running on pooled_seq=%s. This may take a bit because a lot of variants will be considered" % pooled_seq) # define the cmd cmd = "%s -r %s -o %s -p 2 -sbam %s -caller all -c 5 -mchr %s -mcode 3 -gcode 1 --repeats_table %s --remove_smallVarsCNV_nonEssentialFiles -gff %s -thr %i" % ( varcall_cnv_pipeline, ref_genome, outdir_varCall, sorted_bam, mitochondrial_chromosome, repeats_table, gff, threads) # add pooled seq if pooled_seq is True: cmd += " --pooled_sequencing" fun.run_cmd(cmd) print("small variant calling and CNV of genes works")
def test_processing_varcalling(smallVars_input_outdir, reference_genome, outdir, sorted_bam, replace=False, threads=4): """This function takes a varcall file were all the variant calling has been performed and checks that the processing of vcfs works in varcall_cnv_pipeline. sorted_bam is just a fake sorted bam not to repeat the pipeline running""" # get full paths outdir = fun.get_fullpath(outdir) smallVars_input_outdir = fun.get_fullpath(smallVars_input_outdir) reference_genome = fun.get_fullpath(reference_genome) # cp the files under outdir fun.make_folder(outdir) target_smallVars_input_outdir = "%s/smallVars_input_outdir" % outdir target_smallVars_input_outdir_tmp = "%s.tmp" % target_smallVars_input_outdir if not os.path.isdir(target_smallVars_input_outdir) or replace is True: fun.run_cmd( "cp -r %s %s " % (smallVars_input_outdir, target_smallVars_input_outdir_tmp)) os.rename(target_smallVars_input_outdir_tmp, target_smallVars_input_outdir) # final file final_file = "%s/variants_atLeast3PASS_ploidy2.vcf" % target_smallVars_input_outdir if fun.file_is_empty(final_file) or replace is True: cmd = "%s -r %s -o %s -p 2 -sbam %s -caller all -c 5 -mchr no_mitochondria -mcode 3 -gcode 1 --repeats_table %s.repeats.tab --remove_smallVarsCNV_nonEssentialFiles -thr %i --skip_cnv_analysis" % ( varcall_cnv_pipeline, reference_genome, target_smallVars_input_outdir, sorted_bam, reference_genome, threads) fun.run_cmd(cmd) print("you can run successfully the variant processing")
def test_greasy(): """This function tests whether greasy can be found in the path""" try: fun.run_cmd("module load greasy") print("greasy module can be loaded") fun.run_cmd("module load greasy && which greasy") print("greasy is in the path") fun.run_cmd("which sbatch") print("sbatch is in the path") print("greasy can be used for running parallel perSVade jobs") except: print( "\n\n---\nWARNING: greasy is not installed properly in your system. This means that setting '--job_array_mode greasy' will fail. You can set '--job_array_mode local' and run jobs sequentially and not in parallel. '--job_array_mode greasy' will only work on machines that use SLURM for managing jobs and greasy installed (and callable with a command like 'module load greasy && greasy <jobs_file>', and '<jobs_file>' is a file where each line corresponds to a command to be executed in a sepparate SLURM job) \n---\n\n" )
import sv_functions as fun print("loading python packages worked successfully") # define the testing inuts dir testing_outputs_dir = "%s/testing_outputs"%test_dir test_output_perSVade = "%s/perSVade_output"%testing_outputs_dir outdir_small_variantCalling = "%s/smallVars_CNV_output"%test_output_perSVade # delete and cretae outdir #fun.delete_folder(testing_outputs_dir) fun.make_folder(testing_outputs_dir) fun.make_folder(test_output_perSVade) # redefine the reference genome location ref_genome = "%s/reduced_genome.fasta"%testing_outputs_dir if fun.file_is_empty(ref_genome): fun.run_cmd("cp %s %s"%(test_ref_genome, ref_genome)) # redefine the gff gff = "%s/reduced_annotations.gff"%testing_outputs_dir if fun.file_is_empty(gff): fun.run_cmd("cp %s %s"%(test_gff, gff)) # redefine the mutated genome location mut_genome = "%s/mutated_genome.fasta"%testing_outputs_dir if fun.file_is_empty(mut_genome): fun.run_cmd("cp %s %s"%(test_mutated_genome, mut_genome)) # define an example calbicans varCall_outout Calbicans_varCall_outdir = "%s/varcalling_output_Calbicans_SRR2088862"%testing_inputs_dir # define the Calbicans genome inputs_Calbicans_genome = "%s/Candida_albicans.fasta"%testing_inputs_dir Calbicans_genome = "%s/Candida_albicans.fasta"%testing_outputs_dir
# define the goldenSet_dir if spName=="Candida_glabrata": goldenSet_dir = goldenSet_dir_Cglabrata else: goldenSet_dir = "auto" # get the golden set running if taxID in taxIDs_with_noON_overalpping: continue cmd = "%s --ref %s --threads %i -o %s --target_taxID %s --n_close_samples 3 --nruns_per_sample 3 -f1 skip -f2 skip --mitochondrial_chromosome %s --gff %s --goldenSet_dir %s --skip_SVcalling"%(perSVade_py, genome, threads, outdir_perSVade, taxID, mitochondrial_chromosome, gff, goldenSet_dir) # add options depending on the machine if run_in_cluster is True: cmd += " --job_array_mode greasy --queue_jobs bsc_ls --max_ncores_queue 144 --time_read_obtention 48:00:00 --time_perSVade_running 48:00:00" else: cmd += " --job_array_mode local" if StopAfterPrefecth_of_reads is True: cmd += " --StopAfterPrefecth_of_reads" fun.run_cmd(cmd) #if taxID=="5476": adkjhdakg # stop after C. albicans # an example CMD to debug de generation of merged vcfs """ cd ~/samba/CandidaMine_data_generation/v1/data/Candida_albicans_5476/varCall_output/SRR6669901/ ~/samba/scripts/perSVade/perSVade_repository/scripts/varcall_cnv_pipeline.py -r ~/samba/CandidaMine_data_generation/v1/data/Candida_albicans_5476/genome.fasta -thr 4 -o smallVars_CNV_output -p 2 -sbam aligned_reads.bam.sorted -c 12 -mchr Ca22chrM_C_albicans_SC5314 -mcode 4 -gcode 12 -gff ~/samba/CandidaMine_data_generation/v1/data/Candida_albicans_5476/annotations.gff --get_merged_vcf """
raise ValueError("%s is not a valid gff" % opt.gff) # define the minimum AF ploidy_to_minAF = {1: 0.9, 2: 0.25, 3: 0.15, 4: 0.1} if opt.minAF_smallVars == "infer": minAF_smallVars = ploidy_to_minAF[opt.ploidy] elif opt.minAF_smallVars <= 1 and opt.minAF_smallVars >= 0: minAF_smallVars = opt.minAF_smallVars else: raise ValueError("The value provided in --minAF_smallVars is incorrect") print("running small vars and CNV pipeline into %s" % opt.outdir) # check that the environment is correct fun.run_cmd( "echo 'This is a check of the environment in which the pipeline is running'; which bedtools" ) # correct the gff file, so that it doesn't have lines starting with # and also add biotype (important for ensembl VEP) if not opt.gff is None: correct_gff = "%s_corrected.gff" % (opt.gff) correct_gff_tmp = "%s_corrected_tmp.gff" % (opt.gff) if fun.file_is_empty(correct_gff) or opt.replace is True: print("correcting gff") correct_gff_cmd = "grep -v '^#' %s > %s" % (opt.gff, correct_gff_tmp) fun.run_cmd(correct_gff_cmd) os.rename(correct_gff_tmp, correct_gff) # modify gff to add biotype gff_with_biotype = "%s_with_biotype.gff" % correct_gff
# get the gff tabixed and sorted gff_clean = "%s_clean.gff"%opt.gff gff_clean_compressed = "%s_clean.gz"%opt.gff gff_clean_compressed_tbi = "%s.tbi"%gff_clean_compressed if fun.file_is_empty(gff_clean_compressed_tbi): # remove previous files fun.remove_file(gff_clean) fun.remove_file(gff_clean_compressed) fun.remove_file(gff_clean_compressed_tbi) print("compressing gff") # eliminate strange lines,chromosomes and compress fun.run_cmd("%s sort -i %s | egrep -v '^#' | egrep -v $'\tchromosome\t' > %s"%(bedtools, opt.gff, gff_clean)) fun.run_cmd("%s -c %s > %s"%(bgzip, gff_clean, gff_clean_compressed)) # index with tabix fun.run_cmd("%s %s"%(tabix, gff_clean_compressed)) # define the outfile of vep raw outfile_vep_raw = "%s.raw.tbl"%opt.outfile outfile_vep_raw_tmp = "%s.tmp"%outfile_vep_raw if fun.file_is_empty(outfile_vep_raw): fun.remove_file(outfile_vep_raw_tmp) print("running vep") fun.run_cmd('%s --input_file %s --format "vcf" --output_file %s --fasta %s --gff %s -v --force_overwrite --tab --fields "Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,Extra"'%(vep, opt.input_vcf, outfile_vep_raw_tmp, opt.ref, gff_clean_compressed))