示例#1
0
def test_read_simulation_and_get_reads(genome,
                                       window_l=2000,
                                       npairs=50000,
                                       read_length=150,
                                       median_insert_size=250,
                                       median_insert_size_sd=50,
                                       threads=4,
                                       replace=False):
    """ 
    Takes a genome and simulates reads for it, saving them under <genome>_simulating_reads 
    """

    # define the outdir
    outdir = "%s_simulating_reads" % genome
    outdir_reads = "%s/getting_reads" % outdir

    # remove the outdirs if replace is True
    if replace is True:
        fun.delete_folder(outdir)
        fun.delete_folder(outdir_reads)

    # make folders
    fun.make_folder(outdir)
    fun.make_folder(outdir_reads)

    # define the expected reads
    reads1 = "%s/all_reads1.correct.fq.gz" % outdir_reads
    reads2 = "%s/all_reads2.correct.fq.gz" % outdir_reads

    if any([fun.file_is_empty(f) for f in [reads1, reads2]]):

        # run index the genome
        fun.run_cmd("%s faidx %s" % (fun.samtools, genome))

        # get the windows df
        windows_bed = "%s/windows_file.bed" % outdir
        fun.run_cmd("%s makewindows -g %s.fai -w %i > %s" %
                    (fun.bedtools, genome, window_l, windows_bed))
        df_windows = fun.pd.read_csv(windows_bed,
                                     sep="\t",
                                     header=-1,
                                     names=["chromosome", "start", "end"])
        df_windows["predicted_relative_coverage"] = fun.random.sample(
            list(fun.np.linspace(0.5, 2, 10000)), len(df_windows))

        # simulate reads
        fun.simulate_readPairs_per_window(df_windows,
                                          genome,
                                          npairs,
                                          outdir_reads,
                                          read_length,
                                          median_insert_size,
                                          median_insert_size_sd,
                                          replace=False,
                                          threads=4)

    print("read simulation works well")
    return reads1, reads2
示例#2
0
def test_parameter_optimisation_perSVade(sorted_bam,
                                         reference_genome,
                                         outdir,
                                         threads=4,
                                         replace=False):
    """This pipeline will test the parameter optimisation features of perSVade into outdir. It is expected to work for C.glabrata"""

    cmd = "%s -r %s -thr %i -o %s -sbam %s --nvars 5 --simulation_ploidies haploid,diploid_hetero --range_filtering_benchmark theoretically_meaningful -mchr mito_C_glabrata_CBS138" % (
        fun.perSVade_py, reference_genome, threads, outdir, sorted_bam)

    fun.run_cmd(cmd)

    print("parameter optimisation worked successfully")
示例#3
0
def test_conda_env_generation(outdir, replace=False):
    """This function exports the current perSVade_env to a .yml file, and generates a conda file"""

    # define the file that indicates that the enviornment is correct
    correct_env_file = "%s/correct_env.txt" % outdir

    # define a test_env_name
    test_env_name = "%s_test" % EnvName

    if fun.file_is_empty(correct_env_file) or replace is True:

        # remove previous env
        print("removing previous env")
        try:
            fun.run_cmd("conda remove -y -n %s --all" % test_env_name)
        except:
            print("%s does not exist" % test_env_name)

        # export file
        print("creating %s yml" % EnvName)
        yml_file = "%s/%s.yml" % (outdir, test_env_name)
        fun.run_cmd(
            "conda env export --no-builds --from-history -n %s --file %s" %
            (EnvName, yml_file))

        # create environment
        print("re-generating as %s" % test_env_name)
        fun.run_cmd("conda env create --file %s --name %s" %
                    (yml_file, test_env_name))

        # test that the activation works
        print("activating %s" % test_env_name)
        cmd_activation = "source %s/etc/profile.d/conda.sh && conda activate %s && python -c 'import sys; sys.path.insert(0, \"%s\"); import sv_functions as fun'" % (
            AnacondaDir, test_env_name, fun.get_fullpath(scripts_dir))
        fun.run_cmd(cmd_activation)

        # remove file
        print("removing envs")
        fun.remove_file(yml_file)

        # remove env
        fun.run_cmd("conda remove -y -n %s --all" % test_env_name)

        # create file stating that the env is correct
        open(correct_env_file, "w").write("env is correct")

    print("%s can be correctly regenerated" % EnvName)
示例#4
0
def test_smallVarCall_CNV_running(
        sorted_bam,
        outdir,
        ref_genome,
        gff,
        threads=4,
        mitochondrial_chromosome="mito_C_glabrata_CBS138",
        replace=False):
    """Takes a sorted bam (shuld have some mutations) and runs the variant calling pipeline on it"""

    # if replace is True, remove the outdir
    if replace is True: fun.delete_folder(outdir)

    # make the outdir
    fun.make_folder(outdir)

    # get the repeats
    repeats_table = fun.get_repeat_maskerDF(ref_genome,
                                            threads=4,
                                            replace=False)[1]

    for pooled_seq in [False
                       ]:  # this may be also [False, True] to test pooled seq

        outdir_varCall = "%s/varcall_pooledSeq_%s" % (outdir, str(pooled_seq))

        # define the final file
        final_file = "%s/variant_annotation_ploidy2.tab" % outdir_varCall

        if fun.file_is_empty(final_file) or replace is True:
            print(
                "running on pooled_seq=%s. This may take a bit because a lot of variants will be considered"
                % pooled_seq)

            # define the cmd
            cmd = "%s -r %s -o %s -p 2 -sbam %s -caller all -c 5 -mchr %s -mcode 3 -gcode 1 --repeats_table %s --remove_smallVarsCNV_nonEssentialFiles -gff %s -thr %i" % (
                varcall_cnv_pipeline, ref_genome, outdir_varCall, sorted_bam,
                mitochondrial_chromosome, repeats_table, gff, threads)

            # add pooled seq
            if pooled_seq is True: cmd += " --pooled_sequencing"

            fun.run_cmd(cmd)

    print("small variant calling and CNV of genes works")
示例#5
0
def test_processing_varcalling(smallVars_input_outdir,
                               reference_genome,
                               outdir,
                               sorted_bam,
                               replace=False,
                               threads=4):
    """This function takes a varcall file were all the variant calling has been performed and checks that the processing of vcfs works in varcall_cnv_pipeline. sorted_bam is just a fake sorted bam not to repeat the pipeline running"""

    # get full paths
    outdir = fun.get_fullpath(outdir)
    smallVars_input_outdir = fun.get_fullpath(smallVars_input_outdir)
    reference_genome = fun.get_fullpath(reference_genome)

    # cp the files under outdir
    fun.make_folder(outdir)

    target_smallVars_input_outdir = "%s/smallVars_input_outdir" % outdir
    target_smallVars_input_outdir_tmp = "%s.tmp" % target_smallVars_input_outdir
    if not os.path.isdir(target_smallVars_input_outdir) or replace is True:
        fun.run_cmd(
            "cp -r %s %s " %
            (smallVars_input_outdir, target_smallVars_input_outdir_tmp))
        os.rename(target_smallVars_input_outdir_tmp,
                  target_smallVars_input_outdir)

    # final file
    final_file = "%s/variants_atLeast3PASS_ploidy2.vcf" % target_smallVars_input_outdir

    if fun.file_is_empty(final_file) or replace is True:

        cmd = "%s -r %s -o %s -p 2 -sbam %s -caller all -c 5 -mchr no_mitochondria -mcode 3 -gcode 1 --repeats_table %s.repeats.tab --remove_smallVarsCNV_nonEssentialFiles -thr %i --skip_cnv_analysis" % (
            varcall_cnv_pipeline, reference_genome,
            target_smallVars_input_outdir, sorted_bam, reference_genome,
            threads)

        fun.run_cmd(cmd)

    print("you can run successfully the variant processing")
示例#6
0
def test_greasy():
    """This function tests whether greasy can be found in the path"""

    try:

        fun.run_cmd("module load greasy")
        print("greasy module can be loaded")

        fun.run_cmd("module load greasy && which greasy")
        print("greasy is in the path")

        fun.run_cmd("which sbatch")
        print("sbatch is in the path")

        print("greasy can be used for running parallel perSVade jobs")

    except:

        print(
            "\n\n---\nWARNING: greasy is not installed properly in your system. This means that setting '--job_array_mode greasy' will fail. You can set '--job_array_mode local' and run jobs sequentially and not in parallel. '--job_array_mode greasy' will only work on machines that use SLURM for managing jobs and greasy installed (and callable with a command like 'module load greasy && greasy <jobs_file>', and '<jobs_file>' is a file where each line corresponds to a command to be executed in a sepparate SLURM job) \n---\n\n"
        )
import sv_functions as fun
print("loading python packages worked successfully")

# define the testing inuts dir 
testing_outputs_dir = "%s/testing_outputs"%test_dir
test_output_perSVade = "%s/perSVade_output"%testing_outputs_dir
outdir_small_variantCalling = "%s/smallVars_CNV_output"%test_output_perSVade

# delete and cretae outdir
#fun.delete_folder(testing_outputs_dir)
fun.make_folder(testing_outputs_dir)
fun.make_folder(test_output_perSVade)

# redefine the reference genome location
ref_genome = "%s/reduced_genome.fasta"%testing_outputs_dir
if fun.file_is_empty(ref_genome): fun.run_cmd("cp %s %s"%(test_ref_genome, ref_genome))

# redefine the gff
gff = "%s/reduced_annotations.gff"%testing_outputs_dir
if fun.file_is_empty(gff): fun.run_cmd("cp %s %s"%(test_gff, gff))

# redefine the mutated genome location
mut_genome = "%s/mutated_genome.fasta"%testing_outputs_dir
if fun.file_is_empty(mut_genome): fun.run_cmd("cp %s %s"%(test_mutated_genome, mut_genome))

# define an example calbicans varCall_outout
Calbicans_varCall_outdir = "%s/varcalling_output_Calbicans_SRR2088862"%testing_inputs_dir

# define the Calbicans genome
inputs_Calbicans_genome = "%s/Candida_albicans.fasta"%testing_inputs_dir
Calbicans_genome = "%s/Candida_albicans.fasta"%testing_outputs_dir
        # define the goldenSet_dir
        if spName=="Candida_glabrata": goldenSet_dir = goldenSet_dir_Cglabrata
        else: goldenSet_dir = "auto"

        # get the golden set running 
        if taxID in taxIDs_with_noON_overalpping: continue
        cmd = "%s --ref %s --threads %i -o %s --target_taxID %s --n_close_samples 3 --nruns_per_sample 3 -f1 skip -f2 skip --mitochondrial_chromosome %s --gff %s --goldenSet_dir %s --skip_SVcalling"%(perSVade_py, genome, threads, outdir_perSVade, taxID, mitochondrial_chromosome, gff, goldenSet_dir)

    # add options depending on the machine
    if run_in_cluster is True: cmd += " --job_array_mode greasy --queue_jobs bsc_ls --max_ncores_queue 144 --time_read_obtention 48:00:00 --time_perSVade_running 48:00:00"

    else: cmd += " --job_array_mode local"

    if StopAfterPrefecth_of_reads is True: cmd += " --StopAfterPrefecth_of_reads"

    fun.run_cmd(cmd)

    #if taxID=="5476": adkjhdakg # stop after C. albicans



# an example CMD to debug de generation of merged vcfs
"""

cd ~/samba/CandidaMine_data_generation/v1/data/Candida_albicans_5476/varCall_output/SRR6669901/

~/samba/scripts/perSVade/perSVade_repository/scripts/varcall_cnv_pipeline.py -r ~/samba/CandidaMine_data_generation/v1/data/Candida_albicans_5476/genome.fasta -thr 4 -o smallVars_CNV_output -p 2 -sbam aligned_reads.bam.sorted -c 12 -mchr Ca22chrM_C_albicans_SC5314 -mcode 4 -gcode 12 -gff ~/samba/CandidaMine_data_generation/v1/data/Candida_albicans_5476/annotations.gff --get_merged_vcf


"""
示例#9
0
    raise ValueError("%s is not a valid gff" % opt.gff)

# define the minimum AF
ploidy_to_minAF = {1: 0.9, 2: 0.25, 3: 0.15, 4: 0.1}
if opt.minAF_smallVars == "infer":
    minAF_smallVars = ploidy_to_minAF[opt.ploidy]
elif opt.minAF_smallVars <= 1 and opt.minAF_smallVars >= 0:
    minAF_smallVars = opt.minAF_smallVars
else:
    raise ValueError("The value provided in --minAF_smallVars is incorrect")

print("running small vars and CNV pipeline into %s" % opt.outdir)

# check that the environment is correct
fun.run_cmd(
    "echo 'This is a check of the environment in which the pipeline is running'; which bedtools"
)

# correct the gff file, so that it doesn't have lines starting with # and also add biotype (important for ensembl VEP)
if not opt.gff is None:
    correct_gff = "%s_corrected.gff" % (opt.gff)
    correct_gff_tmp = "%s_corrected_tmp.gff" % (opt.gff)

    if fun.file_is_empty(correct_gff) or opt.replace is True:
        print("correcting gff")
        correct_gff_cmd = "grep -v '^#' %s > %s" % (opt.gff, correct_gff_tmp)
        fun.run_cmd(correct_gff_cmd)
        os.rename(correct_gff_tmp, correct_gff)

    # modify gff to add biotype
    gff_with_biotype = "%s_with_biotype.gff" % correct_gff
示例#10
0
# get the gff tabixed and sorted
gff_clean = "%s_clean.gff"%opt.gff
gff_clean_compressed = "%s_clean.gz"%opt.gff
gff_clean_compressed_tbi = "%s.tbi"%gff_clean_compressed

if fun.file_is_empty(gff_clean_compressed_tbi):

    # remove previous files
    fun.remove_file(gff_clean)
    fun.remove_file(gff_clean_compressed)
    fun.remove_file(gff_clean_compressed_tbi)

    print("compressing gff")

    # eliminate strange lines,chromosomes and compress
    fun.run_cmd("%s sort -i %s | egrep -v '^#' | egrep -v $'\tchromosome\t' > %s"%(bedtools, opt.gff, gff_clean))
    fun.run_cmd("%s -c %s > %s"%(bgzip, gff_clean, gff_clean_compressed))

    # index with tabix
    fun.run_cmd("%s %s"%(tabix, gff_clean_compressed))

# define the outfile of vep raw
outfile_vep_raw = "%s.raw.tbl"%opt.outfile
outfile_vep_raw_tmp = "%s.tmp"%outfile_vep_raw

if fun.file_is_empty(outfile_vep_raw):
    fun.remove_file(outfile_vep_raw_tmp)

    print("running vep")

    fun.run_cmd('%s --input_file %s --format "vcf" --output_file %s --fasta %s --gff %s -v --force_overwrite --tab --fields "Uploaded_variation,Location,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,Extra"'%(vep, opt.input_vcf, outfile_vep_raw_tmp, opt.ref, gff_clean_compressed))