Пример #1
0
    def variant_call(self,
                     alignment,
                     reference_file,
                     stand_emit_conf=40,
                     stand_call_conf=100,
                     GATK_dir="",
                     num_of_threads=5,
                     output_mode="EMIT_VARIANTS_ONLY",
                     discovery_mode="BOTH",
                     output_file="GATK_raw.vcf",
                     default_base_qualities=None):
        # manual http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_genotyper_UnifiedGenotyper.html
        # output_mode values:
        # EMIT_VARIANTS_ONLY
        # EMIT_ALL_CONFIDENT_SITES
        # EMIT_ALL_SITES

        # discovery_mode values:
        # SNP
        # INDEL
        # GENERALPLOIDYSNP
        # GENERALPLOIDYINDEL
        # BOTH
        default_qualities = ""
        if default_base_qualities:
            default_qualities = "--defaultBaseQualities %i" % default_base_qualities

        gatk_dir = check_path(GATK_dir)
        os.system(
            " java -jar %sGenomeAnalysisTK.jar -nt %i -l INFO -R %s -T UnifiedGenotyper -I %s -stand_call_conf %i -stand_emit_conf %i -o %s --output_mode %s -glm %s %s"
            % (gatk_dir, num_of_threads, reference_file, alignment,
               stand_call_conf, stand_emit_conf, output_file, output_mode,
               discovery_mode, default_qualities))
Пример #2
0
def RepeatModeler_search(query_file, db_name, output_file="run.out",
                         num_of_threads=5, RepeatModeler_dir=""):
    print("\nRepeatModeler search...\n")
    repmod_dir = check_path(RepeatModeler_dir)
    os.system(repmod_dir + "BuildDatabase -engine ncbi  -name %s %s" % (db_name, query_file))
    os.system(repmod_dir + "RepeatModeler -engine ncbi -pa %i -database %s > %s"
              % (num_of_threads, db_name, output_file))
Пример #3
0
def TRF_search(query_file, match=2, mismatch=7, delta=7, PM=80,
               PI=10, minscore=50, max_period=500, flanked=False, TRF_dir=""):

    print("\nTRF search...\n")
    #use: trf File Match Mismatch Delta PM PI Minscore MaxPeriod [options]
    #Where: (all weights, penalties, and scores are positive)
    # File = sequences input file
    # Match = matching weight
    # Mismatch = mismatching penalty
    # Delta = indel penalty
    # PM = match probability (whole number)
    # PI = indel probability (whole number)
    # Minscore = minimum alignment score to report
    # MaxPeriod = maximum period size to report
    # [options] = one or more of the following :
    # -m masked sequence file
    # -f flanking sequence
    # -d data file
    # -h suppress HTML output
    #Recomended options: trf yoursequence.txt 2 7 7 80 10 50 500 -f -d -m
    flanking = ""
    if flanked:
        flanking = "-f"

    trf_path = check_path(TRF_dir)
    os.system(trf_path + "trf %s %i %i %i %i %i %i %i %s -d -m"
              % (query_file, match, mismatch, delta, PM, PI, minscore, max_period, flanking))
Пример #4
0
def add_header2bam(input_bam,
                   output_bam,
                   RGID,
                   RGLB,
                   RGPL,
                   RGSM,
                   RGPU,
                   PICARD_dir=""):
    picard_dir = check_path(PICARD_dir)
    os.system(
        "java -XX:MaxDirectMemorySize=4G -jar %sAddOrReplaceReadGroups.jar I= %s O= %s SORT_ORDER=coordinate RGID=%s RGLB=%s  RGPL=%s RGSM=%s RGPU=%s CREATE_INDEX=True"
        % (picard_dir, input_bam, output_bam, RGID, RGLB, RGPL, RGSM, RGPU))
Пример #5
0
def RepeatMasker_search(query_file, species, custom_lib_path=None, RepeatMasker_dir="",
                        num_of_threads=5, search_type="-s"):

    #species: see list of possible species in repeatmasker.help coming with RepeatMasker
    #search type: "-s" (sensetive), "" (default), "-q" (fast), "-qq" (very fast)

    repmask_dir = check_path(RepeatMasker_dir)
    custom_lib = ""
    if custom_lib_path:
        cuatom_lib = "-lib %s" % custom_lib_path

    #additional options:
    #-xm    creates an additional output file in cross_match format (for parsing)
    #-ace   creates an additional output file in ACeDB format
    #-gff   creates an additional Gene Feature Finding format
    #-excln The percentages displayed in the .tbl file are calculated using a
    #       total sequence length excluding runs of 25 Ns or more.
    print("\nRepeatMasker search...\n")
    os.system(repmask_dir + "RepeatMasker -excln -xm -ace -gff %s -pa %i -species %s %s %s"
              % (custom_lib, num_of_threads, species, search_type, query_file))
Пример #6
0
    for filename in dir_list:
        if ("_R1" in filename) and (sample_name in filename):
            left_reads = filename
        if ("_R2" in filename) and (sample_name in filename):
            right_reads = filename
    return left_reads, right_reads

reference_name = "LAN210_v0.10m"
reference_dir = "/home/mahajrod/genetics/desaminases/data/LAN210_v0.10m/"
reference = "%s%s.fasta" % (reference_dir, reference_name)
reference_dict = "%s%s.dict" % (reference_dir, reference_name)
reference_index = "%s%s" % (reference_dir, reference_name)

workdir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/all"

gatk_dir = check_path("/home/mahajrod/Repositories/genetic/NGS_tools/GenomeAnalysisTK-3.2-0/")
platform = "illumina"
read_subdir = "trimmed/spades/corrected/"

#samples_file = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/samples_nonwt.t"
#samples_file = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/samples_wt.t"
#samples_file = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/samples_nonHAP.t"
samples_file = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/samples_HAP.t"

alignment_dir = "alignment_LAN210_v0.10m"
os.chdir(reference_dir)
make_fasta_dict(reference, reference_dict, PICARD_dir="/home/mahajrod/Repositories/genetic/NGS_tools/picard-tools-1.115/picard-tools-1.115/")
get_chromosomes_bed(reference, reference_index, mitochondrial_region_name="mt",
                        chrom_out_file="chromosomes.bed", mito_out_file="mt.bed", reference_filetype="fasta")
os.system("samtools faidx %s" % reference)
Пример #7
0
def snp_call_GATK(alignment,
                  sample_name,
                  reference_file,
                  known_sites_vcf,
                  stand_emit_conf=40,
                  stand_call_conf=100,
                  QD=2.0,
                  FS=60.0,
                  MQ=40.0,
                  HaplotypeScore=13.0,
                  MappingQualityRankSum=-12.5,
                  ReadPosRankSum=-8.0,
                  GATK_dir="",
                  num_of_threads=5,
                  skip_base_recalibration=False):
    #default filter expression
    #"QD < 2.0 || FS > 60.0 || MQ < 40.0 || HaplotypeScore > 13.0 || MappingQualityRankSum < -12.5 || ReadPosRankSum < -8.0"
    gatk_dir = check_path(GATK_dir)
    intermediate_alignment = alignment
    if not skip_base_recalibration:
        intermediate_alignment = alignment + "_recal_reads.bam"
        #Analyze patterns of covariation in the sequence dataset
        os.system(
            "java -jar %sGenomeAnalysisTK.jar -nct %i  -T BaseRecalibrator -R %s -I %s -knownSites %s -o %s_recal_data.table"
            % (gatk_dir, num_of_threads, reference_file, alignment,
               known_sites_vcf, sample_name))
        #Do a second pass to analyze covariation remaining after recalibration
        os.system(
            "java -jar %sGenomeAnalysisTK.jar -nct %i  -T BaseRecalibrator -R %s -I %s -knownSites %s  -BQSR %s_recal_data.table -o %s_post_recal_data.table"
            % (gatk_dir, num_of_threads, reference_file, alignment,
               known_sites_vcf, sample_name, sample_name))

        #Generate before/after plots
        #os.system("java -jar %sGenomeAnalysisTK.jar -T AnalyzeCovariates -R %s -before %s_recal_data.table -after %s_post_recal_data.table -plots %s_recalibration_plots.pdf"
        #          % (gatk_dir, reference_file, sample_name, sample_name, sample_name))

        #Apply the recalibration to your sequence data
        os.system(
            "java -jar %sGenomeAnalysisTK.jar -nct %i -T PrintReads -R %s -I %s -BQSR %s_recal_data.table -o %s"
            % (gatk_dir, num_of_threads, reference_file, alignment,
               sample_name, intermediate_alignment))
    print("\nSNP call...\n")
    #SNP call
    os.system(
        " java -jar %sGenomeAnalysisTK.jar -nt %i -l INFO -R %s -T UnifiedGenotyper -I %s -stand_call_conf %i -stand_emit_conf %i  -o %s_GATK_raw.vcf --output_mode EMIT_VARIANTS_ONLY"
        % (gatk_dir, num_of_threads, reference_file, intermediate_alignment,
           stand_call_conf, stand_emit_conf, sample_name))
    #extract SNP
    os.system(
        "java -jar %sGenomeAnalysisTK.jar -T SelectVariants -R %s -V %s_GATK_raw.vcf -selectType SNP -o %s_GATK_raw_no_indel.vcf"
        % (gatk_dir, reference_file, sample_name, sample_name))
    #extract indels
    os.system(
        "java -jar %sGenomeAnalysisTK.jar -T SelectVariants -R %s -V %s_GATK_raw.vcf -selectType INDEL -o %s_GATK_raw_only_indel.vcf"
        % (gatk_dir, reference_file, sample_name, sample_name))

    #filtering
    print("\nFiltering SNP...\n")
    os.system(
        "java -jar %sGenomeAnalysisTK.jar -T VariantFiltration -R %s -V %s_GATK_raw_no_indel.vcf --filterExpression 'QD < %f || FS > %f || MQ < %f || HaplotypeScore > %f || MappingQualityRankSum < %f || ReadPosRankSum < %f' --filterName 'ambigious_snp' -o %s_GATK_filtered_snps.vcf "
        % (gatk_dir, reference_file, sample_name, QD, FS, MQ, HaplotypeScore,
           MappingQualityRankSum, ReadPosRankSum, sample_name))
    #os.system("vcftools --vcf %s_GATK_filtered_snps.vcf --remove-filtered-all --out %s_GATK_best_snps.vcf --recode --recode-INFO-all"
    #          % (sample_name, sample_name ))
    """
Пример #8
0
def windowmasker_search(windowmasker_dir):
    winmask_dir = check_path(windowmasker_dir)
    #TODO: write this function
    pass
Пример #9
0
def rmout2gff3(rmoutfile, outfile, RepeatMaskerUtils_dir=""):
    repmaskutils_dir = check_path(RepeatMaskerUtils_dir)
    os.system(repmaskutils_dir + "rmOutToGFF3.pl %s > %s" % (rmoutfile, outfile))
Пример #10
0
def extract_repbase(species, output_file="RepBase.fasta", RepeatMaskerUtils_dir=""):
    print("\nExtracting RepBase for %s\n" % species)
    repmaskutils_dir = check_path(RepeatMaskerUtils_dir)
    os.system(repmaskutils_dir + "queryRepeatDatabase.pl -species %s > %s" % (species, output_file))
Пример #11
0
def make_fasta_dict(fasta_file, dict_name, PICARD_dir=""):
    picard_dir = check_path(PICARD_dir)
    os.system("java -jar %sCreateSequenceDictionary.jar R= %s O= %s" %
              (picard_dir, fasta_file, dict_name))