Exemplo n.º 1
0
import os
import argparse
from RouToolPa.Tools.Filter import FaCut

#from RouToolPa.Tools.Filter import FastQC

from RouToolPa.Routines import FileRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-d",
                    "--sample_directory",
                    action="store",
                    dest="samples_dir",
                    required=True,
                    type=lambda s: FileRoutines.check_path(os.path.abspath(s)),
                    help="Directory with samples")
parser.add_argument(
    "-s",
    "--samples",
    action="store",
    dest="samples",
    help="Comma-separated list of subdirectories(one per sample) to handle. "
    "If not set all subdirectories will be considered as containing samples."
    "In sample directory should one(in case SE reads) or two(in case PE reads) files."
    "Filenames should should contain '_1.fq' or '_1.fastq' for forward(left) reads, "
    " '_2.fq' or '_2.fastq' for reverse(right) reads and '.fq' or '.fastq' for SE reads"
)
parser.add_argument(
    "-o",
    "--output_dir",
Exemplo n.º 2
0
                    dest="indel_ReadPosRankSum",
                    type=float,
                    default=-20.0,
                    help="Indel ReadPosRankSum threshold. Default -   -20.0")
#parser.add_argument("--indel_InbreedingCoeff", action="store", dest="indel_InbreedingCoeff", type=float, default=-0.8,
#                    help="Indel InbreedingCoeff threshold. Default -   -0.8")
parser.add_argument("--indel_FS",
                    action="store",
                    dest="indel_FS",
                    type=float,
                    default=200.0,
                    help="Indel FS threshold. Default - 200.0")

args = parser.parse_args()

VariantFiltration.jar_path = FileRoutines.check_path(args.gatk_dir)

VariantFiltration.filter_bad_variants(
    args.reference,
    args.input_vcf,
    args.output_prefix,
    snp_filter_name=args.snp_filter_name,
    snp_QD=args.snp_QD,
    snp_FS=args.snp_FS,
    snp_MQ=args.snp_MQ,
    #snp_HaplotypeScore=args.snp_HaplotypeScore,
    snp_MappingQualityRankSum=args.snp_MappingQualityRankSum,
    snp_ReadPosRankSum=args.snp_ReadPosRankSum,
    indel_filter_name=args.indel_filter_name,
    indel_QD=args.indel_QD,
    indel_ReadPosRankSum=args.indel_ReadPosRankSum,
Exemplo n.º 3
0
white_list = IdList()
if args.black_list_file:
    black_list.read(args.black_list_file)
if args.white_list_file:
    white_list.read(args.white_list_file)
out_fd = open(args.cafe_file, "w")
filtered_fd = open("%sfiltered_families.cafe" % args.filtered_family_dir, "w")
out_fd.write("FAMILYDESC\tFAMILY\t%s\n" % ("\t".join(species_list)))
filtered_fd.write("FAMILYDESC\tFAMILY\t%s\n" % ("\t".join(species_list)))
species_filtered_fd_list = OrderedDict()
fam_count_dict = TwoLvlDict()
species_family_dict = TwoLvlDict()
for species in args.species_set:
    species_family_dict[species] = SynDict()
    species_family_dict[species].read(
        "%s%s%s" % (FileRoutines.check_path(args.input), species, args.suffix),
        split_values=True,
        values_separator=",",
        separator="\t")
    #print species_family_dict[species]
    fam_count_dict[species] = species_family_dict[species].count_synonyms()
    #print fam_count_dict[species]
    species_filtered_fd_list[species] = open(
        "%s%s.fam" % (args.filtered_family_dir, species), "w")

for family in fam_count_dict.sl_keys():
    genes_number_list = []
    number_of_species = 0
    for species in species_list:
        genes_number_list.append(fam_count_dict[species][family] if family in
                                 fam_count_dict[species] else 0)
Exemplo n.º 4
0
                    action="store",
                    dest="max_memory_per_thread",
                    default="1G",
                    help="Maximum memory per thread. Default - 1G")
args = parser.parse_args()

if args.prepare_bam and ((not args.prepared_bam_prefix) or
                         (not args.temp_dir)):
    raise ValueError(
        "Options -e/--prepared_bam_prefix and -m/--temp_dir must be set if -p/--prepare_bam option is used"
    )

SamtoolsV1.threads = args.threads

if args.prepare_bam or args.mix_ends:
    FileRoutines.safe_mkdir(FileRoutines.check_path(args.temp_dir))
    prepared_pe_bam_file = "%s.bam" % args.prepared_bam_prefix
    prepared_unpaired_bam_file = (
        "%s.unpaired.bam" %
        args.prepared_bam_prefix) if args.mix_ends else None
    """
    SamtoolsV1.prepare_bam_for_read_extraction(args.input, args.prepared_bam, temp_file_prefix=args.temp_dir,
                                               max_memory_per_thread=args.max_memory_per_thread)
    """
    SamtoolsV1.prepare_bam_for_read_extraction(
        args.input,
        prepared_pe_bam_file,
        temp_file_prefix=args.temp_dir,
        max_memory_per_thread=args.max_memory_per_thread,
        bam_file_to_write_unpaired_reads=prepared_unpaired_bam_file)
if args.paired:
Exemplo n.º 5
0
def snp_call_GATK(alignment,
                  sample_name,
                  reference_file,
                  known_sites_vcf,
                  stand_emit_conf=40,
                  stand_call_conf=100,
                  QD=2.0,
                  FS=60.0,
                  MQ=40.0,
                  HaplotypeScore=13.0,
                  MappingQualityRankSum=-12.5,
                  ReadPosRankSum=-8.0,
                  GATK_dir="",
                  num_of_threads=5,
                  skip_base_recalibration=False):
    #default filter expression
    #"QD < 2.0 || FS > 60.0 || MQ < 40.0 || HaplotypeScore > 13.0 || MappingQualityRankSum < -12.5 || ReadPosRankSum < -8.0"
    gatk_dir = FileRoutines.check_path(GATK_dir)
    intermediate_alignment = alignment
    if not skip_base_recalibration:
        intermediate_alignment = alignment + "_recal_reads.bam"
        #Analyze patterns of covariation in the sequence dataset
        os.system(
            "java -jar %sGenomeAnalysisTK.jar -nct %i  -T BaseRecalibrator -R %s -I %s -knownSites %s -o %s_recal_data.table"
            % (gatk_dir, num_of_threads, reference_file, alignment,
               known_sites_vcf, sample_name))
        #Do a second pass to analyze covariation remaining after recalibration
        os.system(
            "java -jar %sGenomeAnalysisTK.jar -nct %i  -T BaseRecalibrator -R %s -I %s -knownSites %s  -BQSR %s_recal_data.table -o %s_post_recal_data.table"
            % (gatk_dir, num_of_threads, reference_file, alignment,
               known_sites_vcf, sample_name, sample_name))

        #Generate before/after plots
        #os.system("java -jar %sGenomeAnalysisTK.jar -T AnalyzeCovariates -R %s -before %s_recal_data.table -after %s_post_recal_data.table -plots %s_recalibration_plots.pdf"
        #          % (gatk_dir, reference_file, sample_name, sample_name, sample_name))

        #Apply the recalibration to your sequence data
        os.system(
            "java -jar %sGenomeAnalysisTK.jar -nct %i -T PrintReads -R %s -I %s -BQSR %s_recal_data.table -o %s"
            % (gatk_dir, num_of_threads, reference_file, alignment,
               sample_name, intermediate_alignment))
    print("\nSNP call...\n")
    #SNP call
    os.system(
        " java -jar %sGenomeAnalysisTK.jar -nt %i -l INFO -R %s -T UnifiedGenotyper -I %s -stand_call_conf %i -stand_emit_conf %i  -o %s_GATK_raw.vcf --output_mode EMIT_VARIANTS_ONLY"
        % (gatk_dir, num_of_threads, reference_file, intermediate_alignment,
           stand_call_conf, stand_emit_conf, sample_name))
    #extract SNP
    os.system(
        "java -jar %sGenomeAnalysisTK.jar -T SelectVariants -R %s -V %s_GATK_raw.vcf -selectType SNP -o %s_GATK_raw_no_indel.vcf"
        % (gatk_dir, reference_file, sample_name, sample_name))
    #extract indels
    os.system(
        "java -jar %sGenomeAnalysisTK.jar -T SelectVariants -R %s -V %s_GATK_raw.vcf -selectType INDEL -o %s_GATK_raw_only_indel.vcf"
        % (gatk_dir, reference_file, sample_name, sample_name))

    #filtering
    print("\nFiltering SNP...\n")
    os.system(
        "java -jar %sGenomeAnalysisTK.jar -T VariantFiltration -R %s -V %s_GATK_raw_no_indel.vcf --filterExpression 'QD < %f || FS > %f || MQ < %f || HaplotypeScore > %f || MappingQualityRankSum < %f || ReadPosRankSum < %f' --filterName 'ambigious_snp' -o %s_GATK_filtered_snps.vcf "
        % (gatk_dir, reference_file, sample_name, QD, FS, MQ, HaplotypeScore,
           MappingQualityRankSum, ReadPosRankSum, sample_name))
    #os.system("vcftools --vcf %s_GATK_filtered_snps.vcf --remove-filtered-all --out %s_GATK_best_snps.vcf --recode --recode-INFO-all"
    #          % (sample_name, sample_name ))
    """
Exemplo n.º 6
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import argparse
from RouToolPa.Tools.GATK import SelectVariants
from RouToolPa.Routines import FileRoutines


parser = argparse.ArgumentParser()

parser.add_argument("-i", "--input_vcf", action="store", dest="input_vcf", required=True,
                    help="Input vcf file")
parser.add_argument("-o", "--output_vcf", action="store", dest="output_vcf", required=True,
                    help="Output vcf file")
parser.add_argument("-r", "--reference", action="store", dest="reference", required=True,
                    help="Fasta with reference genome")
parser.add_argument("-g", "--gatk_directory", action="store", dest="gatk_dir", default="",
                    help="Directory with GATK jar")
args = parser.parse_args()

SelectVariants.jar_path = FileRoutines.check_path(args.gatk_dir)
SelectVariants.remove_entries_with_filters(args.reference, args.input_vcf, args.output_vcf)