示例#1
0
def prepare_readgroup(forward_read, logger):
    keep_logging('Preparing ReadGroup Info', 'Preparing ReadGroup Info',
                 logger, 'info')
    samplename = os.path.basename(forward_read)
    if forward_read.endswith(".gz"):
        ###
        output = gzip.open(forward_read, 'rb')
        firstLine = output.readline()
        split_field = re.split(r":", firstLine)
        id_name = split_field[1]
        id_name = id_name.strip()
        split_field = "\"" + "@RG" + "\\tID:" + split_field[
            1] + "\\tSM:" + samplename + "\\tLB:1\\tPL:Illumina" + "\""
        return split_field

    elif forward_read.endswith(".fastq"):
        ###
        output = open(forward_read, 'r')
        firstLine = output.readline()
        split_field = re.split(r":", firstLine)
        split_field = "\"" + "@RG" + "\\tID:" + split_field[
            1] + "\\tSM:" + samplename + "\\tLB:1\\tPL:Illumina" + "\""
        return split_field

    elif forward_read.endswith(".fq"):
        ###
        output = open(forward_read, 'r')
        firstLine = output.readline()
        split_field = re.split(r":", firstLine)
        split_field = "\"" + "@RG" + "\\tID:" + split_field[
            1] + "\\tSM:" + samplename + "\\tLB:1\\tPL:Illumina" + "\""
        return split_field
示例#2
0
def prepare_indel_gatk(out_finalbam, out_path, analysis, reference, logger,
                       Config):
    reference_filename = ConfigSectionMap(
        reference, Config)['ref_path'] + "/" + ConfigSectionMap(
            reference, Config)['ref_name']
    final_raw_vcf = gatkhaplotypecaller(out_finalbam, out_path, reference,
                                        analysis, logger, Config)
    if not os.path.isfile(final_raw_vcf):
        keep_logging('Error in GATK Haplotype Variant Calling step. Exiting.',
                     'Error in GATK Haplotype Variant Calling step. Exiting.',
                     logger, 'exception')
        exit()
    else:
        print "GATK Haplotype caller: Extracting indels from raw vcf files"
        indel_file_name = final_raw_vcf + "_indel.vcf"
        base_cmd = ConfigSectionMap(
            "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
                "gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap(
                    "gatk", Config)['base_cmd']
        cmd = "java -jar %s -T SelectVariants -R %s -V %s -selectType INDEL -o %s" % (
            base_cmd, reference_filename, final_raw_vcf, indel_file_name)
        call(cmd, logger)
        keep_logging('Running Command: [%s]' % cmd,
                     'Running Command: [%s]' % cmd, logger, 'info')
        return indel_file_name
示例#3
0
def samtools(out_finalbam, out_path, reference_filename, analysis, logger,
             Config):
    base_cmd = ConfigSectionMap(
        "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
            "samtools", Config)['samtools_bin'] + "/" + ConfigSectionMap(
                "samtools", Config)['base_cmd']
    mpileup_parameters = ConfigSectionMap("samtools",
                                          Config)['mpileup_parameters']
    reference = ConfigSectionMap(reference_filename,
                                 Config)['ref_path'] + "/" + ConfigSectionMap(
                                     reference_filename, Config)['ref_name']
    bcf_base_cmd = ConfigSectionMap(
        "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
            "bcftools", Config)['bcftools_bin'] + ConfigSectionMap(
                "bcftools", Config)['base_cmd']
    cmd = "%s mpileup %s %s %s | %s call -O v -v -c -o %s/%s_aln_mpileup_raw.vcf" % (
        base_cmd, mpileup_parameters, reference, out_finalbam, bcf_base_cmd,
        out_path, analysis)
    keep_logging("COMMAND: " + cmd, cmd, logger, 'debug')
    try:
        call(cmd, logger)
        #print ""
    except sp.CalledProcessError:
        keep_logging('Error in Samtools Variant Calling step. Exiting.',
                     'Error in Samtools Variant Calling step. Exiting.',
                     logger, 'exception')
        sys.exit(1)
    final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf" % (out_path, analysis)
    return final_raw_vcf
示例#4
0
def vcfstats(final_raw_vcf, out_path, analysis, logger, Config):
    base_tabix = ConfigSectionMap("bin_path",
                                  Config)['binbase'] + "/" + ConfigSectionMap(
                                      "vcftools", Config)['tabix_bin']
    base_vcftools_perl = ConfigSectionMap(
        "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
            "vcftools", Config)['vcftools_perl_bin']
    base_vcftools_bin = ConfigSectionMap(
        "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
            "vcftools", Config)['vcftools_bin']
    bgzip_cmd = "%s/bgzip -f -c %s > %s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf.gz" % (
        base_tabix, final_raw_vcf, out_path, analysis)
    keep_logging(bgzip_cmd, bgzip_cmd, logger, 'debug')
    tabix_cmd = "%s/tabix -f %s.gz" % (base_tabix, final_raw_vcf)
    keep_logging(tabix_cmd, tabix_cmd, logger, 'debug')
    vcfstat_cmd = "%s/vcf-stats %s.gz > %s/%s_vcf_stats" % (
        base_vcftools_bin, final_raw_vcf, out_path, analysis)
    keep_logging(vcfstat_cmd, vcfstat_cmd, logger, 'debug')
    try:
        call(bgzip_cmd, logger)
        call(tabix_cmd, logger)
        call(vcfstat_cmd, logger)
    except sp.CalledProcessError:
        keep_logging('Error in vcftools vcf stats step. Exiting.',
                     'Error in vcftools vcf stats step. Exiting.', logger,
                     'exception')
        sys.exit(1)
    vcf_stats_file = "%s/%s_vcf_stats" % (out_path, analysis)
    keep_logging('VCF Stats file: {}'.format(vcf_stats_file),
                 'VCF Stats file: {}'.format(vcf_stats_file), logger, 'debug')
    return vcf_stats_file
示例#5
0
def mlst(filenames_array, Config, logger, output_folder, type, samples, mlst_directory, cluster, scheduler, mlstdb):
    if type == "PE":
        for file in filenames_array:
            if re.search('R1_001_final.fastq.gz', file):
                second_part = file.replace("R1_001_final.fastq.gz", "R2_001_final.fastq.gz")
                first_part_split = file.split('R1_001_final.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
                first_part = re.sub("_S[0-9].*_", "", first_part)
            elif re.search('_R1.fastq.gz', file):
                second_part = file.replace("_R1.fastq.gz", "_R2.fastq.gz")
                first_part_split = file.split('_R1.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
                first_part = re.sub("_S[0-9].*_", "", first_part)
                # Changed on 03/15/2019
            elif re.search('R1.fastq.gz', file):
                second_part = file.replace("R1.fastq.gz", "R2.fastq.gz")
                first_part_split = file.split('R1.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
                first_part = re.sub("_S.*_", "", first_part)
                # Changed on 03/15/2019
                first_part = re.sub("_S[0-9].*_", "", first_part)
            elif re.search('1_combine.fastq.gz', file):
                second_part = file.replace("1_combine.fastq.gz", "2_combine.fastq.gz")
                first_part_split = file.split('1_combine.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
                first_part = re.sub("_S[0-9].*_", "", first_part)
            elif re.search('1_sequence.fastq.gz', file):
                second_part = file.replace("1_sequence.fastq.gz", "2_sequence.fastq.gz")
                first_part_split = file.split('1_sequence.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
                first_part = re.sub("_S[0-9].*_", "", first_part)
            elif re.search('_forward.fastq.gz', file):
                second_part = file.replace("_forward.fastq.gz", "_reverse.fastq.gz")
                first_part_split = file.split('_forward.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
                first_part = re.sub("_S[0-9].*_", "", first_part)
            elif re.search('R1_001.fastq.gz', file):
                second_part = file.replace("R1_001.fastq.gz", "R2_001.fastq.gz")
                first_part_split = file.split('R1_001.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
                first_part = re.sub("_S[0-9].*_", "", first_part)
            elif re.search('_1.fastq.gz', file):
                second_part = file.replace("_1.fastq.gz", "_2.fastq.gz")
                first_part_split = file.split('_1.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
                first_part = re.sub("_S[0-9].*_", "", first_part)
            elif re.search('.1.fastq.gz', file):
                second_part = file.replace(".1.fastq.gz", ".2.fastq.gz")
                first_part_split = file.split('.1.fastq.gz')
                first_part = re.sub("_S[0-9].*_", "", first_part)
                #first_part = re.sub("_S.*_", "", first_part)
            # mlst_cmd = "ariba run --verbose --force %s %s %s %s/%s --tmp_dir /tmp/" % (ConfigSectionMap("ariba", Config)['mlst_db_path'], file, file.replace('_R1_', '_R2_'), output_folder, os.path.basename(file)[0:20])
            mlst_cmd = "ariba run --verbose --force %s %s %s %s/%s --tmp_dir /tmp/" % (mlstdb, file, second_part, output_folder, os.path.basename(first_part))
            #keep_logging(mlst_cmd, mlst_cmd, logger, 'debug')
            if cluster == "cluster":
                job_prefix = "%s/%s" % (output_folder, os.path.basename(first_part))
                generate_cluster_jobs(mlst_cmd, job_prefix, scheduler, Config, logger)

    elif type == "SE":
        keep_logging('Ariba requires PE files', 'Ariba requires PE files', logger, 'debug')
示例#6
0
def prepare_readgroup(forward_read, aligner, logger):
    keep_logging('Preparing ReadGroup Info', 'Preparing ReadGroup Info',
                 logger, 'info')
    samplename = os.path.basename(forward_read)
    if forward_read.endswith(".gz"):
        #output = gzip.open(forward_read, 'rb')
        #firstLine = output.readline()
        #split_field = re.split(r":",firstLine)
        #id_name = split_field[1]
        #id_name = id_name.strip()
        #split_field = "\"" + "@RG" + "\\tID:" + split_field[1] + "\\tSM:" + samplename + "\\tLB:1\\tPL:Illumina" + "\""
        #return split_field
        output = gzip.open(forward_read, 'rb')
        firstLine = output.readline()
        if ":" in firstLine:
            split_field = re.split(r":", firstLine)
            id_name = split_field[1].rstrip()
            id_name = id_name.rstrip()
        # if aligner == "bowtie":
        #     split_field = "--rg-id %s --rg SM:%s --rg LB:1 --rg PL:Illumina" % (split_field[1], samplename)
        # elif aligner == "bwa":
        #     split_field = "\"" + "@RG" + "\\tID:" + split_field[1] + "\\tSM:" + samplename + "\\tLB:1\\tPL:Illumina" + "\""

        ###Pending
        elif "/" in firstLine:
            split_field = re.split(r"/", firstLine)
            id_name = split_field[1].rstrip()
            id_name = id_name.rstrip()
            #id_name = split_field[1].rstrip()
            #id_name = id_name.rstrip()
            split_field = "\"" + "@RG" + "\\tID:" + id_name + "\\tSM:" + samplename + "\\tLB:1\\tPL:Illumina" + "\""
        else:
            id_name = "1"
            split_field = "\"" + "@RG" + "\\tID:" + id_name + "\\tSM:" + samplename + "\\tLB:1\\tPL:Illumina" + "\""
        if aligner == "bowtie":
            split_field = "--rg-id %s --rg SM:%s --rg LB:1 --rg PL:Illumina" % (
                split_field[1], samplename)
        elif aligner == "bwa":
            split_field = "\"" + "@RG" + "\\tID:" + split_field[
                1] + "\\tSM:" + samplename + "\\tLB:1\\tPL:Illumina" + "\""
        return split_field

    elif forward_read.endswith(".fastq"):
        output = open(forward_read, 'r')
        firstLine = output.readline()
        split_field = re.split(r":", firstLine)
        split_field = "\"" + "@RG" + "\\tID:" + split_field[
            1] + "\\tSM:" + samplename + "\\tLB:1\\tPL:Illumina" + "\""
        return split_field

    elif forward_read.endswith(".fq"):
        ###
        output = open(forward_read, 'r')
        firstLine = output.readline()
        split_field = re.split(r":", firstLine)
        split_field = "\"" + "@RG" + "\\tID:" + split_field[
            1] + "\\tSM:" + samplename + "\\tLB:1\\tPL:Illumina" + "\""
        return split_field
示例#7
0
def multiqc(analysis_folder, filename, Config, logger,
            Multiqc_reports_directory):
    message = "Running MultiQC on %s" % analysis_folder
    keep_logging('', message, logger, 'debug')
    run_multiqc_cmd = "%s %s --force --filename %s --outdir %s" % (
        ConfigSectionMap("multiqc", Config)['base_cmd'], analysis_folder,
        filename, Multiqc_reports_directory)
    keep_logging('', run_multiqc_cmd, logger, 'debug')
    call(run_multiqc_cmd, logger)
示例#8
0
def alignment_stats(out_sorted_bam, out_path, analysis, logger, Config):
    alignment_stats_file = flagstat(out_sorted_bam, out_path, analysis, logger,
                                    Config)
    keep_logging(
        'The Alignments Stats file from Samtools: {}'.format(
            alignment_stats_file),
        'The Alignments Stats file from Samtools: {}'.format(
            alignment_stats_file), logger, 'debug')
    return alignment_stats_file
def index_bam(out_sort_bam, out_path, logger, Config):
    base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("samtools", Config)['samtools_bin'] + "/" + ConfigSectionMap("samtools", Config)['base_cmd']
    cmd = "%s index %s" % (base_cmd, out_sort_bam)
    keep_logging(cmd, cmd, logger, 'info')
    try:
        call(cmd, logger)
    except sp.CalledProcessError:
        keep_logging('Error in Samtools Indexing step. Exiting.', 'Error in Samtools Indexing step. Exiting.', logger, 'exception')
        sys.exit(1)
示例#10
0
def flagstat(out_sorted_bam, out_path, analysis, logger, Config, command_list):
    base_cmd = ConfigSectionMap("samtools", Config)['base_cmd']
    cmd = "%s flagstat %s > %s/%s_alignment_stats" % (base_cmd, out_sorted_bam, out_path, analysis)
    keep_logging('', cmd, logger, 'debug')
    try:
        command_list.append(cmd)
    except sp.CalledProcessError:
        keep_logging('Error in appending command to command list at Samtools Alignment Stats step. Exiting.', 'Error in appending command to command list at Samtools Alignment Stats step. Exiting.', logger, 'exception')
        sys.exit(1)
    return command_list
示例#11
0
def index_bam(out_sort_bam, out_path, logger, Config, command_list, files_to_delete):
    base_cmd = ConfigSectionMap("samtools", Config)['base_cmd']
    cmd = "%s index %s" % (base_cmd, out_sort_bam)
    keep_logging('', cmd, logger, 'debug')
    try:
        command_list.append(cmd)
    except sp.CalledProcessError:
        keep_logging('Error in appending command to command list at Samtools Indexing step. Exiting.', 'Error in appending command to command list at Samtools Indexing step. Exiting.', logger, 'exception')
        sys.exit(1)
    return command_list
def flagstat(out_sorted_bam, out_path, analysis, logger, Config):
    base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("samtools", Config)['samtools_bin'] + "/" + ConfigSectionMap("samtools", Config)['base_cmd']
    cmd = "%s flagstat %s > %s/%s_alignment_stats" % (base_cmd, out_sorted_bam, out_path, analysis)
    keep_logging(cmd, cmd, logger, 'debug')
    try:
        call(cmd, logger)
    except sp.CalledProcessError:
        keep_logging('Error in Samtools Alignment Stats step. Exiting.', 'Error in Samtools Alignment Stats step. Exiting.', logger, 'exception')
        sys.exit(1)
    alignment_stats_file = "%s/%s_alignment_stats" % (out_path, analysis)
    return alignment_stats_file
示例#13
0
def kraken_contamination(filenames_array, Config, logger, output_folder, type,
                         samples, kraken_directory, cluster, downsample,
                         scheduler, genome_size, dryrun):
    parallel_local_cmds = []
    parallel_local_cmds_krona = []
    cmd = ""

    for file in filenames_array:
        #file_prefix = kraken_directory + "/" + os.path.basename(file)[0:20]
        file_prefix = kraken_directory + "/" + os.path.basename(
            file.replace('_R1.*fastq.gz', ''))
        file2 = file.replace('_R1_', '_R2_')

        read1, read2, seqtk_downsample = downsample_reads(
            file, file2, genome_size, logger)
        file = read1
        keep_logging("Using downsampled reads for Kraken - %s" % file,
                     "Using downsampled reads for Kraken - %s" % file, logger,
                     'info')

        kraken_cmd = seqtk_downsample
        if file.endswith('.gz'):
            file = "/tmp/%s" % os.path.basename(file)
            kraken_cmd = kraken_cmd + '\n' + "kraken --quick --fastq-input --gzip-compressed --unclassified-out %s_unclassified.txt --db %s --output %s_kraken %s" % (
                file_prefix, ConfigSectionMap(
                    "kraken", Config)['db_path'], file_prefix, file)
            keep_logging("", kraken_cmd, logger, 'info')
            kraken_out = file_prefix + "_kraken"
            report_cmd = "kraken-report --db %s %s > %s_report.txt" % (
                ConfigSectionMap("kraken",
                                 Config)['db_path'], kraken_out, kraken_out)
            kraken_commands = kraken_cmd + "\n" + report_cmd
            if cluster == "cluster":
                cmd = cmd + "\n" + kraken_commands
                krona_cmd = krona_visualization(file_prefix, Config, logger,
                                                kraken_directory, cluster)
                report_cmd = "kraken-report --db %s %s > %s_report.txt" % (
                    ConfigSectionMap(
                        "kraken", Config)['db_path'], kraken_out, kraken_out)
                kraken_commands = kraken_commands + "\n" + krona_cmd + "\n" + report_cmd
                job_filename = generate_cluster_jobs(kraken_commands,
                                                     file_prefix, scheduler,
                                                     Config, logger)
                if dryrun:
                    print "Submitting job - %s\n" % job_filename
                else:
                    os.system("sbatch %s" % job_filename)
            elif cluster == "local":
                call(kraken_cmd, logger)
                call(report_cmd, logger)
                krona_cmd = krona_visualization(file_prefix, Config, logger,
                                                kraken_directory, cluster)
                call(krona_cmd, logger)
示例#14
0
def variant_calling(out_finalbam, out_path, index, analysis, logger, Config):
    variant_caller = eval(
        ConfigSectionMap("pipeline", Config)['variant_caller'])
    final_raw_vcf = variant_caller(out_finalbam, out_path, index, analysis,
                                   logger, Config)
    if not os.path.isfile(final_raw_vcf):
        keep_logging('Error in Samtools Variant Calling step. Exiting.',
                     'Error in Samtools Variant Calling step. Exiting.',
                     logger, 'exception')
        exit()
    else:
        return final_raw_vcf
示例#15
0
def assembly(forward_paired, reverse_paired, forward_unpaired, reverse_unpaired, assembler, out_path, logger, Config, do_assembly):
    # check the assembler argument provided and call relevant subroutines
    if assembler:
        if assembler == "velvet":
            # Have to insert the Velvet Functionality later.
            print ""
            #velvetoptimiser(forward_paired, reverse_paired, forward_unpaired, reverse_unpaired, out_path)
        elif assembler == "spades":
            (contigs, scaffolds, plasmid_contigs, plasmid_scaffolds) = spades_assembly(forward_paired, reverse_paired, forward_unpaired, reverse_unpaired, out_path, logger, Config, do_assembly)
            return contigs, scaffolds, plasmid_contigs, plasmid_scaffolds
    else:
        keep_logging('Please provide the assembler argument for this step.', 'Please provide the assembler argument for this step.', logger, 'exception')
        exit()
def markduplicates(out_sorted_bam, out_path, analysis, files_to_delete, logger,
                   Config):
    base_cmd = ConfigSectionMap(
        "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
            "picard", Config)['picard_bin'] + "/" + ConfigSectionMap(
                "picard", Config)['base_cmd']
    keep_logging('Removing PCR duplicates using PICARD',
                 'Removing PCR duplicates using PICARD', logger, 'info')
    cmd = "java -jar %s MarkDuplicates REMOVE_DUPLICATES=true INPUT=%s OUTPUT=%s/%s_aln_marked.bam METRICS_FILE=%s/%s_markduplicates_metrics CREATE_INDEX=true VALIDATION_STRINGENCY=LENIENT" % (
        base_cmd, out_sorted_bam, out_path, analysis, out_path, analysis)
    keep_logging(cmd, cmd, logger, 'debug')
    try:
        call(cmd, logger)
    except sp.CalledProcessError:
        keep_logging('Error in Picard Duplicates Removal step. Exiting.',
                     'Error in Picard Duplicates Removal step. Exiting.',
                     logger, 'exception')
        sys.exit(1)
    out_marked_bam = "%s/%s_aln_marked.bam" % (out_path, analysis)
    #files_to_delete.append(out_marked_bam)
    if not os.path.isfile(out_marked_bam):
        keep_logging('Problem in Picard MarkDuplicate Step',
                     'Problem in Picard MarkDuplicate Step', logger,
                     'exception')
        exit()
    else:
        return out_marked_bam
示例#17
0
def sort_bam(out_bam, out_path, analysis, logger, Config):
    base_cmd = ConfigSectionMap(
        "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
            "samtools", Config)['samtools_bin'] + "/" + ConfigSectionMap(
                "samtools", Config)['base_cmd']
    cmd = "%s sort %s %s/%s_aln_sort" % (base_cmd, out_bam, out_path, analysis)
    keep_logging('Sorting BAM file', 'Sorting BAM file', logger, 'info')
    keep_logging("COMMAND: " + cmd, cmd, logger, 'debug')
    try:
        call(cmd, logger)
        #print ""
    except sp.CalledProcessError:
        keep_logging('Error in BAM Sorting step. Exiting.',
                     'Error in BAM sorting step. Exiting.', logger,
                     'exception')
        sys.exit(1)
    sort_bam = "%s/%s_aln_sort.bam" % (out_path, analysis)
    if not os.path.isfile(sort_bam):
        print "\n################## Problem in BAM sorting ##################\n"
        keep_logging('Error in BAM Sorting step. Exiting.',
                     'Error in BAM Sorting step. Exiting.', logger,
                     'exception')
        exit()
    else:
        #os.remove(out_bam)
        return sort_bam
示例#18
0
def samtobam(out_sam, out_path, analysis, files_to_delete, logger, Config):
    base_cmd = ConfigSectionMap(
        "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
            "samtools", Config)['samtools_bin'] + "/" + ConfigSectionMap(
                "samtools", Config)['base_cmd']
    cmd = "%s view -Sb %s > %s/%s_aln.bam" % (base_cmd, out_sam, out_path,
                                              analysis)
    keep_logging('SAM to BAM Conversion', 'SAM to BAM Conversion', logger,
                 'info')
    keep_logging("COMMAND: " + cmd, cmd, logger, 'debug')
    try:
        call(cmd, logger)
    except sp.CalledProcessError:
        keep_logging('Error in SAM-to-BAM Conversion step. Exiting.',
                     'Error in SAM-to-BAM Conversion step. Exiting.', logger,
                     'exception')
        sys.exit(1)
    out_bam = "%s/%s_aln.bam" % (out_path, analysis)
    #files_to_delete.append(out_bam)
    if not os.path.isfile(out_bam):
        keep_logging('Error in SAM-to-BAM Conversion step. Exiting.',
                     'Error in SAM-to-BAM Conversion step. Exiting.', logger,
                     'exception')
        exit()
    else:
        #os.remove(out_sam)
        return out_bam
示例#19
0
文件: bwa.py 项目: alipirani88/QC-d
def align_bwa(base_cmd,forward_clean, reverse_clean, out_path, reference, split_field, analysis, files_to_delete, logger, Config, type, command_list):
    if type == "PE":
        cmd = "%s mem -M -R %s -t 8 %s %s %s > %s/%s_aln.sam" % (base_cmd,split_field, reference, forward_clean, reverse_clean, out_path, analysis)
    else:
        cmd = "%s mem -M -R %s -t 8 %s %s > %s/%s_aln.sam" % (base_cmd,split_field, reference, forward_clean, out_path, analysis)
    keep_logging('', cmd, logger, 'debug')
    try:
        #call(cmd, logger)
        command_list.append(cmd)
    except sp.CalledProcessError:
        keep_logging('Error in Appending command to command list. Exiting.', 'Error in Appending command to command list. Exiting.', logger, 'exception')
        sys.exit(1)
    out_sam = "%s/%s_aln.sam" % (out_path, analysis)
    files_to_delete.append(out_sam)
    return command_list, files_to_delete, out_sam
示例#20
0
def filter2_variants(final_raw_vcf, out_path, analysis, ref_index, logger,
                     Config, Avg_dp):
    reference = ConfigSectionMap(ref_index,
                                 Config)['ref_path'] + "/" + ConfigSectionMap(
                                     ref_index, Config)['ref_name']
    gatk_filter2_final_vcf_file = gatk_filter2(final_raw_vcf, out_path,
                                               analysis, reference, logger,
                                               Config, Avg_dp)
    gatk_filter2_final_vcf_file_no_proximate_snp = remove_proximate_snps(
        gatk_filter2_final_vcf_file, out_path, analysis, reference, logger,
        Config)
    keep_logging(
        'The vcf file with no proximate snp: {}'.format(
            gatk_filter2_final_vcf_file_no_proximate_snp),
        'The vcf file with no proximate snp: {}'.format(
            gatk_filter2_final_vcf_file_no_proximate_snp), logger, 'debug')
    gatk_vcf2fasta_filter2_file = gatk_vcf2fasta_filter2(
        gatk_filter2_final_vcf_file, out_path, analysis, reference, logger,
        Config)
    gatk_vcf2fasta_filter2_file_no_proximate = gatk_vcf2fasta_filter2(
        gatk_filter2_final_vcf_file_no_proximate_snp, out_path, analysis,
        reference, logger, Config)
    vcftools_vcf2fasta_filter2_file = vcftools_vcf2fasta_filter2(
        gatk_filter2_final_vcf_file, out_path, analysis, reference, logger,
        Config)
    vcftools_vcf2fasta_filter2_file_no_proximate = vcftools_vcf2fasta_filter2(
        gatk_filter2_final_vcf_file_no_proximate_snp, out_path, analysis,
        reference, logger, Config)
    keep_logging(
        'The final Consensus Fasta file: {}'.format(
            gatk_vcf2fasta_filter2_file),
        'The final Consensus Fasta file: {}'.format(
            gatk_vcf2fasta_filter2_file), logger, 'debug')
    keep_logging(
        'The final Consensus Fasta file with no proximate: {}'.format(
            gatk_vcf2fasta_filter2_file_no_proximate),
        'The final Consensus Fasta file with no proximate: {}'.format(
            gatk_vcf2fasta_filter2_file_no_proximate), logger, 'debug')
    keep_logging(
        'The final Consensus Fasta file from VCF-consensus: {}'.format(
            vcftools_vcf2fasta_filter2_file),
        'The final Consensus Fasta file from VCF-consensus: {}'.format(
            vcftools_vcf2fasta_filter2_file), logger, 'debug')
    keep_logging(
        'The final Consensus Fasta file from VCF-consensus with no proximate: {}'
        .format(vcftools_vcf2fasta_filter2_file_no_proximate),
        'The final Consensus Fasta file from VCF-consensus with no proximate: {}'
        .format(vcftools_vcf2fasta_filter2_file_no_proximate), logger, 'debug')
示例#21
0
def filter_variants(final_raw_vcf, out_path, analysis, ref_index, logger,
                    Config, Avg_dp):
    reference = ConfigSectionMap(ref_index,
                                 Config)['ref_path'] + "/" + ConfigSectionMap(
                                     ref_index, Config)['ref_name']
    gatk_filter_final_vcf_file = gatk_filter(final_raw_vcf, out_path, analysis,
                                             reference, logger, Config, Avg_dp)
    #gatk_filter_final_vcf_contamination_file = gatk_filter_contamination(final_raw_vcf, out_path, analysis, reference, logger, Config, Avg_dp)
    gatk_filter_final_vcf_file_no_proximate_snp = remove_proximate_snps(
        gatk_filter_final_vcf_file, out_path, analysis, reference, logger,
        Config)
    keep_logging(
        'The vcf file with no proximate snp: {}'.format(
            gatk_filter_final_vcf_file_no_proximate_snp),
        'The vcf file with no proximate snp: {}'.format(
            gatk_filter_final_vcf_file_no_proximate_snp), logger, 'debug')
示例#22
0
def only_snp_raw_vcf(final_raw_vcf, out_path, analysis, reference):
    base_tabix = ConfigSectionMap("bin_path",
                                  Config)['binbase'] + "/" + ConfigSectionMap(
                                      "vcftools", Config)['tabix_bin']
    base_vcftools_perl = ConfigSectionMap(
        "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
            "vcftools", Config)['vcftools_perl_bin']
    base_vcftools_bin = ConfigSectionMap(
        "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
            "vcftools", Config)['vcftools_bin']
    onlysnp_raw_cmd = "%s/vcftools --vcf %s --remove-indels --recode --recode-INFO-all --out %s/%s_raw_onlysnp.vcf" % (
        base_vcftools_bin, final_raw_vcf, out_path, analysis)
    keep_logging(onlysnp_raw_cmd, onlysnp_raw_cmd, logger, 'debug')
    only_snp_raw_vcf_file = "%s/%s_raw_onlysnp.vcf.recode.vcf" % (out_path,
                                                                  analysis)
    return only_snp_raw_vcf_file
示例#23
0
def prepare_bam(out_sam, out_path, analysis, files_to_delete, logger, Config):
    out_bam = samtobam(out_sam, out_path, analysis, files_to_delete, logger,
                       Config)
    out_sort_bam = sort_bam(out_bam, out_path, analysis, logger, Config)
    files_to_delete.append(out_sort_bam)
    out_marked_bam = markduplicates(out_sort_bam, out_path, analysis,
                                    files_to_delete, logger, Config)
    out_sort_bam = sort_bam(out_marked_bam, out_path, analysis, logger, Config)
    index_bam(out_sort_bam, out_path, logger, Config)
    if not os.path.isfile(out_sort_bam):
        keep_logging('Error in SAM/BAM conversion, sort, index. Exiting.',
                     'Error in SAM/BAM conversion, sort, index. Exiting.',
                     logger, 'exception')
        exit()
    else:
        return out_sort_bam
示例#24
0
def gatkhaplotypecaller(out_finalbam, out_path, reference, analysis, logger,
                        Config):
    base_cmd = ConfigSectionMap(
        "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
            "gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap(
                "gatk", Config)['base_cmd']
    reference_filename = ConfigSectionMap(
        reference, Config)['ref_path'] + "/" + ConfigSectionMap(
            reference, Config)['ref_name']
    cmd = "java -jar %s %s -R %s -I %s -o %s/%s_aln_mpileup_raw.vcf" % (
        base_cmd, ConfigSectionMap("gatk", Config)['haplotype_parameters'],
        reference_filename, out_finalbam, out_path, analysis)
    keep_logging('Running Command: [%s]' % cmd, 'Running Command: [%s]' % cmd,
                 logger, 'info')
    #os.system(cmd)
    call(cmd, logger)
    final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf" % (out_path, analysis)
    return final_raw_vcf
示例#25
0
def gatk_filter1(final_raw_vcf, out_path, analysis, reference):
    base_cmd = ConfigSectionMap(
        "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
            "gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap(
                "gatk", Config)['base_cmd']
    gatk_filter1_parameter_expression = ConfigSectionMap(
        "gatk")['gatk_filter1_parameter_expression']
    gatk_filter1_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter1_gatk.vcf --variant %s --filterExpression \"%s\" --filterName PASS_filter1" % (
        base_cmd, reference, out_path, analysis, final_raw_vcf,
        gatk_filter1_parameter_expression)
    keep_logging('Running Command: [%s]' % gatk_filter1_command,
                 'Running Command: [%s]' % gatk_filter1_command, logger,
                 'info')
    os.system(gatk_filter1_command)
    filter_flag_command = "grep '#\|PASS_filter1' %s/%s_filter1_gatk.vcf > %s/%s_filter1_final.vcf" % (
        out_path, analysis, out_path, analysis)
    os.system(filter_flag_command)
    gatk_filter1_final_vcf = "%s/%s_filter1_final.vcf" % (out_path, analysis)
    return gatk_filter1_final_vcf
示例#26
0
def only_snp_filter1_vcf(gatk_filter1_final_vcf, out_path, analysis,
                         reference):
    base_tabix = ConfigSectionMap("bin_path",
                                  Config)['binbase'] + "/" + ConfigSectionMap(
                                      "vcftools", Config)['tabix_bin']
    base_vcftools_perl = ConfigSectionMap(
        "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
            "vcftools", Config)['vcftools_perl_bin']
    base_vcftools_bin = ConfigSectionMap(
        "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
            "vcftools", Config)['vcftools_bin']
    onlysnp_filter1_cmd = "%s/vcftools --vcf %s --remove-indels --recode --recode-INFO-all --out %s/%s_filter1_onlysnp.vcf" % (
        base_vcftools_bin, gatk_filter1_final_vcf, out_path, analysis)
    keep_logging('Running Command: [%s]' % onlysnp_filter1_cmd,
                 'Running Command: [%s]' % onlysnp_filter1_cmd, logger, 'info')
    keep_logging(onlysnp_filter1_cmd, onlysnp_filter1_cmd, logger, 'debug')
    only_snp_filter1_vcf_file = "%s/%s_filter1_onlysnp.vcf.recode.vcf" % (
        out_path, analysis)
    return only_snp_filter1_vcf_file
示例#27
0
def vcftools_vcf2fasta_filter2(only_snp_filter2_vcf, out_path, analysis,
                               reference, logger, Config):
    base_tabix = ConfigSectionMap("bin_path",
                                  Config)['binbase'] + "/" + ConfigSectionMap(
                                      "vcftools", Config)['tabix_bin']
    base_vcftools_perl = ConfigSectionMap(
        "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
            "vcftools", Config)['vcftools_perl_bin']
    base_vcftools_bin = ConfigSectionMap(
        "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
            "vcftools", Config)['vcftools_bin']
    bgzip_cmd = "%s/bgzip -f %s" % (base_tabix, only_snp_filter2_vcf)
    os.system(bgzip_cmd)
    tabix_cmd = "%s/tabix %s.gz" % (base_tabix, only_snp_filter2_vcf)
    os.system(tabix_cmd)
    vcftools_vcf2fasta_filter2_cmd = "cat %s | %s/vcf-consensus %s.gz > %s_filter2_consensus.fa" % (
        reference, base_vcftools_bin, only_snp_filter2_vcf,
        only_snp_filter2_vcf)
    keep_logging(vcftools_vcf2fasta_filter2_cmd,
                 vcftools_vcf2fasta_filter2_cmd, logger, 'debug')
    try:
        call(vcftools_vcf2fasta_filter2_cmd, logger)
        #print ""
    except sp.CalledProcessError:
        keep_logging('Error in vcftools vcf 2 fasta step. Exiting.',
                     'Error in vcftools vcf 2 fasta step. Exiting.', logger,
                     'exception')
        sys.exit(1)
    bash_script_file = "%s.sh" % (only_snp_filter2_vcf)
    f1 = open(bash_script_file, 'w+')
    f1.write(vcftools_vcf2fasta_filter2_cmd)
    bash_command = "bash %s" % bash_script_file
    os.system(bash_command)
    if _platform == "darwin":
        change_header_cmd = "sed -i '' 's/>.*/>%s/g' %s_filter2_consensus.fa" % (
            analysis, only_snp_filter2_vcf)
        os.system(change_header_cmd)
    else:
        change_header_cmd = "sed -i 's/>.*/>%s/g' %s_filter2_consensus.fa" % (
            analysis, only_snp_filter2_vcf)
        os.system(change_header_cmd)
示例#28
0
def align_bwa(base_cmd, forward_clean, reverse_clean, out_path, reference,
              split_field, analysis, files_to_delete, logger, Config, type):
    if type == "PE":
        cmd = "%s mem -M -R %s -t 8 %s %s %s > %s/%s_aln.sam" % (
            base_cmd, split_field, reference, forward_clean, reverse_clean,
            out_path, analysis)
    else:
        cmd = "%s mem -M -R %s -t 8 %s %s > %s/%s_aln.sam" % (
            base_cmd, split_field, reference, forward_clean, out_path,
            analysis)
    keep_logging(cmd, cmd, logger, 'debug')
    try:
        call(cmd, logger)
    except sp.CalledProcessError:
        keep_logging('Error in Alignment step. Exiting.',
                     'Error in Alignment step. Exiting.', logger, 'exception')
        sys.exit(1)
    out_sam = "%s/%s_aln.sam" % (out_path, analysis)
    files_to_delete.append(out_sam)
    if not os.path.isfile(out_sam):
        keep_logging('Problem in BWA alignment. SAM file was not generated.',
                     'Problem in BWA alignment. SAM file was not generated',
                     logger, 'exception')
        exit()
    else:
        return out_sam
示例#29
0
def align_bowtie(base_cmd, forward_clean, reverse_clean, forward_unpaired,
                 reverse_unpaired, out_path, reference, split_field, analysis,
                 files_to_delete, logger, Config, type, parameters):
    if type == "PE":
        cmd = "%s -x %s -1 %s -2 %s -S %s/%s_PE_aln.sam -t -p 8 %s %s" % (
            base_cmd, reference, forward_clean, reverse_clean, out_path,
            analysis, parameters, split_field)
        out_sam = "%s/%s_PE_aln.sam" % (out_path, analysis)
    else:
        cmd = "%s -x %s -U %s -S %s/%s_SE_aln.sam -t -p 8 %s %s" % (
            base_cmd, reference, forward_clean, out_path, analysis, parameters,
            split_field)
        out_sam = "%s/%s_SE_aln.sam" % (out_path, analysis)
    keep_logging("COMMAND: " + cmd, cmd, logger, 'debug')
    try:
        call(cmd, logger)
    except sp.CalledProcessError:
        keep_logging('Error in Alignment step. Exiting.',
                     'Error in Alignment step. Exiting.', logger, 'exception')
        sys.exit(1)

    files_to_delete.append(out_sam)
    if not os.path.isfile(out_sam):
        keep_logging('Problem in BWA alignment. SAM file was not generated.',
                     'Problem in BWA alignment. SAM file was not generated',
                     logger, 'exception')
        exit()
    else:
        return out_sam
示例#30
0
def kraken_report(filenames_array, Config, logger, output_folder, type, samples, kraken_directory, cluster, scheduler):
    kraken_report_array = []
    echo = "echo \"Sample,Percentage of reads for Species,# of reads for Species, Species\" > %s/Kraken_report_final.csv" % (kraken_directory)
    os.system(echo)
    prepare_report = "for i in %s/*_report.txt; do grep -w \'S\' $i | sort -k1n | tail -n1; done > %s/Kraken_report_temp.txt\npaste %s %s/Kraken_report_temp.txt > %s/Kraken_report_combined.txt\n" \
                             "awk -F\'\\t\' \'BEGIN{OFS=\",\"};{print $1, $2, $3, $7}\' %s/Kraken_report_combined.txt >> %s/Kraken_report_final.csv\n" \
                             "sed -i \'s/\\s//g\' %s/Kraken_report_final.csv" % (kraken_directory, kraken_directory, samples, kraken_directory, kraken_directory, kraken_directory, kraken_directory, kraken_directory)
    subprocess.call(prepare_report, shell=True)

    keep_logging('', prepare_report, logger, 'debug')
    keep_logging('',
                 "\nKraken Report - %s/Kraken_report_final.csv" % kraken_directory, logger, 'debug')


    # for file in filenames_array:
    #     file_prefix = kraken_directory + "/" + os.path.basename(file)[0:20]
    #     kraken_out = file_prefix + "_kraken"
    #     report_cmd = "kraken-report --db %s %s > %s_report.txt" % (ConfigSectionMap("kraken", Config)['db_path'], kraken_out, kraken_out)
    #     keep_logging(report_cmd, report_cmd, logger, 'debug')
    #     if cluster == "cluster":
    #         generate_cluster_jobs(report_cmd, file_prefix, scheduler, Config, logger)
    #     elif cluster == "parallel-local":
    #         kraken_report_array.append(report_cmd)
    #     elif cluster == "local":
    #         call(report_cmd, logger)
    # if cluster == "parallel-local":
    #     #complete = run_parallel(kraken_report_array)
    #     prepare_report = "for i in %s/*_report.txt; do grep -w \'S\' $i | sort -k1n | tail -n1; done > %s/Kraken_report_temp.txt\nls %s/*_report.txt > %s/filenames\npaste %s/filenames %s/Kraken_report_temp.txt > %s/Kraken_report_combined.txt\n" \
    #                          "awk -F\'\\t\' \'BEGIN{OFS=\",\"};{print $1, $2, $3, $7}\' %s/Kraken_report_combined.txt >> %s/Kraken_report_final.csv" % (kraken_directory, kraken_directory, kraken_directory, kraken_directory, kraken_directory, kraken_directory, kraken_directory, kraken_directory, kraken_directory)
    #
    #     subprocess.call(["for i in %s/*_report.txt; do grep -w 'S' $i | sort -k1n | tail -n1; done > %s/Kraken_report_temp.txt" % (kraken_directory, kraken_directory)], shell=True)
    #     print "for i in %s/*_report.txt; do grep -w 'S' $i | sort -k1n | tail -n1; done > %s/Kraken_report_temp.txt" % (kraken_directory, kraken_directory)
    #     os.chdir(kraken_directory)
    #     subprocess.call(["ls *_report.txt > %s/filenames" % (
    #                         kraken_directory)], shell=True)
    #     subprocess.call(["paste %s/filenames %s/Kraken_report_temp.txt > %s/Kraken_report_combined.txt" % (kraken_directory, kraken_directory, kraken_directory)], shell=True)
    #     subprocess.call(["awk -F'\t' 'BEGIN{OFS=\",\"};{print $1, $2, $3, $7}' %s/Kraken_report_combined.txt >> %s/Kraken_report_final.csv" % (kraken_directory, kraken_directory)], shell=True)
    #
    #     #print "Running:\n%s" % prepare_report
    #     keep_logging('', prepare_report, logger, 'debug')
示例#31
0
def gatk_filter2(final_raw_vcf, out_path, analysis, reference, logger, Config):
    base_cmd = ConfigSectionMap(
        "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
            "gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap(
                "gatk", Config)['base_cmd']
    gatk_filter2_parameter_expression = ConfigSectionMap(
        "gatk", Config)['gatk_filter2_parameter_expression']
    gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_gatk.vcf --variant %s --filterExpression \"%s\" --filterName PASS_filter2" % (
        base_cmd, reference, out_path, analysis, final_raw_vcf,
        gatk_filter2_parameter_expression)
    filter_flag_command = "grep '#\|PASS_filter2' %s/%s_filter2_gatk.vcf > %s/%s_filter2_final.vcf" % (
        out_path, analysis, out_path, analysis)
    keep_logging("COMMAND: " + gatk_filter2_command, gatk_filter2_command,
                 logger, 'debug')
    keep_logging("COMMAND: " + filter_flag_command, filter_flag_command,
                 logger, 'debug')
    try:
        call(gatk_filter2_command, logger)
        call(filter_flag_command, logger)
        #print ""
    except sp.CalledProcessError:
        keep_logging('Error in GATK filter step. Exiting.',
                     'Error in GATK filter step. Exiting.', logger,
                     'exception')
        sys.exit(1)
    gatk_filter2_final_vcf = "%s/%s_filter2_final.vcf" % (out_path, analysis)
    return gatk_filter2_final_vcf
示例#32
0
def ariba_MLST(forward_paired, reverse_paired, output_folder, prefix, logger, Config):
    ariba_mlst_dir = output_folder + "/" + prefix + "_MLST"
    ariba_mlst_cmd = "%s/%s run --force --verbose %s %s %s %s" % (ConfigSectionMap("ariba", Config)['ariba_bin'], ConfigSectionMap("ariba", Config)['base_cmd'], ConfigSectionMap("ariba", Config)['ariba_mlst_db'], forward_paired, reverse_paired, ariba_mlst_dir)
    #print ariba_mlst_cmd
    keep_logging("Using Ariba DB path mentioned in config file: %s" % ConfigSectionMap("ariba", Config)['ariba_mlst_db'], "Using Ariba DB path mentioned in config file: %s" % ConfigSectionMap("ariba", Config)['ariba_mlst_db'], logger, 'info')
    try:
        keep_logging(ariba_mlst_cmd, ariba_mlst_cmd, logger, 'debug')
        call(ariba_mlst_cmd, logger)
    except sp.CalledProcessError:
        keep_logging('Error in running Ariba MLST. Exiting.', 'Error in running Ariba MLST. Exiting.', logger, 'exception')
        sys.exit(1)
示例#33
0
def abacas(reference_genome_path, final_l500_contig, out_path, first_part, logger, Config):
    keep_logging('Contig Reordering using ABACAS', 'Contig Reordering using ABACAS', logger, 'info')
    abacas_cmd = "perl %s/%s/%s -r %s -q %s %s -o %s/%s_contigs_ordered" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("abacas", Config)['abacas_bin'], ConfigSectionMap("abacas", Config)['base_cmd'], reference_genome_path, final_l500_contig, ConfigSectionMap("abacas", Config)['abacas_parameters'], out_path, first_part)
    try:
        keep_logging(abacas_cmd, abacas_cmd, logger, 'debug')
        call(abacas_cmd, logger)
        #print ""
        fasta_header = ">%s" % first_part
        header_cmd = "echo \"%s\" > %s/fasta_header" % (fasta_header, out_path)
        print header_cmd
        keep_logging(abacas_cmd, abacas_cmd, logger, 'debug')
        call(header_cmd, logger)
        abacas_ordered_multifasta = "%s/%s_contigs_ordered.MULTIFASTA.fa" % (out_path, first_part)
        abacas_ordered_contigsInbin = "%s/%s_contigs_ordered.contigsInbin.fas" % (out_path, first_part)
        join_all_contigs = "cat %s %s > %s/all_contigs.fasta" % (abacas_ordered_multifasta, abacas_ordered_contigsInbin, out_path)
        #print join_all_contigs
        keep_logging(join_all_contigs, join_all_contigs, logger, 'debug')
        call(join_all_contigs, logger)
        add_linker = "sed -i 's/>.*/NNNNNCATTCCATTCATTAATTAATTAATGAATGAATGNNNNN/g' %s/all_contigs.fasta" % out_path
        #print add_linker
        keep_logging(add_linker, add_linker, logger, 'debug')
        call(add_linker, logger)
        remove_spaces = "tr -d '[:space:]' < %s/all_contigs.fasta > %s/all_contigs.fasta_changed.fasta" % (out_path, out_path)
        #print remove_spaces
        keep_logging(remove_spaces, remove_spaces, logger, 'debug')
        call(remove_spaces, logger)
        join_files = "cat %s/fasta_header %s/all_contigs.fasta_changed.fasta > %s/%s_contigs_ordered.fasta" % (out_path, out_path, out_path, first_part)
        #print join_files
        keep_logging(join_files, join_files, logger, 'debug')
        call(join_files, logger)
    except sp.CalledProcessError:
        keep_logging('Error in reordering Contigs using Abacas. Exiting.', 'Error in reordering Contigs using Abacas. Exiting.', logger, 'exception')
        sys.exit(1)
    # fasta_header = ">%s" % first_part
    # header_cmd = "echo \"%s\" > %s/fasta_header" % (fasta_header, out_path)
    # print header_cmd
    # call(header_cmd, logger)
    # abacas_ordered_multifasta = "%s/%s_contigs_ordered.MULTIFASTA.fa" % (out_path, first_part)
    # abacas_ordered_contigsInbin = "%s/%s_contigs_ordered.contigsInbin.fas" % (out_path, first_part)
    # print "here"
    # join_all_contigs = "cat %s %s > %s/all_contigs.fasta" % (abacas_ordered_multifasta, abacas_ordered_contigsInbin, out_path)
    # print join_all_contigs
    # call(join_all_contigs, logger)
    # add_linker = "sed -i 's/>.*/NNNNNCATTCCATTCATTAATTAATTAATGAATGAATGNNNNN/g' %s/all_contigs.fasta" % out_path
    # print add_linker
    # call(add_linker, logger)
    # remove_spaces = "tr -d '[:space:]' < %s/all_contigs.fasta > %s/all_contigs.fasta_changed.fasta" % (out_path, out_path)
    # print remove_spaces
    # call(remove_spaces, logger)
    # join_files = "cat %s/fasta_header %s/all_contigs.fasta_changed.fasta > %s/%s_contigs_ordered.fasta" % (out_path, out_path, out_path, first_part)
    # print join_files
    # call(join_files, logger)
    final_ordered_contigs = "%s/%s_contigs_ordered.fasta" % (out_path, first_part)
    return final_ordered_contigs
示例#34
0
def clean_reads(input1, input2, out_path, crop, logger, Config):
    if input2 != "None":
        forward_paired = out_path + ConfigSectionMap("Trimmomatic", Config)['f_p']
        reverse_paired = out_path + ConfigSectionMap("Trimmomatic", Config)['r_p']
        forward_unpaired = out_path + ConfigSectionMap("Trimmomatic", Config)['f_up']
        reverse_unpaired = out_path + ConfigSectionMap("Trimmomatic", Config)['r_up']
        adapter_file = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("Trimmomatic", Config)['trimmomatic_bin'] + "/" + ConfigSectionMap("Trimmomatic", Config)['adaptor_filepath']
        clean_filenames = forward_paired + " " + forward_unpaired + " " + reverse_paired + " " + reverse_unpaired
        illumina_string = 'ILLUMINACLIP:' + adapter_file + ConfigSectionMap("Trimmomatic", Config)['colon'] + ConfigSectionMap("Trimmomatic", Config)['seed_mismatches'] + ConfigSectionMap("Trimmomatic", Config)['colon'] + ConfigSectionMap("Trimmomatic", Config)['palindrome_clipthreshold'] + ConfigSectionMap("Trimmomatic", Config)['colon'] + ConfigSectionMap("Trimmomatic", Config)['simple_clipthreshold']
        sliding_string = 'SLIDINGWINDOW:' + ConfigSectionMap("Trimmomatic", Config)['window_size'] + ConfigSectionMap("Trimmomatic", Config)['colon'] + ConfigSectionMap("Trimmomatic", Config)['window_size_quality']
        minlen_string = 'MINLEN:' + ConfigSectionMap("Trimmomatic", Config)['minlength']
        headcrop_string = 'HEADCROP:' + ConfigSectionMap("Trimmomatic", Config)['headcrop_length']
        if not crop:
            cmdstring = "java -jar " + ConfigSectionMap("bin_path", Config)['binbase'] + ConfigSectionMap("Trimmomatic", Config)['trimmomatic_bin'] + "trimmomatic-0.36.jar PE -phred33 " + input1 + " " + input2 + " " + clean_filenames + " " + illumina_string + " " + sliding_string + " " + minlen_string + " " + headcrop_string
            keep_logging(cmdstring, cmdstring, logger, 'debug')
            try:
                call(cmdstring, logger)
                print ""
            except sp.CalledProcessError:
                keep_logging('Error in Trimmomatic Pre-processing step. Exiting.', 'Error in Trimmomatic Pre-processing step. Exiting.', logger, 'exception')
                sys.exit(1)
            return forward_paired, reverse_paired, forward_unpaired, reverse_unpaired
        else:
            crop_string = 'CROP:' + crop
            cmdstring = "java -jar " + ConfigSectionMap("bin_path", Config)['binbase'] + ConfigSectionMap("Trimmomatic", Config)['trimmomatic_bin'] + "trimmomatic-0.36.jar PE " + input1 + " " + input2 + " " + clean_filenames + " " + crop_string + " " + illumina_string + " " + sliding_string + " " + minlen_string
            keep_logging(cmdstring, cmdstring, logger, 'debug')
            try:
                call(cmdstring, logger)
                print ""
            except sp.CalledProcessError:
                keep_logging('Error in Trimmomatic Pre-processing step. Exiting.', 'Error in Trimmomatic Pre-processing step. Exiting.', logger, 'exception')
                sys.exit(1)
            return forward_paired, reverse_paired, forward_unpaired, reverse_unpaired
    else:
        forward_paired = out_path + ConfigSectionMap("Trimmomatic", Config)['f_p']
        # reverse_paired = out_path + ConfigSectionMap("Trimmomatic", Config)['r_p']
        #forward_unpaired = out_path + ConfigSectionMap("Trimmomatic", Config)['f_up']
        # reverse_unpaired = out_path + ConfigSectionMap("Trimmomatic", Config)['r_up']
        adapter_file = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("Trimmomatic", Config)['trimmomatic_bin'] + "/" + ConfigSectionMap("Trimmomatic", Config)['adaptor_filepath_se']
        clean_filenames = forward_paired
        illumina_string = 'ILLUMINACLIP:' + adapter_file + ConfigSectionMap("Trimmomatic", Config)['colon'] + ConfigSectionMap("Trimmomatic", Config)['seed_mismatches'] + ConfigSectionMap("Trimmomatic", Config)['colon'] + ConfigSectionMap("Trimmomatic", Config)['palindrome_clipthreshold'] + ConfigSectionMap("Trimmomatic", Config)['colon'] + ConfigSectionMap("Trimmomatic", Config)['simple_clipthreshold']
        sliding_string = 'SLIDINGWINDOW:' + ConfigSectionMap("Trimmomatic", Config)['window_size'] + ConfigSectionMap("Trimmomatic", Config)['colon'] + ConfigSectionMap("Trimmomatic", Config)['window_size_quality']
        minlen_string = 'MINLEN:' + ConfigSectionMap("Trimmomatic", Config)['minlength']
        headcrop_string = 'HEADCROP:' + ConfigSectionMap("Trimmomatic", Config)['headcrop_length']
        if not crop:
            cmdstring = "java -jar " + ConfigSectionMap("bin_path", Config)['binbase'] + ConfigSectionMap("Trimmomatic", Config)['trimmomatic_bin'] + "trimmomatic-0.36.jar SE -phred33 " + input1 + " " + clean_filenames + " " + illumina_string + " " + sliding_string + " " + minlen_string + " " + headcrop_string
            keep_logging(cmdstring, cmdstring, logger, 'debug')
            try:
                call(cmdstring, logger)
                print ""
            except sp.CalledProcessError:
                keep_logging('Error in Trimmomatic Pre-processing step. Exiting.', 'Error in Trimmomatic Pre-processing step. Exiting.', logger, 'exception')
                sys.exit(1)
            reverse_paired = "None"
            reverse_unpaired = "None"
            forward_unpaired = "None"
            return forward_paired, reverse_paired, forward_unpaired, reverse_unpaired
        else:
            crop_string = 'CROP:' + crop
            cmdstring = "java -jar " + ConfigSectionMap("bin_path", Config)['binbase'] + ConfigSectionMap("Trimmomatic", Config)['trimmomatic_bin'] + "trimmomatic-0.36.jar SE " + input1 + " " + clean_filenames + " " + crop_string + " " + illumina_string + " " + sliding_string + " " + minlen_string
            keep_logging(cmdstring, cmdstring, logger, 'debug')
            try:
                call(cmdstring, logger)
                print ""
            except sp.CalledProcessError:
                keep_logging('Error in Trimmomatic Pre-processing step. Exiting.', 'Error in Trimmomatic Pre-processing step. Exiting.', logger, 'exception')
                sys.exit(1)
            reverse_paired = "None"
            reverse_unpaired = "None"
            forward_unpaired = "None"
            return forward_paired, reverse_paired, forward_unpaired, reverse_unpaired
示例#35
0
def spades_assembly(forward_paired, reverse_paired, forward_unpaired, reverse_unpaired, out_path, logger, Config, do_assembly):
    # check if the clean reads from Trimmomatic exists in the output folder.
    # Set the paired and unpaired string constants based on their availability
    (paired, unpaired) = check_cleanreads(forward_paired, reverse_paired, forward_unpaired, reverse_unpaired)
    # Pending Changes
    if paired == "0" and unpaired == "0":
        # Clean Paired and unpaired reads doesn't exist. Take raw Input PE files for assembly
        message = "No clean Paired and unpaired reads. Considering forward_paired and reverse_paired as raw Fastq files for assembly.\n"
        print message
        cmdstring = ConfigSectionMap("bin_path")['binbase'] + ConfigSectionMap("spades")['spades_bin'] + ConfigSectionMap("spades")['base_cmd'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " -o " + out_path + "spades_results " + ConfigSectionMap("spades")['spades_parameters']
        plasmid_cmdstring = ConfigSectionMap("bin_path")['binbase'] + ConfigSectionMap("spades")['spades_bin'] + ConfigSectionMap("spades")['base_cmd'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " -o " + out_path + "spades_plasmid_results " + ConfigSectionMap("spades")['plasmid_spades_parameters']
        print "Running: %s \n" % cmdstring
        print "Running: %s \n" % plasmid_cmdstring
        os.system(cmdstring)
        os.system(plasmid_cmdstring = ConfigSectionMap("bin_path")['binbase'] + ConfigSectionMap("spades")['spades_bin'] + ConfigSectionMap("spades")['base_cmd'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " -o " + out_path + "spades_plasmid_results " + ConfigSectionMap("spades")['plasmid_spades_parameters'])
        print "Spades assembly results can be found in " + out_path + "spades_results"
        print "plasmid Spades assembly results can be found in " + out_path + "spades_plasmid_results"
        contigs = out_path + "spades_results" + "/contigs.fasta"
        scaffolds = out_path + "spades_results" + "/scaffolds.fasta"
        plasmid_contigs = out_path + "spades_plasmid_results" + "/contigs.fasta"
        plasmid_scaffolds = out_path + "spades_plasmid_results" + "/contigs.fasta"
        # Copy final contigs/scaffolds file to output directory
        cp_cmdstring = "cp %s %s %s" % (contigs, scaffolds, out_path)
        os.system(cp_cmdstring)
        print "\n################## End: SPADES ASSEMBLY ##################\n"
        return contigs, scaffolds
    # Pending Changes
    elif paired == "1" and unpaired == "0":
        # Only clean Paired PE files exists. Take these files for assembly input.
        message = "Taking only paired reads for assembly.\n"
        print message
        if reverse_paired == "None" and reverse_unpaired == "None":
            cmdstring = ConfigSectionMap("bin_path", Config)['binbase'] + ConfigSectionMap("spades", Config)['spades_bin'] + ConfigSectionMap("spades", Config)['base_cmd'] + " --s1 " + forward_paired + " -o " + out_path + "spades_results/ " + ConfigSectionMap("spades", Config)['spades_parameters']
            plasmid_cmdstring = ConfigSectionMap("bin_path", Config)['binbase'] + ConfigSectionMap("spades", Config)['spades_bin'] + ConfigSectionMap("spades", Config)['base_cmd'] + " --s1 " + forward_paired + " -o " + out_path + "spades_plasmid_results/ " + ConfigSectionMap("spades", Config)['plasmid_spades_parameters']
            print "Running: %s \n" % cmdstring
            print "Running: %s \n" % plasmid_cmdstring
            os.system(cmdstring)
            os.system(plasmid_cmdstring)
            print "Spades assembly results can be found in " + out_path + "spades_results"
            print "plasmid Spades assembly results can be found in " + out_path + "spades_plasmid_results"
            contigs = out_path + "spades_results" + "/contigs.fasta"
            scaffolds = out_path + "spades_results" + "/scaffolds.fasta"
            plasmid_contigs = out_path + "spades_plasmid_results" + "/contigs.fasta"
            plasmid_scaffolds = out_path + "spades_plasmid_results" + "/contigs.fasta"
            # Copy final contigs/scaffolds file to output directory
            cp_cmdstring = "cp %s %s %s" % (contigs, scaffolds, out_path)
            os.system(cp_cmdstring)
            print "\n################## End: SPADES ASSEMBLY ##################\n"
        else:
            ##pending changes
            cmdstring = ConfigSectionMap("bin_path")['binbase'] + ConfigSectionMap("spades")['spades_bin'] + ConfigSectionMap("spades")['base_cmd'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " -o " + out_path + "spades_results/ " + ConfigSectionMap("spades")['spades_parameters']
            plasmid_cmdstring = ConfigSectionMap("bin_path")['binbase'] + ConfigSectionMap("spades")['spades_bin'] + ConfigSectionMap("spades")['base_cmd'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " -o " + out_path + "spades_plasmid_results/ " + ConfigSectionMap("spades")['plasmid_spades_parameters']
            print "Running: %s \n" % cmdstring
            print "Running: %s \n" % plasmid_cmdstring
            os.system(cmdstring)
            os.system(plasmid_cmdstring)
            print "Spades assembly results can be found in " + out_path + "spades_results"
            print "plasmid Spades assembly results can be found in " + out_path + "spades_plasmid_results"
            contigs = out_path + "spades_results" + "/contigs.fasta"
            scaffolds = out_path + "spades_results" + "/scaffolds.fasta"
            plasmid_contigs = out_path + "spades_plasmid_results" + "/contigs.fasta"
            plasmid_scaffolds = out_path + "spades_plasmid_results" + "/contigs.fasta"
            # Copy final contigs/scaffolds file to output directory
            cp_cmdstring = "cp %s %s %s" % (contigs, scaffolds, out_path)
            os.system(cp_cmdstring)
            print "\n################## End: SPADES ASSEMBLY ##################\n"
        return contigs, scaffolds, plasmid_contigs, plasmid_scaffolds
    # Pending Changes
    elif paired == "0" and unpaired == "1":
        # Only clean unpaired PE files exists. Pending...
        cmdstring = "This can be single reads......"
        print "Running: %s \n" % cmdstring
        os.system(cmdstring)
        print "Spades assembly results can be found in " + out_path + "spades_results"
        contigs = out_path + "spades_results" + "/contigs.fasta"
        scaffolds = out_path + "spades_results" + "/scaffolds.fasta"
        # Copy final contigs/scaffolds file to output directory
        cp_cmdstring = "cp %s %s %s" % (contigs, scaffolds, out_path)
        os.system(cp_cmdstring)
        print "\n################## End: SPADES ASSEMBLY ##################\n"
        return contigs, scaffolds, plasmid_contigs, plasmid_scaffolds
    else:
        # Clean paired and unpaired files exists. Take all these files as input.
        cmdstring = ConfigSectionMap("bin_path", Config)['binbase'] + ConfigSectionMap("spades", Config)['spades_bin'] + ConfigSectionMap("spades", Config)['base_cmd'] + " " + ConfigSectionMap("spades", Config)['spades_parameters'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " --pe1-s " + forward_unpaired + " --pe1-s " + reverse_unpaired + " -o " + out_path + "spades_results"
        plasmid_cmdstring = ConfigSectionMap("bin_path", Config)['binbase'] + ConfigSectionMap("spades", Config)['spades_bin'] + ConfigSectionMap("spades", Config)['base_cmd'] + " " + ConfigSectionMap("spades", Config)['plasmid_spades_parameters'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " --pe1-s " + forward_unpaired + " --pe1-s " + reverse_unpaired + " -o " + out_path + "spades_plasmid_results"


        if do_assembly == "both":
            keep_logging('Running Spades and plasmid Spades assembly', 'Running Spades and plasmid Spades assembly', logger, 'debug')
            try:
                keep_logging(cmdstring, cmdstring, logger, 'debug')
                call(cmdstring, logger)
                #Check if they are empty
                contigs = out_path + "spades_results" + "/contigs.fasta"
                scaffolds = out_path + "spades_results" + "/scaffolds.fasta"
                # Copy final contigs/scaffolds file to output directory
                cp_cmdstring = "cp %s %s %s" % (contigs, scaffolds, out_path)
                os.system(cp_cmdstring)
                print ""
                keep_logging('Spades assembly results can be found in {}spades_results'.format(out_path), 'Spades assembly results can be found in {}spades_results'.format(out_path), logger, 'info')
            except sp.CalledProcessError:
                keep_logging('Error in Spades Assembly step. Exiting. Please check spades.log file in spades_results folder', 'Error in Spades Assembly step. Exiting. Please check spades.log file in spades_results folder', logger, 'exception')
                sys.exit(1)


            try:
                keep_logging(plasmid_cmdstring, plasmid_cmdstring, logger, 'debug')
                call(plasmid_cmdstring, logger)
                print ""
                plasmid_contigs = out_path + "spades_plasmid_results" + "/contigs.fasta"
                plasmid_scaffolds = out_path + "spades_plasmid_results" + "/contigs.fasta"
                keep_logging('Spades plasmid assembly results can be found in {}spades_plasmid_results'.format(out_path), 'Spades plasmid assembly results can be found in {}spades_plasmid_results'.format(out_path), logger, 'info')
            except sp.CalledProcessError:
                keep_logging('Error in Spades Plasmid Assembly step. Exiting. Please check spades.log file in spades_results folder', 'Error in Spades Plasmid Assembly step. Exiting. Please check spades.log file in spades_results folder', logger, 'exception')
                sys.exit(1)

        if do_assembly == "wga":
            keep_logging('Running Spades assembly', 'Running Spades assembly', logger, 'debug')
            try:
                keep_logging(cmdstring, cmdstring, logger, 'debug')
                call(cmdstring, logger)
                #Check if they are empty
                contigs = out_path + "spades_results" + "/contigs.fasta"
                scaffolds = out_path + "spades_results" + "/scaffolds.fasta"
                # Copy final contigs/scaffolds file to output directory
                cp_cmdstring = "cp %s %s %s" % (contigs, scaffolds, out_path)
                os.system(cp_cmdstring)
                print ""
                keep_logging('Spades assembly results can be found in {}spades_results'.format(out_path), 'Spades assembly results can be found in {}spades_results'.format(out_path), logger, 'info')
            except sp.CalledProcessError:
                keep_logging('Error in Spades Assembly step. Exiting. Please check spades.log file in spades_results folder', 'Error in Spades Assembly step. Exiting. Please check spades.log file in spades_results folder', logger, 'exception')
                sys.exit(1)

            plasmid_contigs = out_path + "spades_plasmid_results" + "/contigs.fasta"
            plasmid_scaffolds = out_path + "spades_plasmid_results" + "/contigs.fasta"


        if do_assembly == "plasmid":
            keep_logging('Running plasmid Spades assembly', 'Running plasmid Spades assembly', logger, 'debug')
            try:
                keep_logging(plasmid_cmdstring, plasmid_cmdstring, logger, 'debug')
                call(plasmid_cmdstring, logger)
                print ""
                plasmid_contigs = out_path + "spades_plasmid_results" + "/contigs.fasta"
                plasmid_scaffolds = out_path + "spades_plasmid_results" + "/contigs.fasta"
                keep_logging('Spades plasmid assembly results can be found in {}spades_plasmid_results'.format(out_path), 'Spades plasmid assembly results can be found in {}spades_plasmid_results'.format(out_path), logger, 'info')
            except sp.CalledProcessError:
                keep_logging('Error in Spades Plasmid Assembly step. Exiting. Please check spades.log file in spades_results folder', 'Error in Spades Plasmid Assembly step. Exiting. Please check spades.log file in spades_results folder', logger, 'exception')
                sys.exit(1)

            contigs = out_path + "spades_results" + "/contigs.fasta"
            scaffolds = out_path + "spades_results" + "/scaffolds.fasta"

        return contigs, scaffolds, plasmid_contigs, plasmid_scaffolds