예제 #1
0
def gen_report(vcf):
    # open out file and index counts, context, etc
    fn = os.path.basename(vcf)
    parts = fn.split('.')
    loc = 'LOGS/' + parts[0] + '.indels.vep_priority.report.log'
    log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n')
    vcf_in = VariantFile(vcf)

    out = open(parts[0] + '.indels.vep.prioritized_impact.report.xls', 'w')
    desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0,
               'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0, 'VARIANT_CLASS': 0}

    desc_string = vcf_in.header.info['ANN'].record['Description']
    desc_string = desc_string.lstrip('"')
    desc_string = desc_string.rstrip('"')
    desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '')
    f_pos_list = []
    desc_list = desc_string.split('|')
    ann_size = len(desc_list)
    for i in xrange(0, ann_size, 1):
        if desc_list[i] in desired:
            f_pos_list.append(i)
            desired[desc_list[i]] = i
    out.write('chr\tpos\tref\talt\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\tvariant_class_effect\teffect\timpact'
            '\tbiotype\tcodon_change\tamino_acid_change\talt_cov\tnon_alt_cov\tvaf\n')
    for record in vcf_in.fetch():
        (chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf) = (record.contig, str(record.pos), record.ref, record.alts[0],
                                str(record.info['MINCOV']), str(record.info['ALTCOV']), str(record.info['COVRATIO']))
        ann_list = [_.split('|') for _ in record.info['ANN'].split(',')]
        output_highest_impact(chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf, ann_list, desired, out)

    out.close()
    log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n')
    return 0
예제 #2
0
def novosort_merge_pe(config_file, sample_list):
    fh = open(sample_list, 'r')
    (novosort, java_tool, picard_tool, project, project_dir, align, threads, ram, novo_merge_rmdup_slurm) \
        = parse_config(config_file)

    for sample in fh:
        sample = sample.rstrip('\n')
        loc = '../LOGS/' + sample + '.novosort_merge.log'
        job_loc = sample + '.novosort_merge.log'
        (bam_list, n) = list_bam(project, align, sample)
        bam_string = " ".join(bam_list)
        cur_dir = project_dir + project + '/' + align + '/' + sample + '/BAMS/'
        os.chdir(cur_dir)
        out_bam = sample + '.merged.transcriptome.bam'
        if n > 1:
            batch = 'sbatch -c ' + threads + ' --mem ' + ram + 'G -o ' + job_loc + ' --export=novosort="' \
                    + novosort + '",threads="' + threads + '",ram="' + ram + 'G",out_bam="' + out_bam \
                    + '",bam_string="' + bam_string + '",loc="' + loc + '"' + ' ' + novo_merge_rmdup_slurm
            log(loc, date_time() + 'Submitting merge bam job for sample ' + batch + "\n")
            subprocess.call(batch, shell=True)


        else:

                link_bam = 'ln -s ' + bam_list[0] + ' ' + sample + '.merged.transcriptome.bam;'
                log(loc, date_time() + 'Creating symlink for merged final bam since only one exists\n'
                    + link_bam + '\n')
                subprocess.call(link_bam, shell=True)

    sys.stderr.write(date_time() + 'Merged file request submitted and processed, check logs.\n')
    return 0
예제 #3
0
def cutadapter(sample, end1, end2, config_file):
    # casual logging - look for a LOGS directory, otherwise assume current dir
    log_dir = './'
    # designed to be run in a subdirectory, keep original file names
    sf1 = end1
    sf2 = end2
    end1 = os.path.basename(sf1)
    end2 = os.path.basename(sf2)
    if os.path.isdir('LOGS'):
        log_dir = 'LOGS/'
    loc = log_dir + sample + '.cutadapt.log'
    (cutadapt_tool, threads, minlen, r1adapt, r2adapt, r1trim, r2trim, qual,
     mqual) = parse_config(config_file)
    cut_th = threads
    if int(cut_th) >= 4:
        cut_th = str(int(int(threads) / 2))

    cutadapt_cmd = cutadapt_tool + ' -j ' + cut_th + ' -m ' + minlen + ' --quality-base=' + qual + ' -q ' + mqual \
                   + ' -a ' + r1adapt + ' -A ' + r2adapt + ' -u ' + r1trim + ' -U ' + r2trim + ' -o ' + end1 \
                   + ' -p ' + end2 + ' ' + sf1 + ' ' + sf2 + ' >> ' + loc + ' 2>> ' + loc
    if r1adapt == '' and r2adapt == '':
        cutadapt_cmd = cutadapt_tool + ' -j ' + cut_th + ' -m ' + minlen + ' --quality-base=' + qual + ' -q ' + mqual \
                       + ' -u ' + r1trim + ' -U ' + r2trim + ' -o ' + end1 + ' -p ' + end2 + ' ' + sf1 + ' ' + sf2 \
                       + ' >> ' + loc + ' 2>> ' + loc
    log(loc, date_time() + cutadapt_cmd + "\n")
    call(cutadapt_cmd, shell=True)
    return 0
예제 #4
0
def fastqc(fastqc_tool, sample, end1, end2, t):
    # casual logging - look for a LOGS directory, otherwise assume current dir
    log_dir = './'
    if os.path.isdir('LOGS'):
        log_dir = 'LOGS/'
    loc = log_dir + sample + '.fastqc.log'
    fastqc_cmd = fastqc_tool + ' --extract -t ' + t + ' -o QC/ ' + end1 + ' ' + end2
    log(loc, date_time() + fastqc_cmd + "\n")
    f = Popen(fastqc_cmd,
              shell=True,
              stdin=None,
              stdout=None,
              stderr=None,
              close_fds=True)
    # check after a minute whether the process is still good - shouldn't take too long to ascertain whether phred score
    #  didn't fit
    call('sleep 20s', shell=True)

    if str(f.poll()) == '1':
        log(
            loc,
            date_time() +
            'fastqc returned an error.  Check your inputs and try again!\n')
        exit(1)
    return 0
예제 #5
0
 def organize_dirs(self):
     # check for existing BAM, QC and LOG dirs one level up
     try:
         if not os.path.isdir('../' + self.bam_dir):
             mk_bam_dir = 'mkdir ../' + self.bam_dir
             log(self.loc, date_time() + 'Making BAM directory ' + mk_bam_dir + '\n')
             call(mk_bam_dir, shell=True)
         if not os.path.isdir('../' + self.qc_dir):
             mk_qc_dir = 'mkdir ../' + self.qc_dir
             log(self.loc, date_time() + 'Making QC directory ' + mk_qc_dir + '\n')
             call(mk_qc_dir, shell=True)
         if not os.path.isdir('../' + self.log_dir):
             mk_log_dir = 'mkdir ../' + self.log_dir
             log(self.loc, date_time() + 'Making LOGS directory ' + mk_log_dir + '\n')
             call(mk_log_dir, shell=True)
         reloc_files = 'mv ' + self.bam_dir + '* ../' + self.bam_dir + '; mv ' + self.log_dir + '* ../' \
                       + self.log_dir + '; mv ' + self.qc_dir + '* ../' + self.qc_dir
         log(self.loc, date_time() + 'Relocating files ' + reloc_files + '\n')
         call(reloc_files, shell=True)
         # need to reassign log file location since it's being moved!
         self.loc = '../' + self.loc
         rm_old = 'rmdir ' + ' '.join((self.bam_dir , self.log_dir, self.qc_dir))
         log(self.loc, date_time() + 'Clearing out working dirs ' + rm_old + '\n')
         call(rm_old, shell=True)
         return 0
     except:
         return 1
예제 #6
0
def cutadapter(sample, end1, end2, config_file):
    # casual logging - look for a LOGS directory, otherwise assume current dir
    log_dir = './'
    # designed to be run in a subdirectory, keep original file names
    sf1 = end1
    sf2 = end2
    end1 = os.path.basename(sf1)
    end2 = os.path.basename(sf2)
    if os.path.isdir('LOGS'):
        log_dir = 'LOGS/'
    loc = log_dir + sample + '.cutadapt.log'
    (cutadapt_tool, threads, minlen, r1adapt, r2adapt, r1trim, r2trim, qual, mqual) = parse_config(config_file)
    cut_th = threads
    if int(cut_th) >= 4:
        cut_th = str(int(int(threads) / 2))

    cutadapt_cmd = cutadapt_tool + ' -j ' + cut_th + ' -m ' + minlen + ' --quality-base=' + qual + ' -q ' + mqual \
                   + ' -a ' + r1adapt + ' -A ' + r2adapt + ' -u ' + r1trim + ' -U ' + r2trim + ' -o ' + end1 \
                   + ' -p ' + end2 + ' ' + sf1 + ' ' + sf2 + ' >> ' + loc + ' 2>> ' + loc
    if r1adapt == '' and r2adapt == '':
        cutadapt_cmd = cutadapt_tool + ' -j ' + cut_th + ' -m ' + minlen + ' --quality-base=' + qual + ' -q ' + mqual \
                       + ' -u ' + r1trim + ' -U ' + r2trim + ' -o ' + end1 + ' -p ' + end2 + ' ' + sf1 + ' ' + sf2 \
                       + ' >> ' + loc + ' 2>> ' + loc
    log(loc, date_time() + cutadapt_cmd + "\n")
    call(cutadapt_cmd, shell=True)
    return 0
예제 #7
0
def align_stats(sample):
    # casual logging - look for a LOGS directory, otherwise assume current dir
    log_dir = './'
    if os.path.isdir('LOGS'):
        log_dir = 'LOGS/'
    loc = log_dir + sample + '.aln.log'
    log(loc, date_time() + "Converting to table summary format\n")
    fh = open(sample + '/' + 'align_summary.txt', 'r')
    fo = open(sample + '.align.txt', 'w')
    fo.write(
        'Sample\tMean insert size estimate(10k reads)\tStd dev read insert size estimate(10 k reads)\tStarting left reads\t% mapped\tmultimapped(mm)\tgt 20 mm\tStarting right reads\t% mapped\t% mm\tgt 20 mm\tOverall map rate\tAligned pairs\t% mm\t% discordant\t% condordant\n'
        + sample + '\t')
    fi = open(sample + '_subset.insert_metrics.hist')
    for i in range(0, 7, 1):
        skip = next(fi)
    stats = next(fi)
    fi.close()
    stat = stats.split('\t')
    fo.write('\t'.join([str(int(float(stat[4]))), str(int(float(stat[5])))]))
    next(fh)
    lstart = next(fh)
    m = re.search('(\d+)\n$', lstart)
    fo.write('\t' + m.group(1))
    pct = next(fh)
    m = re.search('\(\s*(\S+) of input\)\n', pct)
    fo.write('\t' + m.group(1))
    mm = next(fh)
    m = re.search('\(\s*(\S+)\).*\((\d+) have >20\)\n', mm)
    fo.write('\t' + m.group(1) + '\t' + m.group(2))

    next(fh)
    rstart = next(fh)
    m = re.search('(\d+)\n$', rstart)
    fo.write('\t' + m.group(1))
    pct = next(fh)
    m = re.search('\(\s*(\S+) of input\)\n', pct)
    fo.write('\t' + m.group(1))
    mm = next(fh)
    m = re.search('\(\s*(\S+)\).*\((\d+) have >20\)\n', mm)
    fo.write('\t' + m.group(1) + '\t' + m.group(2))
    ovr = next(fh)
    m = re.search('\s*(^\S+)', ovr)
    fo.write('\t' + m.group(1))
    next(fh)

    aln = next(fh)
    m = re.search('(\d+)\n$', aln)
    fo.write('\t' + m.group(1))
    mm = next(fh)
    m = re.search('\(\s*(\S+)\) have', mm)
    fo.write('\t' + m.group(1))
    dc = next(fh)
    m = re.search('\(\s*(\S+)\) are', dc)
    fo.write('\t' + m.group(1))
    cc = next(fh)
    m = re.search('^\s*(\S+)', cc)
    fo.write('\t' + m.group(1) + '\n')
    fo.close
    return 0
예제 #8
0
def gen_report(vcf, ref_flag):
    # open out file and index counts, context, etc
    fn = os.path.basename(vcf)
    parts = fn.split('.')
    sample = parts[0]
    loc = 'LOGS/' + sample + '.indels.vep_priority.report.log'
    log(loc,
        date_time() + 'Creating prioritized impact reports for ' + vcf + '\n')
    vcf_in = VariantFile(vcf)
    out_fn = sample + '.indels.vep.prioritized_impact.report.xls'
    out = open(out_fn, 'w')
    desired = {
        'Consequence': 0,
        'IMPACT': 0,
        'SYMBOL': 0,
        'Feature': 0,
        'Protein_position': 0,
        'Amino_acids': 0,
        'Codons': 0,
        'Existing_variation': 0,
        'ExAC_MAF': 0,
        'BIOTYPE': 0,
        'VARIANT_CLASS': 0
    }

    desc_string = vcf_in.header.info['ANN'].record['Description']
    desc_string = desc_string.lstrip('"')
    desc_string = desc_string.rstrip('"')
    desc_string = desc_string.replace(
        'Consequence annotations from Ensembl VEP. Format: ', '')
    f_pos_list = []
    desc_list = desc_string.split('|')
    ann_size = len(desc_list)
    for i in range(0, ann_size, 1):
        if desc_list[i] in desired:
            f_pos_list.append(i)
            desired[desc_list[i]] = i
    out.write(
        'chr\tpos\tref\talt\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\tvariant_class_effect\teffect\timpact'
        '\tbiotype\tcodon_change\tamino_acid_change\talt_cov\tnon_alt_cov\tvaf\n'
    )
    if ref_flag != 'n':
        ref_flag = create_index(ref_flag)

    for record in vcf_in.fetch():
        (chrom, pos, ref, alt, alt_ct, non_alt_ct,
         vaf) = (record.contig, str(record.pos), record.ref, record.alts[0],
                 str(record.info['MINCOV']), str(record.info['ALTCOV']),
                 str(record.info['COVRATIO']))
        ann_list = [_.split('|') for _ in record.info['ANN']]
        output_highest_impact(chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf,
                              ann_list, desired, out, ref_flag)

    out.close()
    log(
        loc,
        date_time() + 'Creating prioritized report for ' + vcf +
        ' complete!\n')
    return 0
예제 #9
0
def picard_rmdup(java_tool, picard_tool, picard_tmp, sample, log_dir, ram):
    picard_rmdup_cmd = java_tool + " -Xmx" + ram + "g -jar " + picard_tool + " MarkDuplicates CREATE_INDEX=true " \
                    "TMP_DIR=" + picard_tmp + " REMOVE_DUPLICATES=true ASSUME_SORTED=true " \
                    "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500 INPUT=" + sample + ".srt.bam OUTPUT=" + sample \
                       + ".rmdup.srt.bam METRICS_FILE=" + sample + ".rmdup.srt.metrics VALIDATION_STRINGENCY=LENIENT " \
                                                                   "> " + log_dir + sample + ".picard.rmdup.pe.log 2>&1"
    log(log_dir + sample + ".picard.rmdup.pe.log", date_time() + picard_rmdup_cmd + "\n")
    call(picard_rmdup_cmd, shell=True)
예제 #10
0
def align_stats(sample):
    # casual logging - look for a LOGS directory, otherwise assume current dir
    log_dir = './'
    if os.path.isdir('LOGS'):
        log_dir = 'LOGS/'
    loc = log_dir + sample + '.aln.log'
    log(loc, date_time() + "Converting to table summary format\n")
    fh = open(sample + '/' + 'align_summary.txt', 'r')
    fo = open(sample + '.align.txt', 'w')
    fo.write(
        'Sample\tMean insert size estimate(10k reads)\tStd dev read insert size estimate(10 k reads)\tStarting left reads\t% mapped\tmultimapped(mm)\tgt 20 mm\tStarting right reads\t% mapped\t% mm\tgt 20 mm\tOverall map rate\tAligned pairs\t% mm\t% discordant\t% condordant\n' + sample + '\t')
    fi = open(sample + '_subset.insert_metrics.hist')
    for i in range(0, 7, 1):
        skip = next(fi)
    stats = next(fi)
    fi.close()
    stat = stats.split('\t')
    fo.write('\t'.join([str(int(float(stat[4]))), str(int(float(stat[5])))]))
    next(fh)
    lstart = next(fh)
    m = re.search('(\d+)\n$', lstart)
    fo.write('\t' + m.group(1))
    pct = next(fh)
    m = re.search('\(\s*(\S+) of input\)\n', pct)
    fo.write('\t' + m.group(1))
    mm = next(fh)
    m = re.search('\(\s*(\S+)\).*\((\d+) have >20\)\n', mm)
    fo.write('\t' + m.group(1) + '\t' + m.group(2))

    next(fh)
    rstart = next(fh)
    m = re.search('(\d+)\n$', rstart)
    fo.write('\t' + m.group(1))
    pct = next(fh)
    m = re.search('\(\s*(\S+) of input\)\n', pct)
    fo.write('\t' + m.group(1))
    mm = next(fh)
    m = re.search('\(\s*(\S+)\).*\((\d+) have >20\)\n', mm)
    fo.write('\t' + m.group(1) + '\t' + m.group(2))
    ovr = next(fh)
    m = re.search('\s*(^\S+)', ovr)
    fo.write('\t' + m.group(1))
    next(fh)

    aln = next(fh)
    m = re.search('(\d+)\n$', aln)
    fo.write('\t' + m.group(1))
    mm = next(fh)
    m = re.search('\(\s*(\S+)\) have', mm)
    fo.write('\t' + m.group(1))
    dc = next(fh)
    m = re.search('\(\s*(\S+)\) are', dc)
    fo.write('\t' + m.group(1))
    cc = next(fh)
    m = re.search('^\s*(\S+)', cc)
    fo.write('\t' + m.group(1) + '\n')
    fo.close
    return 0
예제 #11
0
def novosort_merge_pe(config_file, sample_list):
    fh = open(sample_list, 'r')
    (novosort, java_tool, picard_tool, project, project_dir, align, threads, ram, rmdup, novo_merge_rmdup_slurm,
     novo_picard_merge_rmdup_slurm) = parse_config(config_file)

    for sample in fh:
        sample = sample.rstrip('\n')
        loc = sample + '.novosort_merge.log'
        (bam_list, bai_list, n) = list_bam(project, align, sample)
        bam_string = " ".join(bam_list)
        cur_dir = project_dir + project + '/' + align + '/' + sample + '/BAM/'
        os.chdir(cur_dir)
        out_bam = sample + '.merged.final.bam'
        if n > 1:
            if rmdup == 'Y':
                job_loc = sample + '.novosort_merge.log'
                job_name = sample + '_novosort_merge'

                batch = 'sbatch -c ' + threads + ' -J ' + job_name + ' --mem ' + ram + 'G -o ' + job_loc \
                        + ' --export=novosort="' + novosort + '",threads="' + threads + '",ram="' + ram \
                        + 'G",out_bam="' + out_bam + '",bam_string="' + bam_string + '",loc="' + loc + '"' + ' ' \
                        + novo_merge_rmdup_slurm
                log(loc, date_time() + 'Submitting merge bam job for sample ' + batch + "\n")
                subprocess.call(batch, shell=True)

            else:
                # run legacy pipe for removing dups using picard
                picard_tmp = 'picard_tmp'
                job_loc = sample + '.novosort_merge.picard_rmdup.log'
                job_name = sample + '_novosort_merge.picard_rmdup'

                # setting max records in ram to half of ram
                recs = str(int((int(ram) / 2) * (1000000000 / 200)))
                in_bam = sample + '.merged.bam'
                in_bai = sample + '.merged.bam.bai'

                mets = sample + '.rmdup.srt.metrics'
                batch = 'sbatch -c ' + threads + ' --mem ' + ram + 'G -o ' + job_loc + ' -J ' + job_name \
                        + ' --export=novosort="' + novosort + '",threads="' + threads + '",ram="' + ram \
                        + 'G",in_bam="' + in_bam + '",bam_string="' + bam_string + '",loc="' + job_loc \
                        + '",java_tool="' + java_tool + '",picard_tool="' + picard_tool + '",tmp="' + picard_tmp \
                        + '",recs="' + recs + '",out_bam="' + out_bam + '",mets="' + mets + '",in_bai="' + in_bai \
                        + '" ' + novo_picard_merge_rmdup_slurm
                sys.stderr.write(date_time() + 'Merging with novosort and rmdup with picard for legacy reasons!\n'
                                 + batch + '\n')
                subprocess.call(batch, shell=True)

        else:

                link_bam = 'ln -s ' + bam_list[0] + ' ' + sample + '.merged.final.bam; ln -s ' + bai_list[0] + ' ' \
                           + sample + '.merged.final.bam.bai'
                log(loc, date_time() + 'Creating symlink for merged final bam since only one exists\n'
                    + link_bam + '\n')
                subprocess.call(link_bam, shell=True)

    sys.stderr.write(date_time() + 'Merged file request submitted and processed, check logs.\n')
    return 0
예제 #12
0
def picard_rmdup(java_tool, picard_tool, picard_tmp, sample, log_dir, ram):
    picard_rmdup_cmd = java_tool + " -Xmx" + ram + "g -jar " + picard_tool + " MarkDuplicates CREATE_INDEX=true " \
                    "TMP_DIR=" + picard_tmp + " REMOVE_DUPLICATES=true ASSUME_SORTED=true " \
                    "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500 INPUT=" + sample + ".srt.bam OUTPUT=" + sample \
                       + ".rmdup.srt.bam METRICS_FILE=" + sample + ".rmdup.srt.metrics VALIDATION_STRINGENCY=LENIENT " \
                                                                   "> " + log_dir + sample + ".picard.rmdup.pe.log 2>&1"
    log(log_dir + sample + ".picard.rmdup.pe.log",
        date_time() + picard_rmdup_cmd + "\n")
    call(picard_rmdup_cmd, shell=True)
예제 #13
0
def bwa_mem_pe(bwa_tool, RGRP, bwa_ref, end1, end2, samtools_tool, samtools_ref, sample, log_dir, threads):
    bwa_cmd = "(" + bwa_tool + " mem -t " + threads + " -R \"" + RGRP + "\" -v 2 " + bwa_ref + " " + end1 + " " \
              + end2 + " | " + samtools_tool + " view -bT " + samtools_ref + " - > " + sample + ".bam) > " + log_dir \
              + sample + ".bwa.pe.log 2>&1"
    loc = log_dir + sample + ".bwa.pe.log"
    log(loc, date_time() + bwa_cmd + "\n")
    try:
        subprocess.check_output(bwa_cmd, shell=True)
    except:
        exit(1)
    return 0
예제 #14
0
def bwt2_pe(bwt_tool, bwt_ref, end1, end2, samtools_tool, samtools_ref, sample, t, log_dir):
    bwt_cmd = "(" + bwt_tool + " --fr -p " + t + " -I 0 -X 500 -x " + bwt_ref + " -1 " + end1 + " -2 " + end2 + " | " \
              + samtools_tool + " view -bT " + samtools_ref + " - > " + sample + ".bam) > " + log_dir + sample \
              + ".bwt.pe.log 2>&1"
    loc = log_dir + sample + ".bwt.pe.log"
    log(loc, date_time() + bwt_cmd + "\n")
    try:
        call(bwt_cmd, shell=True)
    except:
        return 1
    return 0
예제 #15
0
def watch_mem(proc_obj, sample, loc):
    from time import sleep
    while proc_obj.poll() is None:
        mem_pct = psutil.virtual_memory().percent
        log(loc, date_time() + 'Current memory usage at ' + str(mem_pct) + '% processing sample ' + sample + '\n')
        if mem_pct >= 99:
            log(loc, date_time() + 'Memory exceeded while running VEP.')
            return 1
        sleep(30)

    return proc_obj.poll()
예제 #16
0
def parseFASTQC(FASTQC, loc):
    try:
        fh = open(FASTQC, 'r')
        skip_lines(fh, 8)
        len_range = next(fh)
        info = len_range.rstrip('\n').split('-')
        fh.close()
        return info[1].rstrip('\t')
    except:
        log(loc, date_time() + 'Unable to open/process file ' + FASTQC)
        exit(1)
예제 #17
0
def wg_mode(scalpel, tumor_bam, normal_bam, fasta, cpus, pair, config_file):
    config_data = json.loads(open(config_file, 'r').read())
    exome = config_data['refs']['exome']
    loc = 'LOGS/' + pair + '_' + pair + '.genome_as_exome.scalpel.log'
    cmd = scalpel + ' --somatic --logs --numprocs ' + cpus + ' --tumor ' + tumor_bam + ' --normal ' \
    + normal_bam + ' --window 600 --two-pass --bed ' + exome + ' --ref ' + fasta + ' 2> ' + loc
    log(loc, date_time() + cmd + '\n')
    check = call(cmd, shell=True)
    if check != 0:
        return 1, pair
    return 0, pair
예제 #18
0
def picard_sort_pe(java_tool, picard_tool, picard_tmp, sample, log_dir):
    picard_sort_pe_cmd = java_tool + " -Xmx8g -jar " + picard_tool + " SortSam CREATE_INDEX=true TMP_DIR=" \
                         + picard_tmp + " INPUT=" + sample + ".bam OUTPUT=" + sample + ".srt.bam SORT_ORDER=" \
                        "coordinate VALIDATION_STRINGENCY=LENIENT > " + log_dir + sample + ".picard.sort.pe.log 2>&1"
    log(log_dir + sample + ".picard.sort.pe.log", date_time() + picard_sort_pe_cmd + "\n")
    try:
        subprocess.check_output(picard_sort_pe_cmd, shell=True)
    except:
        log(log_dir + sample + ".picard.sort.pe.log",
            'Picard sort failed for sample ' + sample + '.  Check for borg!\n')
        exit(1)
    return 0
예제 #19
0
def watch_mem(proc_obj, source, sample, loc):
    from time import sleep
    while proc_obj.poll() is None:
        mem_pct = psutil.virtual_memory().percent
        log(loc, date_time() + 'Current memory usage at ' + str(mem_pct) + '% processing sample ' + sample
            + ' from source ' + source + '\n')
        if mem_pct >= 99:
            log(loc, date_time() + 'Memory exceeded while running VEP.')
            return 1
        sleep(30)

    return proc_obj.poll()
예제 #20
0
def parseINS(INS, loc):
    try:
        fh = open(INS, 'r')
        skip_lines(fh, 7)
        line = next(fh)
        line = line.rstrip('\n')
        stats = line.split('\t')
        fh.close()
        return stats[0], stats[1], stats[4], stats[5]
    except:
        log(loc, date_time() + 'Unable to open/process file ' + INS + '\n')
        exit(1)
예제 #21
0
def bwa_mem_pe(bwa_tool, RGRP, bwa_ref, end1, end2, samtools_tool,
               samtools_ref, sample, log_dir, threads):
    bwa_cmd = "(" + bwa_tool + " mem -t " + threads + " -R \"" + RGRP + "\" -v 2 " + bwa_ref + " " + end1 + " " \
              + end2 + " | " + samtools_tool + " view -bT " + samtools_ref + " - > " + sample + ".bam) > " + log_dir \
              + sample + ".bwa.pe.log 2>&1"
    loc = log_dir + sample + ".bwa.pe.log"
    log(loc, date_time() + bwa_cmd + "\n")
    try:
        subprocess.check_output(bwa_cmd, shell=True).decode()
    except:
        exit(1)
    return 0
예제 #22
0
def fastqc(fastqc_tool, sample, end1, end2, t):
    # casual logging - look for a LOGS directory, otherwise assume current dir
    log_dir = './'
    if os.path.isdir('LOGS'):
        log_dir = 'LOGS/'
    loc = log_dir + sample + '.fastqc.log'
    fastqc_cmd = fastqc_tool + ' -t ' + t + ' -o QC/ ' + end1 + ' ' + end2
    log(loc, date_time() + fastqc_cmd + "\n")
    f = call(fastqc_cmd, shell=True)
    # check after a minute whether the process is still good - shouldn't take too long to ascertain whether phred
    # score didn't fit
    return 0
예제 #23
0
def picard_insert_size(java_tool, picard_tool, sample, log_dir, ram):
    loc = log_dir + sample + ".picard.insert_size.log"
    picard_insert_size_cmd = java_tool + " -Xmx" + ram + "g -jar " + picard_tool + " CollectInsertSizeMetrics I=" \
                             + sample + ".rmdup.srt.bam H=" + sample + ".insert_metrics.pdf O=" \
                             + sample + ".insert_metrics.hist  >> " + log_dir + sample + ".picard.insert_size.log 2>&1"
    log(loc, date_time() + picard_insert_size_cmd + "\n")
    try:
        call(picard_insert_size_cmd, shell=True)
        return 0
    except:
        log(loc, date_time() + 'Picard failed using java ' + java_tool + '\n')
        return 1
예제 #24
0
def picard_insert_size(java_tool, picard_tool, sample, log_dir, ram):
    loc = log_dir + sample + ".picard.insert_size.log"
    picard_insert_size_cmd = java_tool + " -Xmx" + ram + "g -jar " + picard_tool + " CollectInsertSizeMetrics I=" \
                             + sample + ".rmdup.srt.bam H=" + sample + ".insert_metrics.pdf O=" \
                             + sample + ".insert_metrics.hist  >> " + log_dir + sample + ".picard.insert_size.log 2>&1"
    log(loc , date_time() + picard_insert_size_cmd + "\n")
    try:
        call(picard_insert_size_cmd, shell=True)
        return 0
    except:
        log(loc, date_time() + 'Picard failed using java ' + java_tool + '\n')
        return 1
예제 #25
0
def fastqc(fastqc_tool, sample, end1, end2, t):
    # casual logging - look for a LOGS directory, otherwise assume current dir
    log_dir = './'
    if os.path.isdir('LOGS'):
        log_dir = 'LOGS/'
    loc = log_dir + sample + '.fastqc.log'
    fastqc_cmd = fastqc_tool + ' -t ' + t + ' -o QC/ ' + end1 + ' ' + end2
    log(loc, date_time() + fastqc_cmd + "\n")
    f = call(fastqc_cmd, shell=True)
    # check after a minute whether the process is still good - shouldn't take too long to ascertain whether phred
    # score didn't fit
    return 0
예제 #26
0
def parseSTAR(STAR, loc):
    try:
        fh = open(STAR, 'r')
        stats = []
        skip_lines(fh, 5)
        num_rds = next(fh)
        num_rds = processSTAR(num_rds)
        stats.append(num_rds)
        skip_lines(fh, 3)
        uniq = next(fh)
        uniq = processSTAR(uniq)
        stats.append(uniq)
        next(fh)
        sjt = next(fh)
        sjt = processSTAR(sjt)
        stats.append(sjt)
        skip_lines(fh, 4)
        nsj = next(fh)
        nsj = processSTAR(nsj)
        stats.append(nsj)
        mm = next(fh)
        mm = processSTAR(mm)
        stats.append(mm)
        delrate = next(fh)
        delrate = processSTAR(delrate)
        stats.append(delrate)
        next(fh)
        ins = next(fh)
        ins = processSTAR(ins)
        stats.append(ins)
        skip_lines(fh, 3)
        mml = next(fh)
        mml = processSTAR(mml)
        stats.append(mml)
        next(fh)
        mmml = next(fh)
        mmml = processSTAR(mmml)
        stats.append(mmml)
        next(fh)
        unmap = next(fh)
        unmap = processSTAR(unmap)
        unmap2 = next(fh)
        unmap2 = processSTAR(unmap2)
        unmap3 = next(fh)
        unmap3 = processSTAR(unmap3)
        unmap_tot = (float(unmap.rstrip('%')) + float(unmap2.rstrip('%')) + float(unmap3.rstrip('%')))
        unmap_tot = round(unmap_tot, 2)
        stats.append(str(unmap_tot) + '%')
        fh.close()
        return stats
    except:
        log(loc, date_time() + 'Unable to open/process file ' + STAR + '\n')
        exit(1)
예제 #27
0
def scalpel_indel(tumor_id, normal_id, log_dir, config_file):
    (scalpel, bedtools, bed, fasta, cpus, dustmask_flag, dustmask_bed, wg, project_dir, project, align) \
        = parse_config(config_file)

    sample_pair = tumor_id + '_' + normal_id
    loc = log_dir + sample_pair + '.scalpel.log'
    bam_dir = project_dir + project + '/' + align
    tumor_bam = bam_dir + '/' + tumor_id + '/BAM/' + tumor_id + '.merged.final.bam'
    normal_bam = bam_dir + '/' + normal_id + '/BAM/' + normal_id + '.merged.final.bam'
    if wg == 'n':
        scalpel_cmd = scalpel + ' --somatic --logs --numprocs ' + cpus + ' --tumor ' + tumor_bam + ' --normal ' \
                      + normal_bam + ' --bed ' + bed + ' --ref ' + fasta + ' 2>> ' + loc
        sys.stderr.write(date_time() + 'Starting indel calls for ' +
                         sample_pair + '\n')
        log(
            loc,
            date_time() + 'Starting indel calls for ' + sample_pair +
            ' in capture mode with command:\n' + scalpel_cmd + '\n')
        check = call(scalpel_cmd, shell=True)
        if check != 0:
            sys.stderr.write(date_time() + 'Indel calling failed for pair ' +
                             sample_pair + ' with command:\n' + scalpel_cmd +
                             '\n')
            exit(1)
    else:
        check = wg_mode(scalpel, tumor_bam, normal_bam, fasta, cpus,
                        sample_pair, config_file)
        if check[0] != 0:
            sys.stderr.write('Scalpel failed for ' + normal_id + ' at ' +
                             tumor_id + '\n')
            exit(1)
    log(
        loc,
        date_time() + 'Indel calling complete for pair ' + sample_pair +
        ' moving output files\n')
    mv_cmd = 'mv outdir/main/* .; rmdir outdir/main;'
    log(loc, date_time() + mv_cmd + '\n')
    call(mv_cmd, shell=True)
    sys.stderr.write(date_time() + 'Completed indel calls for ' + sample_pair +
                     '\n')
    if dustmask_flag == 'Y':
        log(loc, date_time() + 'Filter dustmask flag given\n')
        check = filter_indel(bedtools, dustmask_bed, sample_pair, loc)
        if check != 0:
            sys.stderr.write(date_time() + 'Dustmask failed for ' +
                             sample_pair + '\n')
            exit(1)
        else:
            log(loc,
                date_time() + 'Dustmask complete for ' + sample_pair + '\n')
    sys.stderr.write(date_time() + 'Indel call completed\n')
    return 0
예제 #28
0
def parseCUTADAPT(CUTADAPT, loc):
    try:
        fh = open(CUTADAPT, 'r')
        flag = 0
        stats = []
        while flag == 0:
            cur = next(fh)
            if re.search('Total read', cur):
                # total read pairs
                stats.append(process_parens(cur))
                cur = next(fh)
                # r1a pct
                stats.append(process_parens(cur))
                cur = next(fh)
                # r2a pct
                stats.append(process_parens(cur))
                cur = next(fh)
                # too short
                stats.append(process_parens(cur))
                cur = next(fh)
                # too rp pass
                stats.append(process_parens(cur))
                next(fh)
                flag = 1
        tot_bp_line = next(fh)
        info = tot_bp_line.split()
        tot_bp = int(info[-2].replace(',', ''))
        # total bp
        stats.append(str(tot_bp))
        next(fh)
        next(fh)
        next(fh)
        # calculate trimmed base pers per read as a pct
        r1_qt_line = next(fh)
        info = r1_qt_line.split()
        r1_pct = round(float(info[-2].replace(',', ''))/tot_bp * 100, 2)
        #r1 trimmed
        stats.append(str(r1_pct) + '%')

        r2_qt_line = next(fh)
        info = r2_qt_line.split()
        r2_pct = round(float(info[-2].replace(',', ''))/tot_bp * 100, 2)
        # r2 trimmed
        stats.append(str(r2_pct) + '%')
        # total written
        tw = next(fh)
        stats.append(process_parens(tw))
        fh.close()
        return stats
    except:
        log(loc, date_time() + 'Unable to open/process file ' + CUTADAPT + '\n')
        exit(1)
예제 #29
0
def picard_sort_pe(java_tool, picard_tool, picard_tmp, sample, log_dir):
    picard_sort_pe_cmd = java_tool + " -Xmx8g -jar " + picard_tool + " SortSam CREATE_INDEX=true TMP_DIR=" \
                         + picard_tmp + " INPUT=" + sample + ".bam OUTPUT=" + sample + ".srt.bam SORT_ORDER=" \
                        "coordinate VALIDATION_STRINGENCY=LENIENT > " + log_dir + sample + ".picard.sort.pe.log 2>&1"
    log(log_dir + sample + ".picard.sort.pe.log",
        date_time() + picard_sort_pe_cmd + "\n")
    try:
        subprocess.check_output(picard_sort_pe_cmd, shell=True).decode()
    except:
        log(log_dir + sample + ".picard.sort.pe.log",
            'Picard sort failed for sample ' + sample + '.  Check for borg!\n')
        exit(1)
    return 0
예제 #30
0
def platypus_germline(config_file, sample, log_dir, cflag):

    loc = log_dir + sample + ".platypus.log"
    # here for safety as python is confusing about whether variables exist outside of if-else statements or not
    platypus_cmd = ''
    if cflag == 'y':
        (platypus, fasta, threads, project_dir, project, align) = parse_config(config_file, cflag)
        bam = project_dir + project + '/' + align + '/' + sample + '/BAM/' + sample + '.merged.final.bam'
        platypus_cmd = "python2.7 " + platypus + " callVariants --nCPU=" + threads + " --refFile=" + fasta \
                       + " --bamFiles=" + bam + " -o " + sample + ".germline_calls.vcf --logFileName=" \
                       + log_dir + sample + ".platypus.log" + " >> " + loc + " 2>&1"
    else:
        (platypus, fasta, threads, region_file, minVAF, samtools, project_dir, project, align) \
            = parse_config(config_file, cflag)

        bam = project_dir + project + '/' + align + '/' + sample + '/BAM/' + sample + '.merged.final.bam'
        if not (os.path.isfile(bam + '.bai') or os.path.isfile(bam[:-1] + 'i')):
            log(loc, date_time() + bam + ' not indexed.  Indexing\n')
            cmd = samtools + ' index ' + bam
            log(loc, date_time() + cmd + '\n')
            subprocess.call(cmd, shell=True)
        platypus_cmd = "python2.7 " + platypus + " callVariants --nCPU=" + threads + " --refFile=" + fasta \
                       + " --bamFiles=" + bam + " --filterDuplicates=0 -o " + sample \
                       + ".germline_calls.vcf --minVarFreq=" + minVAF + " --regions=" + region_file \
                       + " --logFileName=" + loc + " >> " + loc + " 2>&1"
    log(loc, date_time() + platypus_cmd + "\n")
    f = 0
    try:
        f = subprocess.call(platypus_cmd, shell=True)
    except:
        log(loc, 'platypus germline variant calling failed for sample ' + sample + '\n')
        return f

    return 0
예제 #31
0
def picard_insert_size(java_tool, picard_tool, sample, log_dir):
    picard_insert_size_cmd = java_tool + " -Xmx2g -jar " + picard_tool + " CollectInsertSizeMetrics I=" + sample \
                             + ".srt.bam H=" + sample + ".insert_metrics.pdf O=" + sample + ".insert_metrics.hist  > " \
                             + log_dir + sample + ".picard.insert_size.log 2>&1"
    log(log_dir + sample + ".picard.insert_size.log", date_time() + picard_insert_size_cmd + "\n")
    call(picard_insert_size_cmd, shell=True)
    # open file and return insert size
    fh = open(sample + ".insert_metrics.hist", 'r')
    for i in range(0, 7, 1):
        skip = next(fh)
    stats = next(fh)
    fh.close()
    stat = stats.split('\t')

    return stat[4], stat[5]
예제 #32
0
def picard_mark_dups(config_file, sample, log_dir, suffix):
    root = os.path.basename(sample)
    loc = log_dir + root + ".picard.mark_dup.log"
    (java_tool, picard_tool, mem) = parse_config(config_file)
    picard_tmp = 'picard_tmp'
    picard_mark_dups_cmd = 'mkdir ' + picard_tmp + ';' + java_tool + " -Djava.io.tmpdir=" + picard_tmp + " -Xmx" \
                           + mem + "g -jar " + picard_tool + " MarkDuplicates I=" + sample + suffix + " O=" + sample \
                           + ".dup_marked.bam  CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT M=" + sample \
                           + ".output.metrics > " + loc + " 2>&1; rm -rf " + picard_tmp
    log(loc, date_time() + picard_mark_dups_cmd + "\n")
    check = call(picard_mark_dups_cmd, shell=True)
    if check == 0:
        return 0
    else:
        return 1
예제 #33
0
def picard_mark_dups(config_file, sample, log_dir, suffix):
    root = os.path.basename(sample)
    loc = log_dir + root + ".picard.mark_dup.log"
    (java_tool, picard_tool, mem) = parse_config(config_file)
    picard_tmp = 'picard_tmp'
    picard_mark_dups_cmd = 'mkdir ' + picard_tmp + ';' + java_tool + " -Djava.io.tmpdir=" + picard_tmp + " -Xmx" \
                           + mem + "g -jar " + picard_tool + " MarkDuplicates I=" + sample + suffix + " O=" + sample \
                           + ".dup_marked.bam  CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT M=" + sample \
                           + ".output.metrics > " + loc + " 2>&1; rm -rf " + picard_tmp
    log(loc, date_time() + picard_mark_dups_cmd + "\n")
    check = call(picard_mark_dups_cmd, shell=True)
    if check == 0:
        return 0
    else:
        return 1
예제 #34
0
def fastqc(fastqc_tool, sample, end1, end2, t):
    # casual logging - look for a LOGS directory, otherwise assume current dir
    log_dir = './'
    if os.path.isdir('LOGS'):
        log_dir = 'LOGS/'
    loc = log_dir + sample + '.fastqc.log'
    fastqc_cmd = fastqc_tool + ' --extract -t ' + t + ' -o QC/ ' + end1 + ' ' + end2 + ' 2>> ' + loc
    log(loc, date_time() + fastqc_cmd + "\n")
    check = call(fastqc_cmd, shell=True)
    # check after a minute whether the process is still good - shouldn't take too long to ascertain whether phred score
    #  didn't fit

    if check != 0:
        log(loc, date_time() + 'FastQC Failed for sample ' + sample + '\n')
        exit(1)
    return 0
예제 #35
0
def fastqc(fastqc_tool, sample, end1, end2, t):
    # casual logging - look for a LOGS directory, otherwise assume current dir
    log_dir = './'
    if os.path.isdir('LOGS'):
        log_dir = 'LOGS/'
    loc = log_dir + sample + '.fastqc.log'
    fastqc_cmd = fastqc_tool + ' --extract -t ' + t + ' -o QC/ ' + end1 + ' ' + end2 + ' 2>> ' + loc
    log(loc, date_time() + fastqc_cmd + "\n")
    check = call(fastqc_cmd, shell=True)
    # check after a minute whether the process is still good - shouldn't take too long to ascertain whether phred score
    #  didn't fit

    if check != 0:
        log(loc, date_time() + 'FastQC Failed for sample ' + sample + '\n')
        exit(1)
    return 0
예제 #36
0
def parsePICARD(PICARD, loc):
    try:
        fh = open(PICARD, 'r')
        skip_lines(fh, 6)
        keys = next(fh)
        keys = keys.rstrip('\n').split('\t')
        vals = next(fh)
        vals = vals.rstrip('\n').split('\t')
        qc_dict = {}
        for i in range(0, len(keys), 1):
            qc_dict[keys[i]] = vals[i]
        fh .close()
        return qc_dict
    except:
        log(loc, date_time() + 'Unable to open/process file ' + PICARD + '\n')
        exit(1)
예제 #37
0
def filter_wrap(mmu_filter, star_tool, genome_ref, end1, end2, sample, log_dir,
                threads, novosort, mem):
    meta = sample.split('_')
    RGRP = "ID:" + sample + "\tLB:" + meta[0] + "\tPU:" + meta[
        4] + "\tSM:" + meta[0] + "\tPL:illumina"
    loc = log_dir + sample + ".mmu.star.pe.log"
    mk_srt_tmp = 'mkdir TMP'
    subprocess.call(mk_srt_tmp, shell=True)
    # split threads for star and novosort as well as memory
    nmem = 2
    ncpu = 2
    threads = int(threads)
    sthreads = threads
    if threads >= 10:
        if threads == 10:
            sthreads = 6
            ncpu = 4
        else:
            if threads % 2.0 == 0.0:
                sthreads = int(threads / 2)
                ncpu = int(threads / 2)
            else:
                sthreads = int(math.ceil(threads / 2.0))
                ncpu = int(math.floor(threads / 2.0))
    else:
        sthreads = int(sthreads) - 2
    mem = int(mem)
    if mem > 42:
        nmem = mem - 40
    star_cmd = "(" + star_tool + " --runMode alignReads --outSAMattrRGline " + RGRP + " --outFileNamePrefix " \
            + sample + ".mmu_filt. --runThreadN " + str(sthreads) + " --genomeDir " + genome_ref\
            + " --readFilesIn " + end1 + " " + end2 + " --readFilesCommand zcat --outSAMtype BAM Unsorted --outStd " \
            "BAM_Unsorted --outFilterType BySJout --outFilterMultimapNmax 20 --alignSJoverhangMin 8 " \
            "--alignSJDBoverhangMin 1 --outFilterMismatchNmax 0" + " --alignIntronMin 20 --alignIntronMax 1000000 " \
            "--alignMatesGapMax 1000000 --outSAMunmapped Within 2>> " + loc + "  | " + novosort + " - -n -c " \
            + str(ncpu) + " -m " + str(nmem) + "G -t TMP 2>> " + loc + " | tee " + sample + ".mmu.nsrt.bam | python " \
            + mmu_filter + " -s " + sample + " -n 0 -t RNA | gzip -4 -c - > " + sample \
            + "_1.filtered.fq.gz;) 2>&1 | gzip -4 -c - > " + sample + "_2.filtered.fq.gz"

    log(loc, date_time() + star_cmd + '\n')
    try:
        subprocess.call(star_cmd, shell=True)
    except:
        log(
            loc,
            date_time() +
            'Star alignment and filter against against mouse genome failed\n')
        exit(1)
    log(loc, date_time() + 'Filtering completed, replacing fastq file\n')
    rn_fq = 'mv ' + sample + '_1.filtered.fq.gz ' + end1 + '; mv ' + sample + '_2.filtered.fq.gz ' + end2 \
            + ';rm -rf TMP'
    check = subprocess.call(rn_fq, shell=True)
    if check != 0:
        log(loc, date_time() + 'File rename failed\n' + rn_fq + '\n')
        exit(1)
    return 0
예제 #38
0
def filter_wrap(mmu_filter, bwa_tool, RGRP, bwa_ref, end1, end2, samtools_tool, samtools_ref, sample, log_dir, threads):
    loc = log_dir + sample + ".mmu.bwa.pe.log"
    bwa_cmd = "(" + bwa_tool + " mem -O 60 -L 0 -E 10 -t " + threads + " -R \"" + RGRP + "\" -v 2 " + bwa_ref + " "\
              + end1 + " " + end2 + " 2>> " + loc + " | " + samtools_tool + " view -bT " \
              + samtools_ref + " - 2>> " + loc + " | tee " + sample + ".mmu.bam | python " \
              + mmu_filter + " -s " + sample + " -n 0 -t DNA | gzip -4 -c - > " \
              + sample + "_1.filtered.fq.gz;) 2>&1 | gzip -4 -c - > " + sample + "_2.filtered.fq.gz"

    log(loc, date_time() + bwa_cmd + "\n")
    try:
        subprocess.check_output(bwa_cmd, shell=True).decode()
        log(loc, date_time() + 'Filtering completed, replacing fastq file\n')
        rn_fq = 'mv ' + sample + '_1.filtered.fq.gz ' + end1 + '; mv ' + sample + '_2.filtered.fq.gz ' + end2
        subprocess.call(rn_fq, shell=True)
    except:
        sys.stderr.write('Filtering failed\n.')
        exit(1)
    return 0
예제 #39
0
def annot_vcf_vep_pipe(config_file, sample_pairs, ref_mnt, in_suffix, out_suffix, source):
    (vep_tool, vep_cache, fasta, report, dbsnp, vcache, threads, intvl, dustmask_flag) = parse_config(config_file)
    fasta = ref_mnt + '/' + fasta
    vep_cache = ref_mnt + '/' + vep_cache
    intvl = ref_mnt + '/' + intvl
    # scale back on the forking a bit
    if int(threads) > 2:
        threads = str(int(threads)/2 - 1)
    # parse sample file, use only last if pairs
    samp_fh = open(sample_pairs, 'r')
    # track to prevent repeat annotation if same sample used as comparison
    for line in samp_fh:
        info = line.rstrip('\n').split('\t')
        sample = info[0]
        mk_log_dir = 'mkdir LOGS'
        subprocess.call(mk_log_dir, shell=True)
        loc = 'LOGS/' + sample + '.vep_anno.log'
        in_vcf = sample + in_suffix
        out_vcf = sample + out_suffix
        if source == 'scalpel':
            pass_filter(sample, in_suffix, dustmask_flag)
            in_vcf = sample + '.somatic_indel.PASS.vcf'
        run_vep = 'perl ' + vep_tool + ' --cache -i ' + in_vcf + ' --vcf -o ' + out_vcf + ' --symbol --vcf_info_field' \
                ' ANN --canonical --variant_class --no_whole_genome --offline --maf_exac --no_whole_genome ' \
                '--fork ' + threads + ' --fasta ' + fasta + ' --dir_cache ' + vep_cache + ' --cache_version ' + vcache \
                + ' 2>> ' + loc + ' >> ' + loc
        log(loc, date_time() + 'Annotating sample ' + sample + in_suffix + '\n')
        check = subprocess.call(run_vep, shell=True)
        if check != 0:
            log(loc, date_time() + 'VEP annotation for ' + sample + in_suffix + ' failed\n')
            exit(1)
        else:
            log(loc, date_time() + 'VEP annotation ' + sample + in_suffix + ' successful!\n')
        if source == 'mutect':
            check = gen_snv_report(out_vcf, sample + '.out.keep', intvl)
            if check != 0:
                log(loc, date_time() + 'Report generation for ' + out_vcf + ' failed\n')
                exit(1)
        else:
            check = gen_indel_report(out_vcf)
            if check != 0:
                log(loc, date_time() + 'Report generation for ' + out_vcf + ' failed\n')
                exit(1)
    return 0
예제 #40
0
def filter_wrap(mmu_filter, bwa_tool, RGRP, bwa_ref, end1, end2, samtools_tool,
                samtools_ref, sample, log_dir, threads):
    loc = log_dir + sample + ".mmu.bwa.pe.log"
    bwa_cmd = "(" + bwa_tool + " mem -O 60 -L 0 -E 10 -t " + threads + " -R \"" + RGRP + "\" -v 2 " + bwa_ref + " "\
              + end1 + " " + end2 + " 2>> " + loc + " | " + samtools_tool + " view -bT " \
              + samtools_ref + " - 2>> " + loc + " | tee " + sample + ".mmu.bam | python " \
              + mmu_filter + " -s " + sample + " -n 0 -t DNA | gzip -4 -c - > " \
              + sample + "_1.filtered.fq.gz;) 2>&1 | gzip -4 -c - > " + sample + "_2.filtered.fq.gz"

    log(loc, date_time() + bwa_cmd + "\n")
    try:
        subprocess.check_output(bwa_cmd, shell=True).decode()
        log(loc, date_time() + 'Filtering completed, replacing fastq file\n')
        rn_fq = 'mv ' + sample + '_1.filtered.fq.gz ' + end1 + '; mv ' + sample + '_2.filtered.fq.gz ' + end2
        subprocess.call(rn_fq, shell=True)
    except:
        sys.stderr.write('Filtering failed\n.')
        exit(1)
    return 0
예제 #41
0
def scalpel_indel(pairs, log_dir, config_file, ref_mnt):
    (scalpel, bedtools, bed, fasta, cpus, dustmask_flag, dustmask_bed) = parse_config(config_file)
    bed = ref_mnt + '/' + bed
    fasta = ref_mnt + '/' + fasta
    dustmask_bed = ref_mnt + '/' + dustmask_bed
    # use get_merged_bams api
    sample_list = 'sample_list.txt'
    if not os.path.isfile(sample_list):
        create_sample_list(pairs)
        sys.stderr.write(date_time() + 'Sample pairs list not created - creating one since this is being run likely '
                                       'outside of pipeline')
        get_merged_bams(config_file, sample_list)
    fh = open(pairs, 'r')
    for line in fh:
        cur = line.rstrip('\n').split('\t')
        loc = log_dir + cur[0] + '.scalpel.log'
        tumor_bam = cur[1] + '.merged.final.bam'
        normal_bam = cur[2] + '.merged.final.bam'
        scalpel_cmd = scalpel + ' --somatic --logs --numprocs ' + cpus + ' --tumor ' + tumor_bam + ' --normal ' \
                      + normal_bam + ' --bed ' + bed + ' --ref ' + fasta + ' 2>> ' + loc
        sys.stderr.write(date_time() + 'Starting indel calls for ' + cur[0] + '\n')
        log(loc, date_time() + 'Starting indel calls for ' + cur[0] + ' with command:\n' + scalpel_cmd + '\n')
        check = call(scalpel_cmd, shell=True)
        if check != 0:
            sys.stderr.write(date_time() + 'Indel calling failed for pair ' + cur[0] + ' with command:\n' +
                             scalpel_cmd + '\n')
        log(loc, date_time() + 'Indel calling complete for pair ' + cur[0] + ' moving output files\n')
        mv_cmd = 'mkdir ' + cur[0] + '; mv outdir/main/* ' + cur[0] + '; rm -rf outdir/main;'
        log(loc, date_time() + mv_cmd + '\n')
        call(mv_cmd, shell=True)
        sys.stderr.write(date_time() + 'Completed indel calls for ' + cur[0] + '\n')
        if dustmask_flag == 'Y':
            log(loc, date_time() + 'Filter dustmask flag given\n')
            check = filter_indel(bedtools, dustmask_bed, cur[0])
            if check != 0:
                sys.stderr.write(date_time() + 'Dustmask failed for ' + cur[0] + '\n')
                exit(1)
            else:
                log(loc, date_time() + 'Dustmask complete for ' + cur[0] + '\n')
    fh.close()
    sys.stderr.write(date_time() + 'Indel call completed\n')
    return 0
예제 #42
0
def express_quant(sample, config_file, x, s):
    loc = sample + '.express.log'
    if os.path.isdir('LOGS'):
        loc = 'LOGS/' + loc
    (stranded, strand, express, transcriptome) = parse_config(config_file)
    bam = 'BAMS/' + sample + '.merged.transcriptome.bam'
    if stranded == 'N':
        express_cmd = express + ' ' + transcriptome + ' ' + bam + ' --no-update-check -m '\
                      + x + ' -s ' + s + ' --logtostderr 2>> ' + loc
    else:
        express_cmd = express + ' ' + transcriptome + ' ' + bam + ' --no-update-check --'\
                      + strand + ' -m ' + x + ' -s ' + s + ' --logtostderr 2>> ' + loc
    log(loc, date_time() + express_cmd + '\n')
    check = subprocess.call(express_cmd, shell=True)

    rename_express_out = 'mv results.xprs ' + sample + '.express_quantification.txt; mv params.xprs ' + sample\
                         + '.params.xprs'
    check += subprocess.call(rename_express_out, shell=True)
    log(loc, date_time() + 'Completed qc.  Renaming files\n')
    return check
예제 #43
0
def qc_bam(sample, config_file):
    # job_list = []
    loc = sample + '.bam_qc.log'
    if os.path.isdir('LOGS'):
        loc = 'LOGS/' + loc
    (java, ram, picard, refFlat, intervals, strand, threads) = parse_config(config_file)
    # recalc ram to be a bit lower
    ram = str(int(round(int(ram) * 0.75)))

    st_dict = {'N': 'NONE', 'fr-stranded': 'FIRST_READ_TRANSCRIPTION_STRAND',
               'rf-stranded': 'SECOND_READ_TRANSCRIPTION_STRAND'}

    picard_cmd = java + ' -Xmx' + ram + 'g -XX:+UseConcMarkSweepGC -XX:ParallelGCThreads=' + threads +  \
                 ' -XX:MaxGCPauseMillis=10000 -jar ' + picard + ' CollectRnaSeqMetrics REF_FLAT=' + refFlat \
                 + ' STRAND=' + st_dict[strand] + ' CHART=' + sample + '.pos_v_cov.pdf I=' + sample \
                 + '.Aligned.sortedByCoord.out.bam O=' + sample + '.picard_RNAseq_qc.txt RIBOSOMAL_INTERVALS=' \
                 + intervals + ' VALIDATION_STRINGENCY=SILENT 2>> ' + loc + ' >> ' + loc
    log(loc, date_time() + picard_cmd + '\n')
    subprocess.call(picard_cmd, shell=True)
    return 0
예제 #44
0
def express_quant(sample, config_file, x, s):
    loc = sample + '.express.log'
    if os.path.isdir('LOGS'):
        loc = 'LOGS/' + loc
    (stranded, strand, express, transcriptome) = parse_config(config_file)
    bam = 'BAMS/' + sample + '.merged.transcriptome.bam'
    if stranded == 'N':
        express_cmd = express + ' ' + transcriptome + ' ' + bam + ' --no-update-check -m '\
                      + x + ' -s ' + s + ' --logtostderr 2>> ' + loc
    else:
        express_cmd = express + ' ' + transcriptome + ' ' + bam + ' --no-update-check --'\
                      + strand + ' -m ' + x + ' -s ' + s + ' --logtostderr 2>> ' + loc
    log(loc, date_time() + express_cmd + '\n')
    check = subprocess.call(express_cmd, shell=True)

    rename_express_out = 'mv results.xprs ' + sample + '.express_quantification.txt; mv params.xprs ' + sample\
                         + '.params.xprs'
    check += subprocess.call(rename_express_out, shell=True)
    log(loc, date_time() + 'Completed qc.  Renaming files\n')
    return check
예제 #45
0
def novosort_sort_pe(novosort, sample, log_dir, t, mem, stype):
    samp_root = os.path.basename(sample)
    temp = 'novosort_tmp'
    novosort_sort_pe_cmd = 'mkdir ' + temp + ';' + novosort + " --threads " + t + " --ram " + mem \
                           + "G --tmpdir  " + temp + " --output " + sample + ".srt.bam --index  " + sample + ".bam > " \
                           + log_dir + samp_root + ".novosort.sort.pe.log 2>&1"
    if stype == 'name':
        novosort_sort_pe_cmd = 'mkdir ' + temp + ';' + novosort + " --threads " + t + " --ram " + mem \
                               + "G --tmpdir  " + temp + " --output " + sample + ".nsrt.bam -n  " + sample + ".bam > " \
                               + log_dir + samp_root + ".novosort.sort.pe.log 2>&1"
    log(log_dir + samp_root + ".novosort.sort.pe.log", date_time() + novosort_sort_pe_cmd + "\n")
    f = 0
    try:
        f = subprocess.call(novosort_sort_pe_cmd, shell=True)
        rm_tmp = 'rm -rf novosort_tmp'
        subprocess.call(rm_tmp, shell=True)
    except:
        log(log_dir + sample + ".novosort.sort.pe.log", 'novosort sort failed for sample ' + sample + '\n')
        exit(1)
    return f
예제 #46
0
def filter_wrap(mmu_filter, star_tool, genome_ref, end1, end2, sample, log_dir, threads, novosort, mem):
    meta = sample.split('_')
    RGRP = "ID:" + sample + "\tLB:" + meta[0] + "\tPU:" + meta[4] + "\tSM:" + meta[0] + "\tPL:illumina"
    loc = log_dir + sample + ".mmu.star.pe.log"
    mk_srt_tmp = 'mkdir TMP'
    subprocess.call(mk_srt_tmp, shell=True)
    # split threads for star and novosort as well as memory
    nmem = 2
    ncpu = 2
    threads = int(threads)
    sthreads = threads
    if threads >= 10:
        if threads == 10:
            sthreads = 6
            ncpu = 4
        else:
            if threads % 2.0 == 0.0:
                sthreads = int(threads/2)
                ncpu = int(threads/2)
            else:
                sthreads = int(math.ceil(threads/2.0))
                ncpu = int(math.floor(threads/2.0))
    else:
        sthreads = int(sthreads) - 2
    mem = int(mem)
    if mem > 42:
        nmem = mem - 40
    star_cmd = "(" + star_tool + " --runMode alignReads --outSAMattrRGline " + RGRP + " --outFileNamePrefix " \
            + sample + ".mmu_filt. --runThreadN " + str(sthreads) + " --genomeDir " + genome_ref\
            + " --readFilesIn " + end1 + " " + end2 + " --readFilesCommand zcat --outSAMtype BAM Unsorted --outStd " \
            "BAM_Unsorted --outFilterType BySJout --outFilterMultimapNmax 20 --alignSJoverhangMin 8 " \
            "--alignSJDBoverhangMin 1 --outFilterMismatchNmax 0" + " --alignIntronMin 20 --alignIntronMax 1000000 " \
            "--alignMatesGapMax 1000000 --outSAMunmapped Within 2>> " + loc + "  | " + novosort + " - -n -c " \
            + str(ncpu) + " -m " + str(nmem) + "G -t TMP 2>> " + loc + " | tee " + sample + ".mmu.nsrt.bam | python " \
            + mmu_filter + " -s " + sample + " -n 0 -t RNA | gzip -4 -c - > " + sample \
            + "_1.filtered.fq.gz;) 2>&1 | gzip -4 -c - > " + sample + "_2.filtered.fq.gz"

    log(loc, date_time() + star_cmd + '\n')
    try:
        subprocess.call(star_cmd, shell=True)
    except:
        log(loc, date_time() + 'Star alignment and filter against against mouse genome failed\n')
        exit(1)
    log(loc, date_time() + 'Filtering completed, replacing fastq file\n')
    rn_fq = 'mv ' + sample + '_1.filtered.fq.gz ' + end1 + '; mv ' + sample + '_2.filtered.fq.gz ' + end2 \
            + ';rm -rf TMP'
    check = subprocess.call(rn_fq, shell=True)
    if check != 0:
        log(loc, date_time() + 'File rename failed\n' + rn_fq + '\n')
        exit(1)
    return 0
예제 #47
0
파일: star.py 프로젝트: WhiteLab/RNAseq
def star(STAR, genome, end1, end2, sample, log_dir, th, sf):
    loc = log_dir + sample + ".star.log"
    meta = sample.split('_')
    RGRP = "ID:" + sample + "\tLB:" + meta[0] + "\tPU:" + meta[4] + "\tSM:" + meta[0] + "\tPL:illumina"
    star_cmd = STAR + " --runMode alignReads --twopassMode Basic --outFileNamePrefix " + sample + ". --runThreadN " \
               + th + " --genomeDir " + genome + " --readFilesIn " + end1 + " " + end2 + " --readFilesCommand zcat \
               --quantMode TranscriptomeSAM GeneCounts --outSAMtype BAM SortedByCoordinate --outFilterType BySJout \
               --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterMismatchNmax 8 \
               --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --quantTranscriptomeBan " \
               "Singleend --outSAMattrRGline " + RGRP
    if sf == 'N':
        # add XS tag is input is not stranded
        star_cmd += ' --outSAMattributes NH HI AS nM XS'
    star_cmd += ' 2>> ' + loc + ' >> ' + loc + '; mv *Log* ' + log_dir

    log(loc, date_time() + star_cmd + "\n")
    check = call(star_cmd, shell=True)
    if check == 0:
        return 0
    else:
        return 1
예제 #48
0
def novosort_sort_pe(novosort, sample, log_dir, t, mem, stype):
    samp_root = os.path.basename(sample)
    temp = 'novosort_tmp'
    novosort_sort_pe_cmd = 'mkdir ' + temp + ';' + novosort + " --threads " + t + " --ram " + mem \
                           + "G --tmpdir  " + temp + " --output " + sample + ".srt.bam --index  " + sample + ".bam > " \
                           + log_dir + samp_root + ".novosort.sort.pe.log 2>&1"
    if stype == 'name':
        novosort_sort_pe_cmd = 'mkdir ' + temp + ';' + novosort + " --threads " + t + " --ram " + mem \
                               + "G --tmpdir  " + temp + " --output " + sample + ".nsrt.bam -n  " + sample + ".bam > " \
                               + log_dir + samp_root + ".novosort.sort.pe.log 2>&1"
    log(log_dir + samp_root + ".novosort.sort.pe.log",
        date_time() + novosort_sort_pe_cmd + "\n")
    f = 0
    try:
        f = subprocess.call(novosort_sort_pe_cmd, shell=True)
        rm_tmp = 'rm -rf novosort_tmp'
        subprocess.call(rm_tmp, shell=True)
    except:
        log(log_dir + sample + ".novosort.sort.pe.log",
            'novosort sort failed for sample ' + sample + '\n')
        exit(1)
    return f
예제 #49
0
def star(STAR, genome, end1, end2, sample, log_dir, th, sf):
    loc = log_dir + sample + ".star.log"
    meta = sample.split('_')
    RGRP = "ID:" + sample + "\tLB:" + meta[0] + "\tPU:" + meta[
        4] + "\tSM:" + meta[0] + "\tPL:illumina"
    star_cmd = STAR + " --runMode alignReads --twopassMode Basic --outFileNamePrefix " + sample + ". --runThreadN " \
               + th + " --genomeDir " + genome + " --readFilesIn " + end1 + " " + end2 + " --readFilesCommand zcat \
               --quantMode TranscriptomeSAM GeneCounts --outSAMtype BAM SortedByCoordinate --outFilterType BySJout \
               --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterMismatchNmax 8 \
               --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --quantTranscriptomeBan " \
               "Singleend --outSAMattrRGline " + RGRP
    if sf == 'N':
        # add XS tag is input is not stranded
        star_cmd += ' --outSAMattributes NH HI AS nM XS'
    star_cmd += ' 2>> ' + loc + ' >> ' + loc + '; mv *Log* ' + log_dir

    log(loc, date_time() + star_cmd + "\n")
    check = call(star_cmd, shell=True)
    if check == 0:
        return 0
    else:
        return 1
예제 #50
0
def gen_report(vcf, out, c, ref_flag):
    # open out file and index counts, context, etc
    fn = os.path.basename(vcf)
    parts = fn.split('.')
    loc = 'LOGS/' + parts[0] + '.subsitutions.vep.priority_report.log'
    log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n')
    mut_dict = create_mutect_ind(out)
    log(loc, date_time() + 'Created index for added mutect info\n')
    on_dict = {}
    if c != 'n':
        on_dict = create_target(c)
        log(loc, date_time() + 'Target file given, creating index for on target info\n')
    vcf_in = VariantFile(vcf)

    out = open(parts[0] + '.subsitutions.vep.prioritized_impact.report.xls', 'w')
    desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0,
               'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0}

    desc_string = vcf_in.header.info['ANN'].record['Description']
    desc_string = desc_string.lstrip('"')
    desc_string = desc_string.rstrip('"')
    desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '')
    f_pos_list = []
    desc_list = desc_string.split('|')
    ann_size = len(desc_list)
    for i in range(0, ann_size, 1):
        if desc_list[i] in desired:
            f_pos_list.append(i)
            desired[desc_list[i]] = i
    out.write('chr\tpos\tcontext\tref\talt\tnormal_ref_count\tnormal_alt_count\t%_normal_alt\ttumor_ref_count\t'
              'tumor_alt_count\t%_tumor_alt\tT/N_%_alt_ratio\tsnp_ID\tgnomAD_AF\tgene\ttx_id\teffect\timpact\tbiotype\t'
              'codon_change\tamino_acid_change\ton/off-target\n')
    if ref_flag != 'n':
        ref_flag = create_index(ref_flag)
    for record in vcf_in.fetch():
        (chrom, pos, ref, alt) = record.contig, str(record.pos), record.ref, record.alts[0]
        ann_list = [_.split('|') for _ in record.info['ANN']]
        tflag = 'NA'
        if c != 'n':
            tflag = mark_target(chrom, pos, on_dict)
            # only outputting ON TARGET hits
            if tflag == 'OFF':
                continue
        output_highest_impact(chrom, pos, ref, alt, ann_list, mut_dict, desired, tflag, out, ref_flag)

    out.close()
    log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n')
    return 0
예제 #51
0
def platypus_germline(config_file, sample, log_dir, cflag):

    loc = log_dir + sample + ".platypus.log"
    # here for safety as python is confusing about whether variables exist outside of if-else statements or not
    platypus_cmd = ''
    if cflag == 'y':
        (platypus, fasta, threads, project_dir, project,
         align) = parse_config(config_file, cflag)
        bam = project_dir + project + '/' + align + '/' + sample + '/BAM/' + sample + '.merged.final.bam'
        platypus_cmd = "python2.7 " + platypus + " callVariants --nCPU=" + threads + " --refFile=" + fasta \
                       + " --bamFiles=" + bam + " -o " + sample + ".germline_calls.vcf --logFileName=" \
                       + log_dir + sample + ".platypus.log" + " >> " + loc + " 2>&1"
    else:
        (platypus, fasta, threads, region_file, minVAF, samtools, project_dir, project, align) \
            = parse_config(config_file, cflag)

        bam = project_dir + project + '/' + align + '/' + sample + '/BAM/' + sample + '.merged.final.bam'
        if not (os.path.isfile(bam + '.bai')
                or os.path.isfile(bam[:-1] + 'i')):
            log(loc, date_time() + bam + ' not indexed.  Indexing\n')
            cmd = samtools + ' index ' + bam
            log(loc, date_time() + cmd + '\n')
            subprocess.call(cmd, shell=True)
        platypus_cmd = "python2.7 " + platypus + " callVariants --nCPU=" + threads + " --refFile=" + fasta \
                       + " --bamFiles=" + bam + " --filterDuplicates=0 -o " + sample \
                       + ".germline_calls.vcf --minVarFreq=" + minVAF + " --regions=" + region_file \
                       + " --logFileName=" + loc + " >> " + loc + " 2>&1"
    log(loc, date_time() + platypus_cmd + "\n")
    f = 0
    try:
        f = subprocess.call(platypus_cmd, shell=True)
    except:
        log(
            loc, 'platypus germline variant calling failed for sample ' +
            sample + '\n')
        return f

    return 0
예제 #52
0
def novosort_sort_pe(novosort, sample, log_dir, threads, ram, rmdup):
    if rmdup == 'Y':
        logfile = sample + ".novosort.rmdup.sort.pe.log"
        novosort_sort_cmd = 'mkdir novosort_tmp;' + novosort + " -c " + threads + " -m " + ram \
                            + "G --tmpdir novosort_tmp --rd --kt -o " + sample + ".rmdup.srt.bam --index  "\
                               + sample + ".bam > " + log_dir + logfile + " 2>&1"
        log(log_dir + logfile, date_time() + novosort_sort_cmd + "\n")
    else:
        logfile = sample + ".novosort.sort.pe.log"
        novosort_sort_cmd = 'mkdir novosort_tmp;' + novosort + " --threads " + threads + " --ram " \
                               + ram + "G --tmpdir novosort_tmp -o " + sample + ".srt.bam --index  " \
                               + sample + ".bam > " + log_dir + logfile + " 2>&1"
        log(log_dir + logfile, date_time() + novosort_sort_cmd + "\n")
    f = 0
    try:
        f = subprocess.call(novosort_sort_cmd, shell=True)
        rm_tmp = 'rm -rf novosort_tmp'
        subprocess.call(rm_tmp, shell=True)
    except:
        log(log_dir + logfile,
            'novosort sort failed for sample ' + sample + '\n')
        exit(1)
    return f
예제 #53
0
def annot_vcf_vep_pipe(config_file, sample_pair, in_suffix, out_suffix,
                       in_mutect, source):
    (vep_tool, vep_cache, fasta, report, dbsnp, vcache, plugin_dir, threads,
     intvl, dustmask_flag, wg_flag, tx_index, project_dir, project,
     analysis) = parse_config(config_file)
    # scale back on the forking a bit

    if int(threads) > 2:
        # threads = str(int(threads)/2 - 1)
        threads = str(int(threads) - 1)
    # track to prevent repeat annotation if same sample used as comparison
    loc = 'LOGS/' + sample_pair + '.vep91_anno.log'
    ana_dir = project_dir + project + '/' + analysis + '/' + sample_pair + '/OUTPUT'
    in_vcf = ana_dir + '/' + sample_pair + in_suffix
    out_vcf = sample_pair + out_suffix

    if source == 'scalpel':
        z_check = pass_filter(ana_dir, sample_pair, in_suffix, dustmask_flag)
        if z_check == 0:
            log(
                loc,
                date_time() +
                '0 variant calls PASS scalpel\'s filters, skipping annotation!\n'
            )
            return 0
        in_vcf = sample_pair + '.somatic_indel.PASS.vcf'
    # run_vep = ''
    buffer_size = '5000'
    run_cmd = run_vep(vep_tool, in_vcf, out_vcf, buffer_size, threads, fasta,
                      vep_cache, vcache, loc, plugin_dir)
    log(
        loc,
        date_time() + 'Annotating sample ' + sample_pair + in_suffix + ' ' +
        run_cmd + '\n')
    # from stack overflow to allow killing of spawned processes in main process fails for cleaner restart
    check = subprocess.Popen(run_cmd,
                             stdout=subprocess.PIPE,
                             shell=True,
                             preexec_fn=os.setsid)
    check_run = watch_mem(check, source, sample_pair, loc)
    if check_run != 0:

        buffer_size = str(int(buffer_size) / 2)
        clean_up = 'rm ' + out_vcf + '*'
        log(
            loc,
            date_time() + 'VEP failed. Status of run was ' + str(check_run) +
            ' Trying smaller buffer size of ' + buffer_size + '\n' + clean_up +
            '\n')
        os.killpg(os.getpgid(check.pid), signal.SIGINT)

        subprocess.call(clean_up, shell=True)
        run_cmd = run_vep(vep_tool, in_vcf, out_vcf, buffer_size, threads,
                          fasta, vep_cache, vcache, loc, plugin_dir)
        log(
            loc,
            date_time() + 'Annotating sample ' + sample_pair + in_suffix +
            '\n')
        check = subprocess.call(run_cmd, shell=True)
        if check != 0:
            log(loc,
                date_time() + 'VEP failed for sample ' + sample_pair + '\n')
            exit(1)
    else:
        log(
            loc,
            date_time() + 'VEP annotation of ' + sample_pair + in_suffix +
            ' successful!\n')

    if vep_cache == '84':
        from annotation.deprecated.vep_substitution_report import gen_report as gen_snv_report
        from annotation.deprecated.vep_indel_report import gen_report as gen_indel_report
    else:
        from annotation.VEP91_substitution_report import gen_report as gen_snv_report
        from annotation.VEP91_indel_report import gen_report as gen_indel_report
    if source == 'mutect':
        if wg_flag == 'y':
            intvl = 'n'
        check = gen_snv_report(out_vcf,
                               ana_dir + '/' + sample_pair + in_mutect, intvl,
                               tx_index, vcache)
        if check != 0:
            log(loc,
                date_time() + 'Report generation for ' + out_vcf + ' failed\n')
            exit(1)
    else:
        check = gen_indel_report(out_vcf, tx_index, vcache)
        if check != 0:
            log(loc,
                date_time() + 'Report generation for ' + out_vcf + ' failed\n')
            exit(1)
    return 0