def fastq(ifile, ofile1, ofile2):
    """
    :param ifile: input bam file
    :param ofile1: first file of fastq pair
    :param ofile2: second file of fastq pair
    """

    logging.info('generating fastq file for ' + str(ifile))

    pysam.fastq('-1', ofile1, '-2', ofile2, '-n', ifile)
예제 #2
0
def simple_fastq(tmp_path, simple_bam):
    import pysam
    f = tmp_path / 'simple.fq'
    fq = pysam.fastq('-t', '-N', '-O', simple_bam)
    fq = fq.replace('\t', '_')
    f.write_text(fq)
    return str(f)
예제 #3
0
elif (len(sys.argv) == 4):
    bam_in = sys.argv[1]
    bed_path = sys.argv[2]
    out_prefix = sys.argv[3]
else:
    sys.stderr.write(
        "Usage: bam2fq_for_realign.py miniBAM U1_BED [OUT_PREFIX]")
    sys.exit(1)

mq_cut = 100  # reads < mapq_cut will be realigned

bam_tmp = os.path.splitext(bam_in)[0] + '.tmp.bam'
# Remove duplicates (1024)
pysam.view('-o', bam_tmp, '-uF', '1024', '-f', '1', bam_in, catch_stdout=False)
# Convert miniBAM to FASTQ
fq = pysam.fastq('-O', bam_tmp).split('\n')
# Convert FASTQ to dict
fq_dict = {}
for i in range(len(fq)):
    if i % 4 == 0:
        read_name = fq[i]
    elif i % 4 == 1:
        dna = fq[i]
    elif i % 4 == 3:
        qual = fq[i]
        # Record one read into dict
        fq_dict[read_name] = (dna, qual)
read_names = list(fq_dict.keys())
# Convert miniBAM to BED
U1U11 = BedTool(bed_path)
bam = BedTool(bam_tmp)
예제 #4
0
def reconst_a(args, params, filenames, refseqid):
    log.logger.debug('started.')
    try:
        if args.p <= 2:
            thread_n = args.p
        elif args.p >= 3:
            thread_n = args.p - 1
        if args.alignmentin is True:
            sample_name = os.path.basename(
                args.b) if not args.b is None else os.path.basename(args.c)
        else:
            sample_name = os.path.basename(args.fq1)
        pysam.view(filenames.mapped_to_virus_bam,
                   '-h',
                   '-o',
                   filenames.tmp_bam,
                   refseqid,
                   catch_stdout=False)
        _, seq = utils.retrieve_only_one_virus_fasta(args.vref, refseqid)
        with open(filenames.tmp_fa, 'w') as outfile:
            outfile.write('>%s %s\n%s\n' % (refseqid, sample_name, seq))
        pysam.faidx(filenames.tmp_fa)

        # mask low depth regions
        mask_low_depth(args, params, filenames, filenames.tmp_fa, refseqid)

        if os.path.exists(filenames.tmp_fa_dict) is True:
            os.remove(filenames.tmp_fa_dict)
        cmd = 'java -jar %s CreateSequenceDictionary R=%s O=%s' % (
            args.picard, filenames.tmp_fa, filenames.tmp_fa_dict)
        log.logger.debug('picard command = `' + cmd + '`')
        out = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE)
        log.logger.debug(
            '\n' + '\n'.join([l.decode() for l in out.stderr.splitlines()]))
        if not out.returncode == 0:
            log.logger.error('Error occurred during gatk running.')
            exit(1)
        cmd = 'java -jar %s AddOrReplaceReadGroups I=%s O=%s RGLB=lib1 RGPL=ILLUMINA RGPU=unit1 RGSM=20' % (
            args.picard, filenames.tmp_bam, filenames.tmp_rg_bam)
        log.logger.debug('picard command = `' + cmd + '`')
        out = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE)
        log.logger.debug(
            '\n' + '\n'.join([l.decode() for l in out.stderr.splitlines()]))
        if not out.returncode == 0:
            log.logger.error('Error occurred during gatk running.')
            exit(1)


#        pysam.index('-@', str(thread_n), filenames.tmp_rg_bam)
        pysam.index(filenames.tmp_rg_bam)
        cmd = 'gatk --java-options "-Xmx4g" HaplotypeCaller -R %s -I %s -O %s' % (
            filenames.tmp_fa, filenames.tmp_rg_bam, filenames.hhv6a_vcf_gz)
        log.logger.debug('gatk command = `' + cmd + '`')
        out = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE)
        log.logger.debug(
            '\n' + '\n'.join([l.decode() for l in out.stderr.splitlines()]))
        if not out.returncode == 0:
            log.logger.error('Error occurred during gatk running.')
            exit(1)
        cmd = 'bcftools norm -c x -f %s %s -Oz -o %s' % (
            filenames.tmp_masked_fa, filenames.hhv6a_vcf_gz,
            filenames.hhv6a_norm_vcf_gz)
        log.logger.debug('bcftools command = `' + cmd + '`')
        out = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE)
        log.logger.debug(
            '\n' + '\n'.join([l.decode() for l in out.stderr.splitlines()]))
        if not out.returncode == 0:
            log.logger.error('Error occurred during bcftools running.')
            exit(1)
        cmd = 'bcftools index %s' % filenames.hhv6a_norm_vcf_gz
        log.logger.debug('bcftools command = `' + cmd + '`')
        out = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE)
        log.logger.debug(
            '\n' + '\n'.join([l.decode() for l in out.stderr.splitlines()]))
        if not out.returncode == 0:
            log.logger.error('Error occurred during bcftools running.')
            exit(1)
        cmd = 'bcftools consensus -f %s -o %s %s' % (
            filenames.tmp_masked_fa, filenames.hhv6a_gatk_naive,
            filenames.hhv6a_norm_vcf_gz)
        log.logger.debug('bcftools command = `' + cmd + '`')
        out = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE)
        log.logger.debug(
            '\n' + '\n'.join([l.decode() for l in out.stderr.splitlines()]))
        if not out.returncode == 0:
            log.logger.error('Error occurred during bcftools running.')
            exit(1)
        # remove unnecessary files
        os.remove(filenames.tmp_rg_bam)
        os.remove(filenames.tmp_rg_bam + '.bai')
        os.remove(filenames.tmp_fa)
        os.remove(filenames.tmp_fa + '.fai')
        os.remove(filenames.tmp_masked_fa)
        os.remove(filenames.tmp_masked_fa + '.fai')
        os.remove(filenames.tmp_fa_dict)
        if args.keep is False:
            os.remove(filenames.hhv6a_vcf_gz + '.tbi')
            os.remove(filenames.hhv6a_norm_vcf_gz)
            os.remove(filenames.hhv6a_norm_vcf_gz + '.csi')
        if args.denovo is True:
            pysam.sort('-@', '%d' % thread_n, '-n', '-O', 'BAM', '-o',
                       filenames.tmp_sorted_bam, filenames.tmp_bam)
            pysam.fastq('-@', '%d' % thread_n, '-N', '-f', '1', '-F', '3852',
                        '-0', '/dev/null', '-1', filenames.tmp_bam_fq1, '-2',
                        filenames.tmp_bam_fq2, '-s', '/dev/null',
                        filenames.tmp_sorted_bam)
            cmd = 'metaspades.py -1 %s -2 %s -k %s -t %d -m %d -o %s' % (
                filenames.tmp_bam_fq1, filenames.tmp_bam_fq2,
                params.metaspades_kmer, thread_n, params.metaspades_memory,
                filenames.hhv6a_metaspades_o)
            log.logger.debug('metaspades command = `' + cmd + '`')
            out = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE)
            log.logger.debug(
                '\n' + '\n'.join([l.decode()
                                  for l in out.stderr.splitlines()]))
            if not out.returncode == 0:
                log.logger.error('Error occurred during metaspades running.')
                exit(1)
            # remove unnecessary files
            os.remove(filenames.tmp_sorted_bam)
            os.remove(filenames.tmp_bam_fq1)
            os.remove(filenames.tmp_bam_fq2)
        # remove unnecessary files
        os.remove(filenames.tmp_bam)
    except:
        log.logger.error('\n' + traceback.format_exc())
        exit(1)
def map_to_dr(args, params, filenames, hhv6_refid):
    log.logger.debug('started.')
    try:
        if args.p <= 2:
            thread_n=args.p
        elif args.p >= 3:
            thread_n=args.p - 1
        pysam.view('-bh', '-o', filenames.tmp_bam, filenames.mapped_to_virus_bam, hhv6_refid, catch_stdout=False)
        pysam.sort('-n', filenames.tmp_bam, '-o', filenames.tmp_sorted_bam)
        pysam.fastq('-N', '-0', '/dev/null', '-1', filenames.unmapped_merged_1, '-2', filenames.unmapped_merged_2, '-s', '/dev/null', filenames.tmp_sorted_bam)
        if args.fastqin is True and args.single is True:
            cmd='hisat2 --mp %s -t -x %s -p %d -U %s --no-spliced-alignment | samtools view -Sbh -o %s -' % (params.hisat2_mismatch_penalties, filenames.hhv6_dr_index, thread_n, filenames.unmapped_merged_1, filenames.mapped_unsorted_bam)
        else:
            cmd='hisat2 --mp %s -t -x %s -p %d -1 %s -2 %s --no-spliced-alignment | samtools view -Sbh -o %s -' % (params.hisat2_mismatch_penalties, filenames.hhv6_dr_index, thread_n, filenames.unmapped_merged_1, filenames.unmapped_merged_2, filenames.mapped_unsorted_bam)
        log.logger.debug('mapping command = `'+ cmd +'`')
        out=subprocess.run(cmd, shell=True, stderr=subprocess.PIPE)
        log.logger.debug('\n'+ '\n'.join([ l.decode() for l in out.stderr.splitlines() ]))
        if not out.returncode == 0:
            log.logger.error('Error occurred during mapping.')
            exit(1)
        if not args.keep is True:
            os.remove(filenames.unmapped_merged_1)
            os.remove(filenames.unmapped_merged_2)
        # sort
        pysam.sort('-@', str(thread_n), '-o', filenames.mapped_sorted, filenames.mapped_unsorted_bam)
        if not args.keep is True:
            os.remove(filenames.mapped_unsorted_bam)
        # mark duplicate
        cmd='java -Xms896m -Xmx5376m -jar %s MarkDuplicates CREATE_INDEX=true I=%s O=%s M=%s' % (args.picard, filenames.mapped_sorted, filenames.mapped_to_dr_bam, filenames.markdup_metrix_dr)
        log.logger.debug('picard command = `'+ cmd +'`')
        out=subprocess.run(cmd, shell=True, stderr=subprocess.PIPE)
        log.logger.debug('\n'+ '\n'.join([ l.decode() for l in out.stderr.splitlines() ]))
        if not out.returncode == 0:
            log.logger.error('\n'+ traceback.format_exc())
            log.logger.error('Error occurred during gatk running.')
            exit(1)
        # remove unnecessary files
        os.remove(filenames.tmp_sorted_bam)
        if args.keep is False:
            os.remove(filenames.mapped_sorted)
        # check mapped = 0
        global read_mapped
        read_mapped=True
        with open(filenames.markdup_metrix_dr) as infile:
            for line in infile:
                if 'Unknown Library' in line:
                    ls=line.split()
                    if int(ls[2]) == 0:
                        read_mapped=False
                    break
        
        # convert to bedgraph
        cmd='bamCoverage --outFileFormat bedgraph -p %d --binSize %d -b %s -o %s' % (thread_n, params.bedgraph_bin, filenames.mapped_to_dr_bam, filenames.bedgraph_dr)
        log.logger.debug('bamCoverage command = "'+ cmd +'"')
        out=subprocess.run(cmd, shell=True, stderr=subprocess.PIPE)
        log.logger.debug('\n'+ '\n'.join([ l.decode() for l in out.stderr.splitlines() ]))
        if not out.returncode == 0:
            log.logger.error('\n'+ traceback.format_exc())
            log.logger.error('Error occurred during bamCoverage.')
            exit(1)
    except:
        log.logger.error('\n'+ traceback.format_exc())
        exit(1)
예제 #6
0
def retrieve_unmapped_reads(args, params, filenames):
    log.logger.debug('started.')
    try:
        if args.p <= 2:
            thread_n = args.p
        elif args.p >= 3:
            thread_n = args.p - 1
        # retrieve discordant reads, default
        if args.use_mate_mapped is False and args.all_discordant is False:
            if not args.b is None:
                pysam.view('-@',
                           '%d' % thread_n,
                           '-f',
                           '12',
                           '-F',
                           '3842',
                           '-b',
                           '-o',
                           filenames.discordant_bam,
                           args.b,
                           catch_stdout=False)
            elif not args.c is None:
                pysam.view('-@',
                           '%d' % thread_n,
                           '-f',
                           '12',
                           '-F',
                           '3842',
                           '-b',
                           '-o',
                           filenames.discordant_bam,
                           '--reference',
                           args.fa,
                           args.c,
                           catch_stdout=False)
            pysam.fastq('-@', '%d' % thread_n, '-N', '-0', '/dev/null', '-1',
                        filenames.unmapped_merged_pre1, '-2',
                        filenames.unmapped_merged_pre2, '-s', '/dev/null',
                        filenames.discordant_bam)
            if args.keep is False:
                os.remove(filenames.discordant_bam)
        # retrieve discordant reads, non-default
        else:
            if not args.b is None:
                pysam.view('-@',
                           '%d' % thread_n,
                           '-f',
                           '1',
                           '-F',
                           '3842',
                           '-b',
                           '-o',
                           filenames.discordant_bam,
                           args.b,
                           catch_stdout=False)
            elif not args.c is None:
                pysam.view('-@',
                           '%d' % thread_n,
                           '-f',
                           '1',
                           '-F',
                           '3842',
                           '-b',
                           '-o',
                           filenames.discordant_bam,
                           '--reference',
                           args.fa,
                           args.c,
                           catch_stdout=False)
            pysam.sort('-@', '%d' % thread_n, '-n', '-O', 'BAM', '-o',
                       filenames.discordant_sort_bam, filenames.discordant_bam)
            if args.keep is False:
                os.remove(filenames.discordant_bam)
            if args.all_discordant is True:
                pysam.fastq('-@', '%d' % thread_n, '-N', '-0', '/dev/null',
                            '-1', filenames.unmapped_merged_pre1, '-2',
                            filenames.unmapped_merged_pre2, '-s', '/dev/null',
                            filenames.discordant_sort_bam)
            else:
                pysam.fastq('-@', '%d' % thread_n, '-f', '12', '-F', '3328',
                            '-N', '-0', '/dev/null', '-1',
                            filenames.unmapped_1, '-2', filenames.unmapped_2,
                            '-s', '/dev/null', filenames.discordant_sort_bam)
                if args.use_mate_mapped is True:
                    pysam.view('-@',
                               '%d' % thread_n,
                               '-f',
                               '8',
                               '-F',
                               '3332',
                               '-b',
                               '-o',
                               filenames.unmapped_bam_3,
                               filenames.discordant_sort_bam,
                               catch_stdout=False)
                    pysam.view('-@',
                               '%d' % thread_n,
                               '-f',
                               '4',
                               '-F',
                               '3336',
                               '-b',
                               '-o',
                               filenames.unmapped_bam_4,
                               filenames.discordant_sort_bam,
                               catch_stdout=False)
                    pysam.merge('-@', '%d' % thread_n, '-f',
                                filenames.unmapped_bam_34,
                                filenames.unmapped_bam_3,
                                filenames.unmapped_bam_4)
                    pysam.sort('-@', '%d' % thread_n, '-n', '-O', 'BAM', '-o',
                               filenames.unmapped_sorted_34,
                               filenames.unmapped_bam_34)
                    pysam.fastq('-@', '%d' % thread_n, '-N', '-0', '/dev/null',
                                '-1', filenames.unmapped_3, '-2',
                                filenames.unmapped_4, '-s', '/dev/null',
                                filenames.unmapped_sorted_34)
                # concatenate fastq
                with open(filenames.unmapped_merged_pre1, 'w') as outfile:
                    for f in [filenames.unmapped_1, filenames.unmapped_3]:
                        if os.path.exists(f) is True:
                            with open(f) as infile:
                                for line in infile:
                                    outfile.write(line)
                            utils.gzip_or_del(args, params, f)
                with open(filenames.unmapped_merged_pre2, 'w') as outfile:
                    for f in [filenames.unmapped_2, filenames.unmapped_4]:
                        if os.path.exists(f) is True:
                            with open(f) as infile:
                                for line in infile:
                                    outfile.write(line)
                            utils.gzip_or_del(args, params, f)
        # remove short reads
        infile1 = open(filenames.unmapped_merged_pre1)
        infile2 = open(filenames.unmapped_merged_pre2)
        outfile1 = open(filenames.unmapped_merged_1, 'w')
        outfile2 = open(filenames.unmapped_merged_2, 'w')
        min_seq_len = params.min_seq_len
        tmp1, tmp2 = [], []
        for line1, line2 in zip(infile1, infile2):
            tmp1.append(line1)
            tmp2.append(line2)
            if len(tmp1) == 4:
                seqlen1 = len(tmp1[1].strip())
                seqlen2 = len(tmp2[1].strip())
                if seqlen1 >= min_seq_len and seqlen2 >= min_seq_len:
                    outfile1.write(''.join(tmp1))
                    outfile2.write(''.join(tmp2))
                tmp1, tmp2 = [], []
        infile1.close()
        infile2.close()
        outfile1.close()
        outfile2.close()
        utils.gzip_or_del(args, params, filenames.unmapped_merged_pre1)
        utils.gzip_or_del(args, params, filenames.unmapped_merged_pre2)
        if args.keep is False:
            if os.path.exists(filenames.discordant_sort_bam) is True:
                os.remove(filenames.discordant_sort_bam)
            if args.use_mate_mapped is True:
                os.remove(filenames.unmapped_bam_3)
                os.remove(filenames.unmapped_bam_4)
                os.remove(filenames.unmapped_bam_34)
                os.remove(filenames.unmapped_sorted_34)

    except:
        log.logger.error('\n' + traceback.format_exc())
        exit(1)
예제 #7
0
md5(location)
    


# In[348]:


ccs_result=run_ccs('/data/yangxiaoxia/bam_sequel/m54152_170704_111850.subreads.bam','/home/kechanglin/gen3ccs_new3.ccs.bam')
ccs_result


# In[354]:


fastq_file=pysam.fastq('/home/kechanglin/gen3ccs_new3.ccs.bam')


# In[358]:


aa=os.system('samtools fastq '+'/home/kechanglin/gen3ccs_new3.ccs.bam '+'> '+'/home/kechanglin/data/newfq.fastq')


# In[390]:


b=run_cmd('fastp -i /home/kechanglin/data/newfq.fastq -o /home/kechanglin/data/fastp_newfq.fq')


# In[393]: