示例#1
0
def assemble_amplicons(contigs_fa=None,
                       ref_fa=None,
                       ref_gtf=None,
                       outdir='.',
                       sample_id='sampleXX',
                       padding=50,
                       min_contig_len=200,
                       keep_tmp=False,
                       quiet=False,
                       logfile=None,
                       debug=False):
    """ Pipeline step to assemble contigs using reference and amplicon regions

    Args:
        contigs_fa (str): Path to fasta file with assembled contigs
        ref_fa (str): Path to reference fasta file
        ref_gtf (str): Path to reference GTF file with amplicons
        outdir (str): Path to output directory
        sample_id (str): Name to append to scaffold sequence
        padding (int): Bases to include outside reference annotation
        min_contig_len (int): Minimum contig length for tiling path
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        out_assembly (str): Path to assembled amplicons (FASTA)
        out_summary (str): Path to assembly summary
        out_padded (str): Path to padded output file

    """
    # Check dependencies
    sysutils.check_dependency('nucmer')
    sysutils.check_dependency('delta-filter')
    sysutils.check_dependency('show-tiling')

    # Outputs
    out_assembly = os.path.join(outdir, 'amplicon_assembly.fna')
    out_summary = os.path.join(outdir, 'amplicon_summary.txt')
    out_padded = os.path.join(outdir, 'amplicon_padded.out')
    if os.path.exists(out_padded): os.unlink(out_padded)

    # Temporary directory
    tempdir = sysutils.create_tempdir('assemble_amplicons', None, quiet,
                                      logfile)

    # Create fasta file with sequence IDs only (remove decription)
    tmp_contigs_fa = sequtils.clean_seqnames_file(contigs_fa, tempdir)

    # Load reference sequence(s)
    refseqs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')}

    # For each amplicon, extract the sequence from the reference and scaffold using nucmer
    amplicon_alignments = []
    amps = [
        gl for gl in gtfparse.gtf_parser(ref_gtf) if gl.feature == 'amplicon'
    ]

    for gl in amps:
        msg = 'Amplicon ref|%s|reg|%s\n' % (gl.chrom, gl.attrs['name'])
        sysutils.log_message(msg, quiet, logfile)
        # Extract reference amplicon
        amp_s = max(0, (gl.start - 1) - padding)
        amp_e = min(len(refseqs[gl.chrom]), gl.end + padding)
        ampseq = refseqs[gl.chrom].seq[amp_s:amp_e]
        amplicon_fa = os.path.join(tempdir, 'subject.fa')
        with open(amplicon_fa, 'w') as outh:
            print('>ref|%s|reg|%s' % (gl.chrom, gl.attrs['name']), file=outh)
            print(sequtils.wrap(str(ampseq)), file=outh)

        # Align with nucmer
        fil, til = alignutils.align_nucmer(tmp_contigs_fa,
                                           amplicon_fa,
                                           tempdir,
                                           min_contig_len=min_contig_len,
                                           quiet=quiet,
                                           logfile=logfile,
                                           debug=debug)

        # Skip everything else if debugging
        if debug: continue

        # Parse tiling and show alignments
        trows = [alignutils.TilingRow(l) for l in open(til, 'rU')]
        if not trows:
            amplicon_alignments.append((gl.chrom, gl.attrs['name'], None))
        else:
            # Initialize alignment
            amp_seq = SeqIO.read(amplicon_fa, 'fasta')
            combined = alignutils.EmptyReferenceAlignment(
                str(amp_seq.seq).lower())
            for tr in trows:
                out = alignutils.show_aligns(tr.ref, tr.qry, fil)
                for nucaln in alignutils.parse_show_aligns(out):
                    combined = combined.merge_alignments(nucaln)
                    with open(out_padded, 'a') as outh:
                        print('%s\n%s\n%s' %
                              (tr, combined.raln(), combined.qaln()),
                              file=outh)
            amplicon_alignments.append((gl.chrom, gl.attrs['name'], combined))

        # Cleanup
        for f in [fil, til, amplicon_fa]:
            if os.path.isfile(f):
                os.unlink(f)

    # Write to output files
    with open(out_assembly, 'w') as outseq, open(out_summary, 'w') as outsum:
        for ref_id, reg, combined in amplicon_alignments:
            amp_id = sequtils.make_seq_id(sid=sample_id, ref=ref_id, reg=reg)
            if combined is None:
                msg1 = '%s\tFAIL\t%d' % (amp_id, 0)
                msg2 = u'%s\tFAIL\t%d\t%s\n' % (amp_id, 0, u"👎🏼")
                if logfile is not None:
                    print(u'%s\tFAIL\t%d\t%s' % (amp_id, 0, u"👎🏼"),
                          file=logfile)
            else:
                scaf, s, e = combined.scaffold2()
                msg1 = '%s\tPASS\t%d' % (amp_id, len(scaf))
                msg2 = u'%s\tPASS\t%d\t%s\n' % (amp_id, len(scaf), u"👍🏼")
                print('>%s' % (amp_id), file=outseq)
                print('%s' % sequtils.wrap(scaf), file=outseq)

            print(msg1, file=outsum)
            sysutils.log_message(msg2, quiet, logfile)

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'assemble_amplicons', quiet, logfile)

    return out_assembly, out_summary, out_padded
示例#2
0
def align_reads(
    fq1=None,
    fq2=None,
    fqU=None,
    ref_fa=None,
    outdir='.',
    bt2_preset='sensitive-local',
    sample_id='sampleXX',
    no_realign=False,
    remove_duplicates=False,
    encoding=None,
    ncpu=1,
    xmx=sysutils.get_java_heap_size(),
    keep_tmp=False,
    quiet=False,
    logfile=None,
    debug=False,
):
    """ Pipeline step to align reads

    Args:
        fq1 (str): Path to fastq file with read 1
        fq2 (str): Path to fastq file with read 2
        fqU (str): Path to fastq file with unpaired reads
        ref_fa (str): Path to reference fasta file
        outdir (str): Path to output directory
        bt2_preset (str): Bowtie2 preset to use for alignment
        sample_id (str): Read group ID
        no_realign (bool): Do not realign indels
        remove_duplicates (bool): Remove duplicates from final alignment
        encoding (str): Quality score encoding
        ncpu (int): Number of CPUs to use
        xmx (int): Maximum heap size for JVM in GB
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        out_aligned (str): Path to aligned BAM file
        out_bt2 (str): Path to bowtie2 report

    """
    # Check inputs
    if fq1 is not None and fq2 is not None and fqU is None:
        input_reads = "paired"  # Paired end
    elif fq1 is None and fq2 is None and fqU is not None:
        input_reads = "single"  # Single end
    elif fq1 is not None and fq2 is not None and fqU is not None:
        input_reads = "both"
    else:
        msg = "incorrect input reads; requires either "
        msg += "(--fq1 AND --fq2) OR (--fqU) OR (--fq1 AND --fq2 AND --fqU)"
        raise MissingRequiredArgument(msg)

    if encoding is None:
        if input_reads == 'single':
            encoding = helpers.guess_encoding(fqU)
        else:
            encoding = helpers.guess_encoding(fq1)

    # Check dependencies
    sysutils.check_dependency('bowtie2')
    sysutils.check_dependency('samtools')
    sysutils.check_dependency('picard')

    # Identify correct command for GATK
    GATK_BIN = sysutils.determine_dependency_path(['gatk', 'gatk3'])

    # Set JVM heap argument (for GATK)
    JAVA_HEAP = '_JAVA_OPTIONS="-Xmx%dg"' % xmx

    # Outputs
    out_aligned = os.path.join(outdir, 'aligned.bam')
    out_bt2 = os.path.join(outdir, 'aligned.bt2.out')

    # Temporary directory
    tempdir = sysutils.create_tempdir('align_reads', None, quiet, logfile)

    # Copy and index initial reference
    curref = os.path.join(tempdir, 'initial.fasta')
    cmd1 = ['cp', ref_fa, curref]
    cmd2 = ['samtools', 'faidx', curref]
    cmd3 = [
        'picard', 'CreateSequenceDictionary',
        'R=%s' % curref,
        'O=%s' % os.path.join(tempdir, 'initial.dict')
    ]
    cmd4 = ['bowtie2-build', curref, os.path.join(tempdir, 'initial')]
    sysutils.command_runner([cmd1, cmd2, cmd3, cmd4], 'align_reads:index',
                            quiet, logfile, debug)

    # Align with bowtie2
    cmd5 = [
        'bowtie2',
        '-p',
        '%d' % ncpu,
        '--phred33' if encoding == "Phred+33" else '--phred64',
        '--no-unal',
        '--rg-id',
        sample_id,
        '--rg',
        'SM:%s' % sample_id,
        '--rg',
        'LB:1',
        '--rg',
        'PU:1',
        '--rg',
        'PL:illumina',
        '--%s' % bt2_preset,
        '-x',
        '%s' % os.path.join(tempdir, 'initial'),
    ]
    if input_reads in [
            'paired',
            'both',
    ]:
        cmd5 += [
            '-1',
            fq1,
            '-2',
            fq2,
        ]
    elif input_reads in [
            'single',
            'both',
    ]:
        cmd5 += [
            '-U',
            fqU,
        ]
    cmd5 += [
        '-S',
        os.path.join(tempdir, 'aligned.bt2.sam'),
    ]
    cmd5 += [
        '2>',
        out_bt2,
    ]

    try:
        sysutils.command_runner([
            cmd5,
        ], 'align_reads:bowtie2', quiet, logfile, debug)
    except PipelineStepError as e:
        if os.path.exists(out_bt2):
            with open(out_bt2, 'r') as fh:
                print('[--- bowtie2 stderr ---]\n%s' % fh.read(),
                      file=sys.stderr)
        raise

    cmd6 = [
        'samtools',
        'view',
        '-u',
        os.path.join(tempdir, 'aligned.bt2.sam'),
        '|',
        'samtools',
        'sort',
        '>',
        os.path.join(tempdir, 'sorted.bam'),
    ]
    cmd7 = [
        'samtools',
        'index',
        os.path.join(tempdir, 'sorted.bam'),
    ]
    sysutils.command_runner([
        cmd6,
        cmd7,
    ], 'align_reads:samsort', quiet, logfile, debug)

    cur_bam = os.path.join(tempdir, 'sorted.bam')

    if remove_duplicates:
        sysutils.log_message('[--- Removing duplicates ---]', quiet, logfile)
    else:
        sysutils.log_message('[--- Marking duplicates ---]', quiet, logfile)

    # MarkDuplicates
    cmd8 = [
        'picard',
        'MarkDuplicates',
        'CREATE_INDEX=true',
        'USE_JDK_DEFLATER=true',
        'USE_JDK_INFLATER=true',
        'M=%s' % os.path.join(tempdir, 'rmdup.metrics.txt'),
        'I=%s' % cur_bam,
        'O=%s' % os.path.join(tempdir, 'rmdup.bam'),
    ]
    if remove_duplicates:
        cmd8 += [
            'REMOVE_DUPLICATES=true',
        ]
    sysutils.command_runner([
        cmd8,
    ], 'align_reads:markdups', quiet, logfile, debug)
    cur_bam = os.path.join(tempdir, 'rmdup.bam')

    if no_realign:
        print('[--- Skipping realignment ---]', file=sys.stderr)
    else:
        # RealignerTargetCreator
        cmd9 = [
            JAVA_HEAP,
            GATK_BIN,
            '-T',
            'RealignerTargetCreator',
            '-I',
            cur_bam,
            '-R',
            curref,
            '-o',
            os.path.join(tempdir, 'tmp.intervals'),
        ]
        # IndelRealigner
        cmd10 = [
            JAVA_HEAP, GATK_BIN, '-T', 'IndelRealigner', '--use_jdk_deflater',
            '--use_jdk_inflater', '-maxReads', '1000000', '-dt', 'NONE', '-I',
            cur_bam, '-R', curref, '-targetIntervals',
            os.path.join(tempdir, 'tmp.intervals'), '-o',
            os.path.join(tempdir, 'realign.bam')
        ]
        sysutils.command_runner([
            cmd9,
            cmd10,
        ], 'align_reads:realign', quiet, logfile, debug)
        cur_bam = os.path.join(tempdir, 'realign.bam')

    # Check that cur_bam was created
    if not os.path.exists(cur_bam):
        msg = "BAM does not exist: %s" % cur_bam
        raise sysutils.PipelineStepError(msg)

    cmd11a = [
        'rm',
        '-f',
        out_aligned,
    ]
    cmd11b = [
        'mv',
        cur_bam,
        out_aligned,
    ]
    cmd11c = [
        'samtools',
        'index',
        out_aligned,
    ]
    sysutils.command_runner([
        cmd11a,
        cmd11b,
        cmd11c,
    ], 'align_reads:copy', quiet, logfile, debug)

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'align_reads', quiet, logfile)

    return out_aligned, out_bt2
示例#3
0
def vcf_to_consensus(
    vcf=None,
    outdir='.',
    sampidx=0,
    min_dp=5,
    major=0.5,
    minor=0.2,
    keep_tmp=False,
    quiet=False,
    logfile=None,
):
    """ Pipeline step to create consensus sequence from VCF

    Args:
        vcf (str): Path to variant calls (VCF)
        outdir (str): Path to output directory
        sampidx (int): Index for sample if multi-sample VCF
        min_dp (int): Minimum depth to call site
        major (float): Allele fraction to make unambiguous call
        minor (float): Allele fraction to make ambiguous call
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:

    """
    # Check inputs
    if vcf is None:
        raise sysutils.PipelineStepError('VCF file is required')

    # Outputs
    out_fasta = os.path.join(outdir, 'consensus.fna')

    sysutils.log_message('[--- vcf_to_consensus ---]\n', quiet, logfile)
    sysutils.log_message('VCF:          %s\n' % vcf, quiet, logfile)

    # Parse VCF
    chroms = []
    samples = []

    if os.path.splitext(vcf)[1] == '.gz':
        lines = (l.decode('utf-8').strip('\n') for l in gzip.open(vcf, 'rb'))
    else:
        lines = (l.strip('\n') for l in open(vcf, 'r'))

    # Parse headers
    for l in lines:
        if l.startswith('##'):
            m = re.match('##contig=<ID=(\S+),length=(\d+)>', l)
            if m:
                chroms.append((m.group(1), int(m.group(2))))
        else:
            assert l.startswith('#')
            cols = l.strip('#').split('\t')
            samples = cols[9:]
            break

    if len(samples) <= sampidx:
        msg = 'Sample index %d does not exist. Samples: %s' % (sampidx,
                                                               str(samples))
        raise sysutils.PipelineStepError(msg)

    chrom_ordered = [_[0] for _ in chroms]
    chroms = dict(chroms)
    newseqs = dict((c, ['.'] * chroms[c]) for c in list(chroms.keys()))
    imputed = dict((c, [''] * chroms[c]) for c in list(chroms.keys()))
    for l in lines:
        chrom, start, stop, RA, AA, info, svals = parse_vcf_sample(l, sampidx)
        gt = call_gt(RA, AA, svals, min_dp, major, minor)

        if gt is None:
            imputed[chrom][start - 1] = RA[0].lower()
        else:
            if len(gt) == 1:
                newseqs[chrom][start - 1] = gt[0]
                imputed[chrom][start - 1] = gt[0]
            else:
                if all(len(_) == 1 for _ in gt):
                    newseqs[chrom][start - 1] = sequtils.get_ambig(gt)
                    imputed[chrom][start - 1] = sequtils.get_ambig(gt)
                else:
                    newseqs[chrom][start - 1] = ''.join(gt[0])
                    imputed[chrom][start - 1] = ''.join(gt[0])
    # newseqs = imputed
    sysutils.log_message('Output FASTA: %s\n' % out_fasta, quiet, logfile)
    with open(out_fasta, 'w') as outh:
        for chrom in chrom_ordered:
            new_seqid = sequtils.update_seq_id(chrom, samples[sampidx])
            new_seq = ''.join(newseqs[chrom]).replace('.', 'n')
            m = re.match('^(?P<pre>n*)(?P<seq>[^n].+[^n])?(?P<suf>n*)$',
                         new_seq)
            if m.group('seq') is None:
                msg = u'%s\tFAIL\t%d\t%s\n' % (new_seqid, 0, u"👎🏼")
                # Don't output sequence if not present
            else:
                msg = u'%s\tPASS\t%d\t%s\n' % (new_seqid, len(
                    m.group('seq')), u"👍🏼")
                print('>%s SM:%s' % (new_seqid, samples[sampidx]), file=outh)
                print(sequtils.wrap(new_seq), file=outh)

            sysutils.log_message(msg, quiet, logfile)

    return out_fasta
示例#4
0
def ph_parser(
    haplotypes_fa=None,
    outdir='.',
    prefix=None,
    keep_gaps=False,
    quiet=False,
    logfile=None,
):
    """

    Args:
        haplotypes_fa (str): Path to haplotype file from PredictHaplo (fasta-ish)
        outdir (str): Path to output directory
        prefix (str): Prefix to add to sequence names
        keep_gaps (bool): Do not remove gaps from alignment
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file

    Returns:

    """
    summary_txt = open(os.path.join(outdir, 'ph_summary.txt'), 'w')
    newseq_fa = open(os.path.join(outdir, 'ph_haplotypes.fna'), 'w')

    num_hap = 0
    freq = []
    fasta = []
    newseq = None
    ph = os.path.basename(haplotypes_fa).split(".")[0]
    for l in open(haplotypes_fa, 'r'):
        l = l.strip('\n')
        if l.startswith('>'):
            num_hap += 1
            if newseq is not None:
                fasta.append(newseq)
            newseq = [ph, l.strip(">"), None, ""]
        elif l.startswith(';'):
            parts = l.strip(';').split(':')
            if parts[0] == 'Freq':
                freq.append(float(parts[1]))
                newseq[2] = float(parts[1])
            else:
                pass
        else:
            newseq[3] += l.strip('\n')

    fasta.append(newseq)

    if len(fasta) == num_hap:
        sysutils.log_message("Number of haplotypes is correct.\n", quiet,
                             logfile)
        freq_sqrd = [x**2 for x in freq]
        freq_sqrd_sum = sum(freq_sqrd)

        hap_div = ((old_div(7000, (7000 - 1))) * (1 - freq_sqrd_sum))

        print("PH_num_hap %s" % num_hap, file=summary_txt)
        print("PH_hap_diversity %s" % hap_div, file=summary_txt)

        seqlen = len(fasta[0][-1])
        equal_len = True
        for seq in fasta:
            sl = len(seq[-1])
            if sl != seqlen:
                sysutils.log_message(
                    "Sequence length is different for each haplotype.\n",
                    quiet, logfile)
                equal_len = False
            else:
                pass
        if equal_len == True:
            print("PH_seq_len %s" % seqlen, file=summary_txt)

        for sub_list in fasta:
            if prefix is None:
                print('>sid|%s_%s|reg|%s| Freq=%s' %
                      (sub_list[0], sub_list[1], sub_list[0].split("_")[-1],
                       sub_list[2]),
                      file=newseq_fa)
            else:
                print('>sid|%s_%s_%s|reg|%s| Freq=%s' %
                      (prefix, sub_list[0], sub_list[1],
                       sub_list[0].split("_")[-1], sub_list[2]),
                      file=newseq_fa)
            if keep_gaps:
                print("%s" % (sub_list[-1]), file=newseq_fa)
            else:
                print("%s" % (sub_list[-1].replace('-', "")), file=newseq_fa)

        sysutils.log_message(
            "Summary and FASTA file completed for %s.\n" % haplotypes_fa,
            quiet, logfile)

    summary_txt.close()
    newseq_fa.close()
示例#5
0
def sample_reads(
    fq1=None,
    fq2=None,
    fqU=None,
    outdir='.',
    nreads=None,
    frac=None,
    seed=None,
    quiet=False,
    logfile=None,
    debug=False,
):
    """

    Args:
        fq1 (str): Path to fastq file with read 1
        fq2 (str): Path to fastq file with read 2
        fqU (str): Path to fastq file with unpaired reads
        outdir (str): Path to output directory
        nreads (int): Number of reads to sample
        frac (float): Fraction of reads to sample
        seed (int): Seed for random number generator
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        out1 (str): Path to sampled fastq file with read 1
        out2 (str): Path to sampled fastq file with read 2
        outU (str): Path to sampled fastq file with unpaired reads
    """
    # Check inputs
    if fq1 is not None and fq2 is not None and fqU is None:
        input_reads = "paired"  # Paired end
    elif fq1 is None and fq2 is None and fqU is not None:
        input_reads = "single"  # Single end
    elif fq1 is not None and fq2 is not None and fqU is not None:
        input_reads = "both"
    else:
        msg = "incorrect input reads; requires either "
        msg += "(--fq1 AND --fq2) OR (--fqU) OR (--fq1 AND --fq2 AND --fqU)"
        raise MissingRequiredArgument(msg)

    # Check dependencies
    sysutils.check_dependency('seqtk')

    # Set seed
    seed = seed if seed is not None else random.randrange(1, 1000)
    sysutils.log_message('[--- sample_reads ---] Random seed = %d\n' % seed,
                         quiet, logfile)

    # Set nreads/frac
    if frac is not None:
        if frac <= 0 or frac > 1:
            raise sysutils.PipelineStepError('--frac must be > 0 and <= 1.')
        frac_arg = '%f' % frac
    else:
        frac_arg = '%d' % nreads

    cmds = None
    if input_reads == 'single':
        out1 = out2 = None
        outU = os.path.join(outdir, 'sample_U.fastq')
        cmds = [
            [
                'seqtk',
                'sample',
                '-s%d' % seed,
                fqU,
                frac_arg,
                '>',
                outU,
            ],
        ]
    elif input_reads == 'paired':
        out1 = os.path.join(outdir, 'sample_1.fastq')
        out2 = os.path.join(outdir, 'sample_2.fastq')
        outU = None
        cmds = [
            [
                'seqtk',
                'sample',
                '-s%d' % seed,
                fq1,
                frac_arg,
                '>',
                out1,
            ],
            [
                'seqtk',
                'sample',
                '-s%d' % seed,
                fq2,
                frac_arg,
                '>',
                out2,
            ],
        ]
    elif input_reads == 'both':
        out1 = os.path.join(outdir, 'sample_1.fastq')
        out2 = os.path.join(outdir, 'sample_2.fastq')
        outU = os.path.join(outdir, 'sample_U.fastq')
        cmds = [
            [
                'seqtk',
                'sample',
                '-s%d' % seed,
                fq1,
                frac_arg,
                '>',
                out1,
            ],
            [
                'seqtk',
                'sample',
                '-s%d' % seed,
                fq2,
                frac_arg,
                '>',
                out2,
            ],
            [
                'seqtk',
                'sample',
                '-s%d' % seed,
                fqU,
                frac_arg,
                '>',
                outU,
            ],
        ]

    sysutils.command_runner(cmds, 'sample_reads', quiet, logfile, debug)
    return out1, out2, outU
示例#6
0
def assemble_to_ref(qry_fa,
                    ref_fa,
                    outdir,
                    pad_fh=None,
                    quiet=False,
                    logfile=None,
                    debug=False):
    """

    Args:
        qry_fa:
        ref_fa:
        outdir:
        pad_fh:
        quiet:
        logfile:
        debug:

    Returns:

    """
    # Align query to reference
    fil, til = align_nucmer(qry_fa,
                            ref_fa,
                            outdir,
                            quiet=quiet,
                            logfile=logfile,
                            debug=debug)
    if debug:
        return None

    # Parse tiling rows
    tr_byref = defaultdict(list)
    for l in open(til, 'rU'):
        tr = TilingRow(l)
        tr_byref[tr.ref].append(tr)

    # Load reference(s)
    refs = sorted(tr_byref.keys())
    ref_dict = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')}
    sysutils.log_message('\nReferences: %s\n' % ', '.join(refs), quiet,
                         logfile)

    scaffolds = {}
    for ref in refs:
        if pad_fh is not None:
            empty = EmptyReferenceAlignment(str(ref_dict[ref].seq).lower())
            print('%s%s' % (ref.ljust(40), empty.rseq().upper()), file=pad_fh)
        scaffolds[ref] = EmptyReferenceAlignment(
            str(ref_dict[ref].seq).lower())
        # Rank hits so that worst hit is in index 0 (best at the end)
        ranked = sorted(tr_byref[ref], key=lambda x: x.pid)
        ranked.sort(key=lambda x: x.qry_alen)

        for tr in ranked:
            out = show_aligns(tr.ref, tr.qry, fil)
            out = out.decode()
            # May be multiple alignments
            flag = False
            aln_reports = []
            for l in out.strip('\n').split('\n'):
                if re.match('^--\s+BEGIN', l):
                    aln_reports.append(list())
                    flag = True
                if flag:
                    aln_reports[-1].append(l)
                if re.match('^--\s+END', l):
                    flag = False

            for aln_report in aln_reports:
                # if debug:
                #     print("*" * 80, file=sys.stderr)
                #     print('\n'.join(aln_report), file=sys.stderr)
                #     print("*" * 80, file=sys.stderr)
                nucaln = NucmerReferenceAlignment(aln_report)
                # print('%d-%d' % (nucaln.rstart, nucaln.rend))
                if pad_fh is not None:
                    pad = empty.merge_alignments(nucaln)
                    print('%s%s' % (tr.qry.ljust(40), pad.padded()),
                          file=pad_fh)
                scaffolds[ref] = scaffolds[ref].merge_alignments(nucaln)

    return scaffolds
示例#7
0
def cliquesnv(fq1=None,
              fq2=None,
              fqU=None,
              ref_fa=None,
              outdir='.',
              jardir='.',
              O22min=None,
              O22minfreq=None,
              printlog=None,
              single=False,
              merging=None,
              fasta_format='extended4',
              outputstart=None,
              outputend=None,
              keep_tmp=False,
              quiet=False,
              logfile=None,
              debug=False,
              ncpu=1):

    # check if paired vs. single
    if fq1 is None and fq2 is None and fqU is not None:
        single = True

    # check dependencies and required arguments
    if fq1 is None and fq2 is None and fqU is None:
        raise MissingRequiredArgument("No fastq files given.")
    if single == False and (fq1 is None or fq2 is None):
        raise MissingRequiredArgument("Either fq1 or fq2 missing.")
    if ref_fa is None:
        raise MissingRequiredArgument("Reference FASTA missing.")

    sysutils.check_dependency('samtools')
    sysutils.check_dependency('bwa')

    if (os.path.isfile(os.path.join(jardir, "clique-snv.jar"))):
        print("CliqueSNV JAR file found.")
    else:
        raise MissingRequiredArgument("No JAR file found.")

    # Temporary directory
    tempdir = sysutils.create_tempdir('clique_snv', None, quiet, logfile)

    # Load reference fasta
    refs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')}

    # Identify reconstruction regions
    regions = []
    for rname, s in refs.items():
        regions.append(('cs%02d' % (len(regions) + 1), rname, 1, len(s)))

    sysutils.log_message('[--- Haplotype Reconstruction Regions ---]\n', quiet,
                         logfile)
    for iv in regions:
        sysutils.log_message('%s -- %s:%d-%d\n' % iv, quiet, logfile)

    if single == False:  #paired end
        # remove .1 and .2 from read names
        fq1_c = os.path.join(tempdir, "fq1_corrected.fastq")
        fq2_c = os.path.join(tempdir, "fq2_corrected.fastq")
        cmd01 = ["cat %s | sed 's/\.1 / /' > %s" % (fq1, fq1_c)]
        cmd02 = ["cat %s | sed 's/\.2 / /' > %s" % (fq2, fq2_c)]
        sysutils.command_runner([cmd01, cmd02], 'clique_snv:setup', quiet,
                                logfile, debug)

        # Create alignment for each REFERENCE in the reconstruction regions
        alnmap = {}
        for cs, rname, spos, epos in regions:
            if rname not in alnmap:
                # Create alignment
                tmp_ref_fa = os.path.join(tempdir, 'ref.%d.fa' % len(alnmap))
                tmp_sam = os.path.join(tempdir, 'aligned.%d.sam' % len(alnmap))
                SeqIO.write(refs[rname], tmp_ref_fa, 'fasta')
                cmd1 = [
                    'bwa',
                    'index',
                    tmp_ref_fa,
                ]
                cmd2 = [
                    'bwa',
                    'mem',
                    tmp_ref_fa,
                    fq1_c,
                    fq2_c,
                    '|',
                    'samtools',
                    'view',
                    '-h',
                    '-F',
                    '12',
                    '>',
                    tmp_sam,
                ]
                cmd3 = ['rm', '-f', '%s.*' % tmp_ref_fa]
                sysutils.command_runner([cmd1, cmd2, cmd3], 'clique_snv:setup',
                                        quiet, logfile, debug)
                alnmap[rname] = (tmp_ref_fa, tmp_sam)

    else:  #single read

        # Create alignment for each REFERENCE in the reconstruction regions
        alnmap = {}
        for cs, rname, spos, epos in regions:
            if rname not in alnmap:
                # Create alignment
                tmp_ref_fa = os.path.join(tempdir, 'ref.%d.fa' % len(alnmap))
                tmp_sam = os.path.join(tempdir, 'aligned.%d.sam' % len(alnmap))
                SeqIO.write(refs[rname], tmp_ref_fa, 'fasta')
                cmd1 = [
                    'bwa',
                    'index',
                    tmp_ref_fa,
                ]
                cmd2 = [
                    'bwa',
                    'mem',
                    tmp_ref_fa,
                    fqU,
                    '|',
                    'samtools',
                    'view',
                    '-h',
                    '-F',
                    '12',
                    '>',
                    tmp_sam,
                ]
                cmd3 = ['rm', '-f', '%s.*' % tmp_ref_fa]
                sysutils.command_runner([cmd1, cmd2, cmd3], 'clique_snv:setup',
                                        quiet, logfile, debug)
                alnmap[rname] = (tmp_ref_fa, tmp_sam)

    # Run CliqueSNV for each region
    cmd4 = ['mkdir -p %s' % os.path.join(outdir, 'clique_snv')]
    sysutils.command_runner([
        cmd4,
    ],
                            stage='cliquesnv',
                            quiet=quiet,
                            logfile=logfile,
                            debug=debug)
    i = 0  #index for filenames
    for cs, rname, spos, epos in regions:
        msg = "Reconstruction region %s:" % cs
        msg += " %s:%d-%d\n" % (rname, spos, epos)
        sysutils.log_message(msg, quiet, logfile)

        # rename the cliquesnv number (cs##) to include region (now: cs##_reg)
        cs = '%s_%s' % (cs, rname.split('|')[-2])

        samfile = os.path.join(tempdir, 'aligned.%d.sam' % i)
        method = 'snv-illumina'
        cmd5 = [
            'java -jar %s -m %s -in %s -threads %d -outDir %s -fdf %s' %
            (os.path.join(jardir, 'clique-snv.jar'), method, samfile, ncpu,
             tempdir, fasta_format)
        ]
        if O22min is not None:
            cmd5 += ['-t %f' % O22min]
        if O22minfreq is not None:
            cmd5 += ['-tf %f' % O22minfreq]
        if printlog is not None:
            cmd5 += ['-log']
        if merging is not None:
            cmd5 += ['-cm %s' % merging]
        if outputstart is not None:
            cmd5 += ['-os %d' % outputstart]
        if outputend is not None:
            cmd5 += ['-oe %d' % outputend]
        sysutils.command_runner([
            cmd5,
        ],
                                stage='clique_snv',
                                quiet=quiet,
                                logfile=logfile,
                                debug=debug)

        # copy output file and delete tempdir
        outname1 = 'aligned.%d.txt' % i
        outname2 = 'aligned.%d.fasta' % i

        os.makedirs(os.path.join(outdir, 'clique_snv/%s' % cs), exist_ok=True)
        if os.path.exists(os.path.join(tempdir, '%s' % outname1)):
            shutil.copy(
                os.path.join(tempdir, '%s' % outname1),
                os.path.join(outdir, 'clique_snv/%s/%s.txt' % (cs, cs)))
        if os.path.exists(os.path.join(tempdir, '%s' % outname2)):
            shutil.copy(
                os.path.join(tempdir, '%s' % outname2),
                os.path.join(outdir, 'clique_snv/%s/%s.fasta' % (cs, cs)))

        # parse output file
        with open(
                os.path.join(outdir,
                             'clique_snv/%s/%s_summary.txt' % (cs, cs)),
                'w') as sumfile, open(
                    os.path.join(outdir, 'clique_snv/%s/%s.txt' % (cs, cs)),
                    'r') as infile:
            l = infile.readlines()
            freqs = []
            haps = []
            tempnum = ''
            for line in l:
                if "SNV got" in line:
                    tempnum = line.split(' ')[2]
                if "frequency" in line:
                    freqs += [float(line.split(' ')[2][:-2])]
                if "haplotype=" in line:
                    haps += [line.split('=')[1][1:-2]]
            sumfile.write('CliqueSNV_num_hap\t%s\n' % tempnum)

            freq_sqrd = [x**2 for x in freqs]
            freq_sqrd_sum = sum(freq_sqrd)
            hap_div = ((old_div(7000, (7000 - 1))) * (1 - freq_sqrd_sum))
            sumfile.write('CliqueSNV_hap_diversity\t%s\n' % hap_div)
            sumfile.write('CliqueSNV_seq_len\t%s\n' % len(haps[0]))

        with open(os.path.join(outdir, 'clique_snv/%s/%s.fasta' % (cs, cs)),
                  'r') as fastafile:
            fastadata = fastafile.read().replace('aligned.%d' % i, rname)
            with open(
                    os.path.join(outdir, 'clique_snv/%s/%s.fasta' % (cs, cs)),
                    'w') as newfastafile:
                newfastafile.write(fastadata)

        i += 1

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'clique_snv', quiet, logfile)

    return
示例#8
0
def progressive_refine_assembly(
        fq1=None, fq2=None, fqU=None, ref_fa=None, outdir='.',
        max_step=None, subsample=None, seed=None, sample_id='sampleXX',
        ncpu=1, xmx=sysutils.get_java_heap_size(),
        keep_tmp=False, quiet=False, logfile=None, debug=False,
    ):

    # Outputs
    out_refined = os.path.join(outdir, 'refined.fna')
    out_bt2 = os.path.join(outdir, 'refined_bt2.out')
    out_summary = os.path.join(outdir, 'refined_summary.out')

    #--- Initialize
    cur_asm = ref_fa
    cur_alnrate = None
    assemblies = [OrderedDict(), ]
    for s in SeqIO.parse(cur_asm, 'fasta'):
        assemblies[-1][s.id] = s
    
    # Message log for summary
    summary = [
        ['iteration', 'alnrate', 'diffs'] + ['diff:%s' % s for s in assemblies[0].keys()]
    ]

    # Seed random number generator
    random.seed(seed)
    
    for i in range(1, max_step+1):
        # Generate a refined assembly
        tmp_refined, tmp_bt2 = refine_assembly_step(
            fq1=fq1, fq2=fq2, fqU=fqU, ref_fa=cur_asm, outdir=outdir,
            iteration=i, subsample=subsample, sample_id=sample_id,
            ncpu=ncpu, xmx=xmx, keep_tmp=keep_tmp,
            quiet=True, logfile=logfile, debug=debug
        )

        # Check whether alignments are different
        diffs = OrderedDict()
        new_seqs = OrderedDict((s.id, s) for s in SeqIO.parse(tmp_refined, 'fasta'))
        for id1, seq1 in new_seqs.items():
            poss0 = [k for k in assemblies[-1].keys() if sequtils.seqid_match(id1, k)]
            if len(poss0) == 1:
                seq0 = assemblies[-1][poss0[0]]
            else:
                raise PipelineStepError("Could not match sequence %s" % id1)
            alns = pairwise2.align.globalms(seq1.seq, seq0.seq, 2, -1, -3, -1)
            d = min(sum(nc != cc for nc, cc in zip(t[0], t[1])) for t in alns)
            diffs[id1] = d

        total_diffs = sum(diffs.values())

        # Check new alignment rate
        with open(tmp_bt2, 'rU') as fh:
            bt2str = fh.read()
            m = re.search('(\d+\.\d+)\% overall alignment rate', bt2str)
            if m is None:
                msg = "Alignment rate not found in bowtie2 output."
                msg += "Output file contents:\n%s\n" % bt2str
                msg += "Aborting."
                raise PipelineStepError(msg)
            else:
                new_alnrate = float(m.group(1))

        # Create messages for log
        row = [str(i), '%.02f' % new_alnrate, '%d' % total_diffs, ]
        for k0 in assemblies[0].keys():
            poss1 = [k for k in diffs.keys() if sequtils.seqid_match(k, k0)]
            if len(poss1) == 0:
                row.append('FAIL')
            elif len(poss1) == 1:
                row.append(str(diffs[poss1[0]]))
            else:
                raise PipelineStepError("Multiple matches for %s" % k0)
        ######row += list(map(str, diffs.values()))
        summary.append(row)

        # Create messages for console
        sysutils.log_message('\nRefinement result:\n', quiet, logfile)
        sysutils.log_message('\tDifferences:\n', quiet, logfile)
        for s,d in diffs.items():
            sysutils.log_message('\t\t%s\t%d\n' % (s,d), quiet, logfile)
        if total_diffs > 0:
            msg = '\t%d differences found with previous\n' % total_diffs
        else:
            msg = '\tNo differences with previous\n'
        sysutils.log_message(msg, quiet, logfile)

        if cur_alnrate is None:
            msg = '\tAlignment rate: %0.2f\n' % new_alnrate
        elif new_alnrate > cur_alnrate:
            msg = '\tAlignment rate has improved: '
            msg += '%.02f > %.02f\n' % (new_alnrate, cur_alnrate)
        else:
            msg = '\tAlignment rate has not improved: '
            msg += '%.02f <= %.02f\n' % (new_alnrate, cur_alnrate)
        sysutils.log_message(msg, quiet, logfile)

        # Decide whether to keep going
        keep_going = True
        if total_diffs == 0:
            keep_going = False
            sysutils.log_message('Stopping: no differences found\n', quiet, logfile)

        # We should also quit if alignment rate does not improve
        # However, subsampling reads can lead to changes in alignment rate
        # that can be ignore. When subsampling is implemented the first
        # boolean value should check whether subsampling is enabled
        if subsample is None: # not subsampling
            if cur_alnrate is not None and new_alnrate <= cur_alnrate:
                keep_going = False
                msg = 'Stopping: alignment rate did not improve\n'
                sysutils.log_message(msg, quiet, logfile)
        
        cur_asm = tmp_refined
        cur_alnrate = new_alnrate
        assemblies.append(new_seqs)

        if not keep_going:
            break

    # Final outputs
    shutil.copy(cur_asm, out_refined)
    shutil.copy(tmp_bt2, out_bt2)

    with open(out_summary, 'w') as outh:
        print('\n'.join('\t'.join(r) for r in summary), file=outh)

    return out_refined, out_bt2, out_summary
示例#9
0
def predict_haplo(
    fq1=None,
    fq2=None,
    ref_fa=None,
    region_txt=None,
    outdir='.',
    min_readlength=36,
    keep_tmp=False,
    quiet=False,
    logfile=None,
    debug=False,
):
    """ Pipeline step to assemble haplotypes

    Args:
        fq1 (str): Path to fastq file with read 1
        fq2 (str): Path to fastq file with read 2
        ref_fa (str): Path to reference fasta file
        region_txt (str): Path to region file
        outdir (str): Path to output directory
        min_readlength (int): Minimum readlength passed to PredictHaplo
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        best_fa (list): Path to best haplotype files (FASTA)

    """
    # Check dependencies
    sysutils.check_dependency('PredictHaplo-Paired')
    sysutils.check_dependency('bwa')

    # Temporary directory
    tempdir = sysutils.create_tempdir('predict_haplo', None, quiet, logfile)

    # Load reference fasta
    refs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')}

    # Identify reconstruction regions
    regions = []
    if region_txt:
        sysutils.log_message('Found regions file.\n', quiet, logfile)
        for l in open(region_txt, 'r'):
            rname, spos, epos = sequtils.region_to_tuple(l.strip())
            if rname not in refs:
                raise PipelineStepError("ERROR: reference %s not valid" %
                                        rname)
            spos = 1 if spos is None else spos
            epos = len(refs[rname]) if epos is None else epos
            regions.append(('PH%02d' % (len(regions) + 1), rname, spos, epos))
    else:
        for rname, s in refs.items():
            regions.append(('PH%02d' % (len(regions) + 1), rname, 1, len(s)))

    sysutils.log_message('[--- Haplotype Reconstruction Regions ---]\n', quiet,
                         logfile)
    for iv in regions:
        sysutils.log_message('%s -- %s:%d-%d\n' % iv, quiet, logfile)

    # Create alignment for each REFERENCE in the reconstruction regions
    alnmap = {}
    for ph, rname, spos, epos in regions:
        if rname not in alnmap:
            # Create alignment
            tmp_ref_fa = os.path.join(tempdir, 'ref.%d.fa' % len(alnmap))
            tmp_sam = os.path.join(tempdir, 'aligned.%d.sam' % len(alnmap))
            SeqIO.write(refs[rname], tmp_ref_fa, 'fasta')
            cmd1 = [
                'bwa',
                'index',
                tmp_ref_fa,
            ]
            cmd2 = [
                'bwa',
                'mem',
                tmp_ref_fa,
                fq1,
                fq2,
                '|',
                'samtools',
                'view',
                '-h',
                '-F',
                '12',
                '>',
                tmp_sam,
            ]
            cmd3 = ['rm', '-f', '%s.*' % tmp_ref_fa]
            sysutils.command_runner([cmd1, cmd2, cmd3], 'predict_haplo:setup',
                                    quiet, logfile, debug)
            alnmap[rname] = (tmp_ref_fa, tmp_sam)

    best_fa = []
    # Run PredictHaplo for each REGION
    for ph, rname, spos, epos in regions:
        msg = "Reconstruction region %s:" % ph
        msg += " %s:%d-%d\n" % (rname, spos, epos)
        sysutils.log_message(msg, quiet, logfile)

        # Construct params specific for region
        reg_params = dict(DEFAULTS)
        reg_params['min_readlength'] = min_readlength
        reg_params['reconstruction_start'] = spos
        reg_params['reconstruction_stop'] = epos
        reg_params['prefix'] = '%s_out.' % ph

        # Lookup reference and alignment filename
        reg_params['ref_fasta'] = os.path.basename(alnmap[rname][0])
        reg_params['alignment'] = os.path.basename(alnmap[rname][1])

        # Create config file for region
        config_file = '%s.config' % ph
        with open(os.path.join(tempdir, config_file), 'w') as outh:
            tmpconfig = config_template % reg_params
            print(tmpconfig.replace('###', '%'), file=outh)
        try:
            # Run PredictHaplo
            cmd1 = [
                'cd',
                tempdir,
            ]
            cmd2 = [
                'PredictHaplo-Paired', config_file, '&>',
                '%s.log' % config_file
            ]
            sysutils.command_runner([
                cmd1,
                cmd2,
            ], 'predict_haplo:%s' % ph, quiet, logfile, debug)

            # Copy files
            dest = os.path.join(outdir, ph)
            if not os.path.exists(dest):
                os.makedirs(dest)
            shutil.copy(os.path.join(tempdir, '%s.config.log' % ph), dest)
            for f in glob(os.path.join(tempdir, '%s_out*global*.fas' % ph)):
                shutil.copy(f, dest)
            for f in glob(os.path.join(tempdir, '%s_out*global*.html' % ph)):
                shutil.copy(f, dest)
            bf, bh = rename_best(dest, ph)
            best_fa.append((ph, bf))
        except PipelineStepError as e:
            print(e, file=sys.stderr)
            if e.returncode == 139:
                print("PredictHaplo segfaulted", file=sys.stderr)
            best_fa.append((ph, None))

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'predict_haplo', quiet, logfile)

    return best_fa