예제 #1
0
파일: assembly.py 프로젝트: andrewjpage/iva
    def _extend_contigs_with_bam(self, bam_in, out_prefix=None, output_all_useful_reads=False):
        if out_prefix is not None:
            fa_out1 = pyfastaq.utils.open_file_write(out_prefix + '_1.fa')
            fa_out2 = pyfastaq.utils.open_file_write(out_prefix + '_2.fa')
        keep_read_types = set([mapping.CAN_EXTEND_LEFT, mapping.CAN_EXTEND_RIGHT, mapping.KEEP])
        if output_all_useful_reads:
            keep_read_types.add(mapping.BOTH_UNMAPPED)
        previous_sam = None
        left_seqs = []
        right_seqs = []
        sam_reader = pysam.Samfile(bam_in, "rb")

        for current_sam in sam_reader.fetch(until_eof=True):
            if previous_sam is None:
                previous_sam = current_sam
                continue

            previous_type, current_type = mapping.get_pair_type(previous_sam, current_sam, self._get_ref_length_sam_pair(sam_reader, previous_sam, current_sam), self.max_insert, min_clip=self.min_clip)

            for sam, sam_type in [(previous_sam, previous_type), (current_sam, current_type)]:
                if sam_type == mapping.CAN_EXTEND_LEFT:
                    name = mapping.get_ref_name(sam, sam_reader)
                    clipped = mapping.soft_clipped(sam)[0]
                    self.contigs[name].add_left_kmer(common.decode(sam.seq[:clipped]))
                elif sam_type == mapping.CAN_EXTEND_RIGHT:
                    name = mapping.get_ref_name(sam, sam_reader)
                    self.contigs[name].add_right_kmer(common.decode(sam.seq[sam.qend:]))

                if out_prefix is not None and sam_type in keep_read_types:
                    if sam.is_read1:
                        print(mapping.sam_to_fasta(sam), file=fa_out1)
                    else:
                        print(mapping.sam_to_fasta(sam), file=fa_out2)

            previous_sam = None

        if out_prefix is not None:
            pyfastaq.utils.close(fa_out1)
            pyfastaq.utils.close(fa_out2)
        total_bases_added = 0

        for ctg in self.contigs:
            left_length, right_length = self.contigs[ctg].extend(self.ext_min_cov, self.ext_min_ratio, self.ext_bases)
            if self.verbose:
                print('    extend contig ' +  ctg, 'new_length:' + str(len(self.contigs[ctg])), 'added_left:' + str(left_length), 'added_right:' + str(right_length), sep='\t')
            self.contig_lengths[ctg].append([len(self.contigs[ctg]), left_length, right_length])
            total_bases_added += left_length + right_length

        return total_bases_added
예제 #2
0
파일: assembly.py 프로젝트: xinggui007/iva
    def _extend_contigs_with_bam(self, bam_in, out_prefix=None, output_all_useful_reads=False):
        if out_prefix is not None:
            fa_out1 = pyfastaq.utils.open_file_write(out_prefix + '_1.fa')
            fa_out2 = pyfastaq.utils.open_file_write(out_prefix + '_2.fa')
        keep_read_types = set([mapping.CAN_EXTEND_LEFT, mapping.CAN_EXTEND_RIGHT, mapping.KEEP])
        if output_all_useful_reads:
            keep_read_types.add(mapping.BOTH_UNMAPPED)
        previous_sam = None
        left_seqs = []
        right_seqs = []
        sam_reader = pysam.Samfile(bam_in, "rb")

        for current_sam in sam_reader.fetch(until_eof=True):
            if previous_sam is None:
                previous_sam = current_sam
                continue

            previous_type, current_type = mapping.get_pair_type(previous_sam, current_sam, self._get_ref_length_sam_pair(sam_reader, previous_sam, current_sam), self.max_insert, min_clip=self.min_clip)

            for sam, sam_type in [(previous_sam, previous_type), (current_sam, current_type)]:
                if sam_type == mapping.CAN_EXTEND_LEFT:
                    name = mapping.get_ref_name(sam, sam_reader)
                    clipped = mapping.soft_clipped(sam)[0]
                    self.contigs[name].add_left_kmer(common.decode(sam.seq[:clipped]))
                elif sam_type == mapping.CAN_EXTEND_RIGHT:
                    name = mapping.get_ref_name(sam, sam_reader)
                    self.contigs[name].add_right_kmer(common.decode(sam.seq[sam.qend:]))

                if out_prefix is not None and sam_type in keep_read_types:
                    if sam.is_read1:
                        print(mapping.sam_to_fasta(sam), file=fa_out1)
                    else:
                        print(mapping.sam_to_fasta(sam), file=fa_out2)

            previous_sam = None

        if out_prefix is not None:
            pyfastaq.utils.close(fa_out1)
            pyfastaq.utils.close(fa_out2)
        total_bases_added = 0

        for ctg in self.contigs:
            left_length, right_length = self.contigs[ctg].extend(self.ext_min_cov, self.ext_min_ratio, self.ext_bases)
            if self.verbose:
                print('    extend contig ' +  ctg, 'new_length:' + str(len(self.contigs[ctg])), 'added_left:' + str(left_length), 'added_right:' + str(right_length), sep='\t')
            self.contig_lengths[ctg].append([len(self.contigs[ctg]), left_length, right_length])
            total_bases_added += left_length + right_length

        return total_bases_added
예제 #3
0
파일: mapping.py 프로젝트: satta/iva
def find_incorrect_ref_bases(bam, ref_fasta):
    assert os.path.exists(bam)
    assert os.path.exists(ref_fasta)
    forward_keys = set(['A', 'C', 'G', 'T', 'N'])
    reverse_keys = set(['a', 'c', 'g', 't', 'n'])
    ref_seqs = {}
    bad_bases = {}
    pyfastaq.tasks.file_to_dict(ref_fasta, ref_seqs)
    mpileup_cmd = 'samtools mpileup ' + bam + ' | cut -f 1,2,5'
    mpileup_out = common.decode(subprocess.Popen(mpileup_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL).communicate()[0]).split('\n')[:-1]

    for line in mpileup_out:
        # somteimes mpileup has an empty bases column, so skip those
        try:
            refname, position, pileup = line.rstrip().split()
        except:
            continue

        assert refname in ref_seqs
        position = int(position) - 1
        pileup = strip_mpileup_coverage_string(pileup)
        counts = collections.Counter(pileup)
        consensus = consensus_base_both_strands(counts, forward_keys, reverse_keys, ratio=0.5)
        ref_base = ref_seqs[refname][position]

        if consensus not in [None, ref_base]:
            if refname not in bad_bases:
                bad_bases[refname] = []
            bad_bases[refname].append((position, ref_base, consensus))

    return bad_bases
예제 #4
0
def get_version(prog, must_be_in_path=True):
    assert prog in prog_to_version_cmd
    if not is_in_path(prog):
        if must_be_in_path:
            raise Error('Error getting version of ' + prog + ' - not found in path.')
        else:
            return 'UNKNOWN - not in path'

    cmd, regex = prog_to_version_cmd[prog]
    cmd_output = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
    cmd_output = common.decode(cmd_output[0]).split('\n')[:-1] + common.decode(cmd_output[1]).split('\n')[:-1]
    for line in cmd_output:
        hits = regex.search(line)
        if hits:
            return hits.group(1)
    return 'UNKNOWN ...\n    I tried running this to get the version: "' + cmd + '"\n    and the output didn\'t match this regular expression: "' + regex.pattern + '"'
예제 #5
0
def get_version(prog, must_be_in_path=True):
    assert prog in prog_to_version_cmd
    if not is_in_path(prog):
        if must_be_in_path:
            raise Error('Error getting version of ' + prog +
                        ' - not found in path.')
        else:
            return 'UNKNOWN - not in path'

    cmd, regex = prog_to_version_cmd[prog]
    cmd_output = subprocess.Popen(cmd,
                                  shell=True,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE).communicate()
    cmd_output = common.decode(cmd_output[0]).split('\n')[:-1] + common.decode(
        cmd_output[1]).split('\n')[:-1]
    for line in cmd_output:
        hits = regex.search(line)
        if hits:
            return hits.group(1)
    return 'UNKNOWN ...\n    I tried running this to get the version: "' + cmd + '"\n    and the output didn\'t match this regular expression: "' + regex.pattern + '"'
예제 #6
0
파일: kcount.py 프로젝트: Fridge004/iva
def _kmc_to_kmer_counts(infile, number, kmers_to_ignore=None, contigs_to_check=None, verbose=0, threads=1):
    '''Makes a dict of the most common kmers from the kmer counts output file of kmc'''
    counts = {}
    if os.path.getsize(infile) == 0:
        return counts
    tmpdir = tempfile.mkdtemp(prefix='tmp.common_kmers.', dir=os.getcwd())
    ref_seqs_file = os.path.join(tmpdir, 'ref.fa')
    counts_fasta_file = os.path.join(tmpdir, 'counts.fa')
    using_refs = _write_ref_seqs_to_be_checked(ref_seqs_file, kmers_to_ignore=kmers_to_ignore, contigs_to_check=contigs_to_check)

    if not using_refs:
        if verbose > 2:
            print('No existing kmers or contigs to check against. Using most common kmer for seed', flush=True)
        f = pyfastaq.utils.open_file_read(infile)
        for line in f:
            if len(counts) >= number:
                break
            try:
                kmer, count = line.rstrip().split()
                count = int(count)
            except:
                raise Error('Error getting kmer info from this line:\n' + line)

            counts[kmer] = count
        pyfastaq.utils.close(f)
    else:
        if verbose > 2:
            print('Existing kmers or contigs to check against. Running mapping', flush=True)
        mapping_prefix = os.path.join(tmpdir, 'map')
        bam = mapping_prefix + '.bam'
        _counts_file_to_fasta(infile, counts_fasta_file)
        mapping.map_reads(counts_fasta_file, None, ref_seqs_file, mapping_prefix, minid=0.9, index_k=9, index_s=1, sort=False, verbose=verbose, required_flag='0x4', threads=threads)

        sam_reader = pysam.Samfile(bam, "rb")
        for sam in sam_reader.fetch(until_eof=True):
            if len(counts) >= number:
                break
            try:
                count = sam.qname.split('_')[1]
            except:
                raise Error('Error getting count from sequence name in bam:\n' + sam.qname)

            nucleotides = common.decode(sam.seq)
            if nucleotides not in kmers_to_ignore:
                counts[nucleotides] = count
            elif verbose >= 4:
                print('Skipping seed already found:', nucleotides)
        sam_reader.close()

    shutil.rmtree(tmpdir)
    return counts
예제 #7
0
파일: kcount.py 프로젝트: satta/iva
def _kmc_to_kmer_counts(infile, number, kmers_to_ignore=None, contigs_to_check=None, verbose=0, threads=1):
    '''Makes a dict of the most common kmers from the kmer counts output file of kmc'''
    counts = {}
    if os.path.getsize(infile) == 0:
        return counts
    tmpdir = tempfile.mkdtemp(prefix='tmp.common_kmers.', dir=os.getcwd())
    ref_seqs_file = os.path.join(tmpdir, 'ref.fa')
    counts_fasta_file = os.path.join(tmpdir, 'counts.fa')
    using_refs = _write_ref_seqs_to_be_checked(ref_seqs_file, kmers_to_ignore=kmers_to_ignore, contigs_to_check=contigs_to_check)

    if not using_refs:
        if verbose > 2:
            print('No existing kmers or contigs to check against. Using most common kmer for seed', flush=True)
        f = pyfastaq.utils.open_file_read(infile)
        for line in f:
            if len(counts) >= number:
                break
            try:
                kmer, count = line.rstrip().split()
                count = int(count)
            except:
                raise Error('Error getting kmer info from this line:\n' + line)

            counts[kmer] = count
        pyfastaq.utils.close(f)
    else:
        if verbose > 2:
            print('Existing kmers or contigs to check against. Running mapping', flush=True)
        mapping_prefix = os.path.join(tmpdir, 'map')
        bam = mapping_prefix + '.bam'
        _counts_file_to_fasta(infile, counts_fasta_file)
        mapping.map_reads(counts_fasta_file, None, ref_seqs_file, mapping_prefix, minid=0.9, index_k=9, index_s=1, sort=False, verbose=verbose, required_flag='0x4', threads=threads)

        sam_reader = pysam.Samfile(bam, "rb")
        for sam in sam_reader.fetch(until_eof=True):
            if len(counts) >= number:
                break
            try:
                count = sam.qname.split('_')[1]
            except:
                raise Error('Error getting count from sequence name in bam:\n' + sam.qname)

            nucleotides = common.decode(sam.seq)
            if nucleotides not in kmers_to_ignore:
                counts[nucleotides] = count
            elif verbose >= 4:
                print('Skipping seed already found:', nucleotides)
        sam_reader.close()

    shutil.rmtree(tmpdir)
    return counts
예제 #8
0
파일: mapping.py 프로젝트: satta/iva
def sam_to_fasta(s):
    name = s.qname
    if s.is_read1:
        name += '/1'
    elif s.is_read2:
        name += '/2'
    else:
        raise Error('Read', name, 'must be first of second of pair according to flag. Cannot continue')

    seq = pyfastaq.sequences.Fasta(name, common.decode(s.seq))
    if s.is_reverse:
        seq.revcomp()

    return seq
예제 #9
0
파일: mapping.py 프로젝트: satta/iva
def get_bam_region_coverage(bam, seqname, seq_length, rev=False, verbose=0, both_strands=False):
    assert os.path.exists(bam)
    assert os.path.exists(bam + '.bai')
    # mpileup only reports positions of non-zero coverage, so can't just
    # take its output. Need to add in the zero coverage bases
    cov = [0] * seq_length

    if both_strands:
        flags = ''
    elif rev:
        flags = '--rf 0x10'
    else:
        flags = '--ff 0x10'

    mpileup_cmd = 'samtools mpileup -r ' + seqname + ' ' + flags + ' ' + bam + ' | cut -f 2,4'
    if verbose >= 2:
        print('    get_bam_region_coverage:', mpileup_cmd)
    mpileup_out = common.decode(subprocess.Popen(mpileup_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL).communicate()[0]).split('\n')[:-1]

    for line in mpileup_out:
        pos, depth = [int(killer_rabbit) for killer_rabbit in line.rstrip().split()]
        cov[pos - 1] = depth

    return cov