def reads_train_test_split(ref_root, test_size, ref_path):
    reference_len = len(bioinf_utils.read_fasta(ref_path))
    train_size = 1 - test_size

    files = glob.glob(os.path.join(ref_root, '*.ref'))
    train_path = os.path.join(ref_root, 'train.txt')
    test_path = os.path.join(ref_root, 'test.txt')

    with open(train_path, 'w') as trainf, open(test_path, 'w') as testf:
        for file_path in tqdm(files):
            basename = os.path.basename(file_path)
            name, ext = os.path.splitext(basename)

            with open(file_path, 'r') as fin:
                next(fin)  # skip header
                line = next(
                    fin)  # line2: abs_start_pos\tstart_position\tlength
                start, rel_start, length = [int(x) for x in line.split()]
                end = start + length
                if end < reference_len * train_size:
                    # train data
                    trainf.write("%s\t%d\n" % (name, length))

                elif start > reference_len * train_size and end <= reference_len:
                    # starts after 'split' and does not overlap in case of circular aligment
                    # test data
                    testf.write("%s\t%d\n" % (name, length))
                else:
                    logging.info('Skipping ref, overlaps train and test')
Пример #2
0
def extend_cigars_in_sam(sam_in, ref_path, fastx_path, sam_out=None):
    tmp_dir = None
    tmp_sam_out = sam_out
    inplace = sam_out is None

    if inplace:
        # inplace change using tmp file
        tmp_dir = tempfile.mkdtemp()
        tmp_sam_out = os.path.join(tmp_dir, 'tmp.sam')

    ref = butil.read_fasta(ref_path)
    reads = {}

    with pysam.FastxFile(fastx_path, 'r') as fh:
        for r in fh:
            reads[r.name] = r

    with pysam.AlignmentFile(sam_in, "r") as in_sam, \
            pysam.AlignmentFile(tmp_sam_out, "w", template=in_sam) as out_sam:

        for x in tqdm(in_sam.fetch(), unit='reads'):
            if x.query_name not in reads:
                logging.warning("read %s in sam not found in .fastx",
                                x.query_name)
                continue

            if x.is_unmapped:
                logging.warning("read %s is unmapped, copy to out sam as is",
                                x.query_name)
                out_sam.write(x)
                continue

            read_seq = reads[x.query_name].sequence
            ref_seq = ref[x.reference_start:x.reference_end]
            cigar_pairs = x.cigartuples

            if x.is_reverse:
                read_seq = butil.reverse_complement(read_seq)

            x.cigarstring = extend_cigar(read_seq, ref_seq, cigar_pairs)
            out_sam.write(x)

    if inplace:
        # clear tmp files
        shutil.move(tmp_sam_out, sam_in)
        shutil.rmtree(tmp_dir)
Пример #3
0
def process_mpileup(name, alignments_path, reference_path, mpileup_path,
                    coverage_threshold, output_prefix):
    def _nlines(path):
        with open(path, 'r') as f:
            n_lines = sum(1 for _ in f)
        return n_lines

    n_lines = _nlines(mpileup_path)
    with open(mpileup_path, 'r') as fp:

        # snp_count,
        # insertion_count,
        # deletion_count,
        counts = np.zeros((7, ))

        fp_variant = None
        fp_vcf = None

        if output_prefix:
            os.makedirs(output_prefix, exist_ok=True)

            variant_file = os.path.join(
                output_prefix, 'cov_%d.variant.csv'
                #
                # num_undercovered_bases,
                # num_called_bases,
                # num_correct_bases,
                # coverage_sum% coverage_threshold
            )
            fp_variant = open(variant_file, 'w')

            vcf_file = os.path.join(output_prefix,
                                    'cov_%d.variant.vcf' % coverage_threshold)
            fp_vcf = open(vcf_file, 'w')

            fp_vcf.write('##fileformat=VCFv4.0\n')
            fp_vcf.write('##fileDate=20150409\n')
            fp_vcf.write('##source=none\n')
            fp_vcf.write('##reference=%s\n' % reference_path)
            fp_vcf.write(
                '##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw Depth">\n'
            )
            fp_vcf.write(
                '##INFO=<ID=TYPE,Number=A,Type=String,Description="Type of each allele (snp, ins, del, mnp, complex)">\n'
            )
            fp_vcf.write(
                '##INFO=<ID=AF,Number=1,Type=Float,Description="Allele Frequency">\n'
            )
            fp_vcf.write(
                '##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias at this position">\n'
            )
            fp_vcf.write(
                '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">\n'
            )
            fp_vcf.write(
                '##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">\n'
            )
            fp_vcf.write(
                '##INFO=<ID=CONSVAR,Number=0,Type=Flag,Description="Indicates that the variant is a consensus variant (as opposed to a low frequency variant).">\n'
            )
            fp_vcf.write(
                '##INFO=<ID=HRUN,Number=1,Type=Integer,Description="Homopolymer length to the right of report indel position">\n'
            )
            fp_vcf.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n')
            fp_vcf.flush()

        i = 0
        j = 0
        num_bases_to_skip = 0

        for line in tqdm(fp, total=n_lines, desc="processing_mpileup"):
            if num_bases_to_skip > 0:
                num_bases_to_skip -= 1
                continue

            num_bases_to_skip, new_counts = process_mpileup_line(
                line, coverage_threshold, fp_variant, fp_vcf)
            counts += new_counts

            i += num_bases_to_skip
            i += 1
            j += 1

        fp.close()
        if fp_variant:
            fp_variant.close()

        if fp_vcf:
            fp_vcf.close()

        # transfrorm coverage sum to average coverage
        counts[-1] /= (i + 1)

        fields = [
            'alignments_file', 'mpileup_file', 'coverage_threshold',
            'snp_count', 'insertion_count', 'deletion_count',
            'num_undercovered_bases', 'num_called_bases', 'num_correct_bases',
            'average_coverage'
        ]
        values = [alignments_path, mpileup_path, coverage_threshold
                  ] + counts.tolist()
        report = pd.DataFrame([values], columns=fields, index=[name])
        report.num_called_bases = report.num_correct_bases + report.snp_count + report.insertion_count

        reference_len = len(butil.read_fasta(reference_path))
        for col in filter(lambda c: c.endswith('_count'), report.columns):
            new_col = col.replace('count', 'rate')
            report[new_col] = 100 * report[col] / report.num_called_bases

        report[
            'correct_rate'] = 100 * report.num_correct_bases / report.num_called_bases
        report[
            'identity_percentage'] = 100 * report.num_correct_bases / reference_len

        if output_prefix:
            summary_file = os.path.join(output_prefix,
                                        'cov_%d.sum.vcf' % coverage_threshold)
            report.to_csv(summary_file, sep=';', index=False)
        return report