示例#1
0
    def __init__(self, in_filenames, extractor, reads_interleaved, r1_length, r2_length):
        """ Args:
              in_filenames (dict of str -> str): Map of paths to fastq files
              feature_ref (FeatureExtractor): for extracting feature barcodes
        """

        self.in_fastqs = None
        self.in_iter = iter([])

        # Relevant read types
        read_types = extractor.get_read_types()

        if in_filenames:
            in_filenames = get_fastqs_from_feature_ref(in_filenames,
                                                       reads_interleaved,
                                                       read_types)
            if in_filenames != (None, None):
                if reads_interleaved:
                    filename = in_filenames[0] if in_filenames[0] else in_filenames[1]
                    self.in_fastqs = (cr_io.open_maybe_gzip(filename, 'r') if filename[0] else None,
                                      None)
                else:
                    self.in_fastqs = (cr_io.open_maybe_gzip(in_filenames[0], 'r') if in_filenames[0] else None,
                                      cr_io.open_maybe_gzip(in_filenames[1], 'r') if in_filenames[1] else None)

                self.in_iter = get_feature_generator_fastq(files=self.in_fastqs,
                                                           extractor=extractor,
                                                           interleaved=reads_interleaved,
                                                           read_types=read_types,
                                                           r1_length=r1_length,
                                                           r2_length=r2_length)
示例#2
0
def main(args, outs):
    outs.coerce_strings()

    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    outs.read1s = martian.make_path('reads_1.fastq' + h5_constants.LZ4_SUFFIX)
    r1_fq_out = cr_io.open_maybe_gzip(outs.read1s, 'w')

    if paired_end:
        outs.read2s = martian.make_path('reads_2.fastq' +
                                        h5_constants.LZ4_SUFFIX)
        r2_fq_out = cr_io.open_maybe_gzip(outs.read2s, 'w')
    else:
        outs.read2s = None
        r2_fq_out = None

    barcodes_out = cr_io.open_maybe_gzip(outs.chunk_barcodes, 'w')

    merge_by_barcode(args.fastqs, r1_fq_out, r2_fq_out, barcodes_out,
                     paired_end)

    r1_fq_out.close()
    if r2_fq_out is not None:
        r2_fq_out.close()
    barcodes_out.close()
示例#3
0
def create_unaligned_bam(args, outs):
    star_ref_path = cr_utils.get_reference_star_path(args.reference_path)

    header_buf = cStringIO.StringIO()

    header_buf.write('@HD\tVN:1.4\n')

    # SQ header lines
    with open(os.path.join(star_ref_path, 'chrNameLength.txt')) as f:
        for line in f:
            chr_name, chr_len = line.strip().split('\t')
            header_buf.write('@SQ\tSN:{}\tLN:{}\n'.format(chr_name, chr_len))

    # RG header lines
    for packed_rg in args.read_groups:
        header_buf.write(
            re.sub('\\\\t', '\t', tk_bam.make_rg_header(packed_rg)) + '\n')

    # Get read group ID for this chunk of reads
    read_group = args.read_group

    # pysam doesn't support reading SAM from a StringIO object
    with open('tmphdr', 'w') as f:
        f.write(header_buf.getvalue())
    samfile = pysam.AlignmentFile('tmphdr', 'r', check_sq=False)

    outbam = pysam.AlignmentFile(outs.genome_output, 'wb', template=samfile)

    fastq_file1 = cr_io.open_maybe_gzip(args.read_chunk)
    fastq_file2 = cr_io.open_maybe_gzip(
        args.read2_chunk) if args.read2_chunk else None
    read1s = tk_fasta.read_generator_fastq(fastq_file1)
    read2s = tk_fasta.read_generator_fastq(fastq_file2) if fastq_file2 else []

    record = pysam.AlignedSegment()
    record.flag = 4

    for read1, read2 in itertools.izip_longest(read1s, read2s):
        name, seq, qual = read1
        record.query_name, record.query_sequence = name.split(' ')[0], seq
        record.query_qualities = tk_fasta.get_qvs(qual)
        record.set_tag('RG', read_group, 'Z')
        outbam.write(record)

        if read2:
            name, seq, qual = read2
            record.query_name, record.query_sequence = name.split(' ')[0], seq
            record.query_qualities = tk_fasta.get_qvs(qual)
            record.set_tag('RG', read_group, 'Z')
            outbam.write(record)

    samfile.close()
    fastq_file1.close()
    if fastq_file2 is not None:
        fastq_file2.close()
    outbam.close()
def load_barcode_translate_map(bc_whitelist):
    """
    Guide BC to Cell BC translate.

    If the barcode whitelist needs to translate, return the mapping dictionary,
    else, return None.
    """
    if bc_whitelist is None:
        return None

    file_path = None
    for extension in ['.txt', '.txt.gz']:
        file_ext = os.path.join(cr_constants.BARCODE_WHITELIST_TRANSLATE_PATH,
                                bc_whitelist + extension)
        if os.path.exists(file_ext):
            file_path = file_ext
            break

    if file_path is None:
        return None
    else:
        translate_map = {}
        for line in cr_io.open_maybe_gzip(file_path, 'r'):
            if line.startswith('#'):
                continue
            bcs = line.strip().split()
            translate_map[bcs[0]] = bcs[1]
        return translate_map
def _compute_r1_length(fastqs, reads_interleaved):
    """ Infer the length of R1 """
    num_reads = 0
    r1_max_len = 0

    def get_r1_noninterleaved(read_iter):
        for _, seq, _ in read_iter:
            yield seq

    def get_r1_interleaved(read_iter):
        for _, seq, _, _, _, _ in read_iter:
            yield seq

    get_r1 = get_r1_interleaved if reads_interleaved else get_r1_noninterleaved

    for fastq in fastqs:
        with cr_io.open_maybe_gzip(fastq, 'r') as fq_file:
            reads = tk_fasta.read_generator_fastq(fq_file, reads_interleaved)

            for r1 in get_r1(reads):
                if num_reads == cr_constants.DETECT_CHEMISTRY_INITIAL_READS:
                    break
                r1_max_len = max(len(r1), r1_max_len)
                num_reads += 1

        if num_reads == cr_constants.DETECT_CHEMISTRY_INITIAL_READS:
            break

    return r1_max_len
def load_barcode_tsv(filename, as_set=False):
    barcodes = [
        x.strip() for x in cr_io.open_maybe_gzip(filename, 'r')
        if not ('#' in x)
    ]
    barcode_set = set(barcodes)
    if len(barcodes) != len(barcode_set):
        raise Exception('Duplicates found in barcode whitelist: %s' % filename)
    return barcode_set if as_set else barcodes
示例#7
0
def save_features_tsv(feature_ref, base_dir, compress):
    """Save a FeatureReference to a tsv file"""
    out_features_fn = os.path.join(base_dir, 'features.tsv')
    if compress:
        out_features_fn += '.gz'

    with cr_io.open_maybe_gzip(out_features_fn, 'w') as f:
        for feature_def in feature_ref.feature_defs:
            f.write('\t'.join((feature_def.id, feature_def.name,
                               feature_def.feature_type)) + '\n')
示例#8
0
    def __init__(self, in_filenames, read_def, reads_interleaved, r1_length, r2_length):
        """ Args:
              in_filenames - Map of paths to fastq files
              read_def - ReadDef
        """

        self.in_fastq = None
        self.in_iter = iter([])
        self.read_def = read_def

        if in_filenames:
            in_filename = get_fastq_from_read_type(in_filenames,
                                                   read_def,
                                                   reads_interleaved)
            if in_filename:
                self.in_fastq = cr_io.open_maybe_gzip(in_filename, 'r')

                self.in_iter = get_read_generator_fastq(self.in_fastq,
                                                        read_def=read_def,
                                                        reads_interleaved=reads_interleaved,
                                                        r1_length=r1_length,
                                                        r2_length=r2_length)
示例#9
0
    def save_mex(self,
                 base_dir,
                 save_features_func,
                 metadata=None,
                 compress=True):
        """Save in Matrix Market Exchange format.
        Args:
          base_dir (str): Path to directory to write files in.
          save_features_func (func): Func that takes (FeatureReference, base_dir, compress) and writes
                                     a file describing the features.
          metadata (dict): Optional metadata to encode into the comments as JSON.
        """
        self.tocoo()

        cr_io.makedirs(base_dir, allow_existing=True)

        out_matrix_fn = os.path.join(base_dir, 'matrix.mtx')
        out_barcodes_fn = os.path.join(base_dir, 'barcodes.tsv')
        if compress:
            out_matrix_fn += '.gz'
            out_barcodes_fn += '.gz'

        # This method only supports an integer matrix.
        assert self.m.dtype in ['uint32', 'int32', 'uint64', 'int64']
        assert type(self.m) == sp_sparse.coo.coo_matrix

        rows, cols = self.m.shape
        # Header fields in the file
        rep = 'coordinate'
        field = 'integer'
        symmetry = 'general'

        metadata = metadata or {}
        metadata.update({
            'format_version': MATRIX_H5_VERSION,
        })

        metadata_str = json.dumps(metadata)
        comment = 'metadata_json: %s' % metadata_str

        with cr_io.open_maybe_gzip(out_matrix_fn, 'w') as stream:
            # write initial header line
            stream.write(
                np.compat.asbytes('%%MatrixMarket matrix {0} {1} {2}\n'.format(
                    rep, field, symmetry)))

            # write comments
            for line in comment.split('\n'):
                stream.write(np.compat.asbytes('%%%s\n' % (line)))

            # write shape spec
            stream.write(
                np.compat.asbytes('%i %i %i\n' % (rows, cols, self.m.nnz)))
            # write row, col, val in 1-based indexing
            for r, c, d in itertools.izip(self.m.row + 1, self.m.col + 1,
                                          self.m.data):
                stream.write(np.compat.asbytes(("%i %i %i\n" % (r, c, d))))

        # both GEX and ATAC provide an implementation of this in respective feature_ref.py
        save_features_func(self.feature_ref, base_dir, compress=compress)

        with cr_io.open_maybe_gzip(out_barcodes_fn, 'w') as f:
            for bc in self.bcs:
                f.write(bc + '\n')
示例#10
0
 def open_file(self, filename):
     return cr_io.open_maybe_gzip(filename, 'w')
示例#11
0
def main(args, outs):
    # Martian coerces dict keys to string
    # Coerce keys back to int
    args.chunks_per_gem_group = {int(k): v for k, v in args.chunks_per_gem_group.iteritems()}

    paired_end = args.read2s_chunk is not None

    # Lazy load R1
    r1_file = cr_io.open_maybe_gzip(args.read1s_chunk)
    read1s = tk_fasta.read_generator_fastq(r1_file)

    # Lazy load R2
    if paired_end:
        r2_file = cr_io.open_maybe_gzip(args.read2s_chunk)
        read2s = tk_fasta.read_generator_fastq(r2_file)
    else:
        read2s = []

    # Lazy load corrected BCs
    bc_file = cr_io.open_maybe_gzip(args.bcs)
    bcs = (line.strip() for line in bc_file)

    buckets = {}

    bucket_filenames = {}

    for gem_group, bucket_name in enumerate_bucket_names(args.chunks_per_gem_group):
        filename = martian.make_path("%s.fastq" % bucket_name)
        bucket_filenames[bucket_name] = filename
        buckets[bucket_name] = []

    for read1, read2, barcode in itertools.izip_longest(read1s, read2s, bcs):
        # Exclude unbarcoded reads
        if barcode == '':
            continue

        # Exclude short reads
        if len(read1[1]) < MIN_READ_LENGTH or (read2 is not None and len(read2[1]) < MIN_READ_LENGTH):
            continue

        # Attach processed barcode to reads
        r1_hdr = cr_fastq.AugmentedFastqHeader(read1[0])
        r1_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode)
        r1_new_qname = r1_hdr.to_string()

        if paired_end:
            r2_hdr = cr_fastq.AugmentedFastqHeader(read2[0])
            r2_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode)
            r2_new_qname = r2_hdr.to_string()

        barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode)
        bucket_name = get_bucket_name(gem_group, barcode_seq, args.chunks_per_gem_group[gem_group])

        buckets[bucket_name].append((r1_new_qname, read1[1], read1[2]))
        if paired_end:
            buckets[bucket_name].append((r2_new_qname, read2[1], read2[2]))

    outs.buckets = {}

    # Sort and write each bucket
    for bucket_name, bucket in buckets.iteritems():
        bucket.sort(key=vdj_utils.fastq_barcode_sort_key)

        # Don't create empty bucket files.
        # This is common when the reads are ordered by gem group
        # And a chunk sees only a single gem group.
        if len(bucket) == 0:
            continue

        filename = bucket_filenames[bucket_name]
        with cr_io.open_maybe_gzip(filename, 'w') as f:
            for read in bucket:
                tk_fasta.write_read_fastq(f, *read)

        outs.buckets[bucket_name] = bucket_filenames[bucket_name]
示例#12
0
def main(args, outs):
    # Load barcode whitelist
    if args.barcode_whitelist is not None:
        barcode_whitelist = cr_utils.load_barcode_whitelist(
            args.barcode_whitelist)

    reporter = vdj_report.VdjReporter()

    # Load barcode count distribution
    barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                              barcode_whitelist,
                                              args.gem_group,
                                              args.library_type)

    if args.barcode_whitelist is not None:
        barcode_whitelist_set = set(barcode_whitelist)
    else:
        barcode_whitelist_set = None

    in_read1_fastq = cr_io.open_maybe_gzip(args.read1_chunk)
    in_read2_fastq = cr_io.open_maybe_gzip(
        args.read2_chunk) if args.read2_chunk else []

    outs.corrected_bcs += h5_constants.LZ4_SUFFIX
    out_file = cr_io.open_maybe_gzip(outs.corrected_bcs, 'w')

    bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist,
                                         outs.corrected_barcode_counts)

    # Correct barcodes, add processed bc tag to fastq
    read_pair_iter = itertools.izip_longest(tk_fasta.read_generator_fastq(in_read1_fastq), \
                                            tk_fasta.read_generator_fastq(in_read2_fastq))
    for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads):
        read1_header = cr_fastq.AugmentedFastqHeader(read1[0])

        raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG)
        bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG)

        processed_bc = None

        if raw_bc:
            if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set:
                processed_bc = cr_stats.correct_bc_error(
                    args.barcode_confidence_threshold, raw_bc, bc_qual,
                    barcode_dist)
            else:
                # Disallow Ns in no-whitelist case
                if 'N' in raw_bc:
                    processed_bc = None
                else:
                    processed_bc = raw_bc

            if processed_bc:
                bc_counter.count(None, processed_bc, None)

                # Add gem group to barcode sequence
                processed_bc = cr_utils.format_barcode_seq(
                    processed_bc, gem_group=args.gem_group)

            reporter.vdj_barcode_cb(raw_bc, processed_bc)

        out_file.write('%s\n' %
                       (processed_bc if processed_bc is not None else ''))

    in_read1_fastq.close()
    if in_read2_fastq:
        in_read2_fastq.close()
    out_file.close()

    bc_counter.close()

    reporter.save(outs.chunked_reporter)