Exemplo n.º 1
0
def add_uniques(session,
                sample,
                vdjs,
                realign_len=None,
                realign_mut=None,
                min_similarity=0,
                max_vties=50,
                trim_to=None,
                max_padding=None):
    bucketed_seqs = OrderedDict()
    vdjs = sorted(vdjs, key=lambda v: v.ids[0])
    for vdj in funcs.periodic_commit(session, vdjs):
        try:
            if realign_len is not None:
                vdj.align_to_germline(realign_len, realign_mut, trim_to)
            if vdj.v_match / float(vdj.v_length) < min_similarity:
                raise AlignmentException('V-identity too low {} < {}'.format(
                    vdj.v_match / float(vdj.v_length), min_similarity))
            if len(vdj.v_gene) > max_vties:
                raise AlignmentException('Too many V-ties {} > {}'.format(
                    len(vdj.v_gene), max_vties))
            if max_padding is not None and vdj.pad_length > max_padding:
                raise AlignmentException('Too much padding {} (max {})'.format(
                    vdj.pad_length, max_padding))
            bucket_key = (funcs.format_ties(vdj.v_gene,
                                            vdj.v_germlines.prefix,
                                            strip_alleles=True),
                          funcs.format_ties(vdj.j_gene,
                                            vdj.j_germlines.prefix,
                                            strip_alleles=True), len(vdj.cdr3))
            if bucket_key not in bucketed_seqs:
                bucketed_seqs[bucket_key] = {}
            bucket = bucketed_seqs[bucket_key]

            if vdj.sequence in bucket:
                bucket[vdj.sequence].ids += vdj.ids
            else:
                bucket[vdj.sequence] = vdj
        except AlignmentException as e:
            add_as_noresult(session, vdj, sample, str(e))
        except:
            logger.error('\tUnexpected error processing sequence '
                         '{}\n\t{}'.format(vdj.ids[0], traceback.format_exc()))

    # Collapse sequences that are the same except for Ns
    for bucket, sequences in funcs.periodic_commit(session,
                                                   bucketed_seqs.iteritems()):
        sequences = sorted(sequences.values(),
                           key=lambda s: (len(s.ids), s.ids[0]),
                           reverse=True)
        while len(sequences) > 0:
            larger = sequences.pop(0)
            for i in reversed(range(len(sequences))):
                smaller = sequences[i]

                if dnautils.equal(larger.sequence, smaller.sequence):
                    larger.ids += smaller.ids
                    del sequences[i]
            add_as_sequence(session, larger, sample)
    session.commit()
Exemplo n.º 2
0
def add_uniques(session,
                sample,
                alignments,
                props,
                aligner,
                realign_len=None,
                realign_mut=None):
    bucketed_seqs = OrderedDict()
    alignments = sorted(alignments, key=lambda v: v.sequence.ids[0])
    for alignment in funcs.periodic_commit(session, alignments):
        try:
            if realign_len is not None:
                aligner.align_to_germline(alignment, realign_len, realign_mut)
                if props.trim_to:
                    alignment.trim_to(props.trim_to)

            props.validate(alignment)
            bucket_key = (funcs.format_ties(alignment.v_gene),
                          funcs.format_ties(alignment.j_gene),
                          len(alignment.cdr3))

            if bucket_key not in bucketed_seqs:
                bucketed_seqs[bucket_key] = {}
            bucket = bucketed_seqs[bucket_key]

            if alignment.sequence.sequence in bucket:
                bucket[alignment.sequence.sequence].sequence.ids += (
                    alignment.sequence.ids)
            else:
                bucket[alignment.sequence.sequence] = alignment
        except AlignmentException as e:
            add_as_noresult(session, alignment.sequence, sample, str(e))
        except Exception:
            logger.error('\tUnexpected error processing sequence '
                         '{}\n\t{}'.format(alignment.sequence.ids[0],
                                           traceback.format_exc()))

    # Collapse sequences that are the same except for Ns
    for bucket, sequences in funcs.periodic_commit(session,
                                                   bucketed_seqs.iteritems()):
        sequences = sorted(sequences.values(),
                           key=lambda s:
                           (len(s.sequence.ids), s.sequence.ids[0]),
                           reverse=True)
        while len(sequences) > 0:
            larger = sequences.pop(0)
            for i in reversed(range(len(sequences))):
                smaller = sequences[i]

                if dnautils.equal(larger.sequence.sequence,
                                  smaller.sequence.sequence):
                    larger.sequence.ids += smaller.sequence.ids
                    del sequences[i]
            add_as_sequence(session, larger, sample)
    session.commit()
Exemplo n.º 3
0
def generate_consensus(session, clone_ids):
    """Generates consensus CDR3s for clones.

    :param Session session: The database session
    :param list clone_ids: The list of clone IDs to assign to groups

    """

    if len(clone_ids) == 0:
        return
    for clone in funcs.periodic_commit(
            session,
            session.query(Clone).filter(Clone.id.in_(clone_ids)),
            interval=1000):
        seqs = session.query(
            Sequence
        ).join(SequenceCollapse).filter(
            Sequence.clone_id == clone.id,
            SequenceCollapse.copy_number_in_subject > 0
        ).all()
        clone.cdr3_nt = funcs.consensus([s.cdr3_nt for s in seqs])
        clone.cdr3_aa = lookups.aas_from_nts(clone.cdr3_nt)

        clone.germline = generate_germline(session, seqs, clone)

    session.commit()
Exemplo n.º 4
0
def generate_consensus(session, clone_ids):
    """Generates consensus CDR3s for clones.

    :param Session session: The database session
    :param list clone_ids: The list of clone IDs to assign to groups

    """

    if len(clone_ids) == 0:
        return
    for clone in funcs.periodic_commit(
            session,
            session.query(Clone).filter(Clone.id.in_(clone_ids)),
            interval=1000):
        seqs = session.query(
            Sequence
        ).join(SequenceCollapse).filter(
            Sequence.clone_id == clone.id,
            SequenceCollapse.copy_number_in_subject > 0
        ).all()
        clone.cdr3_nt = consensus([s.cdr3_nt for s in seqs])
        clone.cdr3_aa = lookups.aas_from_nts(clone.cdr3_nt)

        clone.germline = generate_germline(session, seqs, clone)

    session.commit()
Exemplo n.º 5
0
def generate_consensus(session, clone_ids):
    """Generates consensus CDR3s for clones.

    :param Session session: The database session
    :param list clone_ids: The list of clone IDs to assign to groups

    """

    if not clone_ids:
        return
    for clone in funcs.periodic_commit(session,
                                       session.query(Clone).filter(
                                           Clone.id.in_(clone_ids)),
                                       interval=1000):
        seqs = session.query(Sequence).join(SequenceCollapse).filter(
            Sequence.clone_id == clone.id,
            SequenceCollapse.copy_number_in_subject > 0).all()
        clone.cdr3_nt = funcs.consensus([s.cdr3_nt for s in seqs])
        clone.cdr3_aa = lookups.aas_from_nts(clone.cdr3_nt)

        clone.germline = generate_germline(session, seqs, clone)

        clone.overall_total_cnt = sum([s.copy_number for s in seqs])

        clone.functional = (clone.cdr3_num_nts % 3 == 0
                            and '*' not in clone.cdr3_aa
                            and not lookups.has_stop(clone.germline))

    session.commit()
Exemplo n.º 6
0
def add_sequences_from_sample(session, sample, sequences, props):
    logger.info('Adding {} corrected sequences to sample {}'.format(
        len(sequences), sample.id))
    for sequence in periodic_commit(session, sequences):
        alignment = sequence['alignment']
        try:
            try:
                props.validate(alignment)
            except AlignmentException:
                continue
            if sequence['r_type'] == 'NoResult':
                add_sequences(session, [alignment],
                              sample,
                              error_action='raise')
                session.query(NoResult).filter(
                    NoResult.pk == sequence['pk']).delete(
                        synchronize_session=False)
            elif sequence['r_type'] == 'Sequence':
                fields = {
                    'partial': alignment.partial,
                    'probable_indel_or_misalign': alignment.has_possible_indel,
                    'v_gene': format_ties(alignment.v_gene),
                    'j_gene': format_ties(alignment.j_gene),
                    'num_gaps': alignment.num_gaps,
                    'seq_start': alignment.seq_start,
                    'v_match': alignment.v_match,
                    'v_length': alignment.v_length,
                    'j_match': alignment.j_match,
                    'j_length': alignment.j_length,
                    'removed_prefix':
                    alignment.sequence.removed_prefix_sequence,
                    'removed_prefix_qual':
                    alignment.sequence.removed_prefix_quality,
                    'v_mutation_fraction': alignment.v_mutation_fraction,
                    'pre_cdr3_length': alignment.pre_cdr3_length,
                    'pre_cdr3_match': alignment.pre_cdr3_match,
                    'post_cdr3_length': alignment.post_cdr3_length,
                    'post_cdr3_match': alignment.post_cdr3_match,
                    'in_frame': alignment.in_frame,
                    'functional': alignment.functional,
                    'stop': alignment.stop,
                    'cdr3_nt': alignment.cdr3,
                    'cdr3_num_nts': len(alignment.cdr3),
                    'cdr3_aa': lookups.aas_from_nts(alignment.cdr3),
                    'sequence': str(alignment.sequence.sequence),
                    'quality': alignment.sequence.quality,
                    'locally_aligned': alignment.locally_aligned,
                    '_insertions': serialize_gaps(alignment.insertions),
                    '_deletions': serialize_gaps(alignment.deletions),
                    'germline': alignment.germline
                }
                # This line doesnt actually add anything to the DB, it's just
                # to validate the fields
                Sequence(**fields)

                session.query(Sequence).filter(
                    Sequence.ai == sequence['pk']).update(
                        fields, synchronize_session=False)
        except ValueError:
            continue
Exemplo n.º 7
0
def add_results(uniques, sample, session):
    metrics = {'muts': [], 'lens': []}
    for unique in funcs.periodic_commit(session,
                                        itertools.chain(*uniques),
                                        interval=1000):
        try:
            add_sequences(session, [unique], sample)
            metrics['lens'].append(unique.v_length)
            metrics['muts'].append(unique.v_mutation_fraction)
        except AlignmentException as e:
            add_noresults_for_vdj(session, unique.sequence, sample, str(e))

    if metrics['lens']:
        sample.v_ties_len = sum(metrics['lens']) / len(metrics['lens'])
        sample.v_ties_mutations = sum(metrics['muts']) / len(metrics['muts'])
    session.commit()
Exemplo n.º 8
0
def process_sample(db_config, v_germlines, j_germlines, path, meta, props,
                   nproc):
    session = config.init_db(db_config)
    start = time.time()
    logger.info('Starting sample {}'.format(meta['sample_name']))
    sample = setup_sample(session, meta)

    aligner = AnchorAligner(v_germlines, j_germlines)

    # Initial VJ assignment
    alignments = concurrent.process_data(
        read_input,
        process_vdj,
        aggregate_vdj,
        nproc,
        process_args={'aligner': aligner},
        generate_args={'path': path},
    )
    logger.info('Adding noresults')
    for result in alignments['noresult']:
        add_noresults_for_vdj(session, result['vdj'], sample, result['reason'])

    alignments = alignments['success']
    if alignments:
        avg_len = (sum([v.v_length for v in alignments]) / len(alignments))
        avg_mut = (sum([v.v_mutation_fraction
                        for v in alignments]) / len(alignments))
        sample.v_ties_mutations = avg_mut
        sample.v_ties_len = avg_len
        logger.info('Re-aligning {} sequences to V-ties: Mutations={}, '
                    'Length={}'.format(len(alignments), round(avg_mut, 2),
                                       round(avg_len, 2)))
        session.commit()
        # Realign to V-ties
        v_ties = concurrent.process_data(
            alignments,
            process_vties,
            aggregate_vties,
            nproc,
            process_args={
                'aligner': aligner,
                'avg_len': avg_len,
                'avg_mut': avg_mut,
                'props': props
            },
        )
        logger.info('Adding noresults')

        for result in funcs.periodic_commit(session, v_ties['noresult'], 100):
            add_noresults_for_vdj(session, result['alignment'].sequence,
                                  sample, result['reason'])

        logger.info('Collapsing {} buckets'.format(len(v_ties['success'])))
        session.commit()

        # TODO: Change this so we arent copying everything between processes
        concurrent.process_data([list(v) for v in v_ties['success']],
                                process_collapse,
                                aggregate_collapse,
                                nproc,
                                aggregate_args={
                                    'db_config': db_config,
                                    'sample_id': sample.id,
                                    'props': props
                                })
        session.expire_all()
        session.commit()

        identified = int(
            session.query(func.sum(Sequence.copy_number)).filter(
                Sequence.sample == sample).scalar() or 0)
        noresults = int(
            session.query(func.count(
                NoResult.pk)).filter(NoResult.sample == sample).scalar() or 0)
        if identified + noresults:
            frac = int(100 * identified / (identified + noresults))
        else:
            frac = 0
        logger.info(
            'Completed sample {} in {}m - {}/{} ({}%) identified'.format(
                sample.name, round((time.time() - start) / 60., 1), identified,
                identified + noresults, frac))
    session.close()
Exemplo n.º 9
0
    def do_task(self, args):
        meta = args['meta']
        self.info('Starting sample {}'.format(meta['sample_name']))
        study, sample = self._setup_sample(meta)

        vdjs = {}
        parser = SeqIO.parse(
            args['path'],
            'fasta' if args['path'].endswith('.fasta') else 'fastq')

        # Collapse identical sequences
        self.info('\tCollapsing identical sequences')
        for record in parser:
            try:
                seq = str(record.seq)
                if seq not in vdjs:
                    vdjs[seq] = VDJSequence(
                        ids=[],
                        sequence=seq,
                        quality=funcs.ord_to_quality(
                            record.letter_annotations.get('phred_quality')))
                vdjs[seq].ids.append(record.description)
            except ValueError:
                continue

        alignments = {}
        aligner = AnchorAligner(self._v_germlines, self._j_germlines)
        self.info('\tAligning {} unique sequences'.format(len(vdjs)))
        # Attempt to align all unique sequences
        for sequence in funcs.periodic_commit(self._session,
                                              sorted(vdjs.keys())):
            vdj = vdjs[sequence]
            del vdjs[sequence]
            try:
                # The alignment was successful.  If the aligned sequence
                # already exists, append the seq_ids.  Otherwise add it as a
                # new unique sequence.
                alignment = aligner.get_alignment(vdj)
                seq_key = alignment.sequence.sequence
                if seq_key in alignments:
                    alignments[seq_key].sequence.ids.extend(
                        alignment.sequence.ids)
                else:
                    alignments[seq_key] = alignment
            except AlignmentException as e:
                add_as_noresult(self._session, vdj, sample, str(e))
            except Exception:
                self.error(
                    '\tUnexpected error processing sequence {}\n\t{}'.format(
                        vdj.ids[0], traceback.format_exc()))
        if len(alignments) > 0:
            avg_len = (sum([v.v_length for v in alignments.values()]) /
                       float(len(alignments)))
            avg_mut = (
                sum([v.v_mutation_fraction
                     for v in alignments.values()]) / float(len(alignments)))
            sample.v_ties_mutations = avg_mut
            sample.v_ties_len = avg_len

            self.info('\tRe-aligning {} sequences to V-ties, Mutations={}, '
                      'Length={}'.format(len(alignments), round(avg_mut, 2),
                                         round(avg_len, 2)))
            add_uniques(self._session, sample, alignments.values(),
                        self._props, aligner, avg_len, avg_mut)

        self._session.commit()
        self.info('Completed sample {}'.format(sample.name))
Exemplo n.º 10
0
def process_sample(db_config, v_germlines, j_germlines, path, meta, props,
                   nproc):
    session = config.init_db(db_config)
    start = time.time()
    logger.info('Starting sample {}'.format(meta['sample_name']))
    sample = setup_sample(session, meta)

    aligner = AnchorAligner(v_germlines, j_germlines)

    # Initial VJ assignment
    alignments = concurrent.process_data(
        read_input,
        process_vdj,
        aggregate_vdj,
        nproc,
        process_args={'aligner': aligner},
        generate_args={'path': path},
    )
    logger.info('Adding noresults')
    for result in alignments['noresult']:
        add_noresults_for_vdj(session, result['vdj'], sample, result['reason'])

    alignments = alignments['success']
    if alignments:
        avg_len = (
            sum([v.v_length for v in alignments]) /
            len(alignments))
        avg_mut = (
            sum([v.v_mutation_fraction for v in alignments]) /
            len(alignments)
        )
        sample.v_ties_mutations = avg_mut
        sample.v_ties_len = avg_len
        logger.info('Re-aligning {} sequences to V-ties: Mutations={}, '
                    'Length={}'.format(len(alignments),
                                       round(avg_mut, 2),
                                       round(avg_len, 2)))
        session.commit()
        # Realign to V-ties
        v_ties = concurrent.process_data(
            alignments,
            process_vties,
            aggregate_vties,
            nproc,
            process_args={'aligner': aligner, 'avg_len': avg_len, 'avg_mut':
                          avg_mut, 'props': props},
        )
        logger.info('Adding noresults')

        for result in funcs.periodic_commit(session, v_ties['noresult'], 100):
            add_noresults_for_vdj(session, result['alignment'].sequence,
                                  sample, result['reason'])

        logger.info('Collapsing {} buckets'.format(len(v_ties['success'])))
        session.commit()

        # TODO: Change this so we arent copying everything between processes
        concurrent.process_data(
            [list(v) for v in v_ties['success']],
            process_collapse,
            aggregate_collapse,
            nproc,
            aggregate_args={'db_config': db_config, 'sample_id': sample.id,
                            'props': props}
        )
        session.expire_all()
        session.commit()

        identified = int(session.query(
            func.sum(Sequence.copy_number)
        ).filter(
            Sequence.sample == sample
        ).scalar() or 0)
        noresults = int(session.query(
            func.count(NoResult.pk)
        ).filter(
            NoResult.sample == sample
        ).scalar() or 0)
        if identified + noresults:
            frac = int(100 * identified / (identified + noresults))
        else:
            frac = 0
        logger.info(
            'Completed sample {} in {}m - {}/{} ({}%) identified'.format(
                sample.name,
                round((time.time() - start) / 60., 1),
                identified,
                identified + noresults,
                frac
            )
        )
    session.close()
Exemplo n.º 11
0
    def do_task(self, args):
        meta = args['meta']
        self.info('Starting sample {}'.format(meta.get('sample_name')))
        study, sample = self._setup_sample(meta)

        vdjs = {}
        parser = SeqIO.parse(
            os.path.join(args['path'], args['fn']),
            'fasta' if args['fn'].endswith('.fasta') else 'fastq')

        # Collapse identical sequences
        self.info('\tCollapsing identical sequences')
        for record in parser:
            seq = str(record.seq)
            if seq not in vdjs:
                vdjs[seq] = VDJSequence(
                    ids=[],
                    seq=seq,
                    v_germlines=self._v_germlines,
                    j_germlines=self._j_germlines,
                    quality=funcs.ord_to_quality(
                        record.letter_annotations.get('phred_quality')))
            vdjs[seq].ids.append(record.description)

        self.info('\tAligning {} unique sequences'.format(len(vdjs)))
        # Attempt to align all unique sequences
        for sequence in funcs.periodic_commit(self._session,
                                              sorted(vdjs.keys())):
            vdj = vdjs[sequence]
            del vdjs[sequence]
            try:
                # The alignment was successful.  If the aligned sequence
                # already exists, append the seq_ids.  Otherwise add it as a
                # new unique sequence.
                vdj.analyze()
                if vdj.sequence in vdjs:
                    vdjs[vdj.sequence].ids += vdj.ids
                else:
                    vdjs[vdj.sequence] = vdj
            except AlignmentException as e:
                add_as_noresult(self._session, vdj, sample, str(e))
            except:
                self.error(
                    '\tUnexpected error processing sequence {}\n\t{}'.format(
                        vdj.ids[0], traceback.format_exc()))
        if len(vdjs) > 0:
            avg_len = sum(map(lambda vdj: vdj.v_length,
                              vdjs.values())) / float(len(vdjs))
            avg_mut = sum(map(lambda vdj: vdj.mutation_fraction,
                              vdjs.values())) / float(len(vdjs))
            sample.v_ties_mutations = avg_mut
            sample.v_ties_len = avg_len

            self.info('\tRe-aligning {} sequences to V-ties, Mutations={}, '
                      'Length={}'.format(len(vdjs), round(avg_mut, 2),
                                         round(avg_len, 2)))
            add_uniques(self._session, sample, vdjs.values(), avg_len, avg_mut,
                        self._min_similarity, self._max_vties, self._trim_to,
                        self._max_padding)

        self._session.commit()
        self.info('Completed sample {}'.format(sample.name))
Exemplo n.º 12
0
def add_sequences_from_sample(session, sample, sequences, props):
    logger.info('Adding {} corrected sequences to sample {}'.format(
        len(sequences), sample.id))
    for sequence in periodic_commit(session, sequences):
        alignment = sequence['alignment']
        try:
            try:
                props.validate(alignment)
            except AlignmentException:
                continue
            if sequence['r_type'] == 'NoResult':
                add_sequences(session, [alignment], sample,
                              error_action='raise')
                session.query(NoResult).filter(
                    NoResult.pk == sequence['pk']
                ).delete(synchronize_session=False)
            elif sequence['r_type'] == 'Sequence':
                fields = {
                    'partial': alignment.partial,

                    'probable_indel_or_misalign':
                        alignment.has_possible_indel,

                    'v_gene': format_ties(alignment.v_gene),
                    'j_gene': format_ties(alignment.j_gene),

                    'num_gaps': alignment.num_gaps,
                    'seq_start': alignment.seq_start,

                    'v_match': alignment.v_match,
                    'v_length': alignment.v_length,
                    'j_match': alignment.j_match,
                    'j_length': alignment.j_length,

                    'removed_prefix':
                        alignment.sequence.removed_prefix_sequence,
                    'removed_prefix_qual':
                        alignment.sequence.removed_prefix_quality,
                    'v_mutation_fraction': alignment.v_mutation_fraction,

                    'pre_cdr3_length': alignment.pre_cdr3_length,
                    'pre_cdr3_match': alignment.pre_cdr3_match,
                    'post_cdr3_length': alignment.post_cdr3_length,
                    'post_cdr3_match': alignment.post_cdr3_match,

                    'in_frame': alignment.in_frame,
                    'functional': alignment.functional,
                    'stop': alignment.stop,

                    'cdr3_nt': alignment.cdr3,
                    'cdr3_num_nts': len(alignment.cdr3),
                    'cdr3_aa': lookups.aas_from_nts(alignment.cdr3),

                    'sequence': str(alignment.sequence.sequence),
                    'quality': alignment.sequence.quality,

                    'locally_aligned': alignment.locally_aligned,
                    '_insertions': serialize_gaps(alignment.insertions),
                    '_deletions': serialize_gaps(alignment.deletions),

                    'germline': alignment.germline
                }
                # This line doesnt actually add anything to the DB, it's just
                # to validate the fields
                Sequence(**fields)

                session.query(Sequence).filter(
                    Sequence.ai == sequence['pk']
                ).update(fields, synchronize_session=False)
        except ValueError:
            continue