def aggregate_vties(aggregate_queue): bucketed_seqs = { 'success': {}, 'noresult': [] } for result in aggregate_queue: if result['status'] == 'success': alignment = result['alignment'] bucket_key = ( funcs.format_ties(alignment.v_gene), funcs.format_ties(alignment.j_gene), len(alignment.cdr3) ) bucket = bucketed_seqs['success'].setdefault(bucket_key, {}) if alignment.sequence.sequence in bucket: bucket[alignment.sequence.sequence].sequence.copy_number += ( alignment.sequence.copy_number ) else: bucket[alignment.sequence.sequence] = alignment elif result['status'] == 'noresult': bucketed_seqs['noresult'].append(result) elif result['status'] == 'error': logger.error( 'Unexpected error processing sequence {}\n\t{}'.format( result['alignment'].sequence.seq_id)) bucketed_seqs['success'] = [ b.values() for b in bucketed_seqs['success'].values() ] return bucketed_seqs
def aggregate_vties(aggregate_queue): bucketed_seqs = {'success': {}, 'noresult': []} for result in aggregate_queue: if result['status'] == 'success': alignment = result['alignment'] bucket_key = (funcs.format_ties(alignment.v_gene), funcs.format_ties(alignment.j_gene), len(alignment.cdr3)) bucket = bucketed_seqs['success'].setdefault(bucket_key, {}) if alignment.sequence.sequence in bucket: bucket[alignment.sequence.sequence].sequence.copy_number += ( alignment.sequence.copy_number) else: bucket[alignment.sequence.sequence] = alignment elif result['status'] == 'noresult': bucketed_seqs['noresult'].append(result) elif result['status'] == 'error': logger.error( 'Unexpected error processing sequence {}\n\t{}'.format( result['alignment'].sequence.seq_id)) bucketed_seqs['success'] = [ b.values() for b in bucketed_seqs['success'].values() ] return bucketed_seqs
def add_sequences_from_sample(session, sample, sequences, props): logger.info('Adding {} corrected sequences to sample {}'.format( len(sequences), sample.id)) for sequence in periodic_commit(session, sequences): alignment = sequence['alignment'] try: try: props.validate(alignment) except AlignmentException: continue if sequence['r_type'] == 'NoResult': add_sequences(session, [alignment], sample, error_action='raise') session.query(NoResult).filter( NoResult.pk == sequence['pk']).delete( synchronize_session=False) elif sequence['r_type'] == 'Sequence': fields = { 'partial': alignment.partial, 'probable_indel_or_misalign': alignment.has_possible_indel, 'v_gene': format_ties(alignment.v_gene), 'j_gene': format_ties(alignment.j_gene), 'num_gaps': alignment.num_gaps, 'seq_start': alignment.seq_start, 'v_match': alignment.v_match, 'v_length': alignment.v_length, 'j_match': alignment.j_match, 'j_length': alignment.j_length, 'removed_prefix': alignment.sequence.removed_prefix_sequence, 'removed_prefix_qual': alignment.sequence.removed_prefix_quality, 'v_mutation_fraction': alignment.v_mutation_fraction, 'pre_cdr3_length': alignment.pre_cdr3_length, 'pre_cdr3_match': alignment.pre_cdr3_match, 'post_cdr3_length': alignment.post_cdr3_length, 'post_cdr3_match': alignment.post_cdr3_match, 'in_frame': alignment.in_frame, 'functional': alignment.functional, 'stop': alignment.stop, 'cdr3_nt': alignment.cdr3, 'cdr3_num_nts': len(alignment.cdr3), 'cdr3_aa': lookups.aas_from_nts(alignment.cdr3), 'sequence': str(alignment.sequence.sequence), 'quality': alignment.sequence.quality, 'locally_aligned': alignment.locally_aligned, '_insertions': serialize_gaps(alignment.insertions), '_deletions': serialize_gaps(alignment.deletions), 'germline': alignment.germline } # This line doesnt actually add anything to the DB, it's just # to validate the fields Sequence(**fields) session.query(Sequence).filter( Sequence.ai == sequence['pk']).update( fields, synchronize_session=False) except ValueError: continue
def add_uniques(session, sample, vdjs, realign_len=None, realign_mut=None, min_similarity=0, max_vties=50, trim_to=None, max_padding=None): bucketed_seqs = OrderedDict() vdjs = sorted(vdjs, key=lambda v: v.ids[0]) for vdj in funcs.periodic_commit(session, vdjs): try: if realign_len is not None: vdj.align_to_germline(realign_len, realign_mut, trim_to) if vdj.v_match / float(vdj.v_length) < min_similarity: raise AlignmentException('V-identity too low {} < {}'.format( vdj.v_match / float(vdj.v_length), min_similarity)) if len(vdj.v_gene) > max_vties: raise AlignmentException('Too many V-ties {} > {}'.format( len(vdj.v_gene), max_vties)) if max_padding is not None and vdj.pad_length > max_padding: raise AlignmentException('Too much padding {} (max {})'.format( vdj.pad_length, max_padding)) bucket_key = (funcs.format_ties(vdj.v_gene, vdj.v_germlines.prefix, strip_alleles=True), funcs.format_ties(vdj.j_gene, vdj.j_germlines.prefix, strip_alleles=True), len(vdj.cdr3)) if bucket_key not in bucketed_seqs: bucketed_seqs[bucket_key] = {} bucket = bucketed_seqs[bucket_key] if vdj.sequence in bucket: bucket[vdj.sequence].ids += vdj.ids else: bucket[vdj.sequence] = vdj except AlignmentException as e: add_as_noresult(session, vdj, sample, str(e)) except: logger.error('\tUnexpected error processing sequence ' '{}\n\t{}'.format(vdj.ids[0], traceback.format_exc())) # Collapse sequences that are the same except for Ns for bucket, sequences in funcs.periodic_commit(session, bucketed_seqs.iteritems()): sequences = sorted(sequences.values(), key=lambda s: (len(s.ids), s.ids[0]), reverse=True) while len(sequences) > 0: larger = sequences.pop(0) for i in reversed(range(len(sequences))): smaller = sequences[i] if dnautils.equal(larger.sequence, smaller.sequence): larger.ids += smaller.ids del sequences[i] add_as_sequence(session, larger, sample) session.commit()
def aggregate_results(results, session, sample): alignments = {} success = [r for r in results if r['status'] == 'success'] noresults = [r for r in results if r['status'] == 'noresult'] logger.info('{} total sequences ({} alignments, {} noresults)'.format( len(results), len(success), len(noresults))) for result in success: alignment = result['alignment'] key = (funcs.format_ties(alignment.v_gene), funcs.format_ties(alignment.j_gene), alignment.cdr3_num_nts, tuple(alignment.insertions), tuple(alignment.deletions)) alignments.setdefault(key, []).append(alignment) copies = 0 for i, result in enumerate(noresults): orig_id = result['vdj'].seq_id copies += result['vdj'].copy_number for i in range(result['vdj'].copy_number): result['vdj'].seq_id = '{}_{}'.format(orig_id, i) add_noresults_for_vdj(session, result['vdj'], sample, result['reason']) if copies % 1000 == 0: session.commit() session.commit() return alignments
def add_as_sequence(session, alignment, sample, error_action='discard'): try: seq = Sequence( seq_id=alignment.sequence.ids[0], sample_id=sample.id, subject_id=sample.subject.id, partial=alignment.partial, probable_indel_or_misalign=alignment.has_possible_indel, v_gene=funcs.format_ties(alignment.v_gene), j_gene=funcs.format_ties(alignment.j_gene), num_gaps=alignment.num_gaps, seq_start=alignment.seq_start, v_match=alignment.v_match, v_length=alignment.v_length, j_match=alignment.j_match, j_length=alignment.j_length, removed_prefix=alignment.sequence.removed_prefix_sequence, removed_prefix_qual=alignment.sequence.removed_prefix_quality, v_mutation_fraction=alignment.v_mutation_fraction, pre_cdr3_length=alignment.pre_cdr3_length, pre_cdr3_match=alignment.pre_cdr3_match, post_cdr3_length=alignment.post_cdr3_length, post_cdr3_match=alignment.post_cdr3_match, in_frame=alignment.in_frame, functional=alignment.functional, stop=alignment.stop, copy_number=len(alignment.sequence.ids), cdr3_nt=alignment.cdr3, cdr3_num_nts=len(alignment.cdr3), cdr3_aa=lookups.aas_from_nts(alignment.cdr3), sequence=str(alignment.sequence.sequence), quality=alignment.sequence.quality, locally_aligned=alignment.locally_aligned, insertions=alignment.insertions, deletions=alignment.deletions, germline=alignment.germline) session.add(seq) session.flush() # Add duplicate sequences try: session.bulk_save_objects([ DuplicateSequence(sample_id=sample.id, seq_id=seq_id, duplicate_seq_ai=seq.ai) for seq_id in alignment.sequence.ids[1:] ]) except ValueError as e: pass return seq except ValueError as e: if error_action == 'discard': add_as_noresult(session, alignment.sequence, sample, str(e)) return None elif error_action == 'raise': raise e
def add_uniques(session, sample, alignments, props, aligner, realign_len=None, realign_mut=None): bucketed_seqs = OrderedDict() alignments = sorted(alignments, key=lambda v: v.sequence.ids[0]) for alignment in funcs.periodic_commit(session, alignments): try: if realign_len is not None: aligner.align_to_germline(alignment, realign_len, realign_mut) if props.trim_to: alignment.trim_to(props.trim_to) props.validate(alignment) bucket_key = (funcs.format_ties(alignment.v_gene), funcs.format_ties(alignment.j_gene), len(alignment.cdr3)) if bucket_key not in bucketed_seqs: bucketed_seqs[bucket_key] = {} bucket = bucketed_seqs[bucket_key] if alignment.sequence.sequence in bucket: bucket[alignment.sequence.sequence].sequence.ids += ( alignment.sequence.ids) else: bucket[alignment.sequence.sequence] = alignment except AlignmentException as e: add_as_noresult(session, alignment.sequence, sample, str(e)) except Exception: logger.error('\tUnexpected error processing sequence ' '{}\n\t{}'.format(alignment.sequence.ids[0], traceback.format_exc())) # Collapse sequences that are the same except for Ns for bucket, sequences in funcs.periodic_commit(session, bucketed_seqs.iteritems()): sequences = sorted(sequences.values(), key=lambda s: (len(s.sequence.ids), s.sequence.ids[0]), reverse=True) while len(sequences) > 0: larger = sequences.pop(0) for i in reversed(range(len(sequences))): smaller = sequences[i] if dnautils.equal(larger.sequence.sequence, smaller.sequence.sequence): larger.sequence.ids += smaller.sequence.ids del sequences[i] add_as_sequence(session, larger, sample) session.commit()
def add_as_sequence(session, vdj, sample): try: seq = Sequence(seq_id=vdj.ids[0], sample_id=sample.id, subject_id=sample.subject.id, partial=vdj.partial, probable_indel_or_misalign=vdj.has_possible_indel, v_gene=funcs.format_ties(vdj.v_gene, vdj.v_germlines.prefix, strip_alleles=True), j_gene=funcs.format_ties(vdj.j_gene, vdj.j_germlines.prefix, strip_alleles=True), num_gaps=vdj.num_gaps, pad_length=vdj.pad_length, v_match=vdj.v_match, v_length=vdj.v_length, j_match=vdj.j_match, j_length=vdj.j_length, removed_prefix=vdj.removed_prefix, removed_prefix_qual=vdj.removed_prefix_qual, v_mutation_fraction=vdj.mutation_fraction, pre_cdr3_length=vdj.pre_cdr3_length, pre_cdr3_match=vdj.pre_cdr3_match, post_cdr3_length=vdj.post_cdr3_length, post_cdr3_match=vdj.post_cdr3_match, in_frame=vdj.in_frame, functional=vdj.functional, stop=vdj.stop, copy_number=len(vdj.ids), cdr3_nt=vdj.cdr3, cdr3_num_nts=len(vdj.cdr3), cdr3_aa=lookups.aas_from_nts(vdj.cdr3), sequence=str(vdj.sequence), quality=vdj.quality, germline=vdj.germline) session.add(seq) session.flush() # Add duplicate sequences try: session.bulk_save_objects([ DuplicateSequence(sample_id=sample.id, seq_id=seq_id, duplicate_seq_ai=seq.ai) for seq_id in vdj.ids[1:] ]) except ValueError: pass except ValueError as e: add_as_noresult(session, vdj, sample, str(e))
def get_seq_from_alignment(session, alignment, sample, strip_alleles=True): try: return [ Sequence( seq_id=alignment.sequence.seq_id, sample_id=sample.id, subject_id=sample.subject.id, partial=alignment.partial, rev_comp=alignment.sequence.rev_comp, probable_indel_or_misalign=alignment.has_possible_indel, v_gene=funcs.format_ties(alignment.v_gene, strip_alleles), j_gene=funcs.format_ties(alignment.j_gene, strip_alleles), num_gaps=alignment.num_gaps, seq_start=alignment.seq_start, v_match=alignment.v_match, v_length=alignment.v_length, j_match=alignment.j_match, j_length=alignment.j_length, removed_prefix=alignment.sequence.removed_prefix_sequence, removed_prefix_qual=alignment.sequence.removed_prefix_quality, v_mutation_fraction=alignment.v_mutation_fraction, pre_cdr3_length=alignment.pre_cdr3_length, pre_cdr3_match=alignment.pre_cdr3_match, post_cdr3_length=alignment.post_cdr3_length, post_cdr3_match=alignment.post_cdr3_match, in_frame=alignment.in_frame, functional=alignment.functional, stop=alignment.stop, copy_number=alignment.sequence.copy_number, cdr3_nt=alignment.cdr3, cdr3_num_nts=len(alignment.cdr3), cdr3_aa=lookups.aas_from_nts(alignment.cdr3), sequence=str(alignment.sequence.sequence), quality=alignment.sequence.quality, locally_aligned=alignment.locally_aligned, insertions=alignment.insertions, deletions=alignment.deletions, germline=alignment.germline) ] except ValueError as e: try: return [ get_noresult_from_vdj(session, alignment.sequence, sample, str(e)) ] except ValueError: return []
def aggregate_results(results, session, sample): alignments = {} for result in results: if result['status'] == 'success': alignment = result['alignment'] key = ( funcs.format_ties(alignment.v_gene), funcs.format_ties(alignment.j_gene), alignment.cdr3_num_nts, tuple(alignment.insertions), tuple(alignment.deletions) ) alignments.setdefault(key, []).append(alignment) elif result['status'] == 'noresult': add_noresults_for_vdj(session, result['vdj'], sample, result['reason']) session.commit() return alignments
def get_formatted_ties(genes): res = {} for ties, seq in genes.iteritems(): res[format_ties(ties)] = seq return res
def add_sequences_from_sample(session, sample, sequences, props): logger.info('Adding {} corrected sequences to sample {}'.format( len(sequences), sample.id)) for sequence in periodic_commit(session, sequences): alignment = sequence['alignment'] try: try: props.validate(alignment) except AlignmentException: continue if sequence['r_type'] == 'NoResult': add_sequences(session, [alignment], sample, error_action='raise') session.query(NoResult).filter( NoResult.pk == sequence['pk'] ).delete(synchronize_session=False) elif sequence['r_type'] == 'Sequence': fields = { 'partial': alignment.partial, 'probable_indel_or_misalign': alignment.has_possible_indel, 'v_gene': format_ties(alignment.v_gene), 'j_gene': format_ties(alignment.j_gene), 'num_gaps': alignment.num_gaps, 'seq_start': alignment.seq_start, 'v_match': alignment.v_match, 'v_length': alignment.v_length, 'j_match': alignment.j_match, 'j_length': alignment.j_length, 'removed_prefix': alignment.sequence.removed_prefix_sequence, 'removed_prefix_qual': alignment.sequence.removed_prefix_quality, 'v_mutation_fraction': alignment.v_mutation_fraction, 'pre_cdr3_length': alignment.pre_cdr3_length, 'pre_cdr3_match': alignment.pre_cdr3_match, 'post_cdr3_length': alignment.post_cdr3_length, 'post_cdr3_match': alignment.post_cdr3_match, 'in_frame': alignment.in_frame, 'functional': alignment.functional, 'stop': alignment.stop, 'cdr3_nt': alignment.cdr3, 'cdr3_num_nts': len(alignment.cdr3), 'cdr3_aa': lookups.aas_from_nts(alignment.cdr3), 'sequence': str(alignment.sequence.sequence), 'quality': alignment.sequence.quality, 'locally_aligned': alignment.locally_aligned, '_insertions': serialize_gaps(alignment.insertions), '_deletions': serialize_gaps(alignment.deletions), 'germline': alignment.germline } # This line doesnt actually add anything to the DB, it's just # to validate the fields Sequence(**fields) session.query(Sequence).filter( Sequence.ai == sequence['pk'] ).update(fields, synchronize_session=False) except ValueError: continue
def get_formatted_ties(genes): res = {} for ties, seq in genes.items(): res[format_ties(ties)] = seq return res