def collapse_identical(session, buckets): for i, bucket in enumerate(buckets): clones = session.query(Clone.id, Clone.cdr3_aa).filter( Clone.subject_id == bucket.subject_id, Clone.v_gene == bucket.v_gene, Clone.j_gene == bucket.j_gene, Clone.cdr3_num_nts == bucket.cdr3_num_nts, Clone._insertions == bucket._insertions, Clone._deletions == bucket._deletions, ) if clones.count() < 2: continue logger.info('Reducing bucket {} / {} ({} clones)'.format( i, len(buckets), clones.count())) uniques = {} for c in clones: uniques.setdefault(c.cdr3_aa, []).append(c.id) uniques = [sorted(u) for u in uniques.values() if len(u) > 1] if len(uniques) > 1: logger.info('Collapsing {} duplicate CDR3s'.format(len(uniques))) for identical in uniques: rep_id = identical[0] session.query(Sequence).filter( Sequence.clone_id.in_(identical)).update( {'clone_id': rep_id}, synchronize_session=False) session.query(Clone).filter(Clone.id.in_( identical[1:])).delete(synchronize_session=False) session.commit()
def aggregate_results(results, session, sample): alignments = {} success = [r for r in results if r['status'] == 'success'] noresults = [r for r in results if r['status'] == 'noresult'] logger.info('{} total sequences ({} alignments, {} noresults)'.format( len(results), len(success), len(noresults))) for result in success: alignment = result['alignment'] key = (funcs.format_ties(alignment.v_gene), funcs.format_ties(alignment.j_gene), alignment.cdr3_num_nts, tuple(alignment.insertions), tuple(alignment.deletions)) alignments.setdefault(key, []).append(alignment) copies = 0 for i, result in enumerate(noresults): orig_id = result['vdj'].seq_id copies += result['vdj'].copy_number for i in range(result['vdj'].copy_number): result['vdj'].seq_id = '{}_{}'.format(orig_id, i) add_noresults_for_vdj(session, result['vdj'], sample, result['reason']) if copies % 1000 == 0: session.commit() session.commit() return alignments
def write_pooled_clones(session, out_format, sample_ids=None, pool_on=('sample', ), zipped=False, **kwargs): # Samples and subjects can't be combined with other features exclusives = set(pool_on).intersection(set(('sample', 'subject'))) if len(pool_on) > 1 and exclusives: pool_on = (list(exclusives)[0], ) logger.warning('You specified pooling on {feat} which ' 'cannot be combined with other features.' ' Using only {feat}.'.format(feat=pool_on[0])) logger.info('Writing clones pooled by {} in {} format'.format( ','.join(pool_on), out_format)) sample_ids = sample_ids or [s.id for s in session.query(Sample)] aggregated = get_pooled_samples(session, sample_ids, pool_on) output_func = { 'immunedb': get_immunedb_output, 'vdjtools': get_vdjtools_output }[out_format] with ExportWriter(zipped=zipped) as fh: for (subject, feature_value), clones in aggregated.items(): logger.info('Pooling subject {} for feature(s) {}'.format( subject, ','.join(feature_value))) fh.set_filename(get_filename(subject, pool_on, feature_value)) fh.write(output_func(session, clones)) return fh.get_zip_value()
def write_vdjtools(session, args): clone_features = get_clone_features(session) for sample in session.query(Sample): logger.info('Exporting VDJTools format for sample {}'.format( sample.name)) write_tsv('{}.sample.txt'.format(sample.name), get_sample_vdjtools, session, sample, args.min_clone_size, clone_features)
def write_selection(session, sample_ids=None, filter_type='both', zipped=False, **kwargs): logger.info('Exporting selection pressure') with ExportWriter(zipped=zipped) as fh: fh.set_filename('selection_pressure.tsv') fh.write(get_selection(session, filter_type, sample_ids)) return fh.get_zip_value()
def write_samples(session, sample_ids=None, for_update=False, zipped=False, **kwargs): logger.info('Exporting samples') with ExportWriter(zipped) as fh: fh.set_filename('samples.tsv') fh.write(get_samples(session, for_update, sample_ids)) return fh.get_zip_value()
def remove_duplicates(session, sample): logger.info('Removing duplicates from sample {}'.format(sample.id)) seqs = session.query(Sequence).filter( Sequence.locally_aligned.is_(True), Sequence.sample_id == sample.id).order_by(Sequence.ai) for seq in seqs: potential_collapse = session.query(Sequence).filter( Sequence.sample_id == sample.id, Sequence.v_gene == seq.v_gene, Sequence.j_gene == seq.j_gene, Sequence.cdr3_num_nts == seq.cdr3_num_nts, ).order_by(desc(Sequence.copy_number), Sequence.ai) for other_seq in potential_collapse: if (other_seq.seq_id == seq.seq_id or len(other_seq.sequence) != len(seq.sequence)): continue if dnautils.equal(other_seq.sequence, seq.sequence): other_seq.copy_number += seq.copy_number session.delete(seq) break session.commit()
def write_samples(session, sample_ids=None, for_update=False, zipped=False, **kwargs): logger.info('Exporting samples') with ExportWriter(zipped) as fh: fh.set_filename('samples.tsv') fh.write(get_samples(session, for_update, sample_ids)) return fh.get_zip_value()
def run_import(session, args, remaps=None): v_germlines = VGermlines(args.v_germlines) j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3, args.anchor_len, args.min_anchor_len) study, new = funcs.get_or_create(session, Study, name=args.study_name) if new: logger.info('Created new study "{}"'.format(study.name)) session.commit() sample, new = funcs.get_or_create(session, Sample, name=args.sample_name, study=study) if new: sample.date = args.date logger.info('Created new sample "{}"'.format(sample.name)) for key in ('subset', 'tissue', 'disease', 'lab', 'experimenter', 'ig_class', 'v_primer', 'j_primer'): setattr(sample, key, vars(args).get(key, None)) subject, new = funcs.get_or_create(session, Subject, study=study, identifier=args.subject) sample.subject = subject session.commit() else: loger.error('Sample "{}" already exists'.format(args.sample_name)) return with open(args.input_file) as fh: read_file(session, fh, sample, v_germlines, j_germlines, args, remaps)
def run_clearcut(session, args): if args.clone_ids is not None: clones = session.query(Clone.id).filter( Clone.id.in_(args.clone_ids)) else: if args.subject_ids is not None: clones = session.query(Clone.id).filter( Clone.subject_id.in_(args.subject_ids)) else: clones = session.query(Clone.id) if not args.force: clones = clones.filter(Clone.tree.is_(None)) clones = [c.id for c in clones] mod_log.make_mod('clone_tree', session=session, commit=True, info=vars(args)) tasks = concurrent.TaskQueue() logger.info('Creating task queue for clones') for clone_id in clones: tasks.add_task(clone_id) for _ in range(0, args.nproc): session = config.init_db(args.db_config) tasks.add_worker(LineageWorker( session, get_newick, args.min_mut_copies, args.min_mut_samples, args.min_seq_copies, args.min_seq_samples, args.exclude_stops, args.full_seq, post_tree_hook=minimize_tree)) tasks.start()
def create_sample(session, metadata): study, new = funcs.get_or_create( session, Study, name=metadata['study_name']) if new: logger.info('Created new study "{}"'.format(study.name)) session.commit() sample, new = funcs.get_or_create( session, Sample, name=metadata['sample_name'], study=study) if new: logger.info('Created new sample "{}"'.format(sample.name)) for key, value in metadata.items(): if key not in REQUIRED_FIELDS: session.add(SampleMetadata( sample=sample, key=key, value=value )) subject, new = funcs.get_or_create( session, Subject, study=study, identifier=metadata['subject']) sample.subject = subject session.commit() else: logger.error( 'Sample "{}" already exists'.format(metadata['sample_name'])) return return sample
def add_sequences_from_sample(session, sample, sequences, props): logger.info('Adding {} corrected sequences to sample {}'.format( len(sequences), sample.id)) for sequence in periodic_commit(session, sequences): alignment = sequence['alignment'] try: try: props.validate(alignment) except AlignmentException: continue if sequence['r_type'] == 'NoResult': add_sequences(session, [alignment], sample, error_action='raise') session.query(NoResult).filter( NoResult.pk == sequence['pk']).delete( synchronize_session=False) elif sequence['r_type'] == 'Sequence': fields = { 'partial': alignment.partial, 'probable_indel_or_misalign': alignment.has_possible_indel, 'v_gene': format_ties(alignment.v_gene), 'j_gene': format_ties(alignment.j_gene), 'num_gaps': alignment.num_gaps, 'seq_start': alignment.seq_start, 'v_match': alignment.v_match, 'v_length': alignment.v_length, 'j_match': alignment.j_match, 'j_length': alignment.j_length, 'removed_prefix': alignment.sequence.removed_prefix_sequence, 'removed_prefix_qual': alignment.sequence.removed_prefix_quality, 'v_mutation_fraction': alignment.v_mutation_fraction, 'pre_cdr3_length': alignment.pre_cdr3_length, 'pre_cdr3_match': alignment.pre_cdr3_match, 'post_cdr3_length': alignment.post_cdr3_length, 'post_cdr3_match': alignment.post_cdr3_match, 'in_frame': alignment.in_frame, 'functional': alignment.functional, 'stop': alignment.stop, 'cdr3_nt': alignment.cdr3, 'cdr3_num_nts': len(alignment.cdr3), 'cdr3_aa': lookups.aas_from_nts(alignment.cdr3), 'sequence': str(alignment.sequence.sequence), 'quality': alignment.sequence.quality, 'locally_aligned': alignment.locally_aligned, '_insertions': serialize_gaps(alignment.insertions), '_deletions': serialize_gaps(alignment.deletions), 'germline': alignment.germline } # This line doesnt actually add anything to the DB, it's just # to validate the fields Sequence(**fields) session.query(Sequence).filter( Sequence.ai == sequence['pk']).update( fields, synchronize_session=False) except ValueError: continue
def collapse_duplicates(bucket): use_heuristic = len(bucket) > 10000 bucket = sorted(bucket, key=lambda s: s.sequence.copy_number, reverse=True) if use_heuristic: uniques = {} for alignment in bucket: if alignment.cdr3 not in uniques: uniques[alignment.cdr3] = alignment else: uniques[alignment.cdr3].sequence.copy_number += ( alignment.sequence.copy_number) logger.info( 'Bucket {v_gene} {j_gene} {cdr3_num_nts} had {cnt} ' 'sequences. Used heuristic to reduce to {new_cnt}.'.format( v_gene=[g.name for g in bucket[0].v_gene], j_gene=[g.name for g in bucket[0].j_gene], cdr3_num_nts=bucket[0].cdr3_num_nts, cnt=len(bucket), new_cnt=len(uniques))) bucket = sorted(uniques.values(), key=lambda s: s.sequence.copy_number, reverse=True) return collapse_duplicate_alignments(bucket)
def run_selection_pressure(session, args): mod_log.make_mod('clone_pressure', session=session, commit=True, info=vars(args)) if args.clone_ids is not None: clones = args.clone_ids elif args.subject_ids is not None: clones = map( lambda c: c.id, session.query(Clone.id).filter( Clone.subject_id.in_(args.subject_ids)).all()) else: clones = map(lambda c: c.id, session.query(Clone.id).all()) clones.sort() tasks = concurrent.TaskQueue() logger.info('Creating task queue to calculate selection pressure for {} ' 'clones.'.format(len(clones))) for cid in clones: tasks.add_task(cid) for i in range(0, args.nproc): session = config.init_db(args.db_config) tasks.add_worker( SelectionPressureWorker(session, args.baseline_path, args.temp, args.regen, args.thresholds)) tasks.start()
def collapse_similar_cdr3s(session, buckets, difference_allowed): logger.info('Collapsing similar clones in {} buckets'.format( buckets.count())) for i, bucket in enumerate(buckets): clones = session.query(Clone.id, Clone.cdr3_aa, Clone.cdr3_nt).filter( Clone.subject_id == bucket.subject_id, Clone.cdr3_num_nts == bucket.cdr3_num_nts, ).order_by(Clone.overall_total_cnt.desc()) if clones.count() < 2: continue logger.info('Reducing bucket {} / {} ({} clones)'.format( i, buckets.count(), clones.count())) reduced = {} for c in clones: for larger_cdr3_nt, others in reduced.items(): if (dnautils.hamming(larger_cdr3_nt, c.cdr3_nt) <= difference_allowed): others.append(c.id) break else: reduced[c.cdr3_nt] = [c.id] for collapse in reduced.values(): rep_id, others = collapse[0], collapse[1:] session.query(Sequence).filter( Sequence.clone_id.in_(others)).update( {'clone_id': rep_id}, synchronize_session=False) session.query(Clone).filter( Clone.id.in_(others)).delete(synchronize_session=False) session.commit()
def remove_duplicates(session, sample): logger.info('Removing duplicates from sample {}'.format(sample.id)) seqs = session.query( Sequence ).filter( Sequence.locally_aligned.is_(True), Sequence.sample_id == sample.id ).order_by(Sequence.ai) for seq in seqs: potential_collapse = session.query( Sequence ).filter( Sequence.sample_id == sample.id, Sequence.v_gene == seq.v_gene, Sequence.j_gene == seq.j_gene, Sequence.cdr3_num_nts == seq.cdr3_num_nts, ).order_by(desc(Sequence.copy_number), Sequence.ai) for other_seq in potential_collapse: if (other_seq.seq_id == seq.seq_id or len(other_seq.sequence) != len(seq.sequence)): continue if dnautils.equal(other_seq.sequence, seq.sequence): other_seq.copy_number += seq.copy_number session.delete(seq) break session.commit()
def collapse_identical(session, buckets): for i, bucket in enumerate(buckets): clones = session.query( Clone.id, Clone.cdr3_aa ).filter( Clone.subject_id == bucket.subject_id, Clone.v_gene == bucket.v_gene, Clone.j_gene == bucket.j_gene, Clone.cdr3_num_nts == bucket.cdr3_num_nts, Clone._insertions == bucket._insertions, Clone._deletions == bucket._deletions, ) if clones.count() < 2: continue logger.info('Reducing bucket {} / {} ({} clones)'.format( i, len(buckets), clones.count())) uniques = {} for c in clones: uniques.setdefault(c.cdr3_aa, []).append(c.id) uniques = [sorted(u) for u in uniques.values() if len(u) > 1] if len(uniques) > 1: logger.info('Collapsing {} duplicate CDR3s'.format(len(uniques))) for identical in uniques: rep_id = identical[0] session.query(Sequence).filter( Sequence.clone_id.in_(identical) ).update({'clone_id': rep_id}, synchronize_session=False) session.query(Clone).filter( Clone.id.in_(identical[1:]) ).delete(synchronize_session=False) session.commit()
def run_fix_sequences(session, args): v_germlines = VGermlines(args.v_germlines) j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3) indexes = set() props = IdentificationProps(**args.__dict__) for sample in session.query(Sample): sequences = process_sample(session, sample, indexes, args.temp, v_germlines, j_germlines, args.nproc) add_sequences_from_sample(session, sample, sequences, props) remove_duplicates(session, sample) logger.info('Updating copy numbers') session.connection(mapper=Sequence).execute( text(''' UPDATE sequences SET copy_number = 1 + ( SELECT COUNT(*) FROM duplicate_sequences WHERE duplicate_seq_ai = ai ) '''))
def process_data(input_data, process_func, aggregate_func, nproc, generate_args={}, process_args={}, aggregate_args={}): if callable(input_data): start = time.time() input_data = input_data(**generate_args) logger.info('Generate time: {}'.format(time.time() - start)) with mp.Manager() as manager: proxy_data = manager.list(input_data) pool = mp.Pool(processes=nproc) f = functools.partial( subcaller, functools.partial(process_func, **process_args), proxy_data ) start = time.time() logger.info('Waiting on pool {}'.format(process_func.__name__)) res = [r for r in pool.map(f, range(len(proxy_data))) if r is not None] pool.close() logger.info('Pool done: {}'.format(time.time() - start)) start = time.time() logger.info('Waiting on aggregation {}'.format(aggregate_func.__name__)) ret = aggregate_func(res, **aggregate_args) logger.info('Done aggregation: {}'.format(time.time() - start)) return ret
def create(main_parser, args): if re.search(r'[^A-Za-z0-9_-]', args.db_name) is not None: main_parser.error('Database name must only contain letters, numbers, ' 'dashes and underscores.') try: conn = _get_root_connection(args.db_host, args.admin_user, args.admin_pass) db_user = args.db_user or args.db_name if args.db_pass: db_pass = args.db_pass else: db_pass = ''.join( random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(10)) with conn.cursor() as cursor: logger.info('Creating user "{}"'.format(db_user)) existing_password = _create_user_if_not_exists(conn, '%', db_user, db_pass) if existing_password is not None: if not args.db_pass: logger.warning( 'User {} already exists. To generate the ' 'configuration file, you must enter it\'s ' 'password.'.format(db_user) ) db_pass = _get_user_pass(conn, args.db_host, db_user, existing_password) else: db_pass = args.db_pass logger.info('Creating database "{}"'.format(args.db_name)) cursor.execute('CREATE DATABASE {}'.format(args.db_name)) cursor.execute( 'GRANT ALL PRIVILEGES ON {}.* TO \'{}\'@\'%\''.format( args.db_name, db_user)) config_path = os.path.join(args.config_dir, '{}.json'.format( args.db_name)) logger.info('Creating config at {}'.format(config_path)) with open(config_path, 'w+') as fh: json.dump({ 'host': args.db_host, 'database': args.db_name, 'username': db_user, 'password': db_pass }, fh, sort_keys=True, indent=4, separators=(',', ': ')) logger.info('Initializing tables') config.init_db(config_path) logger.info('Success!') return True except Exception as e: logger.error(e) return False
def _get_root_connection(host, user, admin_pass=None): if admin_pass is None: try: return _connect(host, user) except Exception: logger.info('Failed connection with empty root password') admin_pass = getpass.getpass('MySQL password for ({}):'.format( user)) return _connect(host, user, admin_pass)
def _get_root_connection(host, user, admin_pass=None): if admin_pass is None: try: return _connect(host, user) except Exception as e: logger.info('Failed connection with empty root password') admin_pass = getpass.getpass( 'MySQL password for ({}):'.format(user)) return _connect(host, user, admin_pass)
def _get_user_pass(conn, host, user, existing_password): with conn.cursor() as cursor: while True: db_pass = getpass.getpass() cursor.execute('SELECT PASSWORD(%s) as password', db_pass) if cursor.fetchone()['password'] != existing_password: logger.error('Password does not match.') else: logger.info('Correct password') return db_pass
def write_genbank(session, args): args.inference = 'alignment:' + args.inference header = ('[organism={}] ' '[moltype={}] ' '[keywords=AIRR]').format(args.species, args.mol_type) for sample in session.query(Sample): logger.info('Exporting sample {}'.format(sample.name)) _write_sample(session, sample.id, args.gene_db, args.inference, header)
def _get_user_pass(conn, host, user, existing_password): with conn.cursor() as cursor: while True: db_pass = getpass.getpass() cursor.execute('SELECT PASSWORD(%s) as password', db_pass) if cursor.fetchone()['password'] != existing_password: logger.error('Password does not match.') else: logger.info('Correct password') return db_pass
def preprocess_airr(reader): logger.info('Collapsing identical sequences') seen = {} for l in reader: if l['sequence_alignment'] in seen: seen[l['sequence_alignment']]['copy_number'] += 1 else: l['copy_number'] = 1 seen[l['sequence_alignment']] = l return sorted(seen.values(), key=lambda s: s['sequence_id'])
def run_collapse(session, args): mod_log.make_mod('collapse', session=session, commit=True, info=vars(args)) subject_ids = [] for subject in (args.subject_ids or map( lambda e: e.id, session.query(Subject.id).all() )): if session.query(Sample).filter( Sample.subject_id == subject, ~exists().where( SequenceCollapse.sample_id == Sample.id )).first() is None: logger.info('Subject {} already collapsed. Skipping.'.format( subject)) else: logger.info('Resetting collapse info for subject {}'.format( subject)) samples = session.query(Sample).filter( Sample.subject_id == subject ).all() for sample in samples: session.query(SequenceCollapse).filter( SequenceCollapse.sample_id == sample.id ).delete(synchronize_session=False) logger.info('Resetting clone info for subject {}'.format(subject)) session.query(Clone).filter(Clone.subject_id == subject).delete() subject_ids.append(subject) session.commit() logger.info('Creating task queue to collapse {} subjects.'.format( len(subject_ids))) tasks = concurrent.TaskQueue() for subject_id in subject_ids: buckets = session.query( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ).filter( Sequence.subject_id == subject_id ).group_by( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ) for bucket in buckets: tasks.add_task(bucket) logger.info('Generated {} total tasks'.format(tasks.num_tasks())) for i in range(0, min(tasks.num_tasks(), args.nproc)): tasks.add_worker(CollapseWorker(config.init_db(args.db_config))) tasks.start() session.close()
def run_collapse(session, args): mod_log.make_mod('collapse', session=session, commit=True, info=vars(args)) subject_ids = [] subjects = (args.subject_ids or [e.id for e in session.query(Subject.id)]) for subject in subjects: if session.query(Sample).filter( Sample.subject_id == subject, ~exists().where( SequenceCollapse.sample_id == Sample.id )).first() is None: logger.info('Subject {} already collapsed. Skipping.'.format( subject)) else: logger.info('Resetting collapse info for subject {}'.format( subject)) samples = session.query(Sample).filter( Sample.subject_id == subject ) for sample in samples: session.query(SequenceCollapse).filter( SequenceCollapse.sample_id == sample.id ).delete(synchronize_session=False) sample.sample_stats = [] logger.info('Resetting clone info for subject {}'.format(subject)) session.query(Clone).filter(Clone.subject_id == subject).delete() subject_ids.append(subject) session.commit() logger.info('Creating task queue to collapse {} subjects.'.format( len(subject_ids))) tasks = concurrent.TaskQueue() for subject_id in subject_ids: buckets = session.query( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ).filter( Sequence.subject_id == subject_id ).group_by( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ) for bucket in buckets: tasks.add_task(bucket) logger.info('Generated {} total tasks'.format(tasks.num_tasks())) for i in range(0, min(tasks.num_tasks(), args.nproc)): tasks.add_worker(CollapseWorker(config.init_db(args.db_config))) tasks.start() session.close()
def export_genbank(session, args): args.inference = 'alignment:' + args.inference header = ('[organism={}] ' '[moltype={}] ' '[keywords=AIRR]').format(args.species, args.mol_type) samples = args.sample_ids or [s.id for s in session.query(Sample.id).all()] for sample_id in samples: logger.info('Exporting sample {}'.format(sample_id)) export_sample_genbank(session, sample_id, args.gene_db, args.inference, header)
def run_clones(session, args): """Runs the clone-assignment pipeline stage. :param Session session: The database session :param Namespace args: The arguments passed to the command """ if args.subject_ids is None: subject_ids = [s.id for s in session.query(Subject.id)] else: subject_ids = args.subject_ids mod_log.make_mod('clones', session=session, commit=True, info=vars(args)) if args.regen: logger.info('Deleting existing clones') session.query(Clone).filter( Clone.subject_id.in_(subject_ids) ).delete(synchronize_session=False) session.commit() tasks = concurrent.TaskQueue() for subject_id in subject_ids: logger.info('Generating task queue for subject {}'.format( subject_id)) buckets = session.query( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ).filter( Sequence.subject_id == subject_id, Sequence.clone_id.is_(None) ).group_by( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ) for bucket in buckets: if not args.gene or bucket.v_gene.startswith(args.gene): tasks.add_task(bucket) logger.info('Generated {} total tasks'.format(tasks.num_tasks())) methods = { 'similarity': SimilarityClonalWorker, 'lineage': LineageClonalWorker, } for i in range(0, min(tasks.num_tasks(), args.nproc)): worker = methods[args.method]( config.init_db(args.db_config), **args.__dict__ ) tasks.add_worker(worker) tasks.start() if args.subclones: run_subclones(session, subject_ids, args) else: logger.info('Skipping subclones') push_clone_ids(session) session.commit()
def run_clones(session, args): """Runs the clone-assignment pipeline stage. :param Session session: The database session :param Namespace args: The arguments passed to the command """ if args.subject_ids is None: subject_ids = map(lambda s: s.id, session.query(Subject.id).all()) else: subject_ids = args.subject_ids mod_log.make_mod('clones', session=session, commit=True, info=vars(args)) if args.regen: logger.info('Deleting existing clones') session.query(Clone).filter( Clone.subject_id.in_(subject_ids) ).delete(synchronize_session=False) session.commit() tasks = concurrent.TaskQueue() for subject_id in subject_ids: logger.info('Generating task queue for subject {}'.format( subject_id)) buckets = session.query( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ).filter( Sequence.subject_id == subject_id, Sequence.clone_id.is_(None) ).group_by( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ) for bucket in buckets: tasks.add_task(bucket) logger.info('Generated {} total tasks'.format(tasks.num_tasks())) methods = { 'similarity': SimilarityClonalWorker, 'tcells': TCellClonalWorker, 'lineage': LineageClonalWorker, } for i in range(0, min(tasks.num_tasks(), args.nproc)): worker = methods[args.method]( config.init_db(args.db_config), **args.__dict__ ) tasks.add_worker(worker) tasks.start() if args.subclones: run_subclones(session, subject_ids, args) else: logger.info('Skipping subclones') push_clone_ids(session) session.commit()
def write_sequences(session, sample_ids=None, out_format='changeo', clones_only=False, min_subject_copies=None, zipped=False, **kwargs): samples = session.query(Sample) if sample_ids: samples = samples.filter(Sample.id.in_(sample_ids)) with ExportWriter(zipped=zipped) as fh: for sample in samples: logger.info('Exporting sample {}'.format(sample.name)) fh.set_filename('{}.{}.tsv'.format(sample.name, out_format)) fh.write( get_sequences(session, sample, out_format, clones_only, min_subject_copies) ) return fh.get_zip_value()
def write_clone_overlap(session, sample_ids=None, pool_on=('sample', ), size_metric='copies', sim_func='cosine', agg_func='median', zipped=False, **kwargs): samples = session.query(Sample) if sample_ids: samples = samples.filter(Sample.id.in_(sample_ids)) sample_instances = {s.id: s for s in samples} with ExportWriter(zipped=zipped) as writer: for subject in set([s.subject for s in sample_instances.values()]): logger.info('Calculating overlap for {}'.format( subject.identifier)) sub_samples = [ s.id for s in sample_instances.values() if s.subject == subject ] sdf = get_sample_df(session, sub_samples, pool_on, size_metric, getattr(distance, sim_func)) if sdf.empty: logger.warning( 'Subject {} had no clones for calculation'.format( subject.identifier)) continue sdf = collapse_df_features(sdf, pool_on, sample_instances, getattr(np, agg_func)) name = '{}.overlap'.format(subject.identifier) with writer.get_handle(name + '.tsv') as fh: sdf.to_csv(fh, sep='\t') title_fmt = 'Subject {}\npooled by={}, similarity metric={}' if 'sample' not in pool_on: title_fmt += ', aggregation function={}' fig, ax = plt.subplots(figsize=(20, 20)) ax = sns.heatmap(sdf, annot=True, linewidths=.25, vmin=0, vmax=1) ax.set_title( title_fmt.format(subject.identifier, ' & '.join(pool_on), sim_func, agg_func)) with writer.get_handle(name + '.pdf', 'wb+') as fh: plt.savefig(fh, bbox_inches='tight', format='pdf') return writer.get_zip_value()
def start_job(self, func, **kwargs): uid = str(uuid.uuid4()) logger.info('Starting job with UUID {}'.format(uid)) job_func = partial(self._job_wrap, func, uid) mp.Process( target=job_func, kwargs=kwargs ).start() self.files.extend([ self.get_path(uid, '.log'), self.get_path(uid, '.zip'), ]) return uid
def delete(main_parser, args): try: with open(args.db_config) as fh: db_config = json.load(fh) conn = _get_root_connection(db_config['host'], args.admin_user, args.admin_pass) with conn.cursor() as cursor: logger.info('Deleting database {}'.format(db_config['database'])) cursor.execute('DROP DATABASE `{}`'.format(db_config['database'])) if args.delete_user: logger.info('Deleting user {}'.format(db_config['username'])) cursor.execute('DROP USER `{}`'.format(db_config['username'])) return True except Exception as e: logger.error(e) return False
def delete(main_parser, args): try: with open(args.db_config) as fh: db_config = json.load(fh) conn = _get_root_connection(db_config['host'], args.admin_user, args.admin_pass) with conn.cursor() as cursor: logger.info('Deleting database {}'.format(db_config['database'])) cursor.execute('DROP DATABASE `{}`'.format(db_config['database'])) if args.delete_user: logger.info('Deleting user {}'.format(db_config['username'])) cursor.execute('DROP USER `{}`'.format(db_config['username'])) return True except Exception as e: logger.error(e) return False
def write_sequences(session, args): for subject in session.query(Subject): logger.info('Exporting subject {}'.format(subject.identifier)) seqs = session.query(Sequence).filter( Sequence.subject_id == subject.id ) if args.clones_only: seqs = seqs.filter(~Sequence.clone_id.is_(None)) if args.min_subject_copies is not None: seqs = seqs.filter( SequenceCollapse.copy_number_in_subject >= args.min_subject_copies ) fn = '{}.{}.tsv'.format(subject.identifier, args.fmt) write_tsv(fn, get_sequences, session, seqs, args.fmt)
def run_subclones(session, subject_ids, args): tasks = concurrent.TaskQueue() for subject_id in subject_ids: logger.info( 'Generating subclone task queue for subject {}'.format(subject_id)) buckets = session.query(Clone.subject_id, Clone.v_gene, Clone.j_gene, Clone.cdr3_num_nts).filter( Clone.subject_id == subject_id).group_by( Clone.subject_id, Clone.v_gene, Clone.j_gene, Clone.cdr3_num_nts) for bucket in buckets: tasks.add_task(bucket) logger.info('Generated {} total subclone tasks'.format(tasks.num_tasks())) for i in range(0, min(tasks.num_tasks(), args.nproc)): tasks.add_worker(SubcloneWorker(config.init_db(args.db_config))) tasks.start()
def aggregate_collapse(aggregate_queue, db_config, sample_id, props): seqs_to_add = [] session = config.init_db(db_config, create=False) sample = session.query(Sample).filter(Sample.id == sample_id).one() for i, alignment in enumerate(aggregate_queue): for seq in alignment: seqs_to_add.append(seq) if len(seqs_to_add) >= 1000: add_sequences(session, seqs_to_add, sample, strip_alleles=not props.genotyping) seqs_to_add = [] session.commit() if seqs_to_add: add_sequences(session, seqs_to_add, sample, strip_alleles=not props.genotyping) logger.info('Finished aggregating sequences') session.commit() session.close()
def aggregate_collapse(aggregate_queue, db_config, sample_id, props): seqs_to_add = [] session = config.init_db(db_config, create=False) sample = session.query(Sample).filter(Sample.id == sample_id).one() for i, alignment in enumerate(aggregate_queue): for seq in alignment: seqs_to_add.append(seq) if len(seqs_to_add) >= 1000: add_sequences(session, seqs_to_add, sample, strip_alleles=not props.genotyping) seqs_to_add = [] session.commit() if seqs_to_add: add_sequences(session, seqs_to_add, sample, strip_alleles=not props.genotyping) logger.info('Finished aggregating sequences') session.commit() session.close()
def write_clone_overlap(session, sample_ids=None, pool_on=('sample',), size_metric='copies', sim_func='cosine', agg_func='median', zipped=False, **kwargs): samples = session.query(Sample) if sample_ids: samples = samples.filter(Sample.id.in_(sample_ids)) sample_instances = {s.id: s for s in samples} with ExportWriter(zipped=zipped) as writer: for subject in set([s.subject for s in sample_instances.values()]): logger.info('Calculating overlap for {}'.format( subject.identifier)) sub_samples = [ s.id for s in sample_instances.values() if s.subject == subject ] sdf = get_sample_df(session, sub_samples, pool_on, size_metric, getattr(distance, sim_func)) if sdf.empty: logger.warning( 'Subject {} had no clones for calculation'.format( subject.identifier)) continue sdf = collapse_df_features(sdf, pool_on, sample_instances, getattr(np, agg_func)) name = '{}.overlap'.format(subject.identifier) with writer.get_handle(name + '.tsv') as fh: sdf.to_csv(fh, sep='\t') title_fmt = 'Subject {}\npooled by={}, similarity metric={}' if 'sample' not in pool_on: title_fmt += ', aggregation function={}' fig, ax = plt.subplots(figsize=(20, 20)) ax = sns.heatmap(sdf, annot=True, linewidths=.25, vmin=0, vmax=1) ax.set_title(title_fmt.format( subject.identifier, ' & '.join(pool_on), sim_func, agg_func )) with writer.get_handle(name + '.pdf', 'wb+') as fh: plt.savefig(fh, bbox_inches='tight', format='pdf') return writer.get_zip_value()
def run_subclones(session, subject_ids, args): tasks = concurrent.TaskQueue() for subject_id in subject_ids: logger.info('Generating subclone task queue for subject {}'.format( subject_id)) buckets = session.query( Clone.subject_id, Clone.v_gene, Clone.j_gene, Clone.cdr3_num_nts ).filter( Clone.subject_id == subject_id ).group_by( Clone.subject_id, Clone.v_gene, Clone.j_gene, Clone.cdr3_num_nts ) for bucket in buckets: tasks.add_task(bucket) logger.info('Generated {} total subclone tasks'.format(tasks.num_tasks())) for i in range(0, min(tasks.num_tasks(), args.nproc)): tasks.add_worker(SubcloneWorker(config.init_db(args.db_config), args.similarity)) tasks.start()
def read_input(path): vdjs = [] parser = SeqIO.parse(path, 'fasta' if path.endswith('.fasta') else 'fastq') # Collapse identical sequences logger.info('Parsing input') for record in parser: try: vdjs.append(VDJSequence( seq_id=record.description, sequence=str(record.seq), quality=funcs.ord_to_quality( record.letter_annotations.get('phred_quality') ) )) except ValueError: continue logger.info('There are {} sequences'.format(len(vdjs))) return vdjs
def _queue_tasks(session, sample_id, force, tasks): logger.info('Creating task queue to generate stats for sample {}.'.format( sample_id)) existing_seq = session.query(Sequence).filter( Sequence.sample_id == sample_id) existing_nores = session.query(NoResult).filter( NoResult.sample_id == sample_id) if existing_seq.first() is None and existing_nores.first() is None: logger.warning('\tSKIPPING since there are no sequences in the ' 'sample') return existing = session.query(SampleStats.sample_id).filter( SampleStats.sample_id == sample_id).first() is not None if force and existing: logger.warning('\tFORCING regeneration of stats') elif not force and existing: logger.warning('\tSKIPPING stats since they already exists and the ' '--force flag was not specified.') return min_cdr3, max_cdr3 = _get_cdr3_bounds(session, sample_id) for include_outliers in [True, False]: for only_full_reads in [True, False]: tasks.add_task({ 'func': 'seq', 'sample_id': sample_id, 'min_cdr3': min_cdr3, 'max_cdr3': max_cdr3, 'include_outliers': include_outliers, 'only_full_reads': only_full_reads }) tasks.add_task({ 'func': 'clone', 'sample_id': sample_id, 'min_cdr3': min_cdr3, 'max_cdr3': max_cdr3, 'include_outliers': include_outliers, 'only_full_reads': only_full_reads })
def parse_file(fh, sample, session, alignment_func, props, v_germlines, j_germlines, nproc, preprocess_func=None): start = time.time() reader = csv.DictReader(fh, delimiter='\t') if preprocess_func: reader = preprocess_func(reader) alignments = concurrent.process_data( reader, process_line, aggregate_results, nproc, process_args={ 'alignment_func': alignment_func, 'props': props, 'v_germlines': v_germlines, 'j_germlines': j_germlines }, aggregate_args={ 'session': session, 'sample': sample } ) concurrent.process_data( alignments.values(), collapse_duplicates, add_results, nproc, aggregate_args={ 'session': session, 'sample': sample } ) logger.info('Completed sample {} in {}m'.format( sample.name, round((time.time() - start) / 60., 1)))
def run_clone_stats(session, args): """Runs the clone statistics generation stage of the pipeline. :param Session session: The database session :param Namespace args: The arguments passed to the command """ mod_log.make_mod('clone_stats', session=session, commit=True, info=vars(args)) if args.clone_ids is not None: clones = args.clone_ids elif args.subject_ids is not None: clones = [c.id for c in session.query(Clone.id).filter( Clone.subject_id.in_(args.subject_ids))] else: clones = [c.id for c in session.query(Clone.id)] clones.sort() if args.regen: logger.info('Deleting old clone statistics for {} clones'.format( len(clones))) session.query(CloneStats).filter( CloneStats.clone_id.in_(clones) ).delete(synchronize_session=False) session.commit() tasks = concurrent.TaskQueue() logger.info('Creating task queue to generate stats for {} clones.'.format( len(clones))) for cid in clones: tasks.add_task(cid) for i in range(0, args.nproc): session = config.init_db(args.db_config) tasks.add_worker(CloneStatsWorker(session)) tasks.start()
def setup_sample(session, meta): study, new = funcs.get_or_create(session, Study, name=meta['study_name']) if new: logger.info('Created new study "{}"'.format(study.name)) session.commit() name = meta['sample_name'] sample, new = funcs.get_or_create(session, Sample, name=name, study=study) if new: subject, new = funcs.get_or_create( session, Subject, study=study, identifier=meta['subject']) sample.subject = subject for key, value in meta.items(): if key not in REQUIRED_FIELDS: session.add(SampleMetadata( sample=sample, key=key, value=value )) session.commit() return sample
def remove_duplicates(session, sample): logger.info('Removing duplicates from sample {}'.format(sample.id)) all_seqs = session.query(Sequence).filter( Sequence.sample == sample ).order_by( Sequence.copy_number.desc() ) buckets = {} for seq in all_seqs: key = (seq.v_gene, seq.j_gene, seq.cdr3_num_nts) buckets.setdefault(key, []).append(seq) for i, bucket in enumerate(buckets.values()): while len(bucket) > 0: larger = bucket.pop(0) for i in reversed(range(len(bucket))): smaller = bucket[i] if dnautils.equal(larger.sequence, smaller.sequence): larger.copy_number += smaller.copy_number session.delete(smaller) del bucket[i] session.commit()
def update_metadata(session, args): SENTINEL = '__TEMP' # Used to temporarily avoid duplicate name issues with open(args.new_metadata) as fh: reader = csv.DictReader(fh, delimiter='\t') new_meta = {l['name']: l for l in reader} # delete existing metadata sample_ids = {s.name: s.id for s in session.query(Sample).filter( Sample.name.in_(new_meta))} session.query(SampleMetadata).filter( SampleMetadata.sample_id.in_(sample_ids.values()) ).delete(synchronize_session='fetch') ignore_fields = ['name', 'new_name', 'subject', 'file_name'] for sample_name, row in new_meta.items(): if sample_name not in sample_ids: logger.warning('No sample {} in database. Ignoring.'.format( sample_name)) sample_id = sample_ids[sample_name] logger.info('Updating metadata for {}'.format(row['name'])) session.add_all([ SampleMetadata(sample_id=sample_id, key=k, value=v) for k, v in row.items() if k not in ignore_fields and v not in NA_VALUES ]) if row['new_name'] != row['name']: logger.info(' Updating sample name to {}'.format(row['new_name'])) session.query(Sample).filter(Sample.name == row['name']).update({ Sample.name: row['new_name'] + SENTINEL }) logger.info('Verifying uniqueness') for sample in session.query(Sample).filter( Sample.name.like('%' + SENTINEL)): sample.name = sample.name[:-len(SENTINEL)] if session.query(Clone.id).filter(~Clone.tree.is_(None)).count() > 0: logger.warning('This database has at least one clonal lineage ' 'constructed. All lineages will need to be updated ' 'to reflect the modified metadata.') session.commit()
def process_sample(session, sample, indexes, temp, v_germlines, j_germlines, nproc): indels = session.query( Sequence.ai, Sequence.seq_id, Sequence.sample_id, Sequence.sequence ).filter( Sequence.sample_id == sample.id, Sequence.probable_indel_or_misalign == 1 ).order_by(Sequence.seq_id) # Get the sequences that were not identifiable noresults = session.query(NoResult).filter( NoResult.sample_id == sample.id).order_by(NoResult.seq_id) if indels.count() == 0 and noresults.count() == 0: logger.info('Sample {} has no indels or noresults'.format( sample.id)) return logger.info('Sample {} has {} indels and {} noresults'.format( sample.id, indels.count(), noresults.count())) mut_bucket = v_germlines.mut_bucket(sample.v_ties_mutations) len_bucket = v_germlines.length_bucket(sample.v_ties_len) bucket = '{}_{}'.format(str(mut_bucket).replace('.', ''), len_bucket) sample_v_germlines = get_formatted_ties(v_germlines.all_ties( sample.v_ties_len, sample.v_ties_mutations)) sample_j_germlines = get_formatted_ties(j_germlines.all_ties( sample.v_ties_len, sample.v_ties_mutations)) if bucket not in indexes: indexes.add(bucket) v_path = os.path.join(temp, 'v_genes_{}'.format(bucket)) j_path = os.path.join(temp, 'j_genes_{}'.format(bucket)) logger.info('Creating index for V-ties at {} length, {} ' 'mutation'.format(len_bucket, mut_bucket)) build_index(sample_v_germlines, v_path) build_index(sample_j_germlines, j_path) seq_path = os.path.join(temp, 'll_{}.fasta'.format(sample.id)) with open(seq_path, 'w+') as fh: fh.write(get_fasta({'tp=Sequence|ai={}|sample_id={}|seq_id={}'.format( r.ai, r.sample_id, r.seq_id): r.sequence for r in indels})) fh.write(get_fasta({'tp=NoResult|pk={}|sample_id={}|seq_id={}'.format( r.pk, r.sample_id, r.seq_id): r.sequence for r in noresults})) alignments = {} logger.info('Running bowtie2 for V-gene sequences') for line in get_reader(align_reference(temp, 'v_genes_{}'.format(bucket), seq_path, nproc)): line['ref_offset'] = int(line['ref_offset']) - 1 ref_gene = line['reference'] try: ref, seq, rem_seqs = create_seqs( ref_seq=sample_v_germlines[ref_gene].replace('-', ''), min_size=CDR3_OFFSET, **line) except KeyError as e: logger.warning('bowtie got invalid V: ' + str(e)) continue if len(rem_seqs) == 0: continue ref, seq, seq_start = add_imgt_gaps(sample_v_germlines[ref_gene], ref, seq, line['ref_offset']) if len(ref) < CDR3_OFFSET: continue alignments[line['seq_id']] = { 'v_germline': ref, 'v_gene': line['reference'], 'seq_start': seq_start, 'v_sequence': seq, 'v_rem_seq': rem_seqs[-1], 'cdr3_start': len(ref) } seq_path = os.path.join(temp, 'll_j_{}.fasta'.format(sample.id)) with open(seq_path, 'w+') as fh: seqs = {k: v['v_rem_seq'] for k, v in alignments.items() if len(v['v_rem_seq']) > 0} fh.write(get_fasta(seqs)) tasks = [] logger.info('Running bowtie2 for J-gene sequences') for line in get_reader(align_reference(temp, 'j_genes_{}'.format(bucket), seq_path, nproc)): line['ref_offset'] = int(line['ref_offset']) - 1 ref_gene = line['reference'] ref, seq, rem_seqs = create_seqs( ref_seq=sample_j_germlines[ref_gene].replace('-', ''), min_size=j_germlines.upstream_of_cdr3, **line) alignments[line['seq_id']]['j_gene'] = line['reference'] full_seq = (alignments[line['seq_id']]['v_sequence'] + alignments[line['seq_id']]['v_rem_seq']) if len(rem_seqs) > 0: full_seq = full_seq[:-len(rem_seqs[-1])] cdr3_end = len(full_seq) if len(ref) < j_germlines.upstream_of_cdr3: continue for i in range(j_germlines.upstream_of_cdr3): if ref[-i] != '-': cdr3_end -= 1 alignments[line['seq_id']]['cdr3_end'] = cdr3_end cdr3_length = cdr3_end - alignments[line['seq_id']]['cdr3_start'] full_germ = (alignments[line['seq_id']]['v_germline'] + (GAP_PLACEHOLDER * cdr3_length)) j_length = len(full_seq) - len(full_germ) if j_length <= 0 or cdr3_length <= 0: continue full_germ += ref[-j_length:] r_type, pk, sample_id, seq_id = [ v.split('=', 1)[1] for v in line['seq_id'].split('|', 3)] insertions = gap_positions(full_germ) deletions = gap_positions(full_seq) alignment = VDJAlignment( VDJSequence(seq_id, full_seq.replace(GAP_PLACEHOLDER, '-')) ) alignment.germline = full_germ.replace(GAP_PLACEHOLDER, '-') if len(alignment.germline) != len(alignment.sequence.sequence): continue alignment.v_gene.add(GeneName(alignments[line['seq_id']]['v_gene'])) alignment.j_gene.add(GeneName(alignments[line['seq_id']]['j_gene'])) alignment.seq_offset = alignments[line['seq_id']]['seq_start'] # TODO: This should really look for a streak like in anchoring alignment.germline_cdr3 = '-' * cdr3_length gaps_in_seq = alignment.sequence.sequence[ alignment.seq_start:alignments[line['seq_id']]['cdr3_start'] ].count('-') alignment.v_length = ( alignments[line['seq_id']]['cdr3_start'] - alignment.seq_offset ) - gaps_in_seq alignment.j_length = j_length alignment.v_mutation_fraction = 1 - (alignment.v_match / alignment.v_length) alignment.cdr3_start = alignments[line['seq_id']]['cdr3_start'] alignment.cdr3_num_nts = cdr3_length alignment.post_cdr3_length = j_length alignment.insertions = insertions alignment.deletions = deletions alignment.locally_aligned = True tasks.append({ 'r_type': r_type, 'pk': int(pk), 'sample_id': int(sample_id), 'alignment': alignment }) return tasks
def add_sequences_from_sample(session, sample, sequences, props): logger.info('Adding {} corrected sequences to sample {}'.format( len(sequences), sample.id)) for sequence in periodic_commit(session, sequences): alignment = sequence['alignment'] try: try: props.validate(alignment) except AlignmentException: continue if sequence['r_type'] == 'NoResult': add_sequences(session, [alignment], sample, error_action='raise') session.query(NoResult).filter( NoResult.pk == sequence['pk'] ).delete(synchronize_session=False) elif sequence['r_type'] == 'Sequence': fields = { 'partial': alignment.partial, 'probable_indel_or_misalign': alignment.has_possible_indel, 'v_gene': format_ties(alignment.v_gene), 'j_gene': format_ties(alignment.j_gene), 'num_gaps': alignment.num_gaps, 'seq_start': alignment.seq_start, 'v_match': alignment.v_match, 'v_length': alignment.v_length, 'j_match': alignment.j_match, 'j_length': alignment.j_length, 'removed_prefix': alignment.sequence.removed_prefix_sequence, 'removed_prefix_qual': alignment.sequence.removed_prefix_quality, 'v_mutation_fraction': alignment.v_mutation_fraction, 'pre_cdr3_length': alignment.pre_cdr3_length, 'pre_cdr3_match': alignment.pre_cdr3_match, 'post_cdr3_length': alignment.post_cdr3_length, 'post_cdr3_match': alignment.post_cdr3_match, 'in_frame': alignment.in_frame, 'functional': alignment.functional, 'stop': alignment.stop, 'cdr3_nt': alignment.cdr3, 'cdr3_num_nts': len(alignment.cdr3), 'cdr3_aa': lookups.aas_from_nts(alignment.cdr3), 'sequence': str(alignment.sequence.sequence), 'quality': alignment.sequence.quality, 'locally_aligned': alignment.locally_aligned, '_insertions': serialize_gaps(alignment.insertions), '_deletions': serialize_gaps(alignment.deletions), 'germline': alignment.germline } # This line doesnt actually add anything to the DB, it's just # to validate the fields Sequence(**fields) session.query(Sequence).filter( Sequence.ai == sequence['pk'] ).update(fields, synchronize_session=False) except ValueError: continue
def process_sample(db_config, v_germlines, j_germlines, path, meta, props, nproc): session = config.init_db(db_config) start = time.time() logger.info('Starting sample {}'.format(meta['sample_name'])) sample = setup_sample(session, meta) aligner = AnchorAligner(v_germlines, j_germlines) # Initial VJ assignment alignments = concurrent.process_data( read_input, process_vdj, aggregate_vdj, nproc, process_args={'aligner': aligner}, generate_args={'path': path}, ) logger.info('Adding noresults') for result in alignments['noresult']: add_noresults_for_vdj(session, result['vdj'], sample, result['reason']) alignments = alignments['success'] if alignments: avg_len = ( sum([v.v_length for v in alignments]) / len(alignments)) avg_mut = ( sum([v.v_mutation_fraction for v in alignments]) / len(alignments) ) sample.v_ties_mutations = avg_mut sample.v_ties_len = avg_len logger.info('Re-aligning {} sequences to V-ties: Mutations={}, ' 'Length={}'.format(len(alignments), round(avg_mut, 2), round(avg_len, 2))) session.commit() # Realign to V-ties v_ties = concurrent.process_data( alignments, process_vties, aggregate_vties, nproc, process_args={'aligner': aligner, 'avg_len': avg_len, 'avg_mut': avg_mut, 'props': props}, ) logger.info('Adding noresults') for result in funcs.periodic_commit(session, v_ties['noresult'], 100): add_noresults_for_vdj(session, result['alignment'].sequence, sample, result['reason']) logger.info('Collapsing {} buckets'.format(len(v_ties['success']))) session.commit() # TODO: Change this so we arent copying everything between processes concurrent.process_data( [list(v) for v in v_ties['success']], process_collapse, aggregate_collapse, nproc, aggregate_args={'db_config': db_config, 'sample_id': sample.id, 'props': props} ) session.expire_all() session.commit() identified = int(session.query( func.sum(Sequence.copy_number) ).filter( Sequence.sample == sample ).scalar() or 0) noresults = int(session.query( func.count(NoResult.pk) ).filter( NoResult.sample == sample ).scalar() or 0) if identified + noresults: frac = int(100 * identified / (identified + noresults)) else: frac = 0 logger.info( 'Completed sample {} in {}m - {}/{} ({}%) identified'.format( sample.name, round((time.time() - start) / 60., 1), identified, identified + noresults, frac ) ) session.close()
def combine_samples(session, args): groups = {} for meta in session.query(SampleMetadata).filter( SampleMetadata.key == args.combine_field): groups.setdefault(meta.value, set()).add(meta.sample_id) all_subjects = set() for group_id, samples in groups.items(): group_subs = session.query(Sample.subject_id).filter( Sample.id.in_(samples) ).group_by(Sample.subject_id) group_subs = [s.subject_id for s in group_subs] all_subjects.update(set(group_subs)) if len(group_subs) > 1: logger.error('Cannot combine samples across subjects ' '(group "{}" has {} subjects)'.format( group_id, len(group_subs))) sys.exit(1) all_samples = [s.id for s in session.query(Sample.id).filter( Sample.subject_id.in_(all_subjects))] logger.info('Resetting information for {} subjects'.format( len(all_subjects), len(all_samples))) logger.info(' Resetting collapsing') session.query(SequenceCollapse).filter( SequenceCollapse.sample_id.in_(all_samples) ).delete(synchronize_session=False) logger.info(' Resetting clones') session.query(Clone).filter( Clone.subject_id.in_(all_subjects) ).delete(synchronize_session=False) logger.info(' Resetting sample statistics') session.query(SampleStats).filter( SampleStats.sample_id.in_(all_samples) ).delete(synchronize_session=False) for group_id, samples in groups.items(): final_sample_id = min(samples) logger.info('Combining {} samples into new sample "{}" (ID {})'.format( len(samples), group_id, final_sample_id)) session.query(Sequence).filter( Sequence.sample_id.in_(samples) ).update({ Sequence.sample_id: final_sample_id, }, synchronize_session=False) logger.info('Updating sample name and deleting empty samples') # collapse to one sample final_sample = session.query(Sample).get(final_sample_id) final_sample.name = group_id remove_duplicates(session, final_sample) logger.info('Moving noresults') session.query(NoResult).filter( NoResult.sample_id.in_(samples) ).update({ 'sample_id': final_sample_id }, synchronize_session=False) # delete the now-empty samples session.query(Sample).filter( Sample.id.in_(samples - set([final_sample_id])) ).delete(synchronize_session=False) session.commit() logger.info('Sequences successfully collapsed: please re-run ' 'immunedb_collapse and later pipeline steps.')