def create(main_parser, args): if re.search(r'[^A-Za-z0-9_-]', args.db_name) is not None: main_parser.error('Database name must only contain letters, numbers, ' 'dashes and underscores.') try: conn = _get_root_connection(args.db_host, args.admin_user, args.admin_pass) db_user = args.db_user or args.db_name if args.db_pass: db_pass = args.db_pass else: db_pass = ''.join( random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(10)) with conn.cursor() as cursor: logger.info('Creating user "{}"'.format(db_user)) existing_password = _create_user_if_not_exists(conn, '%', db_user, db_pass) if existing_password is not None: if not args.db_pass: logger.warning( 'User {} already exists. To generate the ' 'configuration file, you must enter it\'s ' 'password.'.format(db_user) ) db_pass = _get_user_pass(conn, args.db_host, db_user, existing_password) else: db_pass = args.db_pass logger.info('Creating database "{}"'.format(args.db_name)) cursor.execute('CREATE DATABASE {}'.format(args.db_name)) cursor.execute( 'GRANT ALL PRIVILEGES ON {}.* TO \'{}\'@\'%\''.format( args.db_name, db_user)) config_path = os.path.join(args.config_dir, '{}.json'.format( args.db_name)) logger.info('Creating config at {}'.format(config_path)) with open(config_path, 'w+') as fh: json.dump({ 'host': args.db_host, 'database': args.db_name, 'username': db_user, 'password': db_pass }, fh, sort_keys=True, indent=4, separators=(',', ': ')) logger.info('Initializing tables') config.init_db(config_path) logger.info('Success!') return True except Exception as e: logger.error(e) return False
def run_sample_stats(session, args): np.seterr(all='raise') mod_log.make_mod('sample_stats', session=session, commit=True, info=vars(args)) if args.sample_ids is None: samples = map(lambda s: s.id, session.query(Sample.id).all()) else: samples = args.sample_ids if args.force: q = session.query(SampleStats).filter( SampleStats.sample_id.in_(samples)) q.delete(synchronize_session=False) session.commit() tasks = concurrent.TaskQueue() for sample_id in samples: _queue_tasks(session, sample_id, args.force, tasks) for i in range(0, args.nproc): session = config.init_db(args.db_config) tasks.add_worker(SampleStatsWorker(session)) tasks.start() session.commit()
def run_sample_stats(session, args): np.seterr(all='raise') mod_log.make_mod('sample_stats', session=session, commit=True, info=vars(args)) if args.sample_ids is None: samples = [s.id for s in session.query(Sample.id)] else: samples = args.sample_ids if args.force: q = session.query(SampleStats).filter( SampleStats.sample_id.in_(samples)) q.delete(synchronize_session=False) session.commit() tasks = concurrent.TaskQueue() for sample_id in samples: _queue_tasks(session, sample_id, args.force, tasks) for i in range(0, args.nproc): session = config.init_db(args.db_config) tasks.add_worker(SampleStatsWorker(session)) tasks.start() session.commit() session.close()
def run_clearcut(session, args): if args.clone_ids is not None: clones = session.query(Clone.id).filter( Clone.id.in_(args.clone_ids)) else: if args.subject_ids is not None: clones = session.query(Clone.id).filter( Clone.subject_id.in_(args.subject_ids)) else: clones = session.query(Clone.id) if not args.force: clones = clones.filter(Clone.tree.is_(None)) clones = [c.id for c in clones] mod_log.make_mod('clone_tree', session=session, commit=True, info=vars(args)) tasks = concurrent.TaskQueue() logger.info('Creating task queue for clones') for clone_id in clones: tasks.add_task(clone_id) for _ in range(0, args.nproc): session = config.init_db(args.db_config) tasks.add_worker(LineageWorker( session, get_newick, args.min_mut_copies, args.min_mut_samples, args.min_seq_copies, args.min_seq_samples, args.exclude_stops, args.full_seq, post_tree_hook=minimize_tree)) tasks.start()
def run_clones(session, args): """Runs the clone-assignment pipeline stage. :param Session session: The database session :param Namespace args: The arguments passed to the command """ if args.subject_ids is None: subject_ids = [s.id for s in session.query(Subject.id)] else: subject_ids = args.subject_ids mod_log.make_mod('clones', session=session, commit=True, info=vars(args)) if args.regen: logger.info('Deleting existing clones') session.query(Clone).filter( Clone.subject_id.in_(subject_ids) ).delete(synchronize_session=False) session.commit() tasks = concurrent.TaskQueue() for subject_id in subject_ids: logger.info('Generating task queue for subject {}'.format( subject_id)) buckets = session.query( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ).filter( Sequence.subject_id == subject_id, Sequence.clone_id.is_(None) ).group_by( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ) for bucket in buckets: if not args.gene or bucket.v_gene.startswith(args.gene): tasks.add_task(bucket) logger.info('Generated {} total tasks'.format(tasks.num_tasks())) methods = { 'similarity': SimilarityClonalWorker, 'lineage': LineageClonalWorker, } for i in range(0, min(tasks.num_tasks(), args.nproc)): worker = methods[args.method]( config.init_db(args.db_config), **args.__dict__ ) tasks.add_worker(worker) tasks.start() if args.subclones: run_subclones(session, subject_ids, args) else: logger.info('Skipping subclones') push_clone_ids(session) session.commit()
def run_clones(session, args): """Runs the clone-assignment pipeline stage. :param Session session: The database session :param Namespace args: The arguments passed to the command """ if args.subject_ids is None: subject_ids = map(lambda s: s.id, session.query(Subject.id).all()) else: subject_ids = args.subject_ids mod_log.make_mod('clones', session=session, commit=True, info=vars(args)) if args.regen: logger.info('Deleting existing clones') session.query(Clone).filter( Clone.subject_id.in_(subject_ids) ).delete(synchronize_session=False) session.commit() tasks = concurrent.TaskQueue() for subject_id in subject_ids: logger.info('Generating task queue for subject {}'.format( subject_id)) buckets = session.query( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ).filter( Sequence.subject_id == subject_id, Sequence.clone_id.is_(None) ).group_by( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ) for bucket in buckets: tasks.add_task(bucket) logger.info('Generated {} total tasks'.format(tasks.num_tasks())) methods = { 'similarity': SimilarityClonalWorker, 'tcells': TCellClonalWorker, 'lineage': LineageClonalWorker, } for i in range(0, min(tasks.num_tasks(), args.nproc)): worker = methods[args.method]( config.init_db(args.db_config), **args.__dict__ ) tasks.add_worker(worker) tasks.start() if args.subclones: run_subclones(session, subject_ids, args) else: logger.info('Skipping subclones') push_clone_ids(session) session.commit()
def run_selection_pressure(session, args): mod_log.make_mod('clone_pressure', session=session, commit=True, info=vars(args)) if args.clone_ids is not None: clones = args.clone_ids elif args.subject_ids is not None: clones = map( lambda c: c.id, session.query(Clone.id).filter( Clone.subject_id.in_(args.subject_ids)).all()) else: clones = map(lambda c: c.id, session.query(Clone.id).all()) clones.sort() tasks = concurrent.TaskQueue() logger.info('Creating task queue to calculate selection pressure for {} ' 'clones.'.format(len(clones))) for cid in clones: tasks.add_task(cid) for i in range(0, args.nproc): session = config.init_db(args.db_config) tasks.add_worker( SelectionPressureWorker(session, args.baseline_path, args.temp, args.regen, args.thresholds)) tasks.start()
def _wrapper(*args, **kwargs): session = config.init_db(db_config) try: return f(session, *args, **kwargs) except Exception: raise finally: session.close()
def run_collapse(session, args): mod_log.make_mod('collapse', session=session, commit=True, info=vars(args)) subject_ids = [] subjects = (args.subject_ids or [e.id for e in session.query(Subject.id)]) for subject in subjects: if session.query(Sample).filter( Sample.subject_id == subject, ~exists().where( SequenceCollapse.sample_id == Sample.id )).first() is None: logger.info('Subject {} already collapsed. Skipping.'.format( subject)) else: logger.info('Resetting collapse info for subject {}'.format( subject)) samples = session.query(Sample).filter( Sample.subject_id == subject ) for sample in samples: session.query(SequenceCollapse).filter( SequenceCollapse.sample_id == sample.id ).delete(synchronize_session=False) sample.sample_stats = [] logger.info('Resetting clone info for subject {}'.format(subject)) session.query(Clone).filter(Clone.subject_id == subject).delete() subject_ids.append(subject) session.commit() logger.info('Creating task queue to collapse {} subjects.'.format( len(subject_ids))) tasks = concurrent.TaskQueue() for subject_id in subject_ids: buckets = session.query( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ).filter( Sequence.subject_id == subject_id ).group_by( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ) for bucket in buckets: tasks.add_task(bucket) logger.info('Generated {} total tasks'.format(tasks.num_tasks())) for i in range(0, min(tasks.num_tasks(), args.nproc)): tasks.add_worker(CollapseWorker(config.init_db(args.db_config))) tasks.start() session.close()
def run_collapse(session, args): mod_log.make_mod('collapse', session=session, commit=True, info=vars(args)) subject_ids = [] for subject in (args.subject_ids or map( lambda e: e.id, session.query(Subject.id).all() )): if session.query(Sample).filter( Sample.subject_id == subject, ~exists().where( SequenceCollapse.sample_id == Sample.id )).first() is None: logger.info('Subject {} already collapsed. Skipping.'.format( subject)) else: logger.info('Resetting collapse info for subject {}'.format( subject)) samples = session.query(Sample).filter( Sample.subject_id == subject ).all() for sample in samples: session.query(SequenceCollapse).filter( SequenceCollapse.sample_id == sample.id ).delete(synchronize_session=False) logger.info('Resetting clone info for subject {}'.format(subject)) session.query(Clone).filter(Clone.subject_id == subject).delete() subject_ids.append(subject) session.commit() logger.info('Creating task queue to collapse {} subjects.'.format( len(subject_ids))) tasks = concurrent.TaskQueue() for subject_id in subject_ids: buckets = session.query( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ).filter( Sequence.subject_id == subject_id ).group_by( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ) for bucket in buckets: tasks.add_task(bucket) logger.info('Generated {} total tasks'.format(tasks.num_tasks())) for i in range(0, min(tasks.num_tasks(), args.nproc)): tasks.add_worker(CollapseWorker(config.init_db(args.db_config))) tasks.start() session.close()
def run_identify(session, args): mod_log.make_mod('identification', session=session, commit=True, info=vars(args)) session.close() # Load the germlines from files v_germlines = VGermlines(args.v_germlines) j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3, args.anchor_len, args.min_anchor_len) tasks = concurrent.TaskQueue() # If metadata is not specified, assume it is "metadata." in the # directory meta_fn = args.metadata if args.metadata else os.path.join( args.sample_dir, 'metadata.tsv') # Verify the metadata file exists if not os.path.isfile(meta_fn): logger.error('Metadata file not found.') return with open(meta_fn, 'rU') as fh: try: metadata = parse_metadata(session, fh, args.warn_existing, args.sample_dir) except MetadataException as ex: logger.error(ex.message) return # Create the tasks for each file for sample_name in sorted(metadata.keys()): tasks.add_task({ 'path': os.path.join(args.sample_dir, metadata[sample_name]['file_name']), 'meta': metadata[sample_name] }) props = IdentificationProps(**args.__dict__) lock = mp.Lock() for i in range(0, min(args.nproc, tasks.num_tasks())): worker_session = config.init_db(args.db_config) tasks.add_worker( IdentificationWorker(worker_session, v_germlines, j_germlines, props, lock)) tasks.start()
def run_subclones(session, subject_ids, args): tasks = concurrent.TaskQueue() for subject_id in subject_ids: logger.info( 'Generating subclone task queue for subject {}'.format(subject_id)) buckets = session.query(Clone.subject_id, Clone.v_gene, Clone.j_gene, Clone.cdr3_num_nts).filter( Clone.subject_id == subject_id).group_by( Clone.subject_id, Clone.v_gene, Clone.j_gene, Clone.cdr3_num_nts) for bucket in buckets: tasks.add_task(bucket) logger.info('Generated {} total subclone tasks'.format(tasks.num_tasks())) for i in range(0, min(tasks.num_tasks(), args.nproc)): tasks.add_worker(SubcloneWorker(config.init_db(args.db_config))) tasks.start()
def aggregate_collapse(aggregate_queue, db_config, sample_id, props): seqs_to_add = [] session = config.init_db(db_config, create=False) sample = session.query(Sample).filter(Sample.id == sample_id).one() for i, alignment in enumerate(aggregate_queue): for seq in alignment: seqs_to_add.append(seq) if len(seqs_to_add) >= 1000: add_sequences(session, seqs_to_add, sample, strip_alleles=not props.genotyping) seqs_to_add = [] session.commit() if seqs_to_add: add_sequences(session, seqs_to_add, sample, strip_alleles=not props.genotyping) logger.info('Finished aggregating sequences') session.commit() session.close()
def run_subclones(session, subject_ids, args): tasks = concurrent.TaskQueue() for subject_id in subject_ids: logger.info('Generating subclone task queue for subject {}'.format( subject_id)) buckets = session.query( Clone.subject_id, Clone.v_gene, Clone.j_gene, Clone.cdr3_num_nts ).filter( Clone.subject_id == subject_id ).group_by( Clone.subject_id, Clone.v_gene, Clone.j_gene, Clone.cdr3_num_nts ) for bucket in buckets: tasks.add_task(bucket) logger.info('Generated {} total subclone tasks'.format(tasks.num_tasks())) for i in range(0, min(tasks.num_tasks(), args.nproc)): tasks.add_worker(SubcloneWorker(config.init_db(args.db_config), args.similarity)) tasks.start()
def get_clones_from_remote_db(): session = config.init_db({ 'host': '35.241.233.255', 'database': 'influenza', 'username': '******', 'password': '', }) rows_final = [] counter = 10 for clone in session.query(Clone): row_final = np.append(clone.id, clone.tree) rows_final.append(row_final) # if counter > 0: counter = counter - 1 print(clone.id) else: return np.asarray(rows_final)
def run_clone_stats(session, args): """Runs the clone statistics generation stage of the pipeline. :param Session session: The database session :param Namespace args: The arguments passed to the command """ mod_log.make_mod('clone_stats', session=session, commit=True, info=vars(args)) if args.clone_ids is not None: clones = args.clone_ids elif args.subject_ids is not None: clones = map( lambda c: c.id, session.query(Clone.id).filter( Clone.subject_id.in_(args.subject_ids)).all()) else: clones = map(lambda c: c.id, session.query(Clone.id).all()) clones.sort() if args.regen: logger.info('Deleting old clone statistics for {} clones'.format( len(clones))) session.query(CloneStats).filter( CloneStats.clone_id.in_(clones)).delete(synchronize_session=False) session.commit() tasks = concurrent.TaskQueue() logger.info('Creating task queue to generate stats for {} clones.'.format( len(clones))) for cid in clones: tasks.add_task(cid) for i in range(0, args.nproc): session = config.init_db(args.db_config) tasks.add_worker(CloneStatsWorker(session)) tasks.start()
def run_clone_stats(session, args): """Runs the clone statistics generation stage of the pipeline. :param Session session: The database session :param Namespace args: The arguments passed to the command """ mod_log.make_mod('clone_stats', session=session, commit=True, info=vars(args)) if args.clone_ids is not None: clones = args.clone_ids elif args.subject_ids is not None: clones = [c.id for c in session.query(Clone.id).filter( Clone.subject_id.in_(args.subject_ids))] else: clones = [c.id for c in session.query(Clone.id)] clones.sort() if args.regen: logger.info('Deleting old clone statistics for {} clones'.format( len(clones))) session.query(CloneStats).filter( CloneStats.clone_id.in_(clones) ).delete(synchronize_session=False) session.commit() tasks = concurrent.TaskQueue() logger.info('Creating task queue to generate stats for {} clones.'.format( len(clones))) for cid in clones: tasks.add_task(cid) for i in range(0, args.nproc): session = config.init_db(args.db_config) tasks.add_worker(CloneStatsWorker(session)) tasks.start()
from argparse import Namespace # noqa: E402 from immunedb.api.rest_service import run_rest_service # noqa: E402 import immunedb.common.config as config # noqa: E402 session = config.init_db('test_db.json', as_maker=True) run_rest_service( session, Namespace(port=8891, debug=True, allow_shutdown=True, rollbar_token=None, rollbar_env=None, server='gunicorn'))
return dict((k, v.dropna().to_dict()) for k, v in pd.compat.iteritems(data)) if __name__ == '__main__': parser = config.get_base_arg_parser() parser.add_argument('subject') parser.add_argument('feature') parser.add_argument('magnitude', choices=['copies', 'instances']) parser.add_argument( '--limit-by', choices=['copies', 'instances'], default=None) parser.add_argument('--limit-val', type=int, default=None) args = parser.parse_args() session = config.init_db(args.db_config) subject_id = session.query(Subject.id).filter( Subject.identifier == args.subject).one().id features = { s.id: str(getattrd(s, args.feature)) for s in session.query(Sample).filter(Sample.subject_id == subject_id) } if args.magnitude == 'instances' or args.limit_by == 'instances': instances = session.query( Sequence.clone_id, Sequence.sample_id, func.count(Sequence.seq_id).label('inst') ).filter( ~Sequence.clone_id.is_(None), Sequence.subject_id == subject_id
def setUp(self): self.session = config.init_db(CONFIG_PATH)
def run_identify(session, args): mod_log.make_mod('identification', session=session, commit=True, info=vars(args)) session.close() # Load the germlines from files v_germlines = VGermlines(args.v_germlines) j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3, args.anchor_len, args.min_anchor_len) tasks = concurrent.TaskQueue() sample_names = set([]) fail = False for directory in args.sample_dirs: # If metadata is not specified, assume it is "metadata.json" in the # directory if args.metadata is None: meta_fn = os.path.join(directory, 'metadata.json') else: meta_fn = args.metadata # Verify the metadata file exists if not os.path.isfile(meta_fn): logger.error('Metadata file not found.') return with open(meta_fn) as fh: metadata = json.load(fh) # Create the tasks for each file for fn in sorted(metadata.keys()): if fn == 'all': continue meta = SampleMetadata( metadata[fn], metadata['all'] if 'all' in metadata else None) if session.query(Sample).filter( Sample.name == meta.get('sample_name'), exists().where( Sequence.sample_id == Sample.id)).first() is not None: log_f = logger.warning if args.warn_existing else logger.error log_f('Sample {} already exists. {}'.format( meta.get('sample_name'), 'Skipping.' if args.warn_existing else 'Cannot continue.')) fail = True elif meta.get('sample_name') in sample_names: logger.error( 'Sample {} exists more than once in metadata.'.format( meta.get('sample_name'))) return else: tasks.add_task({'path': directory, 'fn': fn, 'meta': meta}) sample_names.add(meta.get('sample_name')) if fail and not args.warn_existing: logger.error('Encountered errors. Not running any identification.' ' To skip samples that are already in the database ' 'use --warn-existing.') return lock = mp.Lock() for i in range(0, min(args.nproc, tasks.num_tasks())): worker_session = config.init_db(args.db_config) tasks.add_worker( IdentificationWorker(worker_session, v_germlines, j_germlines, args.trim_to, args.max_padding, args.max_vties, args.min_similarity / float(100), lock)) tasks.start()
def setUp(self): self.session = config.init_db(CONFIG_PATH, drop_all=True)
def run_clones(session, args): """Runs the clone-assignment pipeline stage. :param Session session: The database session :param Namespace args: The arguments passed to the command """ if args.subject_ids is None: subject_ids = [s.id for s in session.query(Subject.id)] else: subject_ids = args.subject_ids mod_log.make_mod('clones', session=session, commit=True, info=vars(args)) if not args.skip_regen: logger.info('Deleting existing clones') q = session.query(Clone).filter(Clone.subject_id.in_(subject_ids)) if args.gene: q = q.filter(Clone.v_gene.like(args.gene + '%')) q.delete(synchronize_session=False) session.commit() tasks = concurrent.TaskQueue() all_buckets = [] for subject_id in subject_ids: logger.info('Generating task queue for subject {}'.format(subject_id)) buckets = session.query(Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts).filter( Sequence.subject_id == subject_id, Sequence.clone_id.is_(None)).group_by( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts) for bucket in buckets: if not args.gene or bucket.v_gene.startswith(args.gene): tasks.add_task(bucket) all_buckets.extend(buckets) logger.info('Generated {} total tasks'.format(tasks.num_tasks())) methods = { 'similarity': SimilarityClonalWorker, 'cluster': ClusteringClonalWorker } for i in range(0, min(tasks.num_tasks(), args.nproc)): worker = methods[args.method](config.init_db(args.db_config), **args.__dict__) tasks.add_worker(worker) tasks.start() session.commit() if args.reduce_difference: buckets = session.query(Clone.subject_id, Clone.cdr3_num_nts).filter( Clone.subject_id.in_(subject_ids)).group_by( Clone.subject_id, Clone.cdr3_num_nts) collapse_similar_cdr3s(session, buckets, args.reduce_difference) else: logger.info('Skipping reduce since --reduce-differece set to 0') push_clone_ids(session) session.commit() if not args.skip_subclones: run_subclones(session, subject_ids, args) else: logger.info('Skipping subclones')
def setUp(self): self.session_maker = config.init_db(CONFIG_PATH, drop_all=True, as_maker=True) self.session = self.session_maker()
def process_sample(db_config, v_germlines, j_germlines, path, meta, props, nproc): session = config.init_db(db_config) start = time.time() logger.info('Starting sample {}'.format(meta['sample_name'])) sample = setup_sample(session, meta) aligner = AnchorAligner(v_germlines, j_germlines) # Initial VJ assignment alignments = concurrent.process_data( read_input, process_vdj, aggregate_vdj, nproc, process_args={'aligner': aligner}, generate_args={'path': path}, ) logger.info('Adding noresults') for result in alignments['noresult']: add_noresults_for_vdj(session, result['vdj'], sample, result['reason']) alignments = alignments['success'] if alignments: avg_len = ( sum([v.v_length for v in alignments]) / len(alignments)) avg_mut = ( sum([v.v_mutation_fraction for v in alignments]) / len(alignments) ) sample.v_ties_mutations = avg_mut sample.v_ties_len = avg_len logger.info('Re-aligning {} sequences to V-ties: Mutations={}, ' 'Length={}'.format(len(alignments), round(avg_mut, 2), round(avg_len, 2))) session.commit() # Realign to V-ties v_ties = concurrent.process_data( alignments, process_vties, aggregate_vties, nproc, process_args={'aligner': aligner, 'avg_len': avg_len, 'avg_mut': avg_mut, 'props': props}, ) logger.info('Adding noresults') for result in funcs.periodic_commit(session, v_ties['noresult'], 100): add_noresults_for_vdj(session, result['alignment'].sequence, sample, result['reason']) logger.info('Collapsing {} buckets'.format(len(v_ties['success']))) session.commit() # TODO: Change this so we arent copying everything between processes concurrent.process_data( [list(v) for v in v_ties['success']], process_collapse, aggregate_collapse, nproc, aggregate_args={'db_config': db_config, 'sample_id': sample.id, 'props': props} ) session.expire_all() session.commit() identified = int(session.query( func.sum(Sequence.copy_number) ).filter( Sequence.sample == sample ).scalar() or 0) noresults = int(session.query( func.count(NoResult.pk) ).filter( NoResult.sample == sample ).scalar() or 0) if identified + noresults: frac = int(100 * identified / (identified + noresults)) else: frac = 0 logger.info( 'Completed sample {} in {}m - {}/{} ({}%) identified'.format( sample.name, round((time.time() - start) / 60., 1), identified, identified + noresults, frac ) ) session.close()
def process_sample(db_config, v_germlines, j_germlines, path, meta, props, nproc): session = config.init_db(db_config) start = time.time() logger.info('Starting sample {}'.format(meta['sample_name'])) sample = setup_sample(session, meta) aligner = AnchorAligner(v_germlines, j_germlines) # Initial VJ assignment alignments = concurrent.process_data( read_input, process_vdj, aggregate_vdj, nproc, process_args={'aligner': aligner}, generate_args={'path': path}, ) logger.info('Adding noresults') for result in alignments['noresult']: add_noresults_for_vdj(session, result['vdj'], sample, result['reason']) alignments = alignments['success'] if alignments: avg_len = (sum([v.v_length for v in alignments]) / len(alignments)) avg_mut = (sum([v.v_mutation_fraction for v in alignments]) / len(alignments)) sample.v_ties_mutations = avg_mut sample.v_ties_len = avg_len logger.info('Re-aligning {} sequences to V-ties: Mutations={}, ' 'Length={}'.format(len(alignments), round(avg_mut, 2), round(avg_len, 2))) session.commit() # Realign to V-ties v_ties = concurrent.process_data( alignments, process_vties, aggregate_vties, nproc, process_args={ 'aligner': aligner, 'avg_len': avg_len, 'avg_mut': avg_mut, 'props': props }, ) logger.info('Adding noresults') for result in funcs.periodic_commit(session, v_ties['noresult'], 100): add_noresults_for_vdj(session, result['alignment'].sequence, sample, result['reason']) logger.info('Collapsing {} buckets'.format(len(v_ties['success']))) session.commit() # TODO: Change this so we arent copying everything between processes concurrent.process_data([list(v) for v in v_ties['success']], process_collapse, aggregate_collapse, nproc, aggregate_args={ 'db_config': db_config, 'sample_id': sample.id, 'props': props }) session.expire_all() session.commit() identified = int( session.query(func.sum(Sequence.copy_number)).filter( Sequence.sample == sample).scalar() or 0) noresults = int( session.query(func.count( NoResult.pk)).filter(NoResult.sample == sample).scalar() or 0) if identified + noresults: frac = int(100 * identified / (identified + noresults)) else: frac = 0 logger.info( 'Completed sample {} in {}m - {}/{} ({}%) identified'.format( sample.name, round((time.time() - start) / 60., 1), identified, identified + noresults, frac)) session.close()