def run_fix_sequences(session, args): v_germlines = VGermlines(args.v_germlines) j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3) indexes = set() props = IdentificationProps(**args.__dict__) for sample in session.query(Sample): sequences = process_sample(session, sample, indexes, args.temp, v_germlines, j_germlines, args.nproc) add_sequences_from_sample(session, sample, sequences, props) remove_duplicates(session, sample) logger.info('Updating copy numbers') session.connection(mapper=Sequence).execute( text(''' UPDATE sequences SET copy_number = 1 + ( SELECT COUNT(*) FROM duplicate_sequences WHERE duplicate_seq_ai = ai ) '''))
def run_import(session, args): v_germlines = VGermlines(args.v_germlines) j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3, args.anchor_len) meta_fn = args.metadata if args.metadata else os.path.join( args.sample_dir, 'metadata.tsv') if not os.path.isfile(meta_fn): logger.error('Metadata file not found.') return with open(meta_fn, 'rU') as fh: try: metadata = parse_metadata(session, fh, args.warn_existing, args.warn_missing, args.sample_dir) except MetadataException as ex: logger.error(ex.message) return props = IdentificationProps(**args.__dict__) for sample_name in sorted(metadata.keys()): sample = create_sample(session, metadata[sample_name]) if sample: path = os.path.join(args.sample_dir, metadata[sample_name]['file_name']) with open(path) as fh: read_file(session, args.format, fh, sample, v_germlines, j_germlines, props)
def run_import(session, args, remaps=None): v_germlines = VGermlines(args.v_germlines) j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3, args.anchor_len, args.min_anchor_len) study, new = funcs.get_or_create(session, Study, name=args.study_name) if new: logger.info('Created new study "{}"'.format(study.name)) session.commit() sample, new = funcs.get_or_create(session, Sample, name=args.sample_name, study=study) if new: sample.date = args.date logger.info('Created new sample "{}"'.format(sample.name)) for key in ('subset', 'tissue', 'disease', 'lab', 'experimenter', 'ig_class', 'v_primer', 'j_primer'): setattr(sample, key, vars(args).get(key, None)) subject, new = funcs.get_or_create(session, Subject, study=study, identifier=args.subject) sample.subject = subject session.commit() else: logger.error('Sample "{}" already exists'.format(args.sample_name)) return with open(args.input_file) as fh: read_file(session, fh, sample, v_germlines, j_germlines, args, remaps)
def run_fix_sequences(session, args): v_germlines = VGermlines(args.v_germlines) j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3) indexes = set() props = IdentificationProps(**args.__dict__) samples = session.query(Sample) if args.sample_ids: samples = samples.filter(Sample.id.in_(args.sample_ids)) for sample in samples: sequences = process_sample(session, sample, indexes, args.temp, v_germlines, j_germlines, args.nproc) add_sequences_from_sample(session, sample, sequences, props) remove_duplicates(session, sample)
def run_identify(session, args): mod_log.make_mod('identification', session=session, commit=True, info=vars(args)) session.close() # Load the germlines from files v_germlines = VGermlines(args.v_germlines) j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3, args.anchor_len, args.min_anchor_len) tasks = concurrent.TaskQueue() # If metadata is not specified, assume it is "metadata." in the # directory meta_fn = args.metadata if args.metadata else os.path.join( args.sample_dir, 'metadata.tsv') # Verify the metadata file exists if not os.path.isfile(meta_fn): logger.error('Metadata file not found.') return with open(meta_fn, 'rU') as fh: try: metadata = parse_metadata(session, fh, args.warn_existing, args.sample_dir) except MetadataException as ex: logger.error(ex.message) return # Create the tasks for each file for sample_name in sorted(metadata.keys()): tasks.add_task({ 'path': os.path.join(args.sample_dir, metadata[sample_name]['file_name']), 'meta': metadata[sample_name] }) props = IdentificationProps(**args.__dict__) lock = mp.Lock() for i in range(0, min(args.nproc, tasks.num_tasks())): worker_session = config.init_db(args.db_config) tasks.add_worker( IdentificationWorker(worker_session, v_germlines, j_germlines, props, lock)) tasks.start()
def run_identify(session, args): mod_log.make_mod('identification', session=session, commit=True, info=vars(args)) # Load the germlines from files v_germlines = VGermlines(args.v_germlines, ties=args.ties and not args.genotyping) j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3, args.anchor_len, args.min_anchor_len, ties=args.ties and not args.genotyping) # If metadata is not specified, assume it is "metadata." in the # directory meta_fn = args.metadata if args.metadata else os.path.join( args.sample_dir, 'metadata.tsv') # Verify the metadata file exists if not os.path.isfile(meta_fn): logger.error('Metadata file not found.') sys.exit(-1) with open(meta_fn, 'rU') as fh: try: metadata = parse_metadata(session, fh, args.warn_existing, args.warn_missing, args.sample_dir) except MetadataException as ex: logger.error(ex) sys.exit(-1) session.close() # Create the tasks for each file props = IdentificationProps(**args.__dict__) for sample_name in sorted(metadata.keys()): process_sample( args.db_config, v_germlines, j_germlines, os.path.join( args.sample_dir, metadata[sample_name]['file_name'] ), metadata[sample_name], props, args.nproc )