def processQueueClust(alive, data_queue, result_queue, clone_func, clone_args): """ Pulls from data queue, performs calculations, and feeds results queue Arguments: alive = a multiprocessing.Value boolean controlling whether processing continues if False exit process data_queue = a multiprocessing.Queue holding data to process result_queue = a multiprocessing.Queue to hold processed results clone_func = the function to call for calculating pairwise distances between sequences clone_args = a dictionary of arguments to pass to clone_func Returns: None """ try: # print 'START WORK', alive.value # Iterator over data queue until sentinel object reached while alive.value: # Get data from queue if data_queue.empty(): continue else: data = data_queue.get() # Exit upon reaching sentinel if data is None: break # print "WORK", alive.value, data['id'] # Define result object for iteration and get data records records = data.data result = DbResult(data.id, records) # Create row of distance matrix and check for error dist_row = clone_func(records, **clone_args) if data else None if dist_row is not None: result.results = dist_row result.valid = True # Feed results to result queue result_queue.put(result) else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None except: #sys.stderr.write('Exception in worker\n') alive.value = False raise return None
def alignWithin(data, field_map, muscle_exec=default_muscle_exec): """ Multiple aligns sequence fields within a row Arguments: data : DbData object with Receptor objects to process. field_map : a dictionary of {input sequence : output sequence) field names to multiple align. muscle_exec : the MUSCLE executable. Returns: changeo.Multiprocessing.DbResult : object containing Receptor objects with multiple aligned sequence fields. """ # Define return object result = DbResult(data.id, data.data) result.results = data.data result.valid = True # Fail invalid groups if result.id is None: result.log = None result.valid = False return result record = data.data seq_fields = list(field_map.keys()) seq_list = [SeqRecord(record.getSeq(f), id=f) for f in seq_fields] seq_aln = runMuscle(seq_list, aligner_exec=muscle_exec) if seq_aln is not None: aln_map = {x.id: i for i, x in enumerate(seq_aln)} for f in seq_fields: idx = aln_map[f] seq = str(seq_aln[idx].seq) record.annotations[field_map[f]] = seq result.log[f] = seq else: result.valid = False return result
def alignAcross(data, field_map, muscle_exec=default_muscle_exec): """ Multiple aligns sequence fields column wise Arguments: data : DbData object with Receptor objects to process. field_map : a dictionary of {input sequence : output sequence) field names to multiple align. muscle_exec : the MUSCLE executable. Returns: changeo.Multiprocessing.DbResult : object containing Receptor objects with multiple aligned sequence fields. """ # Define return object result = DbResult(data.id, data.data) result.results = data.data result.valid = True # Fail invalid groups if result.id is None: result.log = None result.valid = False return result seq_fields = list(field_map.keys()) for f in seq_fields: seq_list = [ SeqRecord(r.getSeq(f), id=r.sequence_id.replace(' ', '_')) for r in data.data ] seq_aln = runMuscle(seq_list, aligner_exec=muscle_exec) if seq_aln is not None: aln_map = {x.id: i for i, x in enumerate(seq_aln)} for i, r in enumerate(result.results, start=1): idx = aln_map[r.sequence_id.replace(' ', '_')] seq = str(seq_aln[idx].seq) r.annotations[field_map[f]] = seq result.log['%s-%s' % (f, r.sequence_id)] = seq else: result.valid = False #for r in result.results: print r.annotations return result
def alignWithin(data, seq_fields, muscle_exec=default_muscle_exec): """ Multiple aligns sequence fields within a row Arguments: data : a DbData object with an IgRecords to process. seq_fields : the sequence fields to multiple align. muscle_exec : the MUSCLE executable. Returns: changeo.Multiprocessing.DbResult : object containing IgRecords with multiple aligned sequence fields. """ # Define return object result = DbResult(data.id, data.data) result.results = data.data result.valid = True # Fail invalid groups if result.id is None: result.log = None result.valid = False return result record = data.data seq_list = [SeqRecord(record.getSeqField(f), id=f) for f in seq_fields] seq_aln = runMuscle(seq_list, aligner_exec=muscle_exec) if seq_aln is not None: aln_map = {x.id: i for i, x in enumerate(seq_aln)} for f in seq_fields: idx = aln_map[f] seq = str(seq_aln[idx].seq) record.annotations['%s_ALIGN' % f] = seq result.log[f] = seq else: result.valid = False return result
def alignBlocks(data, seq_fields, muscle_exec=default_muscle_exec): """ Multiple aligns blocks of sequence fields together Arguments: data : a DbData object with IgRecords to process. seq_fields : the sequence fields to multiple align. muscle_exec : the MUSCLE executable. Returns: changeo.Multiprocessing.DbResult : object containing IgRecords with multiple aligned sequence fields. """ # Define return object result = DbResult(data.id, data.data) result.results = data.data result.valid = True # Fail invalid groups if result.id is None: result.log = None result.valid = False return result seq_list = [SeqRecord(r.getSeqField(f), id='%s_%s' % (r.id, f)) for f in seq_fields \ for r in data.data] seq_aln = runMuscle(seq_list, aligner_exec=muscle_exec) if seq_aln is not None: aln_map = {x.id: i for i, x in enumerate(seq_aln)} for i, r in enumerate(result.results, start=1): for f in seq_fields: idx = aln_map['%s_%s' % (r.id, f)] seq = str(seq_aln[idx].seq) r.annotations['%s_ALIGN' % f] = seq result.log['%s-%s' % (f, r.id)] = seq else: result.valid = False #for r in result.results: print r.annotations return result
def processQueue(alive, data_queue, result_queue, clone_func, clone_args): """ Pulls from data queue, performs calculations, and feeds results queue Arguments: alive = a multiprocessing.Value boolean controlling whether processing continues if False exit process data_queue = a multiprocessing.Queue holding data to process result_queue = a multiprocessing.Queue to hold processed results clone_func = the function to call for clonal assignment clone_args = a dictionary of arguments to pass to clone_func Returns: None """ try: # Iterator over data queue until sentinel object reached while alive.value: # Get data from queue if data_queue.empty(): continue else: data = data_queue.get() # Exit upon reaching sentinel if data is None: break # Define result object for iteration and get data records records = data.data result = DbResult(data.id, records) # Check for invalid data (due to failed indexing) and add failed result if not data: result_queue.put(result) continue # Add V(D)J to log result.log['ID'] = ','.join([str(x) for x in data.id]) result.log['VALLELE'] = ','.join(set([(r.getVAllele() or '') for r in records])) result.log['DALLELE'] = ','.join(set([(r.getDAllele() or '') for r in records])) result.log['JALLELE'] = ','.join(set([(r.getJAllele() or '') for r in records])) result.log['JUNCLEN'] = ','.join(set([(str(len(r.junction)) or '0') for r in records])) result.log['SEQUENCES'] = len(records) # Checking for preclone failure and assign clones clones = clone_func(records, **clone_args) if data else None # import cProfile # prof = cProfile.Profile() # clones = prof.runcall(clone_func, records, **clone_args) # prof.dump_stats('worker-%d.prof' % os.getpid()) if clones is not None: result.results = clones result.valid = True result.log['CLONES'] = len(clones) else: result.log['CLONES'] = 0 # Feed results to result queue result_queue.put(result) else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None except: #sys.stderr.write('Exception in worker\n') alive.value = False raise return None
def processQueue(alive, data_queue, result_queue, max_missing=default_max_missing, clone_func=distanceClones, clone_args={}): """ Pulls from data queue, performs calculations, and feeds results queue Arguments: alive : a multiprocessing.Value boolean controlling whether processing continues if False exit process data_queue : a multiprocessing.Queue holding data to process result_queue : a multiprocessing.Queue to hold processed results max_missing : maximum number of non-ACGT characters to allow in the junction sequence. clone_func : the function to call for clonal assignment clone_args : a dictionary of arguments to pass to clone_func Returns: None """ try: # Iterator over data queue until sentinel object reached while alive.value: # Get data from queue if data_queue.empty(): continue else: data = data_queue.get() # Exit upon reaching sentinel if data is None: break # Define result object for iteration and get data records result = DbResult(data.id, data.data) # Check for invalid data (due to failed indexing) and add failed result if not data: result_queue.put(result) continue # Filter records based on missing content seq_field = clone_args[ 'seq_field'] if 'seq_field' in clone_args else 'JUNCTION' filtered = filterMissing(data.data, field=seq_field, max_missing=max_missing) records = filtered['pass'] result.failed = filtered['fail'] # Add V(D)J to log result.log['ID'] = ','.join([str(x) for x in data.id]) result.log['VALLELE'] = ','.join( set([(r.getVAllele() or '') for r in data.data])) result.log['DALLELE'] = ','.join( set([(r.getDAllele() or '') for r in data.data])) result.log['JALLELE'] = ','.join( set([(r.getJAllele() or '') for r in data.data])) result.log['JUNCLEN'] = ','.join( set([(str(len(r.junction)) or '0') for r in data.data])) result.log['PASSCOUNT'] = len(records) result.log['FAILCOUNT'] = len(result.failed) # Checking for preclone failure and assign clones clones = clone_func(records, **clone_args) if records else None # import cProfile # prof = cProfile.Profile() # clones = prof.runcall(clone_func, records, **clone_args) # prof.dump_stats('worker-%d.prof' % os.getpid()) if clones is not None: result.results = clones result.valid = True result.log['CLONES'] = len(clones) else: result.log['CLONES'] = 0 # Feed results to result queue result_queue.put(result) else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None except: #sys.stderr.write('Exception in worker\n') alive.value = False raise return None
def filterMissing(data, seq_field=junction_attr, v_field=v_attr, j_field=j_attr, max_missing=default_max_missing): """ Splits a set of sequence into passed and failed groups based on the number of missing characters in the sequence Arguments: data (changeo.Multiprocessing.DbData): data object. seq_field (str): Receptor sequence field to filter on. v_field (str): Receptor field containing the V call. j_field (str): Receptor field containing the J call. max_missing (int): maximum number of missing characters (non-ACGT) to permit before failing the record. Returns: changeo.Multiprocessing.DbResult : objected containing filtered records. """ # Function to validate the sequence string def _pass(seq): if len(seq) > 0 and len(re.findall(r'[^ACGT]', seq)) <= max_missing: return True else: return False # Define result object for iteration and get data records result = DbResult(data.id, data.data) if not data: result.data_pass = [] result.data_fail = data.data return result result.data_pass = [] result.data_fail = [] for rec in data.data: seq = rec.getField(seq_field) if _pass(seq): result.data_pass.append(rec) else: result.data_fail.append(rec) # Add V(D)J to log result.log['ID'] = ','.join([str(x) for x in data.id]) result.log['VCALL'] = ','.join( set([(r.getVAllele(field=v_field) or '') for r in data.data])) result.log['JCALL'] = ','.join( set([(r.getJAllele(field=j_field) or '') for r in data.data])) result.log['JUNCLEN'] = ','.join( set([(str(len(r.junction)) or '0') for r in data.data])) result.log['CLONED'] = len(result.data_pass) result.log['FILTERED'] = len(result.data_fail) return result