def retreive_and_validate_inputs( job, job_queue, logger): """ retreives and validates analysis group and reprocessor from a job """ struct = JSONMessage.unserialize(job.input_message) ## get IDs analysis_group_id = struct.analysis_group_id reprocess_config_id = struct.reprocess_config_id analysis_group = Session.query(AnalysisGroup).get(analysis_group_id) reprocessor = Session.query(ReprocessConfig).get(reprocess_config_id) #check if analysis group exists in DB if not analysis_group: logger.error("Reprocess job: Analysis group id: %s not found[job %s]" % (analysis_group_id, job.id)) job_queue.abort(job, JSONErrorMessage("Unknown analysis group id: %s" % analysis_group_id)) #check if reprocessor exists in DB if not reprocessor: logger.error("Reprocess job: reprocessor id: %s not found[job %s]" % (reprocess_config_id, job.id)) job_queue.abort(job, JSONErrorMessage("Unknown reprocessor: %s" % reprocess_config_id)) return analysis_group, reprocessor
def check_job_exists( job_list, analysis_group_id, reprocess_config_id): """ returns 1 if job exists, else 0 """ for job in job_list: struct = JSONMessage.unserialize(job.input_message) if( int( struct.analysis_group_id ) == int( analysis_group_id ) and \ int( struct.reprocess_config_id ) == int( reprocess_config_id ) ): return 1 return 0
def process_taqman_job(job, job_queue, sequence_source, dg_calc): logger = logging.getLogger(LOGGER_NAME) struct = JSONMessage.unserialize(job.input_message) forward_primer_id = struct.forward_primer_id reverse_primer_id = struct.reverse_primer_id probe_ids = struct.probe_ids sequence_group = Session.query(SequenceGroup).get(struct.sequence_group_id) fp_sequence = Session.query(SequenceGroupComponent).get(forward_primer_id).sequence rp_sequence = Session.query(SequenceGroupComponent).get(reverse_primer_id).sequence probes = Session.query(SequenceGroupComponent).filter(SequenceGroupComponent.id.in_(probe_ids)).all() probe_seqs = [p.sequence for p in probes] sequences = sequence_source.sequences_for_primers(fp_sequence.sequence, rp_sequence.sequence, fwd_prefix_length=MAX_CACHE_PADDING, rev_suffix_length=MAX_CACHE_PADDING) if sequences is None: logger.error("Amplicon TaqMan: Could not get response from server [job %s]" % job.id) job_queue.abort(job, JSONErrorMessage('Could not get response from server.')) Session.commit() return amplicons = create_amplicons_from_pcr_sequences(sequence_group, forward_primer=fp_sequence, reverse_primer=rp_sequence, probes=probe_seqs, pcr_sequences=sequences) for amp in amplicons: populate_amplicon_dgs(amp, dg_calc) logger.info("Taqman job completed [job %s]" % job.id) Session.commit() # TODO: add finish message # now add SNPs to cached sequences for amp in amplicons: for cseq in amp.cached_sequences: job_queue.add(JOB_ID_PROCESS_SNPS, ProcessSNPMessage(cached_sequence_id=cseq.id), parent_job=job.parent) # avoid condition where job completed -- clear child job first job_queue.finish(job, None)
def process_transcript_job(job, job_queue, sequence_source, dg_calc): logger = logging.getLogger(LOGGER_NAME) struct = JSONMessage.unserialize(job.input_message) forward_primer_id = struct.forward_primer_id reverse_primer_id = struct.reverse_primer_id probe_ids = struct.probe_ids sequence_group = Session.query(SequenceGroup).get(struct.sequence_group_id) fp_sequence = Session.query(SequenceGroupComponent).get(forward_primer_id).sequence rp_sequence = Session.query(SequenceGroupComponent).get(reverse_primer_id).sequence probes = Session.query(SequenceGroupComponent).filter(SequenceGroupComponent.id.in_(probe_ids)).all() probe_seqs = [p.sequence for p in probes] transcripts = sequence_source.transcript_sequences_for_primers(fp_sequence.sequence, rp_sequence.sequence) if transcripts is None: logger.error("GEX TaqMan: Could not get response from server [job %s]" % job.id) job_queue.abort(job, JSONErrorMessage("Could not get response from server.")) Session.commit() return db_transcripts = create_db_transcripts_from_pcr_transcripts(sequence_group, forward_primer=fp_sequence, reverse_primer=rp_sequence, probes=probe_seqs, pcr_gene_sequences=transcripts) sequence_group.transcripts = db_transcripts for trans in sequence_group.transcripts: populate_transcript_dgs(trans, dg_calc) logger.info("GEX TaqMan job completed [job %s]" % job.id) Session.commit() for trans in db_transcripts: job_queue.add(JOB_ID_PROCESS_GEX_SNPS, ProcessGEXSNPMessage(trans.id), parent_job=job.parent) job_queue.finish(job, None)
def process_location_job(job, job_queue, sequence_source, dg_calc): logger = logging.getLogger(LOGGER_NAME) struct = JSONMessage.unserialize(job.input_message) sequence_group = Session.query(SequenceGroup).get(struct.sequence_group_id) sequence = None try: sequence = sequence_source.sequence_around_loc(sequence_group.location_chromosome, sequence_group.location_base, sequence_group.amplicon_length, prefix_length=MAX_CACHE_PADDING, suffix_length=MAX_CACHE_PADDING) except Exception: logger.exception("Could not retrieve sequence for assay location [job %s]: " % job.id) job_queue.abort(job, JSONErrorMessage('Could not retrieve the sequence for the specified amplicon location.')) Session.commit() return if sequence is None: logger.error("Amplicon Location: could not get response from server [job %s]: " % job.id) job_queue.abort(job, JSONErrorMessage('Could not get response from server.')) Session.commit() return amplicons = create_amplicons_from_pcr_sequences(sequence_group, pcr_sequences=[sequence]) for amp in amplicons: populate_amplicon_dgs(amp, dg_calc) logger.info("Location job completed [job %s]" % job.id) Session.commit() for amp in amplicons: for cseq in amp.cached_sequences: job_queue.add(JOB_ID_PROCESS_SNPS, ProcessSNPMessage(cached_sequence_id=cseq.id), parent_job=job.parent) job_queue.finish(job, None)
def process_snp_job(job, job_queue, sequence_source, dg_calc): logger = logging.getLogger(LOGGER_NAME) struct = JSONMessage.unserialize(job.input_message) sequence_group = Session.query(SequenceGroup).get(struct.sequence_group_id) chromosome = struct.chromosome snp_start = struct.start snp_end = struct.end sequence = None try: sequence = sequence_source.sequence_around_region(chromosome, snp_start, snp_end, sequence_group.amplicon_length, prefix_length=MAX_CACHE_PADDING, suffix_length=MAX_CACHE_PADDING) except Exception: logger.exception("Could not retrieve sequence around SNP location for [job %s]" % job.id) job_queue.abort(job, JSONErrorMessage("Could not retrieve sequence around SNP location.")) Session.commit() return if sequence is None: logger.error("Amplicon SNP: could not get response from server [job %s]" % job.id) job_queue.abort(job, JSONErrorMessage('Could not get response from server.')) Session.commit() return amplicons = create_amplicons_from_pcr_sequences(sequence_group, pcr_sequences=[sequence]) logger.info("SNP assay job completed [job %s]" % job.id) Session.commit() for amp in amplicons: for cseq in amp.cached_sequences: job_queue.add(JOB_ID_PROCESS_SNPS, ProcessSNPMessage(cached_sequence_id=cseq.id), parent_job=job.parent) job_queue.finish(job, None)
def process_assay_job(job_queue, tm_calc, dg_calc): logger = logging.getLogger(LOGGER_NAME) # mark finished first in_progress = job_queue.in_progress(job_type=JOB_ID_PROCESS_ASSAY) for job in in_progress: job_queue.finish_tree(job, None) if job_queue.is_job_done(job): logger.info("Job finished [job %s]" % job.id) args = job_queue.get_job_input_params(job) assay = Session.query(SequenceGroup).get(args.sequence_group_id) if assay: assay.analyzed = True Session.commit() remaining = job_queue.remaining(job_type=JOB_ID_PROCESS_ASSAY) for job in remaining: struct = JSONMessage.unserialize(job.input_message) sg_id = struct.sequence_group_id sg = Session.query(SequenceGroup).get(sg_id) # TODO: I think the parent function should have cleared out the amplicons, # but I need to make that choice if not sg: logger.error("Unknown sequence group id: %s [job %s]" % (sg_id, job.id)) job_queue.abort(job, JSONErrorMessage("Unknown sequence group id: %s" % sg_id)) if sg.kit_type == SequenceGroup.TYPE_DESIGNED: probe_ids = [p.id for p in sg.probes] # TODO: transaction? if sg.type == SequenceGroup.ASSAY_TYPE_GEX: job_type = JOB_ID_PROCESS_GEX_TAQMAN_TRANSCRIPT else: job_type = JOB_ID_PROCESS_TAQMAN_AMPLICON for fp in sg.forward_primers: for rp in sg.reverse_primers: job_queue.add(job_type, ProcessPrimerAmpliconMessage(sequence_group_id=sg.id, forward_primer_id=fp.id, reverse_primer_id=rp.id, probe_ids=probe_ids), parent_job=job) # TM, DG of sequence components right here for fp in sg.forward_primers: fp.tm = tm_calc.tm_primer(fp.sequence.sequence) fp.dg = dg_calc.delta_g(fp.sequence.sequence) for rp in sg.reverse_primers: rp.tm = tm_calc.tm_primer(rp.sequence.sequence) rp.dg = dg_calc.delta_g(rp.sequence.sequence) for p in sg.probes: if p.quencher and p.quencher.upper() == 'MGB': p.tm = tm_calc.tm_probe(p.sequence.sequence, mgb=True) p.dg = dg_calc.delta_g(p.sequence.sequence) else: p.tm = tm_calc.tm_probe(p.sequence.sequence, mgb=False) p.dg = dg_calc.delta_g(p.sequence.sequence) Session.commit() elif sg.kit_type == SequenceGroup.TYPE_LOCATION: job_queue.add(JOB_ID_PROCESS_LOCATION_AMPLICON, ProcessLocationAmpliconMessage(sequence_group_id=sg.id), parent_job=job) elif sg.kit_type == SequenceGroup.TYPE_SNP: job_queue.add(JOB_ID_PROCESS_SNP_RSID, ProcessSNPRSIDMessage(sequence_group_id=sg.id), parent_job=job) # TODO: need to be in transaction? job_queue.progress(job) # this is the key; otherwise, the SQL connection pool will be sucked up. # OH WHY DID I PICK THIS TIME TO WORRY ABOUT THREADING Session.close()
def process_snp_job(job_queue, snp_source, snp_table): logger = logging.getLogger(LOGGER_NAME) in_progress = job_queue.in_progress(job_type=JOB_ID_PROCESS_SNP_RSID) for job in in_progress: job_queue.finish_tree(job, None) remaining = job_queue.remaining(job_type=(JOB_ID_PROCESS_SNPS, JOB_ID_PROCESS_SNP_RSID, JOB_ID_PROCESS_GEX_SNPS)) for job in remaining: if job.type == JOB_ID_PROCESS_SNPS: snps = [] struct = JSONMessage.unserialize(job.input_message) cached_sequence_id = struct.cached_sequence_id cached_seq = Session.query(AmpliconSequenceCache).get(cached_sequence_id) if not cached_seq: logger.error("SNP job: Unknown amplicon sequence id: %s [job %s]" % (cached_sequence_id, job.id)) job_queue.abort(job, JSONErrorMessage("Unknown amplicon sequence id: %s" % cached_sequence_id)) try: snps = snp_source.snps_in_range(cached_seq.chromosome, cached_seq.start_pos-cached_seq.seq_padding_pos5, cached_seq.end_pos+cached_seq.seq_padding_pos3) except Exception: # DB timeout: abort job. logger.exception("Error from SNP worker:") job_queue.abort(job, JSONErrorMessage("Unable to connect to SNP database.")) continue if snps: db_snps = snp_objects_from_extdb(snps, snp_table) if not cached_seq.snps: cached_seq.snps = [] for snp in db_snps: cached_seq.snps.append(snp) logger.info("SNP process job finished [job %s]" % job.id) Session.commit() job_queue.finish(job, None) elif job.type == JOB_ID_PROCESS_GEX_SNPS: snps = [] struct = JSONMessage.unserialize(job.input_message) transcript_id = struct.transcript_id transcript = Session.query(Transcript).get(transcript_id) if not transcript: logger.error("GEX SNP job: Unknown transcript id: %s [job %s]" % (transcript_id, job.id)) job_queue.abort(job, JSONErrorMessage("Unknown transcript id %s" % transcript_id)) try: print transcript.exon_regions snps = snp_source.snps_in_chrom_ranges(transcript.chromosome, transcript.exon_bounds) except Exception: # DB timeout logger.exception("Error from SNP worker:") job_queue.abort(job, JSONErrorMessage("Unable to connect to SNP database.")) continue if snps: # transcript? db_snps = snp_objects_from_extdb(snps, snp_table) if not transcript.snps: transcript.snps = [] for snp in db_snps: transcript.snps.append(snp) logger.info("GEX SNP process job finished [job %s]" % job.id) Session.commit() job_queue.finish(job, None) elif job.type == JOB_ID_PROCESS_SNP_RSID: struct = JSONMessage.unserialize(job.input_message) sequence_group_id = struct.sequence_group_id sequence_group = Session.query(SequenceGroup).get(sequence_group_id) if not sequence_group: logger.error("Process RSID unknown sequence id: %s [job %s]" % (sequence_group_id, job.id)) job_queue.abort(job, JSONErrorMessage("Unknown sequence id.")) snp_rsid = sequence_group.snp_rsid if not snp_rsid: logger.error("Process RSID empty RSID: %s [job %s]" % (snp_rsid, job.id)) job_queue.abort(job, JSONErrorMessage("Empty SNP rsid.")) try: snps = snp_source.snps_by_rsid(snp_rsid) if not snps: logger.info("Process RSID unknown RSID: %s [job %s]" % (snp_rsid, job.id)) job_queue.abort(job, JSONErrorMessage("Unknown SNP rsid.")) continue except Exception: logger.exception("Error from SNP worker:") job_queue.abort(job, JSONErrorMessage("Unable to connect to SNP database.")) continue locations = [] for snp in snps: chromosome = snp['chrom'][3:] if snp['refUCSC'] == '-': # deletion: start = snp['chromStart'] else: start = snp['chromStart']+1 end = snp['chromEnd'] message = ProcessSNPAmpliconMessage(sequence_group_id, chromosome, start, end) job_queue.add(JOB_ID_PROCESS_SNP_AMPLICON, message, parent_job=job) # TODO: need to be in transaction? job_queue.progress(job) Session.close()