Exemplo n.º 1
0
def retreive_and_validate_inputs( job, job_queue, logger): 
    """ 
    retreives and validates analysis group and reprocessor from a job
    """
    struct  = JSONMessage.unserialize(job.input_message)

    ## get IDs
    analysis_group_id     = struct.analysis_group_id
    reprocess_config_id   = struct.reprocess_config_id

    analysis_group        = Session.query(AnalysisGroup).get(analysis_group_id)
    reprocessor           = Session.query(ReprocessConfig).get(reprocess_config_id)

    #check if analysis group exists in DB
    if not analysis_group:
        logger.error("Reprocess job: Analysis group id: %s not found[job %s]" % (analysis_group_id, job.id))
        job_queue.abort(job, JSONErrorMessage("Unknown analysis group id: %s" % analysis_group_id))

    #check if reprocessor exists in DB
    if not reprocessor:
        logger.error("Reprocess job: reprocessor id: %s not found[job %s]" % (reprocess_config_id, job.id))
        job_queue.abort(job, JSONErrorMessage("Unknown reprocessor: %s" % reprocess_config_id))


    return analysis_group, reprocessor
Exemplo n.º 2
0
def check_job_exists( job_list, analysis_group_id, reprocess_config_id):
        """
         returns 1 if job exists, else 0
        """
        for job in job_list:
            struct  = JSONMessage.unserialize(job.input_message)

            if( int( struct.analysis_group_id ) == int( analysis_group_id ) and \
                int( struct.reprocess_config_id )   == int( reprocess_config_id ) ):
                    return 1
        return 0
Exemplo n.º 3
0
def process_taqman_job(job, job_queue, sequence_source, dg_calc):
    logger = logging.getLogger(LOGGER_NAME)

    struct            = JSONMessage.unserialize(job.input_message)
    forward_primer_id = struct.forward_primer_id
    reverse_primer_id = struct.reverse_primer_id
    probe_ids         = struct.probe_ids

    sequence_group    = Session.query(SequenceGroup).get(struct.sequence_group_id)
    fp_sequence       = Session.query(SequenceGroupComponent).get(forward_primer_id).sequence
    rp_sequence       = Session.query(SequenceGroupComponent).get(reverse_primer_id).sequence
    probes            = Session.query(SequenceGroupComponent).filter(SequenceGroupComponent.id.in_(probe_ids)).all()
    probe_seqs        = [p.sequence for p in probes]
    
    sequences = sequence_source.sequences_for_primers(fp_sequence.sequence, rp_sequence.sequence,
                                                      fwd_prefix_length=MAX_CACHE_PADDING,
                                                      rev_suffix_length=MAX_CACHE_PADDING)
    
    if sequences is None:
        logger.error("Amplicon TaqMan: Could not get response from server [job %s]" % job.id)
        job_queue.abort(job, JSONErrorMessage('Could not get response from server.'))
        Session.commit()
        return
    
    amplicons = create_amplicons_from_pcr_sequences(sequence_group,
                                                    forward_primer=fp_sequence,
                                                    reverse_primer=rp_sequence,
                                                    probes=probe_seqs,
                                                    pcr_sequences=sequences)
    
    for amp in amplicons:
        populate_amplicon_dgs(amp, dg_calc)
    
    logger.info("Taqman job completed [job %s]" % job.id)
    Session.commit()
    
    # TODO: add finish message
    
    # now add SNPs to cached sequences
    for amp in amplicons:
        for cseq in amp.cached_sequences:
            job_queue.add(JOB_ID_PROCESS_SNPS, ProcessSNPMessage(cached_sequence_id=cseq.id),
                          parent_job=job.parent)
    
    # avoid condition where job completed -- clear child job first
    job_queue.finish(job, None)
Exemplo n.º 4
0
def process_transcript_job(job, job_queue, sequence_source, dg_calc):
    logger = logging.getLogger(LOGGER_NAME)

    struct = JSONMessage.unserialize(job.input_message)
    forward_primer_id = struct.forward_primer_id
    reverse_primer_id = struct.reverse_primer_id
    probe_ids = struct.probe_ids

    sequence_group = Session.query(SequenceGroup).get(struct.sequence_group_id)
    fp_sequence    = Session.query(SequenceGroupComponent).get(forward_primer_id).sequence
    rp_sequence    = Session.query(SequenceGroupComponent).get(reverse_primer_id).sequence
    probes         = Session.query(SequenceGroupComponent).filter(SequenceGroupComponent.id.in_(probe_ids)).all()
    probe_seqs     = [p.sequence for p in probes]

    transcripts = sequence_source.transcript_sequences_for_primers(fp_sequence.sequence,
                                                                   rp_sequence.sequence)

    if transcripts is None:
        logger.error("GEX TaqMan: Could not get response from server [job %s]" % job.id)
        job_queue.abort(job, JSONErrorMessage("Could not get response from server."))
        Session.commit()
        return

    db_transcripts = create_db_transcripts_from_pcr_transcripts(sequence_group,
                                                                forward_primer=fp_sequence,
                                                                reverse_primer=rp_sequence,
                                                                probes=probe_seqs,
                                                                pcr_gene_sequences=transcripts)

    sequence_group.transcripts = db_transcripts
    for trans in sequence_group.transcripts:
        populate_transcript_dgs(trans, dg_calc)

    logger.info("GEX TaqMan job completed [job %s]" % job.id)
    Session.commit()

    for trans in db_transcripts:
        job_queue.add(JOB_ID_PROCESS_GEX_SNPS, ProcessGEXSNPMessage(trans.id),
                      parent_job=job.parent)

    job_queue.finish(job, None)
Exemplo n.º 5
0
def process_location_job(job, job_queue, sequence_source, dg_calc):
    logger = logging.getLogger(LOGGER_NAME)

    struct         = JSONMessage.unserialize(job.input_message)
    
    sequence_group = Session.query(SequenceGroup).get(struct.sequence_group_id)
    sequence = None
    try:
        sequence = sequence_source.sequence_around_loc(sequence_group.location_chromosome,
                                                        sequence_group.location_base,
                                                        sequence_group.amplicon_length,
                                                        prefix_length=MAX_CACHE_PADDING,
                                                        suffix_length=MAX_CACHE_PADDING)
    except Exception:
        logger.exception("Could not retrieve sequence for assay location [job %s]: " % job.id)
        job_queue.abort(job, JSONErrorMessage('Could not retrieve the sequence for the specified amplicon location.'))
        Session.commit()
        return
    
    if sequence is None:
        logger.error("Amplicon Location: could not get response from server [job %s]: " % job.id)
        job_queue.abort(job, JSONErrorMessage('Could not get response from server.'))
        Session.commit()
        return
    
    amplicons = create_amplicons_from_pcr_sequences(sequence_group,
                                                    pcr_sequences=[sequence])
    
    for amp in amplicons:
        populate_amplicon_dgs(amp, dg_calc)
    
    logger.info("Location job completed [job %s]" % job.id)
    Session.commit()

    for amp in amplicons:
        for cseq in amp.cached_sequences:
            job_queue.add(JOB_ID_PROCESS_SNPS, ProcessSNPMessage(cached_sequence_id=cseq.id),
                          parent_job=job.parent)
    
    job_queue.finish(job, None)
Exemplo n.º 6
0
def process_snp_job(job, job_queue, sequence_source, dg_calc):
    logger = logging.getLogger(LOGGER_NAME)

    struct         = JSONMessage.unserialize(job.input_message)
    sequence_group = Session.query(SequenceGroup).get(struct.sequence_group_id)
    chromosome     = struct.chromosome
    snp_start      = struct.start
    snp_end        = struct.end

    sequence = None
    try:
        sequence   = sequence_source.sequence_around_region(chromosome, snp_start, snp_end,
                                                            sequence_group.amplicon_length,
                                                            prefix_length=MAX_CACHE_PADDING,
                                                            suffix_length=MAX_CACHE_PADDING)
    except Exception:
        logger.exception("Could not retrieve sequence around SNP location for [job %s]" % job.id)
        job_queue.abort(job, JSONErrorMessage("Could not retrieve sequence around SNP location."))
        Session.commit()
        return
    
    if sequence is None:
        logger.error("Amplicon SNP: could not get response from server [job %s]" % job.id)
        job_queue.abort(job, JSONErrorMessage('Could not get response from server.'))
        Session.commit()
        return
    
    amplicons      = create_amplicons_from_pcr_sequences(sequence_group,
                                                         pcr_sequences=[sequence])
    
    logger.info("SNP assay job completed [job %s]" % job.id)
    Session.commit()

    for amp in amplicons:
        for cseq in amp.cached_sequences:
            job_queue.add(JOB_ID_PROCESS_SNPS, ProcessSNPMessage(cached_sequence_id=cseq.id),
                          parent_job=job.parent)
    
    job_queue.finish(job, None)
Exemplo n.º 7
0
def process_assay_job(job_queue, tm_calc, dg_calc):
    logger = logging.getLogger(LOGGER_NAME)

    # mark finished first
    in_progress = job_queue.in_progress(job_type=JOB_ID_PROCESS_ASSAY)
    for job in in_progress:
        job_queue.finish_tree(job, None)
        if job_queue.is_job_done(job):
            logger.info("Job finished [job %s]" % job.id)
            args = job_queue.get_job_input_params(job)
            assay = Session.query(SequenceGroup).get(args.sequence_group_id)
            if assay:
                assay.analyzed = True
                Session.commit()
    
    remaining = job_queue.remaining(job_type=JOB_ID_PROCESS_ASSAY)
    for job in remaining:
        struct = JSONMessage.unserialize(job.input_message)
        sg_id = struct.sequence_group_id
        
        sg = Session.query(SequenceGroup).get(sg_id)
        # TODO: I think the parent function should have cleared out the amplicons,
        # but I need to make that choice
        if not sg:
            logger.error("Unknown sequence group id: %s [job %s]" % (sg_id, job.id))
            job_queue.abort(job, JSONErrorMessage("Unknown sequence group id: %s" % sg_id))
        
        if sg.kit_type == SequenceGroup.TYPE_DESIGNED:
            probe_ids = [p.id for p in sg.probes]
            # TODO: transaction?
            if sg.type == SequenceGroup.ASSAY_TYPE_GEX:
                job_type = JOB_ID_PROCESS_GEX_TAQMAN_TRANSCRIPT
            else:
                job_type = JOB_ID_PROCESS_TAQMAN_AMPLICON

            for fp in sg.forward_primers:
                for rp in sg.reverse_primers:
                    job_queue.add(job_type,
                                  ProcessPrimerAmpliconMessage(sequence_group_id=sg.id,
                                                               forward_primer_id=fp.id,
                                                               reverse_primer_id=rp.id,
                                                               probe_ids=probe_ids),
                                  parent_job=job)
            
            # TM, DG of sequence components right here
            for fp in sg.forward_primers:
                fp.tm = tm_calc.tm_primer(fp.sequence.sequence)
                fp.dg = dg_calc.delta_g(fp.sequence.sequence)
            for rp in sg.reverse_primers:
                rp.tm = tm_calc.tm_primer(rp.sequence.sequence)
                rp.dg = dg_calc.delta_g(rp.sequence.sequence)
            for p in sg.probes:
                if p.quencher and p.quencher.upper() == 'MGB':
                    p.tm = tm_calc.tm_probe(p.sequence.sequence, mgb=True)
                    p.dg = dg_calc.delta_g(p.sequence.sequence)
                else:
                    p.tm = tm_calc.tm_probe(p.sequence.sequence, mgb=False)
                    p.dg = dg_calc.delta_g(p.sequence.sequence)
            Session.commit()


        elif sg.kit_type == SequenceGroup.TYPE_LOCATION:
            job_queue.add(JOB_ID_PROCESS_LOCATION_AMPLICON,
                          ProcessLocationAmpliconMessage(sequence_group_id=sg.id),
                          parent_job=job)
        
        elif sg.kit_type == SequenceGroup.TYPE_SNP:
            job_queue.add(JOB_ID_PROCESS_SNP_RSID,
                          ProcessSNPRSIDMessage(sequence_group_id=sg.id),
                          parent_job=job)
        
        # TODO: need to be in transaction?
        job_queue.progress(job)
    
    # this is the key; otherwise, the SQL connection pool will be sucked up.
    # OH WHY DID I PICK THIS TIME TO WORRY ABOUT THREADING
    Session.close()
Exemplo n.º 8
0
def process_snp_job(job_queue, snp_source, snp_table):
    logger = logging.getLogger(LOGGER_NAME)

    in_progress = job_queue.in_progress(job_type=JOB_ID_PROCESS_SNP_RSID)
    for job in in_progress:
        job_queue.finish_tree(job, None)
    
    remaining = job_queue.remaining(job_type=(JOB_ID_PROCESS_SNPS, JOB_ID_PROCESS_SNP_RSID, JOB_ID_PROCESS_GEX_SNPS))
    for job in remaining:
        if job.type == JOB_ID_PROCESS_SNPS:
            snps               = []
            struct             = JSONMessage.unserialize(job.input_message)
            cached_sequence_id = struct.cached_sequence_id
            cached_seq         = Session.query(AmpliconSequenceCache).get(cached_sequence_id)
            if not cached_seq:
                logger.error("SNP job: Unknown amplicon sequence id: %s [job %s]" % (cached_sequence_id, job.id))
                job_queue.abort(job, JSONErrorMessage("Unknown amplicon sequence id: %s" % cached_sequence_id))
            
            try:
                snps = snp_source.snps_in_range(cached_seq.chromosome,
                                                cached_seq.start_pos-cached_seq.seq_padding_pos5,
                                                cached_seq.end_pos+cached_seq.seq_padding_pos3)
            except Exception:
                # DB timeout: abort job.
                logger.exception("Error from SNP worker:")
                job_queue.abort(job, JSONErrorMessage("Unable to connect to SNP database."))
                continue
            
            if snps:
                db_snps = snp_objects_from_extdb(snps, snp_table)
                if not cached_seq.snps:
                    cached_seq.snps = []
                for snp in db_snps:
                    cached_seq.snps.append(snp)
            
            logger.info("SNP process job finished [job %s]" % job.id)
            Session.commit()
            job_queue.finish(job, None)
        
        elif job.type == JOB_ID_PROCESS_GEX_SNPS:
            snps = []
            struct = JSONMessage.unserialize(job.input_message)
            transcript_id = struct.transcript_id
            transcript = Session.query(Transcript).get(transcript_id)
            if not transcript:
                logger.error("GEX SNP job: Unknown transcript id: %s [job %s]" % (transcript_id, job.id))
                job_queue.abort(job, JSONErrorMessage("Unknown transcript id %s" % transcript_id))
            try:
                print transcript.exon_regions
                snps = snp_source.snps_in_chrom_ranges(transcript.chromosome,
                                                       transcript.exon_bounds)
            except Exception:
                # DB timeout
                logger.exception("Error from SNP worker:")
                job_queue.abort(job, JSONErrorMessage("Unable to connect to SNP database."))
                continue

            if snps:
                # transcript?
                db_snps = snp_objects_from_extdb(snps, snp_table)
                if not transcript.snps:
                    transcript.snps = []
                for snp in db_snps:
                    transcript.snps.append(snp)

            logger.info("GEX SNP process job finished [job %s]" % job.id)
            Session.commit()
            job_queue.finish(job, None)
        
        elif job.type == JOB_ID_PROCESS_SNP_RSID:
            struct = JSONMessage.unserialize(job.input_message)
            sequence_group_id = struct.sequence_group_id
            sequence_group    = Session.query(SequenceGroup).get(sequence_group_id)
            if not sequence_group:
                logger.error("Process RSID unknown sequence id: %s [job %s]" % (sequence_group_id, job.id))
                job_queue.abort(job, JSONErrorMessage("Unknown sequence id."))
            
            snp_rsid = sequence_group.snp_rsid
            if not snp_rsid:
                logger.error("Process RSID empty RSID: %s [job %s]" % (snp_rsid, job.id))
                job_queue.abort(job, JSONErrorMessage("Empty SNP rsid."))
            
            try:
                snps = snp_source.snps_by_rsid(snp_rsid)
                if not snps:
                    logger.info("Process RSID unknown RSID: %s [job %s]" % (snp_rsid, job.id))
                    job_queue.abort(job, JSONErrorMessage("Unknown SNP rsid."))
                    continue
            except Exception:
                logger.exception("Error from SNP worker:")
                job_queue.abort(job, JSONErrorMessage("Unable to connect to SNP database."))
                continue
            
            locations = []
            for snp in snps:
                chromosome = snp['chrom'][3:]
                if snp['refUCSC'] == '-': # deletion:
                    start = snp['chromStart']
                else:
                    start = snp['chromStart']+1
                end = snp['chromEnd']
                message = ProcessSNPAmpliconMessage(sequence_group_id, chromosome, start, end)
                job_queue.add(JOB_ID_PROCESS_SNP_AMPLICON, message, parent_job=job)
            
            # TODO: need to be in transaction?
            job_queue.progress(job)
    
    Session.close()