Пример #1
0
 def hmmsearch_genes(self,
                     hmms,
                     genome,
                     table='Standard',
                     decorate=False,
                     **kwargs):
     #get _genes
     genes = get_indexes_of_all_genes(genome)
     if not genes: return None
     for gene_id, gi in enumerate(genes):
         genome.features[gi].qualifiers['feature_id'] = gi
         genome.features[gi].qualifiers['gene_id'] = gene_id
     #translate _genes
     with user_message('Translating _genes/CDS of %s' % genome.description,
                       '\n'):
         translator = Translator(self._abort_event)
         translation = translator.translate_features(genome, genes, table)
     if not translation: return None
     if isinstance(hmms, str): hmms = [hmms]
     results = dict()
     for hmm in hmms:
         with user_message('Performing hmm search.'):
             hmm_results = self.hmmsearch_recs(hmm, translation, **kwargs)
         if not hmm_results: return None
         with user_message('Parsing search results...'):
             #get hit_ids of hmm matches
             hits = dict()
             for result in hmm_results:
                 for hit in result.iterhits():
                     hits[hit.id] = hit
             #get indexes of features where hmm hit
             hit_features = dict()
             for t in translation:
                 if t.id in hits:
                     fid = t.features[0].qualifiers.get('feature_id')
                     if fid is None: continue
                     hit_features[fid] = hits[t.id], t
             if hit_features: results.update(hit_features)
         #decorate genome
         if decorate:
             with user_message('Adding results as annotations...'):
                 hmm_name = os.path.basename(hmm)
                 for f in hit_features:
                     feature = genome.features[f]
                     for hsp in hit_features[f][0]:
                         if feature.strand == 1:
                             hmm_location = FeatureLocation(
                                 feature.location.start + hsp.hit_start * 3,
                                 feature.location.start + hsp.hit_end * 3,
                                 feature.strand)
                         else:
                             hmm_location = FeatureLocation(
                                 feature.location.end - hsp.hit_end * 3,
                                 feature.location.end - hsp.hit_start * 3,
                                 feature.strand)
                         hmm_feature = self.hsp2feature(
                             hmm_name, 'HMM_annotations', hmm_location, hsp)
                         genome.features.append(hmm_feature)
     return results if results else None
Пример #2
0
 def g2g_blastp(self, reference, subjects, table='Standard', 
                evalue=0.001, max_rlen=0, features_of_interest=None):
     '''
     Perform blastp of each coding sequence of the reference against each 
     subject, which is first translated gene-by-gene.
     Parameters
     @param reference: SeqRecord object of the reference genome
     @param subjects: a list of SeqRecord objects of subject genomes
     @param table: translation table number (see NCBI site for description)
     @param evalue: filter out blastp results with E-value grater than this
     @param max_rlen: filter out blastp results which are shorter than this 
     fraction of target gene length
     @param features_of_interest: list of dictionaries of the form 
     {qualifier_name : qualifier_value}
     to mark features denoting known clusters that should be analyzed one 
     against the other
     @return: list of pairs (CDS, (blast_result1, blast_result2, ...)) 
     where CDS is a gene/CDS feature from the reference.features list 
     and blast_resultN is a list of results for the N-th  
     subject, containing following information:
     (hit_feature, align_length, percent_identity, evalue)
     where hit_feature is a SeqFeature object of the gene/CDS of the subject
     where top blast hit is located, align_length is the length of the hit,
     percent_identity is the ratio of number of identities and align_length [0; 1]
     and evalue is the E-value of the top hit.
     '''
     if not reference or not subjects:
         print 'No reference or subject sequences provided' 
         return None
     #get list of features to query
     with user_message('Searching for gene/CDS features in provided sequences...'):
         all_records = [reference]+subjects
         num_records = len(all_records)
         features = self.parallelize_work(1, lambda ri, records: self._get_genes(records[ri]), 
                                          range(num_records), 
                                          all_records)
         if self.aborted():
             print '\nAborted'
             return None
         if not features or not features[0]:
             print ('\nReference sequence does not contain annotated _genes:\n%s %s'
                    % (reference.id, reference.description))
             return None
         if len([f for f in features if f]) < 2:
             print '\nSubject sequences do not contain annotated _genes'
             return None
         #add gene ids
         for ri, genes in enumerate(features):
             if not genes: continue
             r = all_records[ri]
             for gene_id, gi in enumerate(genes):
                 r.features[gi].qualifiers['feature_id'] = gi
                 r.features[gi].qualifiers['gene_id'] = gene_id
     #get features of interest if requested
     fois = None
     if features_of_interest:
         with user_message('Searching for features of interest...'):
             fois = []
             for foi in features_of_interest:
                 foi = self._get_fois(all_records, foi)
                 if foi and foi[0]: fois.append(foi)
                 if self.aborted():
                     print '\nAborted'
                     return None
     #translate features to proteins
     with Progress('Translating _genes found in the reference and subjects...', num_records) as prg:
         translator = Translator(self._abort_event)
         translations = [None]*num_records
         foi_translations = [[None]*num_records for _f in fois]
         for i, (f, rec) in enumerate(zip(features, all_records)):
             if not f:
                 prg.step(i) 
                 continue
             translation = translator.translate_features(rec, f, table)
             if not translation: return None 
             if i > 0: 
                 translations[i] = cat_records(translation)
                 if fois:
                     for ifoi, foi in enumerate(fois):
                         foi_loc = [0, 0]
                         for foi_var in foi[i]: 
                             if not foi_var: continue
                             for gid in foi_var:
                                 l = translations[i].features[gid].location
                                 foi_loc[0] = min(int(l.start)+1, foi_loc[0]) if foi_loc[0] > 0 else int(l.start)+1
                                 foi_loc[1] = max(int(l.end), foi_loc[1])
                         if foi_loc[0] > 0: foi_translations[ifoi][i] = foi_loc 
             else: 
                 translations[i] = translation
                 if fois: 
                     for ifoi, foi in enumerate(fois):
                         foi_translations[ifoi][i] = [[translation[gid] for gid in foi_var] for foi_var in foi[i]]
             prg.step(i)
     #blast features against subjects
     with user_message('Performing local blast of every translated gene in the reference against every translated subject...', '\n'):
         stranslations = translations[1:]
         blast_results = self._s2s_blast_batch(translations[0], stranslations, None, evalue, 
                                               command='blastp', task='blastp')
         if self.aborted():
             print '\nAborted'
             return None
         if not blast_results:
             print '\nBlast have not returned any results.' 
             return None
     if fois: #redo blast for fois and replace the results
         with user_message('Rerunning blast for FOIs...', '\n'):
             for ifoi, foi in enumerate(foi_translations):
                 sfoi_locs = foi[1:]
                 for i, foi_var in enumerate(foi[0]):
                     foi_blast = self._s2s_blast_batch(foi_var, stranslations, sfoi_locs, evalue, 
                                                       command='blastp', task='blastp')
                     if self.aborted():
                         print '\nAborted'
                         return None
                     if not foi_blast: continue
                     for gi, gid in enumerate(fois[ifoi][0][i]):
                         if foi_blast[gi]:
                             blast_results[gid] = foi_blast[gi]
     #process blast results
     pairs = list(itertools.product(xrange(len(translations[0])), xrange(len(stranslations))))
     with ProgressCounter('Searching for _genes in subjects that overlap with top blast hits...', len(pairs)) as prg:
         work = self.Work()
         work.start_work(self._find_features_by_hsps, pairs,
                         None, stranslations, blast_results)
         @MultiprocessingBase.results_assembler
         def assembler(index, result, blast_results, pairs, prg):
             qs = pairs[index]
             blast_results[qs[0]][qs[1]] = result
             prg.count()
         work.assemble(assembler, blast_results, pairs, prg)
         if not work.wait(): return None
     return zip((reference.features[f] for f in features[0]), blast_results)