def unique_contiguous_region_filter(sequence): '''It filters out the snv in regions repeated in the genome or discontiguous''' if sequence is None: return None for snv in sequence.get_features(kind='snv'): # Check if it is already done previous_result = _get_filter_result(snv, 'uniq_contiguous', threshold=distance) if previous_result is not None: continue #we make a blast #with the sequence around the snv location = snv.location.start.position start = location - distance end = location + distance if start < 0: start = 0 #print start, end seq_fragment = sequence[start:end] blast_fhand = blast_runner(seq_fragment)['blastn'] #now we parse the blast blast_result = blast_parser(blast_fhand) alignments = filter_alignments(blast_result, config=match_filters) #are there any similar sequences? try: alignment = alignments.next() result = True except StopIteration: #if there is no similar sequence we assume that is unique result = False if result: #how many matches, it should be only one num_hits = len(alignment['matches']) if num_hits > 1: result = True else: #how many match parts have the first match? #we could do it with the blast result, but blast is not very #good aligning, so we realign with est2genome blast_fhand.seek(0) sim_seqs = similar_sequences_for_blast(blast_fhand) sim_seq = sim_seqs[0] if sim_seqs else None introns = infer_introns_for_cdna(sequence=seq_fragment, genomic_seqs_index=genomic_seqs_index, similar_sequence=sim_seq, genomic_db=genomic_db) if introns: result = True else: result = False blast_fhand.close() _add_filter_result(snv, 'uniq_contiguous', result, distance) return sequence
def annotate_intron(sequence): 'It adds the orf to the SeqFeatures' if sequence is None: return try: introns = infer_introns_for_cdna(sequence=sequence, genomic_db=genomic_db, genomic_seqs_index=genomic_seqs_index) except KeyError as error: error = str(error).lstrip('u').strip("'") if 'not found' in error: error += ' in seq file %s, but present in blast db %s' % \ (genomic_seqs_fhand.name, genomic_db) raise RuntimeError(error) for intron_pos in introns: feature = SeqFeature(location=FeatureLocation(intron_pos, intron_pos), type='intron', qualifiers={'genomic_db':genomic_db}) sequence.features.append(feature) return sequence
def test_infer_introns_est2genome_method(): 'It tests the est2genome method of infering introns' seq = 'GAAAAGATGTGATTGGTGAAATAAGTTTGCCTCAATTCTCTTGTGCCGAAGTTCCAAAGAAGC' seq += 'AGTTGGTGAATGAGCAGCCAGTACCCGAAAAATCGAGCAAAGATTTTGTGATGTATGTTGGAG' seq += 'GTCTAGCATGGGGGATGGACTGGTGTCCCCAAGCTCATGAAAATAGGGATGCTCCTATGAAAA' seq += 'GTGAGTTTGTCGCAATTGCTCCTCATCCTCCTGATTCATCATATCACAAGACTGATGCCTCAC' seq += 'TTACAGGCAGAGGTGTAATTCAGATATGGTGCCTGCCAGATCTCATTCAAAAAGATATAATTG' seq += 'TGAAAGAAGATTATTTTGCTCAGGTTAACAAAAAACCGTATAGAAATTTGACAAGAAGTGAAG' seq += 'CAGGTACGGGAGAAGTATCTGGACCTCAAAAACCAAGAGGAAGACCAAAAAAGAACCCTGGTA' seq += 'AAGCAGTCCAGGCAAAAGCATCTAGACCACAAAATCCAAGAGGAAGACCGAGAAAGAAGCCTG' seq += 'TTACTGAATCTTTAGGTGATAGAGATAGTGAAGACCACAGTTTACAACCTCTTGCTATAGAGT' seq += 'GGTCGCTGCAATCAACAGAACTTTCTGTAGATTTGTCTTGTGGAAATATGAATAAAGCCCAAG' seq += 'TAGATATTGCGCTGAGTCAAGAAAGATGTATTAATGCGGCAT' seq1 = SeqWithQuality(seq = Seq(seq)) tomato_genome = 'tomato_genome2+' genomic_db = os.path.join(TEST_DATA_DIR, 'blast', tomato_genome) genomic_seqs_index = SeqIO.index(genomic_db, 'fasta') introns = infer_introns_for_cdna(seq1, genomic_db, genomic_seqs_index=genomic_seqs_index) assert introns == [478, 572, 613]