def create_orf_annotator(parameters): 'It creates a function that annotates orfs' runner = create_runner(tool='estscan', parameters=parameters) def annotate_orf(sequence): 'It adds the orf to the SeqFeatures' if sequence is None: return results = runner(sequence) dna_fhand = results['dna'] prot_fhand = results['protein'] description, seq = get_content_from_fasta(dna_fhand)[1:] pep = get_content_from_fasta(prot_fhand)[-1] prot_fhand.close() dna_fhand.close() # If there is no description, ther is no org if description is None: return sequence items = description.split() start = int(items[1]) end = int(items[2]) seq = Seq(seq, generic_dna) pep = Seq(pep, generic_protein) qualifiers = {'dna':seq, 'pep':pep} if start < end: qualifiers['strand'] = 'forward' else: qualifiers['strand'] = 'reverse' feature = SeqFeature(location=FeatureLocation(start, end), type='orf', qualifiers=qualifiers) sequence.features.append(feature) return sequence return annotate_orf
def __init__(self, subject=None, database=None, program='blastn', parameters=None, filters=None): '''It inits the class. Query should be a sequence and subject can be one or several. subject could be an fhand (fasta) or an string ''' if subject is None and database is None: raise ValueError('Either subject or database should be given') if subject is not None and database is not None: msg = 'subject and database can not be given at the same time' raise ValueError(msg) if parameters is None: parameters = {} self._filters = filters if subject is not None: parameters['alig_format'] = 0 self._parser = get_alignment_parser('blast_text') self._subject_fhand = _seq_to_fasta_fhand(subject) parameters['subject'] = self._subject_fhand.name if database is not None: parameters['database'] = database parameters['alig_format'] = 5 self._parser = get_alignment_parser('blast') self._program = program self._aligner = create_runner(tool=program, parameters=parameters)
def test_build_water_relations(): '''it test the function that makes the relations between two sequences using a markx10 format file''' seq = 'ATGGCTTCATCCATTCTCTCATCCGCCGNTGTGGCCTTTGNCAACAGGGCTTCCCCTGCTCA' seq += 'AGCTAGCATGGGGGCACCATTCACTGGCCTAAAATCCGCCGCTGCTTTCCCNGTTTTATGTA' seq += 'CTGTTTTNACTCGCANGACCAACGACATCACCACTTTGGTTAGCAATGGGGGAAGAGTTCAG' seq += 'GGCNTGAAGGTGTGCCCACCACTTGGATTGAAGAAGTTCGAGACTCTTTCTTACCTTCCTGA' seq += 'TATGAGTAACGAGCAATTGGGAAAGGAAGTTGACTACCTTCTCAGGAAGGGATGGATTCCCT' seq += 'GCATTGAATTCGACATTCACAGTGGATTCGTTTACCGTGAGACCCACAGGTCACCAGGATAC' seq += 'TTCGATGGACGCTACTGGACCATGTGGAAGCTGCCCATGTTTGGCTGCACCGAT' seq2 = 'ATGGCTTCATCCATTCTCTCATCCGCCGNTGTGGCCTTTGNCAACAGGGCTTCCCTGCTCAA' seq2 += 'GCTAGCATGGGGGCACCATTCACTGGCCTAAAATCCGCCGCTGCTTTCCCNGTNACTCGCAN' seq2 += 'GACCAACGACATCACCACTTTGGTTAGCAATGGGGGAAGAGTTCAGGGCNTGAAGGTGTGCC' seq2 += 'CACCACTTGGATTGAAGAAGTTCGAGACTCTTTCTTACCTTCCTGATATGAGTAACGAGCAA' seq2 += 'TTGGGAAAGGAAGTTGACTACCTTCTCAGGAAGGGATGGATTCCCTGCATTGAATTCGACAT' seq2 += 'TCACAGTGGATTCGTTTACCGTGAGACCCACAGGTCACCAGGATACTTCGATGGACGCTAC' seq2 += 'TGGACCATGTGGAAGCTGCCCATGTTTGGCTGCACCGAT' subject_seq = SeqWithQuality(seq=Seq(seq), name='subject') query_seq = SeqWithQuality(seq=Seq(seq2), name='query') subject_fhand = temp_fasta_file(subject_seq) parameters = {'subject':subject_fhand.name} aligner = create_runner(tool='water', parameters=parameters) result_fhand = aligner(query_seq)['water'] relations = build_relations_from_aligment(result_fhand, query_name=query_seq.name, subject_name=subject_seq.name) assert relations == {'query': [(0, 50), (51, 112), (113, 409)], 'subject': [(0, 50), (52, 113), (129, 425)]}
def create_aligner_filter(aligner_cmd, cmd_parameters, match_filters=None, environment=None): '''A function factory factory that creates aligner filters. It returns a function that will accept a sequence and it will return True or False depending on the exonerate outcome. parameters is a dictionary and key are defined in ExonerateRunner. Required is only the target fasta file ''' #runners = {'blast':BlastRunner, 'exonerate':ExonerateRunner} parser = get_alignment_parser(aligner_cmd) run_align_for_seq = create_runner(tool=aligner_cmd, environment=environment, parameters=cmd_parameters) def _filter(sequence): 'Giving a sequence it returns true or False depending on the exonerate' if sequence is None: return False source_result = run_align_for_seq(sequence)[aligner_cmd] results = parser(source_result) filtered_results = filter_alignments(results, config=match_filters) try: #only one sequence -> only one result filtered_results.next() except StopIteration: #there was no result for this sequence return False return True return _filter
def create_striper_by_quality_lucy(parameters=None): """It creates a function that removes bad quality regions using lucy. The function will take a sequence iterator and it will return a new sequence iterator with the processed sequences in it.""" run_lucy_for_seqs = create_runner(tool="lucy", parameters=parameters) def strip_seq_by_quality_lucy(sequences): """It trims the bad quality regions from the given sequences. It uses lucy external program. It returns a sequence iterator and a, list of output files. (A seq file and a qual file). We return the files because the iterator feeds from them and they are temporary files that will be removed as soon as they get out of scope. """ # pylint: disable-msg=W0612 # now we run lucy sequences, sequences_for_lucy = tee(sequences, 2) seq_out_fhand = run_lucy_for_seqs(sequences_for_lucy)["sequence"][0] # index the lucy result result_index = SeqIO.index(seq_out_fhand.name, "fasta") # process each sequence and for sequence in sequences: yield _lucy_mapper(sequence, result_index) seq_out_fhand.close() return strip_seq_by_quality_lucy
def look_for_similar_sequences(sequence, database, blast_program, filters=None): 'It return a list with the similar sequences in the database' parameters = {'database': database} blast_runner = create_runner(tool=blast_program, parameters=parameters) blast_fhand = blast_runner(sequence)[blast_program] return similar_sequences_for_blast(blast_fhand, filters=filters)
def test_create_mdust_runner(): 'We can create a runner class for mdust' run_mdust__for_seq = create_runner(tool='mdust') seq = 'AACTACGTAGCTATGCTGATGCTAGTCTAGAAAAAAAAAAAAAAAAAAAAAAAAAAA' seq = Seq(seq) seq1 = SeqWithQuality(seq) result = run_mdust__for_seq(seq1)['sequence'] assert "57\t31\t57" in result.read()
def xtest_run_iprscan(): 'It test the runner of iprscan' protfile = os.path.join(TEST_DATA_DIR, 'prot.fasta') seq = 'MLVNRILKHGKKSLAYQIIYRAMKKIQQKTETNPLSVLRQAIRGVTPDIAVKARRVGGSTH' seq += 'QVPIEIGSTQGKALAIRWLLGASRKRPGRNMAFKLSSELVDAAKGSGDAIRKKEETHRMAEAN' seq += 'RAFAHFR*' seq1 = Seq(seq) run_iprscan_for_seq = create_runner(tool='iprscan', parameters={'format':'html'}) run_iprscan_for_seq(seq1)['result']
def look_for_similar_sequences(sequence, database, blast_program, filters=None): """It return a list with the similar sequences in the database. First it makes a blast of the sequences against the database and then in joins the similar sequences """ parameters = {"database": database} blast_runner = create_runner(tool=blast_program, parameters=parameters) blast_fhand = blast_runner(sequence)[blast_program] return similar_sequences_for_blast(blast_fhand, filters=filters)
def _locate_codons_in_orf(sequence, orf, snv): 'It locates the snv in the orf coordinate system' query_name = sequence.name orf_seq = orf.qualifiers['dna'] subject_name = 'subject' subject_fhand = NamedTemporaryFile(suffix='.fasta') subject_fhand.write('>%s\n%s\n' % (subject_name, orf_seq)) subject_fhand.flush() parameters = {'subject':subject_fhand.name} aligner = create_runner(tool='water', parameters=parameters) result_fhand = aligner(sequence)['water'] relations = build_relations_from_aligment(result_fhand, query_name=sequence.name, subject_name=subject_name) #print relations coord = CoordSystem(relations=[relations]) # snv .positions snv_pos = snv.location.start.position try: snv_in_orf = coord.transform(from_mol=query_name, to_mol=subject_name, position=snv_pos) except RuntimeError: snv_in_orf = None #print snv_in_orf_start, snv_in_orf_end orf_start = 0 orf_end = len(orf.qualifiers['dna']) - 1 orf_start_limit_in_seq = coord.transform(from_mol=subject_name, to_mol=query_name, position=orf_start) orf_end_limit_in_seq = coord.transform(from_mol=subject_name, to_mol=query_name, position=orf_end) if snv_in_orf is None: # it can be utr3, utr5, or None if snv_pos < orf_start_limit_in_seq: position = 'utr5' elif snv_pos > orf_end_limit_in_seq: position = 'in utr3' else: position = None codon_start = None snv_in_orf = None else: start_codon_pos = snv_in_orf % 3 codon_start = snv_in_orf - start_codon_pos position = 'orf' return (position, codon_start, snv_in_orf)
def test_create_blast_runner(): 'We can create a runner class for blast' blastpath = os.path.join(TEST_DATA_DIR, 'blast') run_blast_for_seq = create_runner(tool='blastn', parameters={'database':'arabidopsis_genes+'}, environment={'BLASTDB':blastpath}) seq = 'AACTACGTAGCTATGCTGATGCTAGTCTAGCTAGTCGTAGTCTGATCGTAGTCAGTT' seq = Seq(seq) seq1 = SeqWithQuality(seq) result = run_blast_for_seq(seq1)['blastn'] assert result.read()[0] == '<'
def test_create_lucy_runner(): 'We can create a runner class for lucy' fastafile = os.path.join(TEST_DATA_DIR, 'seq2.fasta') run_lucy_for_seq = create_runner(tool='lucy', parameters={'vector':(fastafile,fastafile)}) seq = 'AACTACGTAGCTATGCTGATGCTAGTCTAGAAAAAAAAAAAAAAAAAAAAAAAAAAA' qual = [30] * len(seq) seq = Seq(seq) seq1 = SeqWithQuality(seq, qual=qual) seqs = [seq1, seq1] result = run_lucy_for_seq(seqs)['sequence'] assert result[0].read() == ''
def __init__(self, subject, parameters=None, filters=None): 'It inits the class' if parameters is None: parameters = {} self._filters = filters self._parser = get_alignment_parser('exonerate') self._subject_fhand = _seq_to_fasta_fhand(subject) parameters['target'] = self._subject_fhand.name self._aligner = create_runner(tool='exonerate', parameters=parameters)
def create_microsatellite_annotator(): 'It creates a function that' runner = create_runner(tool='sputnik') def search_ssr(sequence): 'Do the actual search' if sequence is None: return srrs_out_fhand = runner(sequence)['sputnik'] for feature in _get_features_from_sputnik(srrs_out_fhand): sequence.features.append(feature) return sequence return search_ssr
def create_masker_for_polia(): 'It creates a masker function that will mask poly-A tracks' parameters = {'min_score':'10', 'end':'x', 'incremental_dist':'20', 'fixed_dist':None} mask_polya_by_seq = create_runner(tool='trimpoly', parameters=parameters) def mask_polya(sequence): '''It adds a mask to the sequence where the poly-A is found. It uses trimpoly from seqclean package ''' if sequence is None: return None fhand = mask_polya_by_seq(sequence)['sequence'] segments = _segments_from_trimpoly(fhand, sequence) _add_trim_segments(segments, sequence, trim=False) return sequence return mask_polya
def strip_seq_by_quality_trimpoly(sequence): """It strips the sequence where low quality is found It uses trimpoly from seqclean package. This program does not work well with short sequences. """ if sequence is None: return None if len(sequence) < 80: return None parameters = {"only_n_trim": None, "ntrim_above_percent": ntrim_above_percent} mask_polya_by_seq = create_runner(tool="trimpoly", parameters=parameters) fhand = mask_polya_by_seq(sequence)["sequence"] segments = _segments_from_trimpoly(fhand, sequence) _add_trim_segments(segments, sequence, vector=False) return sequence
def create_masker_for_polia(): "It creates a masker function that will mask poly-A tracks" parameters = {"min_score": "10", "end": "x", "incremental_dist": "20", "fixed_dist": None} mask_polya_by_seq = create_runner(tool="trimpoly", parameters=parameters) def mask_polya(sequence): """It adds a mask to the sequence where the poly-A is found. It uses trimpoly from seqclean package """ if sequence is None: return None fhand = mask_polya_by_seq(sequence)["sequence"] segments = _segments_from_trimpoly(fhand, sequence) _add_trim_segments(segments, sequence, trim=False) return sequence return mask_polya
def create_masker_for_low_complexity(): 'It creates a masker function for low complexity sections that uses mdust' mask_low_complex_by_seq = create_runner(tool='mdust') def mask_low_complexity(sequence): '''It adds a mask to the sequence where low complexity is found It uses mdust from the seqclean package ''' if sequence is None: return None fhand = mask_low_complex_by_seq(sequence)['sequence'] segments = [] for line in fhand: start, end = line.strip().split()[-2:] segments.append((int(start) - 1, int(end) - 1)) _add_trim_segments(segments, sequence, trim=False) return sequence return mask_low_complexity
def create_unique_contiguous_region_filter(distance, genomic_db, genomic_seqs_fpath): '''It returns a filter that removes snv in a region that give more than one match or more than one match_parts''' parameters = {'database': genomic_db} blast_runner = create_runner(tool='blastn', parameters=parameters) blast_parser = get_alignment_parser('blast') match_filters = [{'kind' : 'score_threshold', 'score_key': 'similarity', 'min_score': 90, }, {'kind' : 'min_length', 'min_num_residues': 20, 'length_in_query' : True } ] if not genomic_seqs_fpath: msg = 'No genomic sequence file defined for unique SNV filter' raise ValueError(msg) if not genomic_db: msg = 'No genomic blast database defined for unique SNV filter' raise ValueError(msg) genomic_seqs_fhand = open(genomic_seqs_fpath) genomic_seqs_index = SeqIO.index(genomic_seqs_fhand.name, guess_seq_file_format(genomic_seqs_fhand)) def unique_contiguous_region_filter(sequence): '''It filters out the snv in regions repeated in the genome or discontiguous''' if sequence is None: return None for snv in sequence.get_features(kind='snv'): # Check if it is already done previous_result = _get_filter_result(snv, 'uniq_contiguous', threshold=distance) if previous_result is not None: continue #we make a blast #with the sequence around the snv location = snv.location.start.position start = location - distance end = location + distance if start < 0: start = 0 #print start, end seq_fragment = sequence[start:end] blast_fhand = blast_runner(seq_fragment)['blastn'] #now we parse the blast blast_result = blast_parser(blast_fhand) alignments = filter_alignments(blast_result, config=match_filters) #are there any similar sequences? try: alignment = alignments.next() result = True except StopIteration: #if there is no similar sequence we assume that is unique result = False if result: #how many matches, it should be only one num_hits = len(alignment['matches']) if num_hits > 1: result = True else: #how many match parts have the first match? #we could do it with the blast result, but blast is not very #good aligning, so we realign with est2genome blast_fhand.seek(0) sim_seqs = similar_sequences_for_blast(blast_fhand) sim_seq = sim_seqs[0] if sim_seqs else None introns = infer_introns_for_cdna(sequence=seq_fragment, genomic_seqs_index=genomic_seqs_index, similar_sequence=sim_seq, genomic_db=genomic_db) if introns: result = True else: result = False blast_fhand.close() _add_filter_result(snv, 'uniq_contiguous', result, distance) return sequence return unique_contiguous_region_filter