def create_orf_annotator(parameters):
    'It creates a function that annotates orfs'
    runner = create_runner(tool='estscan', parameters=parameters)

    def annotate_orf(sequence):
        'It adds the orf to the SeqFeatures'
        if sequence is None:
            return
        results = runner(sequence)
        dna_fhand = results['dna']
        prot_fhand = results['protein']
        description, seq = get_content_from_fasta(dna_fhand)[1:]
        pep = get_content_from_fasta(prot_fhand)[-1]
        prot_fhand.close()
        dna_fhand.close()
        # If there is no description, ther is no org
        if description is None:
            return sequence

        items = description.split()
        start = int(items[1])
        end = int(items[2])
        seq = Seq(seq, generic_dna)
        pep = Seq(pep, generic_protein)
        qualifiers = {'dna':seq, 'pep':pep}
        if start < end:
            qualifiers['strand'] = 'forward'
        else:
            qualifiers['strand'] = 'reverse'
        feature = SeqFeature(location=FeatureLocation(start, end), type='orf',
                             qualifiers=qualifiers)

        sequence.features.append(feature)
        return sequence
    return annotate_orf
예제 #2
0
    def __init__(self, subject=None, database=None, program='blastn',
                 parameters=None, filters=None):
        '''It inits the class.

        Query should be a sequence and subject can be one or several.
        subject could be an fhand (fasta) or an string
        '''
        if subject is None and database is None:
            raise ValueError('Either subject or database should be given')
        if subject is not None and database is not None:
            msg = 'subject and database can not be given at the same time'
            raise ValueError(msg)

        if parameters is None:
            parameters = {}
        self._filters = filters

        if subject is not None:
            parameters['alig_format'] = 0
            self._parser  = get_alignment_parser('blast_text')
            self._subject_fhand = _seq_to_fasta_fhand(subject)
            parameters['subject'] = self._subject_fhand.name
        if database is not None:
            parameters['database'] = database
            parameters['alig_format'] = 5
            self._parser  = get_alignment_parser('blast')
        self._program = program
        self._aligner = create_runner(tool=program, parameters=parameters)
    def test_build_water_relations():
        '''it test the function that makes the relations between two sequences
         using a markx10 format file'''
        seq = 'ATGGCTTCATCCATTCTCTCATCCGCCGNTGTGGCCTTTGNCAACAGGGCTTCCCCTGCTCA'
        seq += 'AGCTAGCATGGGGGCACCATTCACTGGCCTAAAATCCGCCGCTGCTTTCCCNGTTTTATGTA'
        seq += 'CTGTTTTNACTCGCANGACCAACGACATCACCACTTTGGTTAGCAATGGGGGAAGAGTTCAG'
        seq += 'GGCNTGAAGGTGTGCCCACCACTTGGATTGAAGAAGTTCGAGACTCTTTCTTACCTTCCTGA'
        seq += 'TATGAGTAACGAGCAATTGGGAAAGGAAGTTGACTACCTTCTCAGGAAGGGATGGATTCCCT'
        seq += 'GCATTGAATTCGACATTCACAGTGGATTCGTTTACCGTGAGACCCACAGGTCACCAGGATAC'
        seq += 'TTCGATGGACGCTACTGGACCATGTGGAAGCTGCCCATGTTTGGCTGCACCGAT'

        seq2 = 'ATGGCTTCATCCATTCTCTCATCCGCCGNTGTGGCCTTTGNCAACAGGGCTTCCCTGCTCAA'
        seq2 += 'GCTAGCATGGGGGCACCATTCACTGGCCTAAAATCCGCCGCTGCTTTCCCNGTNACTCGCAN'
        seq2 += 'GACCAACGACATCACCACTTTGGTTAGCAATGGGGGAAGAGTTCAGGGCNTGAAGGTGTGCC'
        seq2 += 'CACCACTTGGATTGAAGAAGTTCGAGACTCTTTCTTACCTTCCTGATATGAGTAACGAGCAA'
        seq2 += 'TTGGGAAAGGAAGTTGACTACCTTCTCAGGAAGGGATGGATTCCCTGCATTGAATTCGACAT'
        seq2 += 'TCACAGTGGATTCGTTTACCGTGAGACCCACAGGTCACCAGGATACTTCGATGGACGCTAC'
        seq2 += 'TGGACCATGTGGAAGCTGCCCATGTTTGGCTGCACCGAT'

        subject_seq = SeqWithQuality(seq=Seq(seq), name='subject')
        query_seq = SeqWithQuality(seq=Seq(seq2), name='query')

        subject_fhand = temp_fasta_file(subject_seq)
        parameters = {'subject':subject_fhand.name}
        aligner = create_runner(tool='water', parameters=parameters)
        result_fhand = aligner(query_seq)['water']
        relations = build_relations_from_aligment(result_fhand,
                                                  query_name=query_seq.name,
                                                  subject_name=subject_seq.name)
        assert relations == {'query': [(0, 50), (51, 112), (113, 409)],
                             'subject': [(0, 50), (52, 113), (129, 425)]}
예제 #4
0
def create_aligner_filter(aligner_cmd, cmd_parameters, match_filters=None,
                          environment=None):
    '''A function factory factory that creates aligner filters.

    It returns a function that will accept a sequence and it will return
    True or False depending on the exonerate outcome.
    parameters is a dictionary and key are defined in ExonerateRunner.
    Required is only the target fasta file
    '''
    #runners = {'blast':BlastRunner, 'exonerate':ExonerateRunner}

    parser = get_alignment_parser(aligner_cmd)

    run_align_for_seq = create_runner(tool=aligner_cmd, environment=environment,
                                      parameters=cmd_parameters)
    def _filter(sequence):
        'Giving a sequence it returns true or False depending on the exonerate'
        if sequence is None:
            return False
        source_result    = run_align_for_seq(sequence)[aligner_cmd]
        results          = parser(source_result)
        filtered_results = filter_alignments(results, config=match_filters)
        try:
            #only one sequence -> only one result
            filtered_results.next()
        except StopIteration:
            #there was no result for this sequence
            return False
        return True
    return _filter
예제 #5
0
def create_striper_by_quality_lucy(parameters=None):
    """It creates a function that removes bad quality regions using lucy.

    The function will take a sequence iterator and it will return a new sequence
    iterator with the processed sequences in it."""
    run_lucy_for_seqs = create_runner(tool="lucy", parameters=parameters)

    def strip_seq_by_quality_lucy(sequences):
        """It trims the bad quality regions from the given sequences.

        It uses lucy external program. It returns a sequence iterator and a,
        list of output files. (A seq file and a qual file). We return the files
        because the iterator feeds from them and they are temporary files that
        will be removed as soon as they get out of scope.
        """
        # pylint: disable-msg=W0612
        # now we run lucy
        sequences, sequences_for_lucy = tee(sequences, 2)
        seq_out_fhand = run_lucy_for_seqs(sequences_for_lucy)["sequence"][0]

        # index the lucy result
        result_index = SeqIO.index(seq_out_fhand.name, "fasta")

        # process each sequence and
        for sequence in sequences:
            yield _lucy_mapper(sequence, result_index)

        seq_out_fhand.close()

    return strip_seq_by_quality_lucy
예제 #6
0
def look_for_similar_sequences(sequence, database, blast_program, filters=None):
    'It return a list with the similar sequences in the database'
    parameters = {'database': database}

    blast_runner = create_runner(tool=blast_program, parameters=parameters)
    blast_fhand  = blast_runner(sequence)[blast_program]
    return similar_sequences_for_blast(blast_fhand, filters=filters)
 def test_create_mdust_runner():
     'We can create a runner class for mdust'
     run_mdust__for_seq = create_runner(tool='mdust')
     seq  = 'AACTACGTAGCTATGCTGATGCTAGTCTAGAAAAAAAAAAAAAAAAAAAAAAAAAAA'
     seq = Seq(seq)
     seq1 = SeqWithQuality(seq)
     result = run_mdust__for_seq(seq1)['sequence']
     assert "57\t31\t57" in  result.read()
 def xtest_run_iprscan():
     'It test the runner of iprscan'
     protfile = os.path.join(TEST_DATA_DIR, 'prot.fasta')
     seq  = 'MLVNRILKHGKKSLAYQIIYRAMKKIQQKTETNPLSVLRQAIRGVTPDIAVKARRVGGSTH'
     seq += 'QVPIEIGSTQGKALAIRWLLGASRKRPGRNMAFKLSSELVDAAKGSGDAIRKKEETHRMAEAN'
     seq += 'RAFAHFR*'
     seq1 = Seq(seq)
     run_iprscan_for_seq = create_runner(tool='iprscan', parameters={'format':'html'})
     run_iprscan_for_seq(seq1)['result']
예제 #9
0
def look_for_similar_sequences(sequence, database, blast_program, filters=None):
    """It return a list with the similar sequences in the database.
    First it makes a blast of the sequences against the database and then
    in joins the similar sequences
    """
    parameters = {"database": database}

    blast_runner = create_runner(tool=blast_program, parameters=parameters)
    blast_fhand = blast_runner(sequence)[blast_program]
    return similar_sequences_for_blast(blast_fhand, filters=filters)
def _locate_codons_in_orf(sequence, orf, snv):
    'It locates the snv in the orf coordinate system'
    query_name = sequence.name
    orf_seq = orf.qualifiers['dna']
    subject_name = 'subject'
    subject_fhand = NamedTemporaryFile(suffix='.fasta')
    subject_fhand.write('>%s\n%s\n' % (subject_name, orf_seq))
    subject_fhand.flush()
    parameters   = {'subject':subject_fhand.name}
    aligner      = create_runner(tool='water', parameters=parameters)
    result_fhand = aligner(sequence)['water']
    relations = build_relations_from_aligment(result_fhand,
                                              query_name=sequence.name,
                                              subject_name=subject_name)
    #print relations
    coord = CoordSystem(relations=[relations])

    # snv .positions
    snv_pos = snv.location.start.position


    try:
        snv_in_orf = coord.transform(from_mol=query_name, to_mol=subject_name,
                                     position=snv_pos)
    except RuntimeError:
        snv_in_orf = None

    #print snv_in_orf_start, snv_in_orf_end

    orf_start = 0
    orf_end   = len(orf.qualifiers['dna']) - 1
    orf_start_limit_in_seq = coord.transform(from_mol=subject_name,
                                             to_mol=query_name,
                                             position=orf_start)
    orf_end_limit_in_seq   = coord.transform(from_mol=subject_name,
                                             to_mol=query_name,
                                             position=orf_end)

    if snv_in_orf is None:
        # it can be utr3, utr5, or None
        if snv_pos < orf_start_limit_in_seq:
            position = 'utr5'
        elif snv_pos > orf_end_limit_in_seq:
            position = 'in utr3'
        else:
            position = None
        codon_start = None
        snv_in_orf  = None
    else:
        start_codon_pos = snv_in_orf % 3
        codon_start     = snv_in_orf - start_codon_pos
        position        = 'orf'

    return (position, codon_start, snv_in_orf)
 def test_create_blast_runner():
     'We can create a runner class for blast'
     blastpath = os.path.join(TEST_DATA_DIR, 'blast')
     run_blast_for_seq = create_runner(tool='blastn',
                             parameters={'database':'arabidopsis_genes+'},
                             environment={'BLASTDB':blastpath})
     seq = 'AACTACGTAGCTATGCTGATGCTAGTCTAGCTAGTCGTAGTCTGATCGTAGTCAGTT'
     seq = Seq(seq)
     seq1 = SeqWithQuality(seq)
     result = run_blast_for_seq(seq1)['blastn']
     assert result.read()[0] == '<'
 def test_create_lucy_runner():
     'We can create a runner class for lucy'
     fastafile = os.path.join(TEST_DATA_DIR, 'seq2.fasta')
     run_lucy_for_seq = create_runner(tool='lucy',
                                 parameters={'vector':(fastafile,fastafile)})
     seq  = 'AACTACGTAGCTATGCTGATGCTAGTCTAGAAAAAAAAAAAAAAAAAAAAAAAAAAA'
     qual = [30] * len(seq)
     seq = Seq(seq)
     seq1 = SeqWithQuality(seq, qual=qual)
     seqs = [seq1, seq1]
     result = run_lucy_for_seq(seqs)['sequence']
     assert result[0].read() == ''
예제 #13
0
    def __init__(self, subject, parameters=None, filters=None):
        'It inits the class'

        if parameters is None:
            parameters = {}
        self._filters = filters

        self._parser  = get_alignment_parser('exonerate')

        self._subject_fhand = _seq_to_fasta_fhand(subject)
        parameters['target'] = self._subject_fhand.name
        self._aligner = create_runner(tool='exonerate', parameters=parameters)
def create_microsatellite_annotator():
    'It creates a function that'
    runner = create_runner(tool='sputnik')

    def search_ssr(sequence):
        'Do the actual search'
        if sequence is None:
            return
        srrs_out_fhand = runner(sequence)['sputnik']
        for feature in _get_features_from_sputnik(srrs_out_fhand):
            sequence.features.append(feature)
        return sequence
    return search_ssr
예제 #15
0
def create_masker_for_polia():
    'It creates a masker function that will mask poly-A tracks'
    parameters = {'min_score':'10', 'end':'x', 'incremental_dist':'20',
                      'fixed_dist':None}
    mask_polya_by_seq = create_runner(tool='trimpoly', parameters=parameters)
    def mask_polya(sequence):
        '''It adds a mask to the sequence where the poly-A is found.

        It uses trimpoly from seqclean package
        '''
        if sequence is None:
            return None

        fhand = mask_polya_by_seq(sequence)['sequence']
        segments = _segments_from_trimpoly(fhand, sequence)
        _add_trim_segments(segments, sequence, trim=False)
        return sequence
    return mask_polya
예제 #16
0
    def strip_seq_by_quality_trimpoly(sequence):
        """It strips the sequence where low quality is found

        It uses trimpoly from seqclean package.
        This program does not work well with short sequences.
        """
        if sequence is None:
            return None

        if len(sequence) < 80:
            return None

        parameters = {"only_n_trim": None, "ntrim_above_percent": ntrim_above_percent}
        mask_polya_by_seq = create_runner(tool="trimpoly", parameters=parameters)
        fhand = mask_polya_by_seq(sequence)["sequence"]
        segments = _segments_from_trimpoly(fhand, sequence)
        _add_trim_segments(segments, sequence, vector=False)
        return sequence
예제 #17
0
def create_masker_for_polia():
    "It creates a masker function that will mask poly-A tracks"
    parameters = {"min_score": "10", "end": "x", "incremental_dist": "20", "fixed_dist": None}
    mask_polya_by_seq = create_runner(tool="trimpoly", parameters=parameters)

    def mask_polya(sequence):
        """It adds a mask to the sequence where the poly-A is found.

        It uses trimpoly from seqclean package
        """
        if sequence is None:
            return None

        fhand = mask_polya_by_seq(sequence)["sequence"]
        segments = _segments_from_trimpoly(fhand, sequence)
        _add_trim_segments(segments, sequence, trim=False)
        return sequence

    return mask_polya
예제 #18
0
def create_masker_for_low_complexity():
    'It creates a masker function for low complexity sections that uses mdust'
    mask_low_complex_by_seq = create_runner(tool='mdust')

    def mask_low_complexity(sequence):
        '''It adds a mask to the sequence where low complexity is found

        It uses mdust from the seqclean package
        '''
        if sequence is None:
            return None
        fhand = mask_low_complex_by_seq(sequence)['sequence']
        segments = []
        for line in fhand:
            start, end = line.strip().split()[-2:]
            segments.append((int(start) - 1, int(end) - 1))
        _add_trim_segments(segments, sequence, trim=False)

        return sequence
    return mask_low_complexity
예제 #19
0
def create_unique_contiguous_region_filter(distance, genomic_db,
                                           genomic_seqs_fpath):
    '''It returns a filter that removes snv in a region that give more than one
    match or more than one match_parts'''
    parameters = {'database': genomic_db}
    blast_runner = create_runner(tool='blastn', parameters=parameters)
    blast_parser = get_alignment_parser('blast')
    match_filters = [{'kind'     : 'score_threshold',
                      'score_key': 'similarity',
                      'min_score': 90,
                     },
                     {'kind'            : 'min_length',
                      'min_num_residues': 20,
                      'length_in_query' : True
                     }
                    ]
    if not genomic_seqs_fpath:
        msg = 'No genomic sequence file defined for unique SNV filter'
        raise ValueError(msg)
    if not genomic_db:
        msg = 'No genomic blast database defined for unique SNV filter'
        raise ValueError(msg)
    genomic_seqs_fhand = open(genomic_seqs_fpath)
    genomic_seqs_index = SeqIO.index(genomic_seqs_fhand.name,
                                     guess_seq_file_format(genomic_seqs_fhand))

    def unique_contiguous_region_filter(sequence):
        '''It filters out the snv in regions repeated in the genome or
        discontiguous'''
        if sequence is None:
            return None

        for snv in sequence.get_features(kind='snv'):
            # Check if it is already done
            previous_result = _get_filter_result(snv, 'uniq_contiguous',
                                                 threshold=distance)
            if previous_result is not None:
                continue

            #we make a blast
            #with the sequence around the snv
            location = snv.location.start.position
            start = location - distance
            end = location + distance
            if start < 0:
                start = 0
            #print start, end
            seq_fragment = sequence[start:end]
            blast_fhand = blast_runner(seq_fragment)['blastn']
            #now we parse the blast
            blast_result = blast_parser(blast_fhand)
            alignments = filter_alignments(blast_result, config=match_filters)
            #are there any similar sequences?
            try:
                alignment = alignments.next()
                result = True
            except StopIteration:
                #if there is no similar sequence we assume that is unique
                result = False
            if result:
                #how many matches, it should be only one
                num_hits = len(alignment['matches'])

                if num_hits > 1:
                    result = True
                else:
                    #how many match parts have the first match?
                    #we could do it with the blast result, but blast is not very
                    #good aligning, so we realign with est2genome
                    blast_fhand.seek(0)
                    sim_seqs = similar_sequences_for_blast(blast_fhand)
                    sim_seq = sim_seqs[0] if sim_seqs else None

                    introns = infer_introns_for_cdna(sequence=seq_fragment,
                                          genomic_seqs_index=genomic_seqs_index,
                                              similar_sequence=sim_seq,
                                              genomic_db=genomic_db)
                    if introns:
                        result = True
                    else:
                        result = False

            blast_fhand.close()
            _add_filter_result(snv, 'uniq_contiguous', result, distance)
        return sequence

    return unique_contiguous_region_filter