Пример #1
0
def findFragendSites(fasta, resite):
    ''' Function creates FragendDict object. The object contains
    the location of all fragends for eachh strand of all
    chromosomes within a FASTA file.
    '''
    # Process restriction enzyme size and create output dictionary
    resite = resite.upper()
    frags = {'resite': resite}
    # Create sequence object for resite and reverse complent
    standard = Seq(resite)
    revcomp = standard.reverse_complement()
    # Open and parse fasta file
    fastaHandle = open(fasta)
    fastaData = SeqIO.parse(fastaHandle,'fasta')
    # Loop through fasta file and extract fragend information for each chromosome
    for fasta in fastaData:
        # Extract name and sequence
        fName, fSequence = str(fasta.id), str(fasta.seq).upper()
        # Add re sites to dictionary using 1 based index
        forward = nt_search(fSequence, standard)[1:]
        if forward:
            frags[(fName,'+')] = [x + len(resite) for x in forward]
        else:
            frags[(fName,'+')] = []
        reverse = nt_search(fSequence, revcomp)[1:]
        if reverse:
            frags[(fName,'-')] = [x + 1 for x in reverse]
        else:
            frags[(fName,'-')] = []
    # Close input file and return data
    fastaHandle.close()
    return(frags)
Пример #2
0
def findFragendSites(fasta, resite):
    ''' Function creates FragendDict object. The object contains
    the location of all fragends for eachh strand of all
    chromosomes within a FASTA file.
    '''
    # Process restriction enzyme size and create output dictionary
    resite = resite.upper()
    frags = {'resite': resite}
    # Create sequence object for resite and reverse complent
    standard = Seq(resite)
    revcomp = standard.reverse_complement()
    # Open and parse fasta file
    fastaHandle = open(fasta)
    fastaData = SeqIO.parse(fastaHandle, 'fasta')
    # Loop through fasta file and extract fragend information for each chromosome
    for fasta in fastaData:
        # Extract name and sequence
        fName, fSequence = str(fasta.id), str(fasta.seq).upper()
        # Add re sites to dictionary using 1 based index
        forward = nt_search(fSequence, standard)[1:]
        if forward:
            frags[(fName, '+')] = [x + len(resite) for x in forward]
        else:
            frags[(fName, '+')] = []
        reverse = nt_search(fSequence, revcomp)[1:]
        if reverse:
            frags[(fName, '-')] = [x + 1 for x in reverse]
        else:
            frags[(fName, '-')] = []
    # Close input file and return data
    fastaHandle.close()
    return (frags)
Пример #3
0
def target_seq(genome, query, filetype='fasta'):
    """ Finds a target sequence on a genome """

    print('finding target sequences')

    if arguments().sp == 'cg':
        features_file = cg_features_file
    elif arguments().sp == 'sc':
        features_file = sc_features_file
    elif arguments().sp == 'ca':
        features_file = ca_features_file
    elif arguments().sp == 'sp':
        features_file = sp_features_file
    else:
        raise ValueError('Unknown species flag specified')

    chromes = list(pd.read_csv(features_file)['chrom'].unique())
    if arguments().sp == 'ca':
        chromes = [str(chrom)[:9] for chrom in chromes if str(chrom)[8] == 'A']
    if arguments().sp == 'sp':
        chromes = ['I', 'II', 'III']

    chroms_locs = pd.DataFrame(np.nan, index=range(250000), columns=chromes)
    count = 0
    total_length = 0
    i = 0
    for record in SeqIO.parse(genome, filetype):
        if arguments().sp == 'sp':
            if record.id in chromes:
                x = nt_search(str(record.seq), query)
                count += len(x[1:])
                total_length += len(record.seq)
                chroms_locs[chromes[i]] = pd.Series(x[1:])
                i += 1
            else:
                pass
        else:
            x = nt_search(str(record.seq), query)
            count += len(x[1:])
            total_length += len(record.seq)
            chroms_locs[chromes[i]] = pd.Series(x[1:])
            i += 1

    chroms_locs = chroms_locs.dropna(how='all')

    if arguments().sp == 'cg':
        chroms_locs.to_csv(dependencies_dir + cg_hermes_on_chr)
    elif arguments().sp == 'sc':
        chroms_locs.to_csv(dependencies_dir + sc_hermes_on_chr)
    elif arguments().sp == 'ca':
        chroms_locs.to_csv(dependencies_dir + ca_hermes_on_chr)
    elif arguments().sp == 'sp':
        chroms_locs.to_csv(dependencies_dir + sp_hermes_on_chr)
    else:
        raise ValueError('Unknown species flag specified')

    return chroms_locs
Пример #4
0
        def testPrimerDirection(primers, record):
            self.assertGreaterEqual(len(nt_search(
                str(record.seq.upper()),
                str(primers[0].anneal_seq().upper().seq),
            )), 2)

            self.assertGreaterEqual(len(nt_search(
                str(record.seq.upper()),
                str(primers[1].anneal_seq().reverse_complement().upper().seq),
            )), 2)
Пример #5
0
 def find_pam_sites(self):
     search_space = self.reference.seq
     pam_seq = self.pam.seq
     pam_len = len(pam_seq)
     fwd_hits = nt_search(str(search_space),str(pam_seq))[1:]
     rev_hits = nt_search(str(search_space.complement()),str(pam_seq)[::-1])[1:]
     i2ps = lambda i,s: SeqFeature(FeatureLocation(i,i+len(pam_seq)),type='pam_site',strand=s)
     pam_sites = [i2ps(i,1) for i in fwd_hits]
     pam_sites.extend([i2ps(i,-1) for i in rev_hits])
     # pam_recs = SeqRecord(search_space,name='pam sites',features=pam_sites)
     self.pam_sites = pam_sites
     return pam_sites
Пример #6
0
def find_alignments(kmers):
    '''alignment function using nt_search'''
    my_align = {}
    for k in kmers:
        #print(k)
        my_align[str(k.seq)] = {r.id:nt_search(str(r.seq),str(k.seq))[1:] for r in refs}
    return my_align
Пример #7
0
def freq_appearance(file, query, gffutils_db, filetype='fasta'):
    """ Frequency of occurrence of a query sequence in a file

    :param file: sequence file
    :param query: sequence query (string)
    :param gffutils_db: gffutils all_features database
    :param filetype: parameter for SeqIO.parse (default='fasta')

    """
    sc_features, chromes = get_features(gffutils_db)
    chroms_locs = pd.DataFrame(np.nan, index=range(250000), columns=chromes)
    count = 0
    total_length = 0
    i = 0
    for record in SeqIO.parse(file, filetype):
        x = nt_search(str(record.seq), query)
        count += len(x[1:])
        total_length += len(record.seq)
        chroms_locs[chromes[i]] = pd.Series(x[1:])
        i += 1

    print(total_length)
    print(count)
    print(float(count) / total_length)
    chroms_locs = chroms_locs.dropna(how='all')
    chroms_locs.to_csv('gc_seq_locations_on_chromosomes.csv')

    return chroms_locs, sc_features
Пример #8
0
def complex_pattern_search(sequence, pattern, outfile, strand='+'):
    """
    Searching for pattern with biopyhon's nt_search().
    This allows for ambiguous values, like N = A or T or C or G, R = A or G ...
    """
    l = len(pattern)
    matches = nt_search(str(sequence.seq), pattern)
    bed_template = '%s\t%s\t%s\t%s\t%s\t%s\n'
    for match in matches[1:]:
        outfile.write(bed_template % (sequence.id, match, match+l, sequence.description, '', strand) )
Пример #9
0
def locateWord(ref, area, word):

    if area == None: return None
    seq =  ref.seq[area[0]:area[1]]
    words = nt_search(str(seq.upper()), word.upper())
    list1 = []
    for i in words[1:]:
        list1.append([(area[0]+i), (area[0]+i+len(word))])
    if list1: 
        return list1 
    return None
Пример #10
0
def complex_pattern_search(sequence, pattern, outfile, strand='+'):
    """
    Searching for pattern with biopyhon's nt_search().
    This allows for ambiguous values, like N = A or T or C or G, R = A or G ...
    """
    l = len(pattern)
    matches = nt_search(str(sequence.seq), pattern)
    bed_template = '%s\t%s\t%s\t%s\t%s\t%s\n'
    for match in matches[1:]:
        outfile.write(
            bed_template %
            (sequence.id, match, match + l, sequence.description, '', strand))
Пример #11
0
    def testProduct(self):
        product = self.assembly.product(5)
        product.seq.alphabet = IUPAC.IUPACAmbiguousDNA() # TODO: use this alphabet for all sequences
        SeqIO.write(product, open('/tmp/output.gb', 'w'), 'genbank')

        # Compare the SS1 sequence of the predicted product with what is expected'

        ss1_up_seq = 'atgatgttgtcaaagagtatgcgtcgttaattttatctcgttgataccgg'.upper()
        ss1_down_seq = 'gcgtcctgcttgccagatgcgatgttgtagcatcttatccagcaaccagg'.upper()

        product_ss1_seq = product[
            nt_search(str(product.seq), ss1_up_seq)[1]:
            nt_search(str(product.seq), ss1_down_seq)[1]+len(ss1_down_seq)]
        expected_record = SeqIO.read(
            'sequences_v2/stage-5-ss1-v2-vioabedc-gs-with-increased-vioedc-max-expression.gb',
            'genbank')
        expected_ss1_seq = expected_record[
            nt_search(str(expected_record.seq), ss1_up_seq)[1]:
            nt_search(str(expected_record.seq), ss1_down_seq)[1]+len(ss1_down_seq)]

        self.assertEqual(str(product_ss1_seq.seq), str(expected_ss1_seq.seq))
Пример #12
0
def search_for_substring(promoter_list, seq_to_search):
    search_array = []
    for promoter in promoter_list:
        search_results = nt_search(str(seq_to_search), promoter)
        search_array.append(search_results)
    return search_array
Пример #13
0
def find_positions(seqfile, queryseq, filetype='fasta'):
    """ Finds positions of a query sequence in a file """

    genome = SeqIO.parse(seqfile, filetype)
    positions = nt_search(str(genome), queryseq)
    return positions
Пример #14
0
    def search_seqs(self, seqrec, in_seq, locus, run=0, partial_ann=None):
        """
        search_seqs - method for annotating a BioPython sequence without alignment

        :param seqrec: The reference sequence
        :type seqrec: SeqRecord
        :param locus: The gene locus associated with the sequence.
        :type locus: str
        :param in_seq: The input sequence
        :type in_seq: SeqRecord
        :param run: The number of runs that have been done
        :type run: int
        :param partial_ann: A partial annotation from a previous step
        :type partial_ann: :ref:`ann`
        :rtype: :ref:`ann`

        Example usage:

            >>> from Bio.Seq import Seq
            >>> from seqann.seq_search import SeqSearch
            >>> inseq = Seq('AGAGACTCTCCCGAGGATTTCGTGTACCAGTTTAAGGCCATGTGCTACTTCACC')
            >>> sqsrch = SeqSearch()
            >>> ann = sqsrch.search_seqs(refseqs, inseq)

        """
        # Extract out the sequences and feature names
        # from the reference sequences

        # The mapped features will be subtracted from seq_covered
        # so the final seq_covered number will reflect the remaining
        # number of base pairs that haven't been mapped.
        #
        # The coordinates and mapping will help determine what positions
        # in the sequence have been mapped and to what features. The
        # missing blocks variable will be generated using these.
        structures = get_structures()
        seq_covered = len(in_seq.seq)
        coordinates = dict(
            map(lambda x: [x, 1], [i for i in range(0,
                                                    len(in_seq.seq) + 1)]))

        mapping = dict(
            map(lambda x: [x, 1], [i for i in range(0,
                                                    len(in_seq.seq) + 1)]))

        ambig_map = {}
        found_feats = {}
        feat_missing = {}

        method = "nt_search" if not partial_ann else partial_ann.method

        # If the partial annotation is provided
        # then make the found_feats equal to
        # what has already been annotated
        feats = get_features(seqrec)
        if partial_ann:

            found_feats = partial_ann.features

            if self.verbose and self.verbosity > 4:
                self.logger.info("Found partial features:")
                for f in found_feats:
                    self.logger.info(f)

            # Skip references that only have features
            # that have already been annoated
            if len([f for f in feats if f in found_feats]) == len(feats):
                if self.verbose:
                    self.logger.info("Skipping incomplete refseq")
                return partial_ann

            if self.verbose and self.verbosity > 1:
                self.logger.info("Using partial annotation | " + locus + " " +
                                 str(len(partial_ann.features)))

            coordinates = dict(
                map(lambda l: [l, 1], [
                    item for sublist in partial_ann.blocks for item in sublist
                ]))
            seq_covered = partial_ann.covered
            mapping = partial_ann.mapping

            if self.verbose and self.verbosity > 2:
                self.logger.info("Partial sequence coverage = " +
                                 str(seq_covered))
                self.logger.info("Partial sequence metho = " + method)

        added_feat = {}
        deleted_coords = {}
        for feat_name in sorted(feats, key=lambda k: structures[locus][k]):

            # skip if partial annotation is provided
            # and the feat name is not one of the
            # missing features
            if partial_ann and feat_name not in partial_ann.refmissing:
                if self.verbose and self.verbosity > 1:
                    self.logger.info("Skipping " + feat_name +
                                     " - Already annotated")
                continue

            if self.verbose and self.verbosity > 1:
                self.logger.info("Running seqsearch for " + feat_name)

            # Search for the reference feature sequence in the
            # input sequence. Record the coordinates if it's
            # found and if it's found in multiple spots. If it
            # is not found, then record that feature as missing.
            seq_search = nt_search(str(in_seq.seq), str(feats[feat_name]))

            if len(seq_search) == 2:

                if self.verbose and self.verbosity > 0:
                    self.logger.info("Found exact match for " + feat_name)

                seq_covered -= len(str(feats[feat_name]))
                end = int(len(str(feats[feat_name])) + seq_search[1])

                if feat_name == 'three_prime_UTR' \
                        and len(str(in_seq.seq)) > end:
                    end = len(str(in_seq.seq))

                # If the feature is found and it's a five_prime_UTR then
                # the start should always be 0, so insertions at the
                # beinging of the sequence will be found.
                start = seq_search[1] if feat_name != 'five_prime_UTR' else 0
                si = seq_search[1]+1 if seq_search[1] != 0 and \
                    feat_name != 'five_prime_UTR' else 0

                # check if this features has already been mapped
                mapcheck = set(
                    [0 if i in coordinates else 1 for i in range(si, end + 1)])

                # Dont map features if they are out of order
                skip = False
                if found_feats and len(found_feats) > 0:
                    for f in found_feats:
                        o1 = structures[locus][feat_name]
                        o2 = structures[locus][f]
                        loctyp = loctype(found_feats[f].location.start,
                                         found_feats[f].location.end, start,
                                         end)

                        if o1 < o2 and loctyp:
                            skip = True
                            if self.verbose:
                                self.logger.info("Skipping map for " +
                                                 feat_name)
                        elif o2 < o1 and not loctyp:
                            skip = True
                            if self.verbose:
                                self.logger.info("Skipping map for " +
                                                 feat_name)

                if 1 not in mapcheck and not skip:
                    for i in range(si, end + 1):
                        if i in coordinates:
                            if feat_name == "exon_8" or feat_name == 'three_prime_UTR':
                                deleted_coords.update({i: coordinates[i]})
                            del coordinates[i]
                        else:
                            if self.verbose:
                                self.logger.error(
                                    "seqsearch - should't be here " + locus +
                                    " - " + " - " + feat_name)
                        mapping[i] = feat_name

                    found_feats.update({
                        feat_name:
                        SeqFeature(FeatureLocation(ExactPosition(start),
                                                   ExactPosition(end),
                                                   strand=1),
                                   type=feat_name)
                    })

                    if feat_name == "exon_8" or feat_name == 'three_prime_UTR':
                        added_feat.update({feat_name: feats[feat_name]})
                    if self.verbose and self.verbosity > 3:
                        self.logger.info("Coordinates | Start = " +
                                         str(start) + " - End = " + str(end))

            elif (len(seq_search) > 2):
                if self.verbose and self.verbosity > 1:
                    self.logger.info("Found " + str(len(seq_search)) +
                                     " matches for " + feat_name)

                new_seq = [seq_search[0]]
                for i in range(1, len(seq_search)):
                    tnp = seq_search[i] + 1
                    if seq_search[i] in coordinates or tnp in coordinates:
                        new_seq.append(seq_search[i])

                seq_search = new_seq
                if (partial_ann and feat_name == "exon_8" and run > 0):
                    missing_feats = sorted(list(partial_ann.missing.keys()))

                    # * HARD CODED LOGIC * #
                    # > exon8 in class I maps to multiple spots in a sequence,
                    #   often in the 3' UTR. These features need to be mapped
                    #   last to make sure it's not mapping exon8 incorrectly.
                    if (missing_feats == ['exon_8', 'three_prime_UTR']
                            and len(seq_search) <= 3):
                        if self.verbose and self.verbosity > 0:
                            self.logger.info("Resolving exon_8")

                        seq_covered -= len(str(feats[feat_name]))
                        end = int(len(str(feats[feat_name])) + seq_search[1])

                        # If the feature is found and it's a five_prime_UTR then
                        # the start should always be 0, so insertions at the
                        # beinging of the sequence will be found.
                        start = seq_search[1]
                        si = seq_search[1] + 1 if seq_search[1] != 0 else 0

                        # check if this features has already been mapped
                        mapcheck = set([
                            0 if i in coordinates else 1
                            for i in range(si, end + 1)
                        ])

                        for i in range(si, end + 1):
                            if i in coordinates:
                                del coordinates[i]
                            else:
                                if self.verbose:
                                    self.logger.error(
                                        "seqsearch - should't be here " +
                                        locus + " - " + " - " + feat_name)
                            mapping[i] = feat_name

                        found_feats.update({
                            feat_name:
                            SeqFeature(FeatureLocation(ExactPosition(start),
                                                       ExactPosition(end),
                                                       strand=1),
                                       type=feat_name)
                        })

                        if self.verbose and self.verbosity > 0:
                            self.logger.info("Coordinates | Start = " +
                                             str(start) + " - End = " +
                                             str(end))
                    else:
                        if self.verbose and self.verbosity > 0:
                            self.logger.info("Adding ambig feature " +
                                             feat_name)
                        feat_missing.update({feat_name: feats[feat_name]})
                        ambig_map.update(
                            {feat_name: seq_search[1:len(seq_search)]})
                else:
                    if self.verbose and self.verbosity > 0:
                        self.logger.info("Adding ambig feature " + feat_name)
                    feat_missing.update({feat_name: feats[feat_name]})
                    ambig_map.update(
                        {feat_name: seq_search[1:len(seq_search)]})
            else:
                if self.verbose and self.verbosity > 1:
                    self.logger.info("No match for " + feat_name)
                feat_missing.update({feat_name: feats[feat_name]})

        blocks = getblocks(coordinates)
        exact_matches = list(found_feats.keys())

        # * HARD CODED LOGIC * #
        # >
        #
        #  HLA-DRB1 exon3 exact match - with intron1 and 3 missing
        if ('exon_3' in exact_matches and run == 99 and locus == 'HLA-DRB1'
                and 'exon_2' in feat_missing
                and (len(blocks) == 1 or len(blocks) == 2)):

            for b in blocks:
                x = b[len(b) - 1]
                if x == max(list(mapping.keys())):
                    featname = "intron_3"
                    found_feats.update({
                        featname:
                        SeqFeature(FeatureLocation(ExactPosition(b[0] - 1),
                                                   ExactPosition(b[len(b) -
                                                                   1]),
                                                   strand=1),
                                   type=featname)
                    })
                else:
                    featname = "exon_2"
                    found_feats.update({
                        featname:
                        SeqFeature(FeatureLocation(ExactPosition(b[0]),
                                                   ExactPosition(b[len(b) -
                                                                   1]),
                                                   strand=1),
                                   type=featname)
                    })
                    seq_covered -= len(b)

                if self.verbose and self.verbosity > 1:
                    self.logger.info(
                        "Successfully annotated class DRB1 II sequence")

                return Annotation(features=found_feats,
                                  covered=seq_covered,
                                  seq=in_seq,
                                  missing=feat_missing,
                                  ambig=ambig_map,
                                  method=method,
                                  mapping=mapping,
                                  exact_match=exact_matches)

        # If it's a class II sequence and
        # exon_2 is an exact match
        # * HARD CODED LOGIC * #
        # > It's common for exon2 to be fully sequenced
        #   but intron_2 and intron_1 to be partially sequenced,
        #   which can make it hard to annotate those to features.
        #   If there are two missing blocks that is small enough
        #   and they are before and after exon2, then it's very
        #   very likely to be intron_2 and intron_1.
        if 'exon_2' in exact_matches and len(blocks) == 2 \
                and is_classII(locus) and seq_covered < 300:

            if self.verbose and self.verbosity > 1:
                self.logger.info("Running search for class II sequence")

            r = True
            for b in blocks:
                x = b[len(b) - 1]
                if x == max(list(mapping.keys())):
                    x = b[0] - 1
                else:
                    x += 1
                f = mapping[x]
                if f != 'exon_2':
                    r = False
            if r:
                for b in blocks:
                    x = b[len(b) - 1]
                    if x == max(list(mapping.keys())):
                        featname = "intron_2"
                        found_feats.update({
                            featname:
                            SeqFeature(FeatureLocation(ExactPosition(b[0] - 1),
                                                       ExactPosition(b[len(b) -
                                                                       1]),
                                                       strand=1),
                                       type=featname)
                        })
                    else:
                        featname = "intron_1"
                        found_feats.update({
                            featname:
                            SeqFeature(FeatureLocation(ExactPosition(b[0]),
                                                       ExactPosition(b[len(b) -
                                                                       1]),
                                                       strand=1),
                                       type=featname)
                        })
                    seq_covered -= len(b)

                if self.verbose and self.verbosity > 1:
                    self.logger.info(
                        "Successfully annotated class II sequence")

                return Annotation(features=found_feats,
                                  covered=seq_covered,
                                  seq=in_seq,
                                  missing=feat_missing,
                                  ambig=ambig_map,
                                  method=method,
                                  mapping=mapping,
                                  exact_match=exact_matches)

        annotated_feats, mb, mapping = self._resolve_unmapped(
            blocks, feat_missing, ambig_map, mapping, found_feats, locus,
            seq_covered)

        # * HARD CODED LOGIC * #
        if (not mb and blocks and len(feat_missing.keys()) == 0
                and len(ambig_map.keys()) == 0):
            mb = blocks

        if mb:

            # Unmap exon 8
            if locus in ['HLA-C', 'HLA-A'] and len(in_seq.seq) < 3000 \
                    and 'exon_8' in exact_matches:
                for i in deleted_coords:
                    mapping[i] = 1
                coordinates.update(deleted_coords)
                mb = getblocks(coordinates)
                feat_missing.update(added_feat)

                # Delte from found features
                del exact_matches[exact_matches.index('exon_8')]
                del found_feats['exon_8']

                if 'exon_8' in annotated_feats:
                    del annotated_feats['exon_8']
                if 'three_prime_UTR' in found_feats:
                    del found_feats['three_prime_UTR']
                if 'three_prime_UTR' in annotated_feats:
                    del annotated_feats['three_prime_UTR']

            refmissing = [
                f for f in structures[locus] if f not in annotated_feats
            ]

            if self.verbose and self.verbosity > 1:
                self.logger.info("* Annotation not complete *")

            # Print out what features were missing by the ref
            if self.verbose and self.verbosity > 2:
                self.logger.info("Refseq was missing these features = " +
                                 ",".join(list(refmissing)))

            # Print out what features were ambig matches
            if self.verbose and self.verbosity > 1 and len(ambig_map) > 1:
                self.logger.info("Features with ambig matches = " +
                                 ",".join(list(ambig_map)))

            # Print out what features were exact matches
            if self.verbose and self.verbosity > 2 and len(exact_matches) > 1:
                self.logger.info("Features exact matches = " +
                                 ",".join(list(exact_matches)))

            # Print out what features have been annotated
            if self.verbose and self.verbosity > 1 and len(
                    annotated_feats) > 1:
                self.logger.info("Features annotated = " +
                                 ",".join(list(annotated_feats)))

            # Print out what features are missing
            if self.verbose and self.verbosity > 1 and len(feat_missing) > 1:
                self.logger.info("Features missing = " +
                                 ",".join(list(feat_missing)))

            annotation = Annotation(features=annotated_feats,
                                    covered=seq_covered,
                                    seq=in_seq,
                                    missing=feat_missing,
                                    ambig=ambig_map,
                                    blocks=mb,
                                    method=method,
                                    refmissing=refmissing,
                                    mapping=mapping,
                                    exact_match=exact_matches,
                                    annotation=None)
        else:

            mb = None
            # Unmap exon 8
            if locus in ['HLA-C', 'HLA-A'] and len(in_seq.seq) < 600 \
                    and 'exon_8' in exact_matches \
                    and 'three_prime_UTR' in annotated_feats\
                    and 'three_prime_UTR' not in exact_matches:

                for i in deleted_coords:
                    mapping[i] = 1

                coordinates.update(deleted_coords)
                mb = getblocks(coordinates)
                feat_missing.update(added_feat)
                del exact_matches[exact_matches.index('exon_8')]
                del found_feats['exon_8']
                if 'exon_8' in annotated_feats:
                    del annotated_feats['exon_8']
                if 'three_prime_UTR' in found_feats:
                    del found_feats['three_prime_UTR']
                if 'three_prime_UTR' in annotated_feats:
                    del annotated_feats['three_prime_UTR']

            if self.verbose:
                self.logger.info("* No missing blocks after seq_search *")

            # Print out what features were ambig matches
            if self.verbose and self.verbosity > 0 and len(ambig_map) > 1:
                self.logger.info("Features with ambig matches = " +
                                 ",".join(list(ambig_map)))

            # Print out what features were exact matches
            if self.verbose and self.verbosity > 0 and len(exact_matches) > 1:
                self.logger.info("Features exact matches = " +
                                 ",".join(list(exact_matches)))

            # Print out what features have been annotated
            if self.verbose and self.verbosity > 0 and len(
                    annotated_feats) > 1:
                self.logger.info("Features annotated = " +
                                 ",".join(list(annotated_feats)))

            # Print out what features are missing
            if self.verbose and self.verbosity > 0 and len(feat_missing) > 1:
                self.logger.info("Features missing = " +
                                 ",".join(list(feat_missing)))

            annotation = Annotation(features=annotated_feats,
                                    covered=seq_covered,
                                    seq=in_seq,
                                    missing=feat_missing,
                                    ambig=ambig_map,
                                    method=method,
                                    blocks=mb,
                                    mapping=mapping,
                                    exact_match=exact_matches,
                                    annotation=None)

        return annotation
Пример #15
0
totalreads = 0
totalspacers = 0
totalbadspacers = 0


def my_rev_complement(seq):
    return Seq(seq).reverse_complement()


for seq in input:
    totalreads += 1
    if ((totalreads % 100000) == 0):
        print(totalreads)

    posBf = nt_search(seq, repB)
    if len(posBf) > 1:
        posEf = nt_search(seq, repE)
        if len(posEf) > 1:
            spacer = seq[posBf[1] + 9:posEf[1]]
            spacer_rev = my_rev_complement(spacer)
            totalspacers += 1
            output.write(">" + str(totalspacers) + "\n" + str(spacer_rev) +
                         "\n")
        else:
            totalbadspacers += 1
            badspacers.write(seq + "\n")
    else:
        totalbadspacers += 1
        badspacers.write(seq + "\n")
 def match_target(self, pam):
     match = nt_search(str(self.get_target(len(pam))), pam)
     return len(match) > 1
Пример #17
0
                print structure, start, end
                print aln, score

#now guess anticodon and possible_aa based on pref code table (dictionary)
            left, right, not_bad = acodon_loop(structure)
            possible_aa = []
            possible_anticodon = []
            possible_offset = []
            possible_structure = []
            no_left_arm = 'S' in product
            if not_bad:
                # bad case happens if anticodon loop is not found, record will not be stored
                # try to minimize those by sophisticating detection
                if verbose: print 'not bad',
                for aa in pref_code:
                    hits = nt_search(aln[left:right], pref_code[aa])
                    if len(hits) > 1:
                        possible_aa.append(aa)
                        offset = int(hits[1]) + left
                        #      offset = int(hits[1]) + left + 1
                        ##this is not used internally so must be in non-pythonic coordinates! Add "1" for pythonic god!
                        possible_offset.append(offset)
                        possible_anticodon.append(aln[offset + 1:offset + 4])
                        if verbose: print aa, aln[offset + 1:offset + 4],

                        #ended up with 3 lists in the same order:
                        #one with possible aa [0]
                        #the second with respective (real) anticodons [4],
                        #third with their relative positions [6]
                        #make separate record for each valid aa+anticodon combination
                        #avoiding the need for sanitization of product
Пример #18
0
rev_complemento_sequencia2 = sequencia2.reverse_complement()

print("Complemento reverso da Sequência 1: %s" % rev_complemento_sequencia1)
print("Complemento reverso da Sequência 2: %s" % rev_complemento_sequencia2)

#################### FLUXO DA INFORMAÇÃO GENÉTICA ###########################
#####
#####
################### Transcrição #################
rna_sequencia1 = sequencia1.transcribe()
print("RNA da Sequência 1: %s" % rna_sequencia1)

#################### Transcrição reversa #################

dna2 = rna_sequencia1.back_transcribe()
print("Sequência 1 original\tSequência 1 após transcrição reversa")
print("%s\t%s" % (sequencia1, dna2))

################### Tradução #######################
# Possíveis erros: comprimento da sequência não é múltiplo de 3.
proteina_sequencia1 = sequencia1.translate()
print("Sequência de aminoácidos do RNA de Sequência 1: %s" %
      proteina_sequencia1)
print("Sequência de aminoácidos da Sequência 1: %s" % sequencia1.translate())

## conteudo GC
print(GC(sequencia1))

## buscando sub-sequencia
print(nt_search(str(sequencia1), "TCGA"))