示例#1
0
def print_enrichment_chart(file_handle, vals, title):
    try:
        import matplotlib.pyplot as plt
    except ImportError:
        print("Error while printing. To use this functionality you need to have matplotlib installed.", file=sys.stderr)
    else:
        fig, ax1 = plt.subplots()
        
        xs = list(range(len(vals)))
        ys =  vals
        
        ax1.plot(xs, ys)
        
        bar_ys = [int(ys[0] > 0)]
        for i in range(1, len(ys)):
            bar_ys.append(int(ys[i] > ys[i - 1]))
        bar_ys = [bar_ys]
        
        pos = ax1.axes.get_position()
        
        ax0 = fig.add_axes([pos.x0, pos.y1, pos.width, 0.1])
        
        ax0.imshow(bar_ys, cmap=plt.cm.Blues, interpolation='nearest')
        ax0.axes.get_yaxis().set_visible(False)
        ax0.axes.get_xaxis().set_visible(False)
        ax0.set_title(title)
        
        plt.savefig(file_handle, bbox_inches=0)
        plt.close()
    def test_limits(self):
        """Check line graphs."""
        #TODO - Fix GD so that the same min/max is used for all three lines?
        points = 1000
        scale = math.pi * 2.0 / points
        data1 = [math.sin(x*scale) for x in range(points)]
        data2 = [math.cos(x*scale) for x in range(points)]
        data3 = [2*math.sin(2*x*scale) for x in range(points)]

        gdd = Diagram('Test Diagram', circular=False,
                      y=0.01, yt=0.01, yb=0.01,
                      x=0.01, xl=0.01, xr=0.01)
        gdt_data = gdd.new_track(1, greytrack=False)
        gds_data = gdt_data.new_set("graph")
        for data_values, name, color in zip([data1, data2, data3],
                                            ["sin", "cos", "2sin2"],
                                            ["red", "green", "blue"]):
            data = list(zip(range(points), data_values))
            gds_data.new_graph(data, "", style="line",
                               color = color, altcolor = color,
                               center = 0)

        gdd.draw(format='linear',
                 tracklines=False,
                 pagesize=(15*cm, 15*cm),
                 fragments=1,
                 start=0, end=points)
        gdd.write(os.path.join('Graphics', "line_graph.pdf"), "pdf")
        #Circular diagram
        gdd.draw(tracklines=False,
                 pagesize=(15*cm, 15*cm),
                 circular=True,  # Data designed to be periodic
                 start=0, end=points, circle_core=0.5)
        gdd.write(os.path.join('Graphics', "line_graph_c.pdf"), "pdf")
示例#3
0
文件: _Motif.py 项目: BrianLinSu/rop
    def make_instances_from_counts(self):
        """Creates "fake" instances for a motif created from a count matrix.

        In case the sums of counts are different for different columnes, the
        shorter columns are padded with background.
        """
        alpha = "".join(self.alphabet.letters)
        # col[i] is a column taken from aligned motif instances
        col = []
        self.has_instances = True
        self.instances = []
        s = sum(self.counts[nuc][0] for nuc in self.alphabet.letters)
        for i in range(self.length):
            col.append("")
            for n in self.alphabet.letters:
                col[i] = col[i] + n * (self.counts[n][i])
            if len(col[i]) < s:
                print("WARNING, column too short %i %i" % (len(col[i]), s))
                col[i] += (alpha * s)[:(s - len(col[i]))]
            # print("column %i, %s" % (i, col[i]))
        # iterate over instances
        for i in range(s):
            inst = ""  # start with empty seq
            for j in range(self.length):  # iterate over positions
                inst += col[j][i]
            # print("%i %s" % (i,inst)
            inst = Seq(inst, self.alphabet)
            self.add_instance(inst)
        return self.instances
示例#4
0
    def format_phylip(self, handle):
        """Write data in Phylip format to a given file-like object or handle.

        The output stream is the input distance matrix format used with Phylip
        programs (e.g. 'neighbor'). See:
        http://evolution.genetics.washington.edu/phylip/doc/neighbor.html

        :Parameters:
            handle : file or file-like object
                A writeable file handle or other object supporting the 'write'
                method, such as StringIO or sys.stdout. On Python 3, should be
                open in text mode.

        """
        handle.write("    {0}\n".format(len(self.names)))
        # Phylip needs space-separated, vertically aligned columns
        name_width = max(12, max(map(len, self.names)) + 1)
        value_fmts = ("{" + str(x) + ":.4f}"
                      for x in range(1, len(self.matrix) + 1))
        row_fmt = "{0:" + str(name_width) + "s}" + "  ".join(value_fmts) + "\n"
        for i, (name, values) in enumerate(zip(self.names, self.matrix)):
            # Mirror the matrix values across the diagonal
            mirror_values = (self.matrix[j][i]
                             for j in range(i + 1, len(self.matrix)))
            fields = itertools.chain([name], values, mirror_values)
            handle.write(row_fmt.format(*fields))
示例#5
0
def lowess(x, y, f=2. / 3., iter=3):
    """lowess(x, y, f=2./3., iter=3) -> yest

    Lowess smoother: Robust locally weighted regression.
    The lowess function fits a nonparametric regression curve to a scatterplot.
    The arrays x and y contain an equal number of elements; each pair
    (x[i], y[i]) defines a data point in the scatterplot. The function returns
    the estimated (smooth) values of y.

    The smoothing span is given by f. A larger value for f will result in a
    smoother curve. The number of robustifying iterations is given by iter. The
    function will run faster with a smaller number of iterations.

    x and y should be numpy float arrays of equal length.  The return value is
    also a numpy float array of that length.

    e.g.
    >>> import numpy
    >>> x = numpy.array([4,  4,  7,  7,  8,  9, 10, 10, 10, 11, 11, 12, 12, 12,
    ...                 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16,
    ...                 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20,
    ...                 20, 22, 23, 24, 24, 24, 24, 25], numpy.float)
    >>> y = numpy.array([2, 10,  4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24,
    ...                 28, 26, 34, 34, 46, 26, 36, 60, 80, 20, 26, 54, 32, 40,
    ...                 32, 40, 50, 42, 56, 76, 84, 36, 46, 68, 32, 48, 52, 56,
    ...                 64, 66, 54, 70, 92, 93, 120, 85], numpy.float)
    >>> result = lowess(x, y)
    >>> len(result)
    50
    >>> print("[%0.2f, ..., %0.2f]" % (result[0], result[-1]))
    [4.85, ..., 84.98]
    """
    n = len(x)
    r = int(numpy.ceil(f * n))
    h = [numpy.sort(abs(x - x[i]))[r] for i in range(n)]
    w = numpy.clip(abs(([x] - numpy.transpose([x])) / h), 0.0, 1.0)
    w = 1 - w * w * w
    w = w * w * w
    yest = numpy.zeros(n)
    delta = numpy.ones(n)
    for iteration in range(iter):
        for i in range(n):
            weights = delta * w[:, i]
            weights_mul_x = weights * x
            b1 = numpy.dot(weights, y)
            b2 = numpy.dot(weights_mul_x, y)
            A11 = sum(weights)
            A12 = sum(weights_mul_x)
            A21 = A12
            A22 = numpy.dot(weights_mul_x, x)
            determinant = A11 * A22 - A12 * A21
            beta1 = (A22 * b1 - A12 * b2) / determinant
            beta2 = (A11 * b2 - A21 * b1) / determinant
            yest[i] = beta1 + beta2 * x[i]
        residuals = y - yest
        s = median(abs(residuals))
        delta[:] = numpy.clip(residuals / (6 * s), -1, 1)
        delta[:] = 1 - delta * delta
        delta[:] = delta * delta
    return yest
    def forward_algorithm(self):
        """Calculate sequence probability using the forward algorithm.

        This implements the forward algorithm, as described on p57-58 of
        Durbin et al.

        Returns:

        o A dictionary containing the forward variables. This has keys of the
        form (state letter, position in the training sequence), and values
        containing the calculated forward variable.

        o The calculated probability of the sequence.
        """
        # all of the different letters that the state path can be in
        state_letters = self._seq.states.alphabet.letters

        # -- initialize the algorithm
        #
        # NOTE: My index numbers are one less than what is given in Durbin
        # et al, since we are indexing the sequence going from 0 to
        # (Length - 1) not 1 to Length, like in Durbin et al.
        #
        forward_var = {}
        # f_{0}(0) = 1
        forward_var[(state_letters[0], -1)] = 1
        # f_{k}(0) = 0, for k > 0
        for k in range(1, len(state_letters)):
            forward_var[(state_letters[k], -1)] = 0

        # -- now do the recursion step
        # loop over the training sequence
        # Recursion step: (i = 1 .. L)
        for i in range(len(self._seq.emissions)):
            # now loop over the letters in the state path
            for main_state in state_letters:
                # calculate the forward value using the appropriate
                # method to prevent underflow errors
                forward_value = self._forward_recursion(main_state, i,
                                                        forward_var)

                if forward_value is not None:
                    forward_var[(main_state, i)] = forward_value

        # -- termination step - calculate the probability of the sequence
        first_state = state_letters[0]
        seq_prob = 0

        for state_item in state_letters:
            # f_{k}(L)
            forward_value = forward_var[(state_item,
                                         len(self._seq.emissions) - 1)]
            # a_{k0}
            transition_value = self._mm.transition_prob[(state_item,
                                                         first_state)]

            seq_prob += forward_value * transition_value

        return forward_var, seq_prob
示例#7
0
    def calculate(self, sequence):
        """Returns the PWM score for a given sequence for all positions.

        Notes:

         - the sequence can only be a DNA sequence
         - the search is performed only on one strand
         - if the sequence and the motif have the same length, a single
           number is returned
         - otherwise, the result is a one-dimensional list or numpy array
        """
        # TODO - Code itself tolerates ambiguous bases (as NaN).
        if not isinstance(self.alphabet, IUPAC.IUPACUnambiguousDNA):
            raise ValueError("PSSM has wrong alphabet: %s - Use only with DNA motifs"
                                 % self.alphabet)
        if not isinstance(sequence.alphabet, IUPAC.IUPACUnambiguousDNA):
            raise ValueError("Sequence has wrong alphabet: %r - Use only with DNA sequences"
                                 % sequence.alphabet)

        # TODO - Force uppercase here and optimise switch statement in C
        # by assuming upper case?
        sequence = str(sequence)
        m = self.length
        n = len(sequence)

        scores = []
        # check if the fast C code can be used
        try:
            from . import _pwm
        except ImportError:
            # use the slower Python code otherwise
            # The C code handles mixed case so Python version must too:
            sequence = sequence.upper()
            for i in range(n - m + 1):
                score = 0.0
                for position in range(m):
                    letter = sequence[i + position]
                    try:
                        score += self[letter][position]
                    except KeyError:
                        score = float("nan")
                        break
                scores.append(score)
        else:
            # get the log-odds matrix into a proper shape
            # (each row contains sorted (ACGT) log-odds values)
            logodds = [[self[letter][i] for letter in "ACGT"] for i in range(m)]
            scores = _pwm.calculate(sequence, logodds)
        if len(scores) == 1:
            return scores[0]
        else:
            return scores
示例#8
0
 def load_seqrecord(self, record):
     """Load a Biopython SeqRecord into the database."""
     bioentry_id = self._load_bioentry_table(record)
     self._load_bioentry_date(record, bioentry_id)
     self._load_biosequence(record, bioentry_id)
     self._load_comment(record, bioentry_id)
     self._load_dbxrefs(record, bioentry_id)
     references = record.annotations.get("references", ())
     for reference, rank in zip(references, list(range(len(references)))):
         self._load_reference(reference, rank, bioentry_id)
     self._load_annotations(record, bioentry_id)
     for seq_feature_num in range(len(record.features)):
         seq_feature = record.features[seq_feature_num]
         self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
示例#9
0
 def test_illumina_to_sanger(self):
     """Mapping check for FASTQ Illumina (0 to 62) to Sanger (0 to 62)"""
     seq = "N"*63
     qual = "".join(chr(64+q) for q in range(0, 63))
     expected_phred = list(range(63))
     in_handle = StringIO("@Test\n%s\n+\n%s" % (seq, qual))
     out_handle = StringIO()
     SeqIO.write(SeqIO.parse(in_handle, "fastq-illumina"),
                 out_handle, "fastq-sanger")
     out_handle.seek(0)
     record = SeqIO.read(out_handle, "fastq-sanger")
     self.assertEqual(str(record.seq), seq)
     self.assertEqual(record.letter_annotations["phred_quality"],
                      expected_phred)
示例#10
0
 def _calculate(self, sequence, m, n):
     # The C code handles mixed case so Python version must too:
     sequence = sequence.upper()
     scores = []
     for i in range(n - m + 1):
         score = 0.0
         for position in range(m):
             letter = sequence[i + position]
             try:
                 score += self[letter][position]
             except KeyError:
                 score = float("nan")
                 break
         scores.append(score)
     return scores
示例#11
0
 def dist_pearson_at(self, other, offset):
     """Return the similarity score based on pearson correlation at the given offset."""
     letters = self.alphabet
     sx = 0.0   # \sum x
     sy = 0.0   # \sum y
     sxx = 0.0  # \sum x^2
     sxy = 0.0  # \sum x \cdot y
     syy = 0.0  # \sum y^2
     norm = max(self.length, offset + other.length) * len(letters)
     for pos in range(min(self.length - offset, other.length)):
         xi = [self[letter, pos + offset] for letter in letters]
         yi = [other[letter, pos] for letter in letters]
         sx += sum(xi)
         sy += sum(yi)
         sxx += sum(x * x for x in xi)
         sxy += sum(x * y for x, y in zip(xi, yi))
         syy += sum(y * y for y in yi)
     sx /= norm
     sy /= norm
     sxx /= norm
     sxy /= norm
     syy /= norm
     numerator = sxy - sx * sy
     denominator = math.sqrt((sxx - sx * sx) * (syy - sy * sy))
     return numerator / denominator
示例#12
0
    def normalize(self, pseudocounts=None):
        """Create and return a position-weight matrix by normalizing the counts matrix.

        If pseudocounts is None (default), no pseudocounts are added
        to the counts.

        If pseudocounts is a number, it is added to the counts before
        calculating the position-weight matrix.

        Alternatively, the pseudocounts can be a dictionary with a key
        for each letter in the alphabet associated with the motif.
        """
        counts = {}
        if pseudocounts is None:
            for letter in self.alphabet.letters:
                counts[letter] = [0.0] * self.length
        elif isinstance(pseudocounts, dict):
            for letter in self.alphabet.letters:
                counts[letter] = [float(pseudocounts[letter])] * self.length
        else:
            for letter in self.alphabet.letters:
                counts[letter] = [float(pseudocounts)] * self.length
        for i in range(self.length):
            for letter in self.alphabet.letters:
                counts[letter][i] += self[letter][i]
        # Actual normalization is done in the PositionWeightMatrix initializer
        return PositionWeightMatrix(self.alphabet, counts)
示例#13
0
 def std(self, background=None):
     """Standard deviation of the score of a motif."""
     if background is None:
         background = dict.fromkeys(self._letters, 1.0)
     else:
         background = dict(background)
     total = sum(background.values())
     for letter in self._letters:
         background[letter] /= total
     variance = 0.0
     for i in range(self.length):
         sx = 0.0
         sxx = 0.0
         for letter in self._letters:
             logodds = self[letter, i]
             if math.isnan(logodds):
                 continue
             if math.isinf(logodds) and logodds < 0:
                 continue
             b = background[letter]
             p = b * math.pow(2, logodds)
             sx += p * logodds
             sxx += p * logodds * logodds
         sxx -= sx * sx
         variance += sxx
     variance = max(variance, 0)  # to avoid roundoff problems
     return math.sqrt(variance)
示例#14
0
 def _get_perms(self, gene_list, perms_no):
     perms = []
     permutation = list(gene_list)
     for _ in range(perms_no):
         random.shuffle(permutation)
         perms.append(list(permutation))
     return perms
示例#15
0
文件: _Motif.py 项目: BrianLinSu/rop
    def pwm(self, laplace=True):
        """
        returns the PWM computed for the set of instances

        if laplace=True (default), pseudocounts equal to self.background multiplied by self.beta are added to all positions.
        """
        if self._pwm_is_current:
            return self._pwm
        # we need to compute new pwm
        self._pwm = []
        for i in range(self.length):
            dict = {}
            # filling the dict with 0's
            for letter in self.alphabet.letters:
                if laplace:
                    dict[letter] = self.beta * self.background[letter]
                else:
                    dict[letter] = 0.0
            if self.has_counts:
                # taking the raw counts
                for letter in self.alphabet.letters:
                    dict[letter] += self.counts[letter][i]
            elif self.has_instances:
                # counting the occurences of letters in instances
                for seq in self.instances:
                    # dict[seq[i]]=dict[seq[i]]+1
                    try:
                        dict[seq[i]] += 1
                    except KeyError:  # we need to ignore non-alphabet letters
                        pass
            self._pwm.append(FreqTable.FreqTable(dict, FreqTable.COUNT, self.alphabet))
        self._pwm_is_current = 1
        return self._pwm
示例#16
0
 def __str__(self):
     """Get a lower triangular matrix string."""
     matrix_string = '\n'.join(
         [self.names[i] + "\t" + "\t".join([str(n) for n in self.matrix[i]])
          for i in range(0, len(self))])
     matrix_string = matrix_string + "\n\t" + "\t".join(self.names)
     return matrix_string
示例#17
0
    def _crossover(self, x, no, locs):
        """Generalized Crossover Function:

           arguments:
               - x (int)        - genome number [0|1]
               - no (organism,organism)

                - new organisms

               - locs (int list, int list)
               
                - lists of locations,
                  [0, +n points+, bound]
                  for each genome (sync'd with x)

            return type: sequence (to replace no[x])
        """
        s = no[x].genome[:locs[x][1]]
        for n in range(1, self._npoints):
            # flipflop between genome_0 and genome_1
            mode = (x+n)%2
            # _generate_locs gives us [0, +n points+, bound]
            #  so we can iterate: { 0:loc(1) ... loc(n):bound }
            t = no[mode].genome[locs[mode][n]:locs[mode][n+1]]
            if (s):
                s = s + t
            else:
                s = t
        return s
示例#18
0
 def check_general_fails(self, filename, good_count):
     handle = open(filename, _universal_read_mode)
     tuples = QualityIO.FastqGeneralIterator(handle)
     for i in range(good_count):
         title, seq, qual = next(tuples)  # Make sure no errors!
     self.assertRaises(ValueError, next, tuples)
     handle.close()
示例#19
0
def _gen_random_array(n):
    """Return an array of n random numbers summing to 1.0 (PRIVATE)."""
    randArray = [random.random() for i in range(n)]
    total = sum(randArray)
    normalizedRandArray = [x / total for x in randArray]

    return normalizedRandArray
示例#20
0
文件: bai.py 项目: jhrf/Playground
def _load_bai(handle):
    indexes = []
    magic = handle.read(4)
    if magic != _BAI_magic:
        raise ValueError("BAM index files should start %r, not %r" % (_BAI_magic, magic))
    assert 4 == struct.calcsize("<i")
    assert 8 == struct.calcsize("<Q")
    data = handle.read(4)
    n_ref = struct.unpack("<i", data)[0]
    # print("%i references" % n_ref)
    for n in range(n_ref):
        indexes.append(_load_ref_index(handle))
    # This is missing on very old samtools index files,
    # and isn't in the SAM/BAM specifiction yet either.
    # This was reverse engineered vs "samtools idxstats"
    data = handle.read(8)
    if data:
        unmapped = struct.unpack("<Q", data)[0]
        # print("%i unmapped reads" % unmapped)
    else:
        unmapped = None
        # print("Index missing unmapped reads count")
    data = handle.read()
    if data:
        print("%i extra bytes" % len(data))
        print(repr(data))
    return indexes, unmapped
示例#21
0
def matches_schema(pattern, schema, ambiguity_character='*'):
    """Determine whether or not the given pattern matches the schema.

    Arguments:

    o pattern - A string representing the pattern we want to check for
    matching. This pattern can contain ambiguity characters (which are
    assumed to be the same as those in the schema).

    o schema - A string schema with ambiguity characters.

    o ambiguity_character - The character used for ambiguity in the schema.
    """
    if len(pattern) != len(schema):
        return 0

    # check each position, and return a non match if the schema and pattern
    # are non ambiguous and don't match
    for pos in range(len(pattern)):
        if schema[pos] != ambiguity_character and \
           pattern[pos] != ambiguity_character and \
           pattern[pos] != schema[pos]:

            return 0

    return 1
示例#22
0
    def insert(self, name, value, index=None):
        """Insert distances given the name and value.

        :Parameters:
            name : str
                name of a row/col to be inserted
            value : list
                a row/col of values to be inserted

        """
        if isinstance(name, str):
            # insert at the given index or at the end
            if index is None:
                index = len(self)
            if not isinstance(index, int):
                raise TypeError("Invalid index type.")
            # insert name
            self.names.insert(index, name)
            # insert elements of 0, to be assigned
            self.matrix.insert(index, [0] * index)
            for i in range(index, len(self)):
                self.matrix[i].insert(index, 0)
            # assign value
            self[index] = value
        else:
            raise TypeError("Invalid name type.")
示例#23
0
    def representation(self, sequence):
        """Represent the given input sequence as a bunch of motif counts.

        Arguments:

        o sequence - A Bio.Seq object we are going to represent as schemas.

        This takes the sequence, searches for the motifs within it, and then
        returns counts specifying the relative number of times each motifs
        was found. The frequencies are in the order the original motifs were
        passed into the initializer.
        """
        schema_counts = []

        for schema in self._schemas:
            num_counts = self._converter.num_matches(schema, str(sequence))
            schema_counts.append(num_counts)

        # normalize the counts to go between zero and one
        min_count = 0
        max_count = max(schema_counts)

        # only normalize if we've actually found something, otherwise
        # we'll just return 0 for everything
        if max_count > 0:
            for count_num in range(len(schema_counts)):
                schema_counts[count_num] = (float(schema_counts[count_num]) -
                                           float(min_count)) / float(max_count)

        return schema_counts
示例#24
0
    def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
        """Insert the (key, value) pair qualifiers relating to a feature (PRIVATE).

        Qualifiers should be a dictionary of the form:
            {key : [value1, value2]}
        """
        tag_ontology_id = self._get_ontology_id("Annotation Tags")
        for qualifier_key in qualifiers:
            # Treat db_xref qualifiers differently to sequence annotation
            # qualifiers by populating the seqfeature_dbxref and dbxref
            # tables.  Other qualifiers go into the seqfeature_qualifier_value
            # and (if new) term tables.
            if qualifier_key != "db_xref":
                qualifier_key_id = self._get_term_id(qualifier_key, ontology_id=tag_ontology_id)
                # now add all of the values to their table
                entries = qualifiers[qualifier_key]
                if not isinstance(entries, list):
                    # Could be a plain string, or an int or a float.
                    # However, we exect a list of strings here.
                    entries = [entries]
                for qual_value_rank in range(len(entries)):
                    qualifier_value = entries[qual_value_rank]
                    sql = (
                        r"INSERT INTO seqfeature_qualifier_value "
                        r" (seqfeature_id, term_id, rank, value) VALUES"
                        r" (%s, %s, %s, %s)"
                    )
                    self.adaptor.execute(sql, (seqfeature_id, qualifier_key_id, qual_value_rank + 1, qualifier_value))
            else:
                # The dbxref_id qualifier/value sets go into the dbxref table
                # as dbname, accession, version tuples, with dbxref.dbxref_id
                # being automatically assigned, and into the seqfeature_dbxref
                # table as seqfeature_id, dbxref_id, and rank tuples
                self._load_seqfeature_dbxref(qualifiers[qualifier_key], seqfeature_id)
示例#25
0
def intermediate_points(start, end, graph_data):
    """Generate intermediate points describing provided graph data..

    Returns a list of (start, end, value) tuples describing the passed
    graph data as 'bins' between position midpoints.
    """
    # print start, end, len(graph_data)
    newdata = []    # data in form (X0, X1, val)
    # add first block
    newdata.append((start,
                    graph_data[0][0] + (graph_data[1][0] - graph_data[0][0]) / 2.,
                    graph_data[0][1]))
    # add middle set
    for index in range(1, len(graph_data) - 1):
        lastxval, lastyval = graph_data[index - 1]
        xval, yval = graph_data[index]
        nextxval, nextyval = graph_data[index + 1]
        newdata.append((lastxval + (xval - lastxval) / 2.,
                        xval + (nextxval - xval) / 2., yval))
    # add last block
    newdata.append((xval + (nextxval - xval) / 2.,
                    end, graph_data[-1][1]))
    # print newdata[-1]
    # print newdata
    return newdata
示例#26
0
def calculate_pseudocounts(motif):
    alphabet = motif.alphabet
    background = motif.background

    # It is possible to have unequal column sums so use the average
    # number of instances.
    total = 0
    for i in range(motif.length):
        total += sum(float(motif.counts[letter][i])
                     for letter in alphabet.letters)

    avg_nb_instances = total / motif.length
    sq_nb_instances = math.sqrt(avg_nb_instances)

    if background:
        background = dict(background)
    else:
        background = dict.fromkeys(sorted(alphabet.letters), 1.0)

    total = sum(background.values())
    pseudocounts = {}

    for letter in alphabet.letters:
        background[letter] /= total
        pseudocounts[letter] = sq_nb_instances * background[letter]

    return pseudocounts
示例#27
0
def kolmogorov_smirnov_rank_test(gene_set, gene_list, adj_corr, plot=False):
    """
    Rank test used in GSEA method. It measures dispersion of genes from
    gene_set over a gene_list. Every gene from gene_list has its weight
    specified by adj_corr, where adj_corr are gene weights (correlation 
    with fenotype) already raised to the power of parameter p, changing 
    weights importance. Plot define if method should return list of ES 
    for each position in ranking, if plot=False (default) second 
    returned object is None.
    
    Reference: http://www.pnas.org/content/102/43/15545.full
    """
    
    cval = 0
    Dn = 0
    Nr = 0
    
    N = len(gene_list)
    Nh = 0
    
    for i in range(N):
        if gene_list[i] in gene_set:
            Nr += adj_corr[i] 
            Nh += 1
    
    if N == Nh:
        miss_pen = 1.
    else:
        miss_pen = float(1) / (N - Nh)
    
    stat_plot = N * [None]
    
    if plot:
        stat_plot = N * [None]
    else:
        stat_plot = None
    for i in range(N):
        if gene_list[i] in gene_set:
            cval += adj_corr[i] / Nr
        else:
            cval -= miss_pen
        if plot:
            stat_plot[i] = cval

        if abs(cval) > abs(Dn):
            Dn = cval
    return (Dn, stat_plot)
示例#28
0
def compare_sequence(old, new):
    """Compare two Seq or DBSeq objects."""
    assert len(old) == len(new), "%i vs %i" % (len(old), len(new))
    assert str(old) == str(new)

    if isinstance(old, UnknownSeq):
        assert isinstance(new, UnknownSeq)
    else:
        assert not isinstance(new, UnknownSeq)

    ln = len(old)
    s = str(old)
    assert isinstance(s, str)

    # Don't check every single element; for long sequences
    # this takes far far far too long to run!
    # Test both positive and negative indices
    if ln < 50:
        indices = list(range(-ln, ln))
    else:
        # A selection of end cases, and the mid point
        indices = [-ln, -ln + 1, -(ln // 2), -1, 0, 1, ln // 2, ln - 2, ln - 1]

    # Test element access,
    for i in indices:
        expected = s[i]
        assert expected == old[i]
        assert expected == new[i]

    # Test slices
    indices.append(ln)  # check copes with overflows
    indices.append(ln + 1000)  # check copes with overflows
    for i in indices:
        for j in indices:
            expected = s[i:j]
            assert expected == str(old[i:j]), \
                   "Slice %s vs %s" % (repr(expected), repr(old[i:j]))
            assert expected == str(new[i:j]), \
                   "Slice %s vs %s" % (repr(expected), repr(new[i:j]))
            # Slicing with step of 1 should make no difference.
            # Slicing with step 3 might be useful for codons.
            for step in [1, 3]:
                expected = s[i:j:step]
                assert expected == str(old[i:j:step])
                assert expected == str(new[i:j:step])

        # Check automatic end points
        expected = s[i:]
        assert expected == str(old[i:])
        assert expected == str(new[i:])

        expected = s[:i]
        assert expected == str(old[:i])
        assert expected == str(new[:i])

    # Check "copy" splice
    assert s == str(old[:])
    assert s == str(new[:])
    return True
示例#29
0
def _gen_random_array(n):
    """ Return an array of n random numbers, where the elements of the array sum
    to 1.0"""
    randArray = [random.random() for i in range(n)]
    total = sum(randArray)
    normalizedRandArray = [x/total for x in randArray]

    return normalizedRandArray
示例#30
0
 def __init__(self, alphabet, counts):
     GenericPositionMatrix.__init__(self, alphabet, counts)
     for i in range(self.length):
         total = sum(float(self[letter][i]) for letter in alphabet.letters)
         for letter in alphabet.letters:
             self[letter][i] /= total
     for letter in alphabet.letters:
         self[letter] = tuple(self[letter])
示例#31
0
 def dist_product(self, other):
     """
     A similarity measure taking into account a product probability of generating overlaping instances of two motifs
     """
     max_p = 0.0
     for offset in range(-self.length + 1, other.length):
         if offset < 0:
             p = self.dist_product_at(other, -offset)
         else:  #offset>=0
             p = other.dist_product_at(self, offset)
         if max_p < p:
             max_p = p
             max_o = -offset
     return 1 - max_p / self.dist_product_at(self, 0), max_o
 def test_sanger_to_solexa(self):
     """Mapping check for FASTQ Sanger (0 to 93) to Solexa (-5 to 62)"""
     # The point of this test is the writing code doesn't actually use the
     # solexa_quality_from_phred function directly. For speed it uses a
     # cached dictionary of the mappings.
     seq = "N" * 94
     qual = "".join(chr(33 + q) for q in range(0, 94))
     expected_sol = [
         min(62, int(round(QualityIO.solexa_quality_from_phred(q))))
         for q in range(0, 94)
     ]
     in_handle = StringIO("@Test\n%s\n+\n%s" % (seq, qual))
     out_handle = StringIO()
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter("always", BiopythonWarning)
         SeqIO.write(SeqIO.parse(in_handle, "fastq-sanger"), out_handle,
                     "fastq-solexa")
         self.assertTrue(len(w) <= 1, w)
     out_handle.seek(0)
     record = SeqIO.read(out_handle, "fastq-solexa")
     self.assertEqual(str(record.seq), seq)
     self.assertEqual(record.letter_annotations["solexa_quality"],
                      expected_sol)
    def __str__(self, masked=False):
        """Return string representation of a motif."""
        text = ""
        if self.instances is not None:
            text += str(self.instances)

        if masked:
            for i in range(self.length):
                if self.__mask[i]:
                    text += "*"
                else:
                    text += " "
            text += "\n"
        return text
示例#34
0
def fmt_cdt(sample_ids, rows):
    """Format as CDT."""
    outheader = ['GID', 'CLID', 'NAME', 'GWEIGHT'] + sample_ids
    header2 = ['AID', '', '', '']
    header2.extend(
        ['ARRY' + str(i).zfill(3) + 'X' for i in range(len(sample_ids))])
    outrows = [header2]
    for i, row in enumerate(rows):
        probe, values = row[0], row[1:]
        outrow = ['GENE%dX' % i, 'IMAGE:%d' % i, probe.label,
                  1]  # or probe.gene?
        outrow.extend(values)
        outrows.append(outrow)
    return outheader, outrows
 def test_solexa_quality_from_phred(self):
     """Mapping check for function solexa_quality_from_phred"""
     self.assertEqual(-5, round(QualityIO.solexa_quality_from_phred(0)))
     self.assertEqual(-5, round(QualityIO.solexa_quality_from_phred(1)))
     self.assertEqual(-2, round(QualityIO.solexa_quality_from_phred(2)))
     self.assertEqual(0, round(QualityIO.solexa_quality_from_phred(3)))
     self.assertEqual(2, round(QualityIO.solexa_quality_from_phred(4)))
     self.assertEqual(3, round(QualityIO.solexa_quality_from_phred(5)))
     self.assertEqual(5, round(QualityIO.solexa_quality_from_phred(6)))
     self.assertEqual(6, round(QualityIO.solexa_quality_from_phred(7)))
     self.assertEqual(7, round(QualityIO.solexa_quality_from_phred(8)))
     self.assertEqual(8, round(QualityIO.solexa_quality_from_phred(9)))
     for i in range(10, 100):
         self.assertEqual(i, round(QualityIO.solexa_quality_from_phred(i)))
示例#36
0
    def __init__(self, names, matrix=None):
        """Initialize matrix by a list of names and a list of
        lower triangular matrix data.
        """
        # check names
        if isinstance(names, list) and all(isinstance(s, str) for s in names):
            if len(set(names)) == len(names):
                self.names = names
            else:
                raise ValueError("Duplicate names found")
        else:
            raise TypeError("'names' should be a list of strings")

        # check matrix
        if matrix is None:
            # create a new one with 0 if matrix is not assigned
            matrix = [[0] * i for i in range(1, len(self) + 1)]
            self.matrix = matrix
        else:
            # check if all elements are numbers
            if (isinstance(matrix, list) and
                all(isinstance(l, list) for l in matrix) and
                all(_is_numeric(n) for n in [item for sublist in matrix
                                             for item in sublist])):
                # check if the same length with names
                if len(matrix) == len(names):
                    # check if is lower triangle format
                    if [len(m) for m in matrix] == list(range(1, len(self) + 1)):
                        self.matrix = matrix
                    else:
                        raise ValueError(
                            "'matrix' should be in lower triangle format")
                else:
                    raise ValueError(
                        "'names' and 'matrix' should be the same size")
            else:
                raise TypeError("'matrix' should be a list of numerical lists")
示例#37
0
    def backward_algorithm(self):
        """Calculate sequence probability using the backward algorithm.

        This implements the backward algorithm, as described on p58-59 of
        Durbin et al.

        Returns:

        o A dictionary containing the backwards variables. This has keys
        of the form (state letter, position in the training sequence),
        and values containing the calculated backward variable.
        """
        # all of the different letters that the state path can be in
        state_letters = self._seq.states.alphabet.letters

        # -- initialize the algorithm
        #
        # NOTE: My index numbers are one less than what is given in Durbin
        # et al, since we are indexing the sequence going from 0 to
        # (Length - 1) not 1 to Length, like in Durbin et al.
        #
        backward_var = {}

        first_letter = state_letters[0]
        # b_{k}(L) = a_{k0} for all k
        for state in state_letters:
            backward_var[(state, len(self._seq.emissions) - 1)] = \
              self._mm.transition_prob[(state, state_letters[0])]

        # -- recursion
        # first loop over the training sequence backwards
        # Recursion step: (i = L - 1 ... 1)
        all_indexes = list(range(len(self._seq.emissions) - 1))
        all_indexes.reverse()
        for i in all_indexes:
            # now loop over the letters in the state path
            for main_state in state_letters:
                # calculate the backward value using the appropriate
                # method to prevent underflow errors
                backward_value = self._backward_recursion(
                    main_state, i, backward_var)

                if backward_value is not None:
                    backward_var[(main_state, i)] = backward_value

        # skip the termination step to avoid recalculations -- you should
        # get sequence probabilities using the forward algorithm

        return backward_var
示例#38
0
 def degenerate_consensus(self):
     # Following the rules adapted from
     # D. R. Cavener: "Comparison of the consensus sequence flanking
     # translational start sites in Drosophila and vertebrates."
     # Nucleic Acids Research 15(4): 1353-1361. (1987).
     # The same rules are used by TRANSFAC.
     degenerate_nucleotide = {
         'A': 'A',
         'C': 'C',
         'G': 'G',
         'T': 'T',
         'AC': 'M',
         'AG': 'R',
         'AT': 'W',
         'CG': 'S',
         'CT': 'Y',
         'GT': 'K',
         'ACG': 'V',
         'ACT': 'H',
         'AGT': 'D',
         'CGT': 'B',
         'ACGT': 'N',
     }
     sequence = ""
     for i in range(self.length):
         def get(nucleotide):
             return self[nucleotide][i]
         nucleotides = sorted(self, key=get, reverse=True)
         counts = [self[c][i] for c in nucleotides]
         # Follow the Cavener rules:
         if counts[0] > sum(counts[1:]) and counts[0] > 2 * counts[1]:
             key = nucleotides[0]
         elif 4 * sum(counts[:2]) > 3 * sum(counts):
             key = "".join(sorted(nucleotides[:2]))
         elif counts[3] == 0:
             key = "".join(sorted(nucleotides[:3]))
         else:
             key = "ACGT"
         nucleotide = degenerate_nucleotide.get(key, key)
         sequence += nucleotide
     if isinstance(self.alphabet, Alphabet.DNAAlphabet):
         alpha = IUPAC.ambiguous_dna
     elif isinstance(self.alphabet, Alphabet.RNAAlphabet):
         alpha = IUPAC.ambiguous_rna
     elif isinstance(self.alphabet, Alphabet.ProteinAlphabet):
         alpha = IUPAC.protein
     else:
         raise Exception("Unknown alphabet")
     return Seq(sequence, alphabet=alpha)
示例#39
0
    def random_motif(self):
        """Create a random motif within the given parameters.

        This returns a single motif string with letters from the given
        alphabet. The size of the motif will be randomly chosen between
        max_size and min_size.
        """
        motif_size = random.randrange(self._min_size, self._max_size)

        motif = ""
        for letter_num in range(motif_size):
            cur_letter = random.choice(self._alphabet.letters)
            motif += cur_letter

        return MutableSeq(motif, self._alphabet)
示例#40
0
    def __str__(self,masked=False):
        """ string representation of a motif.
        """
        str = ""
        for inst in self.instances:
            str = str + inst.tostring() + "\n"

        if masked:
            for i in range(self.length):
                if self.mask[i]:
                    str = str + "*"
                else:
                    str = str + " "
            str = str + "\n"
        return str
示例#41
0
 def __delitem__(self, item):
     """Delete related distances by the index or name."""
     index = None
     if isinstance(item, int):
         index = item
     elif isinstance(item, str):
         index = self.names.index(item)
     else:
         raise TypeError("Invalid index type.")
     # remove distances related to index
     for i in range(index + 1, len(self)):
         del self.matrix[i][index]
     del self.matrix[index]
     # remove name
     del self.names[index]
示例#42
0
 def test_solexa_to_sanger(self):
     """Mapping check for FASTQ Solexa (-5 to 62) to Sanger (0 to 62)"""
     #The point of this test is the writing code doesn't actually use the
     #solexa_quality_from_phred function directly. For speed it uses a
     #cached dictionary of the mappings.
     seq = "N" * 68
     qual = "".join(chr(64 + q) for q in range(-5, 63))
     expected_phred = [
         round(QualityIO.phred_quality_from_solexa(q))
         for q in range(-5, 63)
     ]
     in_handle = StringIO("@Test\n%s\n+\n%s" % (seq, qual))
     out_handle = StringIO()
     #Want to ignore the data loss warning
     #(on Python 2.6 we could check for it!)
     warnings.simplefilter('ignore', BiopythonWarning)
     SeqIO.write(SeqIO.parse(in_handle, "fastq-solexa"), out_handle,
                 "fastq-sanger")
     warnings.filters.pop()
     out_handle.seek(0)
     record = SeqIO.read(out_handle, "fastq-sanger")
     self.assertEqual(str(record.seq), seq)
     self.assertEqual(record.letter_annotations["phred_quality"],
                      expected_phred)
示例#43
0
    def mutate(self, organism):
        """Mutate the organism's genome."""
        mutated_org = organism.copy()

        gene_choices = mutated_org.genome.alphabet.letters

        # potentially mutate any gene in the genome
        for gene_index in range(len(mutated_org.genome)):
            mutation_chance = self._mutation_rand.random()
            # if we have a mutation
            if mutation_chance <= self._mutation_rate:
                # get a new letter
                new_letter = self._switch_rand.choice(gene_choices)
                mutated_org.genome[gene_index] = new_letter

        return mutated_org
示例#44
0
 def anticonsensus(self):
     sequence = ""
     for i in range(self.length):
         try:
             minimum = float("inf")
         except ValueError:
             # On Python 2.5 or older that was handled in C code,
             # and failed on Windows XP 32bit
             minimum = 1E400
         for letter in self.alphabet.letters:
             count = self[letter][i]
             if count < minimum:
                 minimum = count
                 sequence_letter = letter
         sequence += sequence_letter
     return Seq(sequence, self.alphabet)
示例#45
0
    def search_pwm(self, sequence, normalized=0, masked=0, threshold=0.0, both=True):
        """
        a generator function, returning found hits in a given sequence with the pwm score higher than the threshold
        """
        if both:
            rc = self.reverse_complement()

        sequence = str(sequence).upper()
        for pos in range(0, len(sequence) - self.length + 1):
            score = self.score_hit(sequence, pos, normalized, masked)
            if score > threshold:
                yield (pos, score)
            if both:
                rev_score = rc.score_hit(sequence, pos, normalized, masked)
                if rev_score > threshold:
                    yield (-pos, rev_score)
示例#46
0
    def mutate(self, organism):
        """Mutate the organism's genome."""
        mutated_org = organism.copy()
        gene_choices = mutated_org.genome.alphabet.letters

        mutation_chance = self._mutation_rand.random()
        if mutation_chance <= self._mutation_rate:
            # pick a gene position to mutate at
            mutation_pos = \
                         self._pos_rand.choice(list(range(len(mutated_org.genome))))

            # get a new letter to replace the position at
            new_letter = self._switch_rand.choice(gene_choices)

            mutated_org.genome[mutation_pos] = new_letter

        return mutated_org
示例#47
0
    def add_instance(self, instance):
        """
        adds new instance to the motif
        """
        self._check_alphabet(instance.alphabet)
        self._check_length(len(instance))
        if self.has_counts:
            for i in range(self.length):
                let=instance[i]
                self.counts[let][i]+=1

        if self.has_instances or not self.has_counts:
            self.instances.append(instance)
            self.has_instances=True

        self._pwm_is_current = False
        self._log_odds_is_current = False
示例#48
0
    def log_odds(self, laplace=True):
        """
        returns the logg odds matrix computed for the set of instances
        """

        if self._log_odds_is_current:
            return self._log_odds
        # we need to compute new pwm
        self._log_odds = []
        pwm=self.pwm(laplace)
        for i in range(self.length):
            d = {}
            for a in self.alphabet.letters:
                    d[a]=math.log(pwm[i][a]/self.background[a], 2)
            self._log_odds.append(d)
        self._log_odds_is_current=1
        return self._log_odds
示例#49
0
    def update(self, inputs):
        """Update the values of the nodes using given inputs.

        Arguments:
         - inputs -- A list of inputs into the network -- this must be
           equal to the number of nodes in the layer.

        """
        if len(inputs) != len(self.values) - 1:
            raise ValueError("Inputs do not match input layer nodes.")

        # set the node values from the inputs
        for input_num in range(len(inputs)):
            self.values[input_num + 1] = inputs[input_num]

        # propagate the update to the next layer
        self._next_layer.update(self)
示例#50
0
 def consensus(self):
     """Return the consensus sequence."""
     sequence = ""
     for i in range(self.length):
         try:
             maximum = float("-inf")
         except ValueError:
             # On Python 2.5 or older that was handled in C code,
             # and failed on Windows XP 32bit
             maximum = - 1E400
         for letter in self.alphabet:
             count = self[letter][i]
             if count > maximum:
                 maximum = count
                 sequence_letter = letter
         sequence += sequence_letter
     return Seq(sequence)
示例#51
0
def get_background(target_bed, access_bed, avg_bin_size, min_bin_size):
    """Generate background intervals from target intervals.

    Procedure:

    - Invert target intervals
    - Subtract the inverted targets from accessible regions
    - For each of the resulting regions:

        - Shrink by a fixed margin on each end
        - If it's smaller than min_bin_size, skip
        - Divide into equal-size (region_size/avg_bin_size) portions
        - Emit the (chrom, start, end) coords of each portion
    """
    target_chroms = group_coords(RA.read(target_bed).coords())
    if access_bed:
        # Chromosome accessible sequence regions are given -- use them
        access_chroms = group_coords(RA.read(access_bed).coords())
    else:
        # Chromosome accessible sequence regions not known -- use heuristics
        # (chromosome length is endpoint of last probe; skip initial
        # <magic number> of bases that are probably telomeric)
        TELOMERE_SIZE = 150000
        access_chroms = guess_chromosome_regions(target_chroms, TELOMERE_SIZE)

    backgrounds = find_background_regions(access_chroms, target_chroms,
                                          2 * INSERT_SIZE)
    # Emit regions as antitarget bins according to avg_bin_size and min_bin_size
    # Do a set operation on backgrounds to avoid any duplicate regions
    for chrom, start, end in sorted(backgrounds, key=core.sorter_chrom_at(0)):
        span = end - start
        if span >= min_bin_size:
            nbins = int(round(span / avg_bin_size)) or 1
            if nbins == 1:
                yield (chrom, start, end)
            else:
                # Divide the background region into equal-sized bins
                bin_size = span / nbins
                bin_start = start
                bin_end = None
                for i in range(1, nbins):
                    bin_end = start + int(i * bin_size)
                    yield (chrom, bin_start, bin_end)
                    bin_start = bin_end
                yield (chrom, bin_start, end)
示例#52
0
    def _generate_locs(self, bound):
        """Generalized Location Generator.

        Arguments:

        - bound (int)   - upper bound

        Returns: [0]+x_0...x_n+[bound] where n=self._npoints-1
        and 0 < x_0 < x_1 ... < bound
        """
        results = []
        for increment in range(self._npoints):
            x = random.randint(1, bound - 1)
            while (x in results):  # uniqueness
                x = random.randint(1, bound - 1)
            results.append(x)
        results.sort()  # sorted
        return [0] + results + [bound]  # [0, +n points+, bound]
示例#53
0
    def dist_pearson(self, other):
        """Return the similarity score based on pearson correlation for the given motif against self.

        We use the Pearson's correlation of the respective probabilities.
        """
        if self.alphabet != other.alphabet:
            raise ValueError("Cannot compare motifs with different alphabets")

        max_p = -2
        for offset in range(-self.length + 1, other.length):
            if offset < 0:
                p = self.dist_pearson_at(other, -offset)
            else:  # offset>=0
                p = other.dist_pearson_at(self, offset)
            if max_p < p:
                max_p = p
                max_o = -offset
        return 1 - max_p, max_o
示例#54
0
 def search(self, sequence, threshold=0.0, both=True):
     """
     a generator function, returning found hits in a given sequence with the pwm score higher than the threshold
     """
     sequence = sequence.upper()
     n = len(sequence)
     m = self.length
     if both:
         rc = self.reverse_complement()
     for position in range(0, n - m + 1):
         s = sequence[position:position + m]
         score = self.calculate(s)
         if score > threshold:
             yield (position, score)
         if both:
             score = rc.calculate(s)
             if score > threshold:
                 yield (position - n, score)
示例#55
0
    def __init__(self, num_nodes, has_bias_node):
        """Initialize the layer.

        Arguments:
         - num_nodes -- The number of nodes that are contained in this layer.
         - has_bias_node -- Specify whether or not this node has a bias
           node. This node is not included in the number of nodes in the
           network, but is used in constructing and dealing with the network.

        """
        # specify all of the nodes in the network
        if has_bias_node:
            lower_range = 0
        else:
            lower_range = 1

        self.nodes = list(range(lower_range, num_nodes + 1))

        self.weights = {}
示例#56
0
    def make_counts_from_instances(self):
        """Creates the count matrix for a motif with instances.

        """
        # make strings for "columns" of motifs
        # col[i] is a column taken from aligned motif instances
        counts={}
        for a in self.alphabet.letters:
            counts[a]=[]
        self.has_counts=True
        s = len(self.instances)
        for i in range(self.length):
            ci = dict((a, 0) for a in self.alphabet.letters)
            for inst in self.instances:
                ci[inst[i]]+=1
            for a in self.alphabet.letters:
                counts[a].append(ci[a])
        self.counts=counts
        return counts
示例#57
0
    def dist_dpq_at(self, other, offset):
        """
        calculates the dist_dpq measure with a given offset.

        offset should satisfy 0<=offset<=len(self)
        """
        def dpq(f1, f2, alpha):
            s=0
            for n in alpha.letters:
                avg=(f1[n]+f2[n])/2
                s+=f1[n]*math.log(f1[n]/avg, 2)+f2[n]*math.log(f2[n]/avg, 2)
            return math.sqrt(s)

        s=0
        for i in range(max(self.length, offset+other.length)):
            f1=self[i]
            f2=other[i-offset]
            s+=dpq(f1, f2, self.alphabet)
        return s
示例#58
0
 def test_phred_quality_from_solexa(self):
     """Mapping check for function phred_quality_from_solexa"""
     self.assertEqual(1, round(QualityIO.phred_quality_from_solexa(-5)))
     self.assertEqual(1, round(QualityIO.phred_quality_from_solexa(-4)))
     self.assertEqual(2, round(QualityIO.phred_quality_from_solexa(-3)))
     self.assertEqual(2, round(QualityIO.phred_quality_from_solexa(-2)))
     self.assertEqual(3, round(QualityIO.phred_quality_from_solexa(-1)))
     self.assertEqual(3, round(QualityIO.phred_quality_from_solexa(0)))
     self.assertEqual(4, round(QualityIO.phred_quality_from_solexa(1)))
     self.assertEqual(4, round(QualityIO.phred_quality_from_solexa(2)))
     self.assertEqual(5, round(QualityIO.phred_quality_from_solexa(3)))
     self.assertEqual(5, round(QualityIO.phred_quality_from_solexa(4)))
     self.assertEqual(6, round(QualityIO.phred_quality_from_solexa(5)))
     self.assertEqual(7, round(QualityIO.phred_quality_from_solexa(6)))
     self.assertEqual(8, round(QualityIO.phred_quality_from_solexa(7)))
     self.assertEqual(9, round(QualityIO.phred_quality_from_solexa(8)))
     self.assertEqual(10, round(QualityIO.phred_quality_from_solexa(9)))
     for i in range(10, 100):
         self.assertEqual(i, round(QualityIO.phred_quality_from_solexa(i)))
示例#59
0
 def score_hit(self, sequence, position, normalized=0, masked=0):
     """
     give the pwm score for a given position
     """
     lo=self.log_odds()
     score = 0.0
     for pos in range(self.length):
         a = sequence[position+pos]
         if not masked or self.mask[pos]:
             try:
                 score += lo[pos][a]
             except:
                 pass
     if normalized:
         if not masked:
             score/=self.length
         else:
             score/=len([x for x in self.mask if x])
     return score
示例#60
0
    def log_odds(self, background=None):
        """
        returns the Position-Specific Scoring Matrix.

        The Position-Specific Scoring Matrix (PSSM) contains the log-odds
        scores computed from the probability matrix and the background
        probabilities. If the background is None, a uniform background
        distribution is assumed.
        """
        values = {}
        alphabet = self.alphabet
        if background is None:
            background = dict.fromkeys(self._letters, 1.0)
        else:
            background = dict(background)
        total = sum(background.values())
        for letter in alphabet.letters:
            background[letter] /= total
            values[letter] = []
        for i in range(self.length):
            for letter in alphabet.letters:
                b = background[letter]
                if b > 0:
                    p = self[letter][i]
                    if p > 0:
                        logodds = math.log(p / b, 2)
                    else:
                        #TODO - Ensure this has unittest coverage!
                        try:
                            logodds = float("-inf")
                        except ValueError:
                            # On Python 2.5 or older that was handled in C code,
                            # and failed on Windows XP 32bit
                            logodds = -1E400
                else:
                    p = self[letter][i]
                    if p > 0:
                        logodds = float("inf")
                    else:
                        logodds = _nan
                values[letter].append(logodds)
        pssm = PositionSpecificScoringMatrix(alphabet, values)
        return pssm