예제 #1
0
    def score(self, cost='chunkiness'):

        transitions = [self.model.chunkiness(n1, n2)
                       for n1, n2 in utils.neighbors(self)]

        transitions = (np.array(transitions) + .001).clip(0, 1)  # smoothing
        return np.prod(transitions) ** (1/len(self.utterance))
def train_bigram(graph, corpus):
    for sentence in corpus:
        bigrams = utils.neighbors(sentence.split(' '))
        for word1, word2 in bigrams:
            node1 = graph.get(word1, add=True)
            node2 = graph.get(word2, add=True)
            node1.bump_edge(node2)
예제 #3
0
파일: main.py 프로젝트: fredcallaway/danish
def prepare(corpus, distributed):
    # Nets are trained to predict the next phoneme.
    inputs, targets = zip(*utils.neighbors(corpus))

    # Encode phonemes into numeric representations.
    encoding = corpora.get_encoding(distributed=distributed)

    return [encoding[c] for c in inputs], [encoding[c] for c in targets]
예제 #4
0
파일: main.py 프로젝트: fredcallaway/danish
def extract_boundaries(corpus):
    """Returns a corpus with boundaries removed, and boundary markers.

    XABXDEFXHXIJKLX -> ABDEFHIJKL, 0100110001"""
    pairs = utils.neighbors(corpus)
    phones_and_boundaries = ((phone, nxt in 'XQ')      # phone, precedes_boundary
                             for phone, nxt in pairs  # for all adjacent pairs
                             if phone != 'X')         # except ones that lead with a boundary
    return phones_and_boundaries
예제 #5
0
def _frequent_words_helper(text: str, k: int, d: int) -> collections.Counter:
    freq_map = collections.Counter()
    n = len(text)
    for i in range(n - k + 1):
        pattern = text[i:i + k]
        neighborhood = neighbors(pattern, d)
        for neighbor in neighborhood:
            freq_map[neighbor] += 1
    return freq_map
예제 #6
0
def bleu(lst1, lst2, order=2):
    """The percentage of N-grams (N = `order`) that are shared in two lists.

    Note that the metric is sensitive to the number of times a given
    pair occurs in each list. The lists are assumed to contain the
    same elements.
    
    [1,2,3] [3,1,2] -> 0.5
    [1,2,3,1,2], [1,2,2,3,1] -> 0.75
    """
    if order > len(lst1):
        return None
    ngrams1 = Counter(utils.neighbors(lst1, n=order))
    ngrams2 = Counter(utils.neighbors(lst2, n=order))
    num_shared = sum((ngrams1 & ngrams2).values())
    possible = sum(ngrams1.values())
    result = num_shared / possible
    assert 0 <= result <= 1
    return result
예제 #7
0
def tokenize(w, h, pix):
    """We tokenize an image such that there is a token for each pixel
    and each of its neighboring pixels, so that each neighbor is
    equally likely to occur after any given pixel.

    (And we ignore the outermost pixels for simplicity's sake.)
    """
    for y in range(1, h - 1):
        for x in range(1, w - 1):
            for nx, ny in utils.neighbors(x, y):
                yield pix[x, y]
                yield pix[nx, ny]
예제 #8
0
def random_walk_fill(actual_w, actual_h, target_pix, pix_stream, draw):
    scale = 0.1
    w = int(actual_w * scale)
    h = int(actual_h * scale)
    pixel_radius = int(math.ceil(1 / scale) * 0.5)

    cx = int(w * .33)
    cy = h / 2

    visited = set()
    q = [(cx, cy)]

    def random_sort():
        x_factor = 1 if random.random() < 0.5 else -1
        y_factor = 1 if random.random() < 0.5 else -1
        sort_by_x = random.random() < 0.5

        def sort_func((x, y)):
            x *= x_factor
            y *= y_factor
            return (x, y) if sort_by_x else (y, x)

        return sort_func

    sort_func = random_sort()
    sort_mutation_chance = 0.75

    def is_valid_coord((x, y)):
        return (x, y) not in visited and 0 <= x < w and 0 <= y < h

    while q:
        x, y = q.pop()
        sx, sy = x / scale, y / scale

        bounding_box = [
            (sx - pixel_radius, sy - pixel_radius),
            (sx + pixel_radius, sy + pixel_radius)
        ]
        draw.ellipse(bounding_box, fill=next(pix_stream))

        visited.add((x, y))

        q.extend(
            sorted(
                filter(is_valid_coord, utils.neighbors(x, y)),
                key=sort_func))

        if random.random() < sort_mutation_chance:
            # print 'changing sort func! (q: %s; v: %s; t: %s)' % (
            #     len(q), len(visited), (w * h))
            sort_func = random_sort()
예제 #9
0
def bfs(start, finish):  # --> [(1,2), ...]
    visited = [row[:] for row in maze]
    q = deque([(start, [start])])
    while q:
        curr, path = q.popleft()
        r, c = curr
        visited[r][c] = 1
        for r, c in neighbors(curr):
            if (r, c) == finish:
                return path + [finish]
            else:
                if visited[r][c] == 0:
                    visited[r][c] = 1
                    q.append(((r, c), path + [(r, c)]))
예제 #10
0
    def _one_run(self, i):
        active = set()
        neighbors = defaultdict(int)
        for point in self.active:
            for neighbor in utils.neighbors(point):
                neighbors[neighbor] += 1

        for neighbor in neighbors:
            if neighbor in self.active and neighbors[neighbor] in (2, 3):
                active.add(neighbor)
            elif neighbor not in self.active and neighbors[neighbor] == 3:
                active.add(neighbor)

        self.active = active
예제 #11
0
def test_word_segmentation(correct_boundaries, predicted_boundaries):
    # get words as tuples of beginning and end indices
    correct_indices = np.nonzero(correct_boundaries)[0]
    predicted_indices = np.nonzero(predicted_boundaries)[0]
    correct_words = set(map(tuple, utils.neighbors(correct_indices)))
    predicted_words = set(map(tuple, utils.neighbors(predicted_indices)))
    joblib.dump((correct_words, predicted_words), 'words.pkl')
    hits = len(correct_words & predicted_words)
    alarms = len(predicted_words - correct_words)
    misses = len(correct_words - predicted_words)

    if hits == 0:
        # avoid zero division
        precision = 0
        recall = 0
        F = 0
    else:
        precision = hits / (hits+alarms)
        recall = hits / (hits+misses)
        F = 2 * (precision * recall) / (precision+recall)

    return {'word_precision': precision,
            'word_recall': recall,
            'word_F': F}
예제 #12
0
    def try_to_chunk(self) -> None:
        """Attempts to combine two Nodes in memory into one Node.

        Returns True for success, False for failure.
        """

        if len(self.memory) == 1:
            # We can't create a chunk when there's only one node left.
            # This can only happen while processing the tail, so we
            # must be done processing
            self.log.debug('done parsing')
            return None

        # Consider chunking all adjacent nodes in memory, except
        # boundary markers.
        chunkable =(n for n in self.memory if n.id_string != 'ø')
        pairs = list(utils.neighbors(chunkable))
        if not pairs:
            return

        chunkinesses = [self.model.chunkiness(node1, node2)
                        for node1, node2 in pairs]
        
        best_idx = np.argmax(chunkinesses)
        best_chunkiness = chunkinesses[best_idx]

        # See if the best pair already forms a chunk in the graph.
        chunk = self.model.get_chunk(*pairs[best_idx], create=False)
        # If the node doesn't exist, but chunkiness exceeds the 
        # threshold, create the chunk and add it to the graph
        if not chunk and best_chunkiness > self.params['CHUNK_THRESHOLD']:
            chunk = self.model.get_chunk(*pairs[best_idx], create=True, add=self.learn)

        if chunk:
            # Replace the two nodes in memory with the single chunk
            best_chunk = self.model.get_chunk(*pairs[best_idx], create=True)
            self.memory[best_idx] = best_chunk
            del self.memory[best_idx+1]
            self.chunkinesses.append(best_chunkiness)
            self.log.debug('create chunk: %s', best_chunk)
            return best_idx

        else:
            self.log.debug('no chunk created')
            return None
def main():

    # A pcfg that has free slots for a noun and determiner.
    toy_pcfg = '''
    S    -> NP VP    [1.0]
    VP   -> V NP     [0.5]
    VP   -> V        [0.5]
    NP   -> Det N    [0.5]
    NP   -> Name     [0.5]
    V    -> 'saw'    [0.5]
    V    -> 'ate'    [0.5]
    N    -> 'boy'    [0.5]
    N    -> '{NOUN}' [0.5]
    Name -> 'Jack'   [0.5]
    Name -> 'Bob'    [0.5]
    Det  -> 'the'    [0.5]
    Det  -> '{DET}'  [0.5]
    '''

    # Instantiate two versions of the above pcfg with (that, table)
    # and (my, bunny) as (DET, NOUN). As a result, neither pcfg can
    # generate "that bunny" nor "my table".
    that_table_pcfg = toy_pcfg.format(DET='that', NOUN='table')
    my_bunny_pcfg = toy_pcfg.format(DET='my', NOUN='bunny')

    corpus = (list(pcfg.random_sentences(that_table_pcfg, 100))
              + list(pcfg.random_sentences(my_bunny_pcfg, 100)))

    # Mix sentences from each corpus. This is essential for the dynamic
    # generalization algorithm to work.
    np.random.shuffle(corpus)

    # Check how many times each critical pair occured.
    bigrams = Counter(itertools.chain(*(list(utils.neighbors(utt.split(' '))) for utt in corpus)))
    for det in 'that', 'my':
        for noun in 'table', 'bunny':
            print(det, noun, ':', bigrams[(det, noun)])
    
    # Train the graph on the markov transitions in the combined corpus.
    graph = VectorGraph(DYNAMIC=True, COMPOSITION=1)
    train_bigram(graph, corpus)
    
    generalization(graph)
    composition(graph)
예제 #14
0
def motif_enumeration(dna: List[str], k: int, d: int) -> Set[str]:
    """Search for all (k, d)-motifs in dna with brute force.
    O(t * n * X * t * n)
        where t = len(dna)
              n = len(dna[0])
              X = time complexity of neighbors()

    >>> motif_enumeration({"ATTTGGC", "TGCCTTA", "CGGTATC", "GAAAATT"}, 3, 1) == {"ATA", "ATT", "GTT", "TTT"}
    True
    """
    seen = set()
    res = set()
    for seq in dna:
        for seed in sliding_window(seq, k):
            if seed in seen:
                continue
            seen.add(seed)
            for pattern in neighbors(seed, d):
                if all(hamming_distance_str(pattern, s) <= d for s in dna):
                    res.add(pattern)
    return res
예제 #15
0
 def __add__(self, x):
     """
     Adds new element x to the sets that represent islands.
     If the new element is neighbor to multiple existing such sets, 
     the corresponding islands are being merged.
     :param x: [int] new element to be added
     :return: [None]
     """
     x_neighbors = []
     for s, S in enumerate(self.setList):
         if any([neighbors(x, y, self.width, self.height) for y in S]):
             x_neighbors.append(s)
     if len(x_neighbors) < 1:
         self.setList.append(set([x]))
     elif len(x_neighbors) == 1:
         self.setList[x_neighbors[0]].add(x)
     else:
         new_set = set([x])
         for neighbor_set in x_neighbors:
             new_set.update(self.setList[neighbor_set])
         self.setList = [s for i, s in enumerate(self.setList) if i not in x_neighbors]
         self.setList.append(new_set)
예제 #16
0
 def __init__(self, grid):
     """
     Given grid of Boolean values, it creates an instance of Islands which contains a collection 
     of sets, each representing an island (see above def. of an island).
     :param grid: [2D bools] where True denotes land, False water
     """
     if len(grid) == 0 or len(grid[0]) == 0:
         raise ValueError
     lands = []
     self.width, self.height = len(grid), len(grid[0])
     for i in range(self.height):
         for j in range(self.width):
             if grid[i][j]:
                 lands.append(i * self.width + j)
     self.sets = DisjointSet(lands)
     edges = []
     for x in lands:
         for y in [x - 1, x + 1, x - self.width, x + self.width]:
             if neighbors(x, y, self.width,
                          self.height):  # if x == y returns False
                 edges.append((x, y))
     for x, y in edges:
         self.sets.union(x, y)
     self.sets.filter()
예제 #17
0
def less_simple_fill(w, h, target_pix, pix_stream):
    for y in range(2, h - 2, 2):
        for x in range(2, w - 2, 2):
            target_pix[x, y] = next(pix_stream)
            for nx, ny in utils.neighbors(x, y):
                target_pix[nx, ny] = next(pix_stream)
예제 #18
0
def test_easy(model):
    model.params['CHUNK_THRESHOLD'] = 2
    # One simple utterance 50 times.
    utterance = 'a b a c a b d'
    corpus = [utterance] * 50
    model.parse(corpus[0])
    print(utils.log_parse(model, corpus[0]))
    a, b, c, d = (model.graph[x] for x in 'abcd')  # node objects
    def weight(edge, n1, n2):
        return n1.edge_weight(n2, edge)

    # Check that all connections are positive after one utterance
    for x, y in utils.neighbors(utterance.split(' ')):
        assert weight('ftp', model.graph[x], model.graph[y])
        assert weight('btp', model.graph[y], model.graph[x])

    # Equal conditional probability, but more evidence
    #assert weight('btp', b, a) < weight('btp', c, a)

    model.fit(corpus)

    # Check that weights don't change when they shouldn't change.
    w1 = weight('ftp', a, b)
    model.parse('b c')
    w2 = weight('ftp', a, b)
    assert w1 - w2 < .001

    w1 = weight('btp', b, a)
    model.parse('d a d a d a d a d a d a')
    w2 = weight('btp', b, a)
    assert w1 - w2 < .001

    
    # Check that more common edges are more highly weighted.
    # We vary the conditional (ab | a) and raw (ab) probabilities.
    # Reference: a b a c a b d

    # Higher conditional, higher raw.
    assert weight('ftp', a, b) > weight('ftp', a, c)
    
    # Higher conditional, equal raw.
    assert weight('ftp', c, a) > weight('ftp', b, d)

    return  # TODO

    # Equal conditional, higher raw. But lots of evidence for both.
    print()
    print(weight('btp', c, a, verbose=True))
    assert 0

    assert weight('btp', b, a) - weight('btp', c, a) < 0.001

    
    # This always fails for vector. The edge weights do not really
    # represent probabilities. They are more sensitive to the raw
    # occurrence counts.
    # p(ab | a) = 0.66
    # p(ca | c) = 1
    # p(ab) = 0.4
    # p(ca) = 0.2
    #assert weight('ftp', c, a) > weight('ftp', a, b)

    assert weight('ftp', a, a) < 0.05
    assert weight('ftp', b, b) < 0.05
    assert weight('ftp', c, c) < 0.05
    assert weight('ftp', b, c) < 0.05