Exemplos de xrange em Python, exemplos de nltk.compat.xrange em Python

Exemplo n.º 1

0

Exibir arquivo

def all_combsi(lol):
    lens = [len(x) for x in lol]
    num_combs = reduce(lambda x, y: x * y, lens, 1)
    for i in xrange(num_combs):
        tmp = [0] * len(lol)
        for j in xrange(len(tmp)):
            tmp[j] = lol[j][i % lens[j]]
            i = i / lens[j]
        yield tmp

Exemplo n.º 2

0

Exibir arquivo

Arquivo: generate2.py Projeto: AydinSakar/TextBlob

def all_combsi(lol):
    lens = [len(x) for x in lol]
    num_combs = reduce(lambda x, y: x*y, lens, 1)
    for i in xrange(num_combs):
        tmp = [0]*len(lol)
        for j in xrange(len(tmp)):
            tmp[j] = lol[j][i % lens[j]]
            i = i / lens[j]
        yield tmp

Exemplo n.º 3

0

Exibir arquivo

def pk(ref, hyp, k=None, boundary='1'):
    """
    Compute the Pk metric for a pair of segmentations A segmentation
    is any sequence over a vocabulary of two items (e.g. "0", "1"),
    where the specified boundary value is used to mark the edge of a
    segmentation.

    >>> '%.2f' % pk('0100'*100, '1'*400, 2)
    '0.50'
    >>> '%.2f' % pk('0100'*100, '0'*400, 2)
    '0.50'
    >>> '%.2f' % pk('0100'*100, '0100'*100, 2)
    '0.00'

    :param ref: the reference segmentation
    :type ref: str or list
    :param hyp: the segmentation to evaluate
    :type hyp: str or list
    :param k: window size, if None, set to half of the average reference segment length
    :type boundary: str or int or bool
    :param boundary: boundary value
    :type boundary: str or int or bool
    :rtype: float
    """

    if k is None:
        k = int(round(len(ref) / (ref.count(boundary) * 2.)))

    err = 0
    for i in xrange(len(ref) - k + 1):
        r = ref[i:i + k].count(boundary) > 0
        h = hyp[i:i + k].count(boundary) > 0
        if r != h:
            err += 1
    return err / (len(ref) - k + 1.)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: ngram_score.py Projeto: NadiyaSitdykova/subDecipher

 def score(self, text):
     score = 0
     ngrams = self.ngrams.__getitem__
     for i in xrange(len(text) - self.L + 1):
         if text[i : i + self.L] in self.ngrams: score += ngrams(text[i : i + self.L])
         else: score += self.floor
     return score

Exemplo n.º 5

0

Exibir arquivo

Arquivo: segmentation.py Projeto: CaptainAL/Spyder

def pk(ref, hyp, k=None, boundary='1'):
    """
    Compute the Pk metric for a pair of segmentations A segmentation
    is any sequence over a vocabulary of two items (e.g. "0", "1"),
    where the specified boundary value is used to mark the edge of a
    segmentation.

    >>> '%.2f' % pk('0100'*100, '1'*400, 2)
    '0.50'
    >>> '%.2f' % pk('0100'*100, '0'*400, 2)
    '0.50'
    >>> '%.2f' % pk('0100'*100, '0100'*100, 2)
    '0.00'

    :param ref: the reference segmentation
    :type ref: str or list
    :param hyp: the segmentation to evaluate
    :type hyp: str or list
    :param k: window size, if None, set to half of the average reference segment length
    :type boundary: str or int or bool
    :param boundary: boundary value
    :type boundary: str or int or bool
    :rtype: float
    """

    if k is None:
        k = int(round(len(ref) / (ref.count(boundary) * 2.)))

    err = 0
    for i in xrange(len(ref)-k +1):
        r = ref[i:i+k].count(boundary) > 0
        h = hyp[i:i+k].count(boundary) > 0
        if r != h:
           err += 1
    return err / (len(ref)-k +1.)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: PMI.py Projeto: arepina/courseWork2016

 def create_col_array(matrix, matrix_terms_len):
     array = []
     from nltk.compat import xrange
     for i in xrange(matrix_terms_len):
         col = np.array(matrix[:, i].T.toarray())
         array.append(col)
     return array

Exemplo n.º 7

0

Exibir arquivo

Arquivo: segmentation.py Projeto: shereenoraby/nltk

def pk(ref, hyp, k=None, boundary="1"):
    """
    Compute the Pk metric for a pair of segmentations A segmentation
    is any sequence over a vocabulary of two items (e.g. "0", "1"),
    where the specified boundary value is used to mark the edge of a
    segmentation.

    >>> s1 = "00000010000000001000000"
    >>> s2 = "00000001000000010000000"
    >>> s3 = "00010000000000000001000"
    >>> pk(s1, s1, 3)
    0.0
    >>> pk(s1, s2, 3)
    0.095238...
    >>> pk(s2, s3, 3)
    0.190476...

    :param ref: the reference segmentation
    :type ref: str or list
    :param hyp: the segmentation to evaluate
    :type hyp: str or list
    :param k: window size, if None, set to half of the average reference segment length
    :type boundary: str or int or bool
    :param boundary: boundary value
    :type boundary: str or int or bool
    :rtype: float
    """

    if k is None:
        k = int(round(len(ref) / (ref.count(boundary) * 2.0)))

    n_considered_seg = len(ref) - k + 1
    n_same_ref = 0.0
    n_false_alarm = 0.0
    n_miss = 0.0

    for i in xrange(n_considered_seg):
        bsame_ref_seg = False
        bsame_hyp_seg = False

        if boundary not in ref[(i + 1) : (i + k)]:
            n_same_ref += 1.0
            bsame_ref_seg = True
        if boundary not in hyp[(i + 1) : (i + k)]:
            bsame_hyp_seg = True

        if bsame_hyp_seg and not bsame_ref_seg:
            n_miss += 1
        if bsame_ref_seg and not bsame_hyp_seg:
            n_false_alarm += 1

    prob_same_ref = n_same_ref / n_considered_seg
    prob_diff_ref = 1 - prob_same_ref
    prob_miss = n_miss / n_considered_seg
    prob_false_alarm = n_false_alarm / n_considered_seg

    return prob_miss * prob_diff_ref + prob_false_alarm * prob_same_ref

Exemplo n.º 8

0

Exibir arquivo

Arquivo: GermEvalReader.py Projeto: DomWag/TMP_Andre

def createNumpyArray(sentences, windowsize, word2Idx, label2Idx):
    unknownIdx = word2Idx['UNK']
    paddingIdx = word2Idx['MASK']

    xMatrix = []
    yVector = []

    wordCount = 0
    unknownWordCount = 0

    for sentence in sentences:
        targetWordIdx = 0

        for targetWordIdx in xrange(len(sentence)):

            # Get the context of the target word and map these words to the index in the embeddings matrix
            wordIndices = []
            for wordPosition in xrange(targetWordIdx - windowsize, targetWordIdx + windowsize + 1):
                if wordPosition < 0 or wordPosition >= len(sentence):
                    wordIndices.append(paddingIdx)
                    continue

                word = sentence[wordPosition]


                wordIndices.append(word)

            # Get the label and map to int
            labelIdx = label2Idx[sentence[targetWordIdx][1]]

            xMatrix.append(wordIndices)
            yVector.append(labelIdx)

    print
    "Unknowns: %.2f%%" % (unknownWordCount / (float(wordCount)) * 100)
    return (np.asarray(xMatrix, dtype='int32'), np.asarray(yVector, dtype='int32'))

Exemplo n.º 9

0

Exibir arquivo

Arquivo: article_spninner_amazon.py Projeto: vkgpt11/python-natural-language-processing

def test_spinner():
    rand_review = random.choice(positive_reviews)
    s = rand_review.text.lower()
    print("Original:", s)
    word_tokens = nltk.tokenize.word_tokenize(s)
    for index in xrange(len(word_tokens) - 2):
        if random.random() < 0.2:  # 20% chance of replacement
            k = (word_tokens[index], word_tokens[index + 2])
            if k in trigram_probabilities:
                w = random_sample(trigram_probabilities[k])
                word_tokens[index + 1] = w

    print(
        "Spun:",
        " ".join(word_tokens).replace(" .", ".").replace(" '", "'").replace(
            " ,", ",").replace("$ ", "$").replace(" !", "!"))

Exemplo n.º 10

0

Exibir arquivo

Arquivo: nonprojectivedependencyparser.py Projeto: forkdump/piquote

    def parse(self, tokens):
        """
        Parses the input tokens with respect to the parser's grammar.  Parsing
        is accomplished by representing the search-space of possible parses as
        a fully-connected directed graph.  Arcs that would lead to ungrammatical
        parses are removed and a lattice is constructed of length n, where n is
        the number of input tokens, to represent all possible grammatical
        traversals.  All possible paths through the lattice are then enumerated
        to produce the set of non-projective parses.

        param tokens: A list of tokens to parse.
        type tokens: list(str)
        return: An iterator of non-projective parses.
        rtype: iter(DependencyGraph)
        """
        # Create graph representation of tokens
        self._graph = DependencyGraph()

        for index, token in enumerate(tokens):
            self._graph.nodes[index] = {
                'word': token,
                'deps': [],
                'rel': 'NTOP',
                'address': index,
            }

        for head_node in self._graph.nodes.values():
            deps = []
            for dep_node in self._graph.nodes.values():
                if (self._grammar.contains(head_node['word'], dep_node['word'])
                        and head_node['word'] != dep_node['word']):
                    deps.append(dep_node['address'])
            head_node['deps'] = deps

        # Create lattice of possible heads
        roots = []
        possible_heads = []
        for i, word in enumerate(tokens):
            heads = []
            for j, head in enumerate(tokens):
                if (i != j) and self._grammar.contains(head, word):
                    heads.append(j)
            if len(heads) == 0:
                roots.append(i)
            possible_heads.append(heads)

        # Set roots to attempt
        if len(roots) < 2:
            if len(roots) == 0:
                for i in range(len(tokens)):
                    roots.append(i)

            # Traverse lattice
            analyses = []
            for root in roots:
                stack = []
                analysis = [[] for i in range(len(possible_heads))]
            i = 0
            forward = True
            while i >= 0:
                if forward:
                    if len(possible_heads[i]) == 1:
                        analysis[i] = possible_heads[i][0]
                    elif len(possible_heads[i]) == 0:
                        analysis[i] = -1
                    else:
                        head = possible_heads[i].pop()
                        analysis[i] = head
                        stack.append([i, head])
                if not forward:
                    index_on_stack = False
                    for stack_item in stack:
                        if stack_item[0] == i:
                            index_on_stack = True
                    orig_length = len(possible_heads[i])

                    if index_on_stack and orig_length == 0:
                        for j in xrange(len(stack) - 1, -1, -1):
                            stack_item = stack[j]
                            if stack_item[0] == i:
                                possible_heads[i].append(stack.pop(j)[1])

                    elif index_on_stack and orig_length > 0:
                        head = possible_heads[i].pop()
                        analysis[i] = head
                        stack.append([i, head])
                        forward = True

                if i + 1 == len(possible_heads):
                    analyses.append(analysis[:])
                    forward = False
                if forward:
                    i += 1
                else:
                    i -= 1

        # Filter parses
        # ensure 1 root, every thing has 1 head
        for analysis in analyses:
            if analysis.count(-1) > 1:
                # there are several root elements!
                continue

            graph = DependencyGraph()
            graph.root = graph.nodes[analysis.index(-1) + 1]

            for address, (token,
                          head_index) in enumerate(zip(tokens, analysis),
                                                   start=1):
                head_address = head_index + 1

                node = graph.nodes[address]
                node.update({
                    'word': token,
                    'address': address,
                })

                if head_address == 0:
                    rel = 'ROOT'
                else:
                    rel = ''
                graph.nodes[head_index + 1]['deps'][rel].append(address)

            # TODO: check for cycles
            yield graph

Exemplo n.º 11

0

Exibir arquivo

Arquivo: earleychart.py Projeto: Jaspreet10/moody

 def _positions(self):
     return xrange(self.num_leaves() + 1)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: article_spninner_amazon.py Projeto: vkgpt11/python-natural-language-processing

from bs4 import BeautifulSoup
from nltk.compat import xrange

positive_reviews = BeautifulSoup(open("positive.review").read())
positive_reviews = positive_reviews.findAll('review_text')

# extract the trigrams
# Key -> first and last word
# value -> possible middle words

trigrams = {}
for review in positive_reviews:
    s = review.text.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    for i in xrange(len(tokens) - 2):
        k = (tokens[i], tokens[i + 2])
        if k not in trigrams:
            trigrams[k] = []
        trigrams[k].append(tokens[i + 1])

# turn each array of middle words into a probability vector
trigram_probabilities = {}
for k, words in trigrams.items():
    # create a dictionary of words -> count
    if len(set(words)) > 1:
        # only do this when there are different possibilities for a middle word
        d = {}
        n = 0
        for w in words:
            if w not in d:

Exemplo n.º 13

0

Exibir arquivo

Arquivo: nonprojectivedependencyparser.py Projeto: CaptainAL/Spyder

    def parse(self, tokens):
        """
        Parses the input tokens with respect to the parser's grammar.  Parsing
        is accomplished by representing the search-space of possible parses as
        a fully-connected directed graph.  Arcs that would lead to ungrammatical
        parses are removed and a lattice is constructed of length n, where n is
        the number of input tokens, to represent all possible grammatical
        traversals.  All possible paths through the lattice are then enumerated
        to produce the set of non-projective parses.

        param tokens: A list of tokens to parse.
        type tokens: list(str)
        return: An iterator of non-projective parses.
        rtype: iter(DependencyGraph)
        """
        # Create graph representation of tokens
        self._graph = DependencyGraph()

        for index, token in enumerate(tokens):
            self._graph.nodes[index] = {
                'word': token,
                'deps': [],
                'rel': 'NTOP',
                'address': index,
            }

        for head_node in self._graph.nodes.values():
            deps = []
            for dep_node in self._graph.nodes.values()  :
                if (
                    self._grammar.contains(head_node['word'], dep_node['word'])
                    and head_node['word'] != dep_node['word']
                ):
                    deps.append(dep_node['address'])
            head_node['deps'] = deps

        # Create lattice of possible heads
        roots = []
        possible_heads = []
        for i, word in enumerate(tokens):
            heads = []
            for j, head in enumerate(tokens):
                if (i != j) and self._grammar.contains(head, word):
                    heads.append(j)
            if len(heads) == 0:
                roots.append(i)
            possible_heads.append(heads)

        # Set roots to attempt
        if len(roots) < 2:
            if len(roots) == 0:
                for i in range(len(tokens)):
                    roots.append(i)

            # Traverse lattice
            analyses = []
            for root in roots:
                stack = []
                analysis = [[] for i in range(len(possible_heads))]
            i = 0
            forward = True
            while i >= 0:
                if forward:
                    if len(possible_heads[i]) == 1:
                        analysis[i] = possible_heads[i][0]
                    elif len(possible_heads[i]) == 0:
                        analysis[i] = -1
                    else:
                        head = possible_heads[i].pop()
                        analysis[i] = head
                        stack.append([i, head])
                if not forward:
                    index_on_stack = False
                    for stack_item in stack:
                        if stack_item[0] == i:
                            index_on_stack = True
                    orig_length = len(possible_heads[i])

                    if index_on_stack and orig_length == 0:
                        for j in xrange(len(stack) - 1, -1, -1):
                            stack_item = stack[j]
                            if stack_item[0] == i:
                                possible_heads[i].append(stack.pop(j)[1])

                    elif index_on_stack and orig_length > 0:
                        head = possible_heads[i].pop()
                        analysis[i] = head
                        stack.append([i, head])
                        forward = True

                if i + 1 == len(possible_heads):
                    analyses.append(analysis[:])
                    forward = False
                if forward:
                    i += 1
                else:
                    i -= 1

        # Filter parses
        # ensure 1 root, every thing has 1 head
        for analysis in analyses:
            if analysis.count(-1) > 1:
                # there are several root elements!
                continue

            graph = DependencyGraph()
            graph.root = graph.nodes[analysis.index(-1) + 1]

            for address, (token, head_index) in enumerate(zip(tokens, analysis), start=1):
                head_address = head_index + 1

                node = graph.nodes[address]
                node.update(
                    {
                        'word': token,
                        'address': address,
                    }
                )

                if head_address == 0:
                    rel = 'ROOT'
                else:
                    rel = ''
                graph.nodes[head_index + 1]['deps'][rel].append(address)

            # TODO: check for cycles
            yield graph

Exemplo n.º 14

0

Exibir arquivo

Arquivo: scores.py Projeto: Journo-App/flask-by-example

def approxrand(a, b, **kwargs):
    """
    Returns an approximate significance level between two lists of
    independently generated test values.

    Approximate randomization calculates significance by randomly drawing
    from a sample of the possible permutations. At the limit of the number
    of possible permutations, the significance level is exact. The
    approximate significance level is the sample mean number of times the
    statistic of the permutated lists varies from the actual statistic of
    the unpermuted argument lists.

    :return: a tuple containing an approximate significance level, the count
             of the number of times the pseudo-statistic varied from the
             actual statistic, and the number of shuffles
    :rtype: tuple
    :param a: a list of test values
    :type a: list
    :param b: another list of independently generated test values
    :type b: list
    """
    shuffles = kwargs.get('shuffles', 999)
    # there's no point in trying to shuffle beyond all possible permutations
    shuffles = \
        min(shuffles, reduce(operator.mul, xrange(1, len(a) + len(b) + 1)))
    stat = kwargs.get('statistic', lambda lst: sum(lst) / len(lst))
    verbose = kwargs.get('verbose', False)

    if verbose:
        print('shuffles: %d' % shuffles)

    actual_stat = fabs(stat(a) - stat(b))

    if verbose:
        print('actual statistic: %f' % actual_stat)
        print('-' * 60)

    c = 1e-100
    lst = LazyConcatenation([a, b])
    indices = list(range(len(a) + len(b)))

    for i in xrange(shuffles):
        if verbose and i % 10 == 0:
            print('shuffle: %d' % i)

        shuffle(indices)

        pseudo_stat_a = stat(LazyMap(lambda i: lst[i], indices[:len(a)]))
        pseudo_stat_b = stat(LazyMap(lambda i: lst[i], indices[len(a):]))
        pseudo_stat = fabs(pseudo_stat_a - pseudo_stat_b)

        if pseudo_stat >= actual_stat:
            c += 1

        if verbose and i % 10 == 0:
            print('pseudo-statistic: %f' % pseudo_stat)
            print('significance: %f' % ((c + 1) / (i + 1)))
            print('-' * 60)

    significance = (c + 1) / (shuffles + 1)

    if verbose:
        print('significance: %f' % significance)
        if betai:
            for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]:
                print("prob(phi<=%f): %f" % (phi, betai(c, shuffles, phi)))

    return (significance, c, shuffles)

Exemplo n.º 15

0

Exibir arquivo

 def apply(self, chart, grammar):
     for prod in grammar.productions(empty=True):
         for index in xrange(chart.num_leaves() + 1):
             new_edge = FeatureTreeEdge.from_production(prod, index)
             if chart.insert(new_edge, ()):
                 yield new_edge

Exemplo n.º 16

0

Exibir arquivo

Arquivo: featurechart.py Projeto: esabelhaus/secret-octo-dubstep

 def apply(self, chart, grammar):
     for prod in grammar.productions(empty=True):
         for index in xrange(chart.num_leaves() + 1):
             new_edge = FeatureTreeEdge.from_production(prod, index)
             if chart.insert(new_edge, ()):
                 yield new_edge

Exemplo n.º 17

0

Exibir arquivo

Arquivo: nonprojectivedependencyparser.py Projeto: laurii/learning-flask

    def parse(self, tokens):
        """
        Parses the input tokens with respect to the parser's grammar.  Parsing
        is accomplished by representing the search-space of possible parses as
        a fully-connected directed graph.  Arcs that would lead to ungrammatical
        parses are removed and a lattice is constructed of length n, where n is
        the number of input tokens, to represent all possible grammatical
        traversals.  All possible paths through the lattice are then enumerated
        to produce the set of non-projective parses.

        param tokens: A list of tokens to parse.
        type tokens: list(str)
        return: A set of non-projective parses.
        rtype: list(DependencyGraph)
        """
        # Create graph representation of tokens
        self._graph = DependencyGraph()
        self._graph.nodelist = []  # Remove the default root
        for index, token in enumerate(tokens):
            self._graph.nodelist.append({'word':token, 'deps':[], 'rel':'NTOP', 'address':index})
        for head_node in self._graph.nodelist:
            deps = []
            for dep_node in self._graph.nodelist:
                if self._grammar.contains(head_node['word'], dep_node['word']) and not head_node['word'] == dep_node['word']:
                    deps.append(dep_node['address'])
            head_node['deps'] = deps
        # Create lattice of possible heads
        roots = []
        possible_heads = []
        for i, word in enumerate(tokens):
            heads = []
            for j, head in enumerate(tokens):
                if (i != j) and self._grammar.contains(head, word):
                    heads.append(j)
            if len(heads) == 0:
                roots.append(i)
            possible_heads.append(heads)
        # Set roots to attempt
        if len(roots) > 1:
            print("No parses found.")
            return False
        elif len(roots) == 0:
            for i in range(len(tokens)):
                roots.append(i)
        # Traverse lattice
        analyses = []
        for root in roots:
            stack = []
            analysis = [[] for i in range(len(possible_heads))]
            i = 0
            forward = True
            while(i >= 0):
                if forward:
                    if len(possible_heads[i]) == 1:
                        analysis[i] = possible_heads[i][0]
                    elif len(possible_heads[i]) == 0:
                        analysis[i] = -1
                    else:
                        head = possible_heads[i].pop()
                        analysis[i] = head
                        stack.append([i, head])
                if not forward:
                    index_on_stack = False
                    for stack_item in stack:
#                       print stack_item
                        if stack_item[0] == i:
                            index_on_stack = True
                    orig_length = len(possible_heads[i])
#                    print len(possible_heads[i])
                    if index_on_stack and orig_length == 0:
                        for j in xrange(len(stack) -1, -1, -1):
                            stack_item = stack[j]
                            if stack_item[0] == i:
                                possible_heads[i].append(stack.pop(j)[1])
#                        print stack
                    elif index_on_stack and orig_length > 0:
                        head = possible_heads[i].pop()
                        analysis[i] = head
                        stack.append([i, head])
                        forward = True

#                   print 'Index on stack:', i, index_on_stack
                if i + 1 == len(possible_heads):
                    analyses.append(analysis[:])
                    forward = False
                if forward:
                    i += 1
                else:
                    i -= 1
        # Filter parses
        graphs = []
        #ensure 1 root, every thing has 1 head
        for analysis in analyses:
            root_count = 0
            root = []
            for i, cell in enumerate(analysis):
                if cell == -1:
                    root_count += 1
                    root = i
            if root_count == 1:
                graph = DependencyGraph()
                graph.nodelist[0]['deps'] = root + 1
                for i in range(len(tokens)):
                    node = {'word':tokens[i], 'address':i+1}
                    node['deps'] = [j+1 for j in range(len(tokens)) if analysis[j] == i]
                    graph.nodelist.append(node)
#                cycle = graph.contains_cycle()
#                if not cycle:
                graphs.append(graph)
        return graphs

Exemplo n.º 18

0

Exibir arquivo

Arquivo: earleychart.py Projeto: Arttii/TextBlob

 def _positions(self):
     return xrange(self.num_leaves() + 1)

Exemplo n.º 19

0

Exibir arquivo

Arquivo: scores.py Projeto: hfiuza/Text-Mining-and-NLP

def approxrand(a, b, **kwargs):
    """
    Returns an approximate significance level between two lists of
    independently generated test values.

    Approximate randomization calculates significance by randomly drawing
    from a sample of the possible permutations. At the limit of the number
    of possible permutations, the significance level is exact. The
    approximate significance level is the sample mean number of times the
    statistic of the permutated lists varies from the actual statistic of
    the unpermuted argument lists.

    :return: a tuple containing an approximate significance level, the count
             of the number of times the pseudo-statistic varied from the
             actual statistic, and the number of shuffles
    :rtype: tuple
    :param a: a list of test values
    :type a: list
    :param b: another list of independently generated test values
    :type b: list
    """
    shuffles = kwargs.get('shuffles', 999)
    # there's no point in trying to shuffle beyond all possible permutations
    shuffles = \
        min(shuffles, reduce(operator.mul, xrange(1, len(a) + len(b) + 1)))
    stat = kwargs.get('statistic', lambda lst: sum(lst) / len(lst))
    verbose = kwargs.get('verbose', False)

    if verbose:
        print('shuffles: %d' % shuffles)

    actual_stat = fabs(stat(a) - stat(b))

    if verbose:
        print('actual statistic: %f' % actual_stat)
        print('-' * 60)

    c = 1e-100
    lst = LazyConcatenation([a, b])
    indices = list(range(len(a) + len(b)))

    for i in xrange(shuffles):
        if verbose and i % 10 == 0:
            print('shuffle: %d' % i)

        shuffle(indices)

        pseudo_stat_a = stat(LazyMap(lambda i: lst[i], indices[:len(a)]))
        pseudo_stat_b = stat(LazyMap(lambda i: lst[i], indices[len(a):]))
        pseudo_stat = fabs(pseudo_stat_a - pseudo_stat_b)

        if pseudo_stat >= actual_stat:
            c += 1

        if verbose and i % 10 == 0:
            print('pseudo-statistic: %f' % pseudo_stat)
            print('significance: %f' % ((c + 1) / (i + 1)))
            print('-' * 60)

    significance = (c + 1) / (shuffles + 1)

    if verbose:
        print('significance: %f' % significance)
        if betai:
            for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]:
                print("prob(phi<=%f): %f" % (phi, betai(c, shuffles, phi)))

    return (significance, c, shuffles)

Exemplo n.º 20

0

Exibir arquivo

Arquivo: nonprojectivedependencyparser.py Projeto: Arttii/TextBlob

    def parse(self, tokens):
        """
        Parses the input tokens with respect to the parser's grammar.  Parsing
        is accomplished by representing the search-space of possible parses as
        a fully-connected directed graph.  Arcs that would lead to ungrammatical
        parses are removed and a lattice is constructed of length n, where n is
        the number of input tokens, to represent all possible grammatical
        traversals.  All possible paths through the lattice are then enumerated
        to produce the set of non-projective parses.

        param tokens: A list of tokens to parse.
        type tokens: list(str)
        return: A set of non-projective parses.
        rtype: list(DependencyGraph)
        """
        # Create graph representation of tokens
        self._graph = DependencyGraph()
        self._graph.nodelist = []  # Remove the default root
        for index, token in enumerate(tokens):
            self._graph.nodelist.append({'word':token, 'deps':[], 'rel':'NTOP', 'address':index})
        for head_node in self._graph.nodelist:
            deps = []
            for dep_node in self._graph.nodelist:
                if self._grammar.contains(head_node['word'], dep_node['word']) and not head_node['word'] == dep_node['word']:
                    deps.append(dep_node['address'])
            head_node['deps'] = deps
        # Create lattice of possible heads
        roots = []
        possible_heads = []
        for i, word in enumerate(tokens):
            heads = []
            for j, head in enumerate(tokens):
                if (i != j) and self._grammar.contains(head, word):
                    heads.append(j)
            if len(heads) == 0:
                roots.append(i)
            possible_heads.append(heads)
        # Set roots to attempt
        if len(roots) > 1:
            print("No parses found.")
            return False
        elif len(roots) == 0:
            for i in range(len(tokens)):
                roots.append(i)
        # Traverse lattice
        analyses = []
        for root in roots:
            stack = []
            analysis = [[] for i in range(len(possible_heads))]
            i = 0
            forward = True
            while(i >= 0):
                if forward:
                    if len(possible_heads[i]) == 1:
                        analysis[i] = possible_heads[i][0]
                    elif len(possible_heads[i]) == 0:
                        analysis[i] = -1
                    else:
                        head = possible_heads[i].pop()
                        analysis[i] = head
                        stack.append([i, head])
                if not forward:
                    index_on_stack = False
                    for stack_item in stack:
#                       print stack_item
                        if stack_item[0] == i:
                            index_on_stack = True
                    orig_length = len(possible_heads[i])
#                    print len(possible_heads[i])
                    if index_on_stack and orig_length == 0:
                        for j in xrange(len(stack) -1, -1, -1):
                            stack_item = stack[j]
                            if stack_item[0] == i:
                                possible_heads[i].append(stack.pop(j)[1])
#                        print stack
                    elif index_on_stack and orig_length > 0:
                        head = possible_heads[i].pop()
                        analysis[i] = head
                        stack.append([i, head])
                        forward = True

#                   print 'Index on stack:', i, index_on_stack
                if i + 1 == len(possible_heads):
                    analyses.append(analysis[:])
                    forward = False
                if forward:
                    i += 1
                else:
                    i -= 1
        # Filter parses
        graphs = []
        #ensure 1 root, every thing has 1 head
        for analysis in analyses:
            root_count = 0
            root = []
            for i, cell in enumerate(analysis):
                if cell == -1:
                    root_count += 1
                    root = i
            if root_count == 1:
                graph = DependencyGraph()
                graph.nodelist[0]['deps'] = root + 1
                for i in range(len(tokens)):
                    node = {'word':tokens[i], 'address':i+1}
                    node['deps'] = [j+1 for j in range(len(tokens)) if analysis[j] == i]
                    graph.nodelist.append(node)
#                cycle = graph.contains_cycle()
#                if not cycle:
                graphs.append(graph)
        return graphs