Exemplo n.º 1
0
def getWSJTokens (n, randomize = False):
    """
    Returns a list of the tagged C{Token}s in M{n} files of the Wall Street
    Journal corpus.

    @param n: How many files to get C{Token}s from; if there are more than
        M{n} files in the corpus, all tokens are returned.
    @type n: int
    @param randomize: C{False} means the tokens are from the first M{n} files
        of the corpus.  C{True} means the tokens are from a random set of M{n}
        files.
    @type randomize: Boolean
    @return: The list of tagged tokens
    @rtype: list of C{Token}
    """
    taggedData = []
    items = treebank.items('tagged')
    if randomize:
        random.seed(len(items))
        random.shuffle(items)
    for item in items[:n]:
        item = treebank.read(item)
        for sent in item['SENTS']:
            taggedData += sent['WORDS']
    taggedData = [taggedData[i] for i in range(len(taggedData))
                  if taggedData[i]['TEXT'][0] not in "[]="]
    return taggedData
Exemplo n.º 2
0
def demo():
    """
    A demonstration of the porter stemmer on a sample taken randomly
    from from the Penn Treebank corpus.
    """
    # Pick a file from the brown corpus, and tokenize it.
    # Keep at most 100 tokens.
    import random
    from lib.nltk.corpus import treebank
    item = random.choice(treebank.items('raw'))
    text = treebank.tokenize(item)
    text['SUBTOKENS'] = text['SUBTOKENS'][:100]

    # Remove any formatting tokens.
    text = [tok for tok in text['SUBTOKENS']
            if tok['TEXT'] != '.START' and
            not tok['TEXT'].startswith('======')]

    # Create a porter stemmer, and run it over the text.
    stemmer = PorterStemmer()
    for tok in text: stemmer.stem(tok)

    # Convert the results to a string, and word-wrap them.
    results = ' '.join([tok['STEM'] for tok in text])
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join([tok['TEXT'] for tok in text])
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print '-Original-'.center(70).replace(' ', '*').replace('-', ' ')
    print original
    print '-Results-'.center(70).replace(' ', '*').replace('-', ' ')
    print results
    print '*'*70