def getWSJTokens(n, randomize=False): """ Returns a list of the tagged C{Token}s in M{n} files of the Wall Street Journal corpus. @param n: How many files to get C{Token}s from; if there are more than M{n} files in the corpus, all tokens are returned. @type n: int @param randomize: C{False} means the tokens are from the first M{n} files of the corpus. C{True} means the tokens are from a random set of M{n} files. @type randomize: Boolean @return: The list of tagged tokens @rtype: list of C{Token} """ taggedData = [] items = treebank.items("tagged") if randomize: random.seed(len(items)) random.shuffle(items) for item in items[:n]: item = treebank.read(item) for sent in item["SENTS"]: taggedData += sent["WORDS"] taggedData = [taggedData[i] for i in range(len(taggedData)) if taggedData[i]["TEXT"][0] not in "[]="] return taggedData
def getWSJTokens(n, randomize=False): """ Returns a list of the tagged C{Token}s in M{n} files of the Wall Street Journal corpus. @param n: How many files to get C{Token}s from; if there are more than M{n} files in the corpus, all tokens are returned. @type n: int @param randomize: C{False} means the tokens are from the first M{n} files of the corpus. C{True} means the tokens are from a random set of M{n} files. @type randomize: Boolean @return: The list of tagged tokens @rtype: list of C{Token} """ taggedData = [] items = treebank.items('tagged') if randomize: random.seed(len(items)) random.shuffle(items) for item in items[:n]: item = treebank.read(item) for sent in item['SENTS']: taggedData += sent['WORDS'] taggedData = [ taggedData[i] for i in range(len(taggedData)) if taggedData[i]['TEXT'][0] not in "[]=" ] return taggedData
def demo(): """ A demonstration of the porter stemmer on a sample taken randomly from from the Penn Treebank corpus. """ # Pick a file from the brown corpus, and tokenize it. # Keep at most 100 tokens. import random from nltk.corpus import treebank item = random.choice(treebank.items('raw')) text = treebank.tokenize(item) text['SUBTOKENS'] = text['SUBTOKENS'][:100] # Remove any formatting tokens. text = [tok for tok in text['SUBTOKENS'] if tok['TEXT'] != '.START' and not tok['TEXT'].startswith('======')] # Create a porter stemmer, and run it over the text. stemmer = PorterStemmer() for tok in text: stemmer.stem(tok) # Convert the results to a string, and word-wrap them. results = ' '.join([tok['STEM'] for tok in text]) results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip() # Convert the original to a string, and word wrap it. original = ' '.join([tok['TEXT'] for tok in text]) original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip() # Print the results. print '-Original-'.center(70).replace(' ', '*').replace('-', ' ') print original print '-Results-'.center(70).replace(' ', '*').replace('-', ' ') print results print '*'*70