def test(): import sys import csv import nlp.nlp_paths as nlp_paths f = nlp_paths.get_proj_root() + "/testing/test_interest_text" extractor = InterestExtractor() reader = csv.reader(open(f, "rb"), delimiter="$") i = 0 desired = int(sys.argv[1]) for row in reader: if len(row) == 1: i += 1 if i == desired: # print row[0] print extractor.extract(row[0])
#!/usr/bin/python #By Steve Hanov, 2011. Released to the public domain import time import sys import cPickle import nlp.nlp_paths as paths from nltk.stem.porter import PorterStemmer from nltk.stem.lancaster import LancasterStemmer from nlp.util import chunk_similarity from nltk.metrics import distance MOST_COMMON = paths.get_proj_root()+"/util/1000_most_common.pkl"; DICTIONARY = paths.get_proj_root()+"/freebase_util/interest_lst.pkl"; CUSTOM = ['drinking', 'partying', 'wine'] TOP_K_FILTER = 200 # The Trie data structure keeps a set of words, organized with one node for # each letter. Each node has a branch for each letter that may follow it in the # set of words. class TrieNode: def __init__(self): self.word = None self.children = {} def insert( self, word ): node = self for letter in word: if letter not in node.children: node.children[letter] = TrieNode() node = node.children[letter]