Exemplo n.º 1
0
def test():
    import sys
    import csv
    import nlp.nlp_paths as nlp_paths

    f = nlp_paths.get_proj_root() + "/testing/test_interest_text"
    extractor = InterestExtractor()
    reader = csv.reader(open(f, "rb"), delimiter="$")
    i = 0
    desired = int(sys.argv[1])
    for row in reader:
        if len(row) == 1:
            i += 1
            if i == desired:
                # print row[0]
                print extractor.extract(row[0])
#!/usr/bin/python
#By Steve Hanov, 2011. Released to the public domain
import time
import sys
import cPickle
import nlp.nlp_paths as paths
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nlp.util import chunk_similarity
from nltk.metrics import distance

MOST_COMMON = paths.get_proj_root()+"/util/1000_most_common.pkl";
DICTIONARY = paths.get_proj_root()+"/freebase_util/interest_lst.pkl";
CUSTOM = ['drinking', 'partying', 'wine']
TOP_K_FILTER = 200

# The Trie data structure keeps a set of words, organized with one node for
# each letter. Each node has a branch for each letter that may follow it in the
# set of words.
class TrieNode:
    def __init__(self):
        self.word = None
        self.children = {}

    def insert( self, word ):
        node = self
        for letter in word:
            if letter not in node.children: 
                node.children[letter] = TrieNode()

            node = node.children[letter]