示例#1
0
def get_prepared_KB(file_prepared_KB, stemmed=True):
    phrase2candidates = {}
    with open(file_prepared_KB, 'r', encoding='utf-8') as f:
        p = ''
        candi_list = []
        for line in f:
            if line.startswith('=='):
                # process last one
                if p != '':
                    phrase2candidates[p] = copy(candi_list)

                # process next one
                strs = line.split('\t')
                p = strs[1].strip()
                candi_list.clear()  # clear the list
            else:
                strs = line.split('\t')
                if len(strs) > 1:
                    explain = strs[1].replace("==SS==", "")
                    explain = explain.replace("==DB==", "")
                    if stemmed:
                        explain = stem_words(explain)
                    candi_list.append(explain)

        if p != '':
            phrase2candidates[p] = copy(candi_list)
    # print(phrase2candidates)
    return phrase2candidates
def get_meals(tokenized_string, enum=False):
    """
    Returns a tuple of (index, meal) or a list of meals from a
    tokenized string.
    
    >>> raw_input_string = "I want cats for breakfast and dogs for dinner."
    >>> tokenizer = nltk.WordPunctTokenizer()
    >>> tokenized_string = tokenizer.tokenize(raw_input_string)
    >>> for i,w in get_meals(tokenized_string, enum=True): print i,w
    4 breakfast
    8 dinner
    """
    
    stemmed_string = utils.stem_words(tokenized_string)
    stemmed_meals = utils.stem_words(wordlists.meal_types)
    results = _extract_words_from_list(stemmed_meals, stemmed_string, True)
    if enum:
        return [(i, tokenized_string[i]) for i, w in results]
    else:
        return [tokenized_string[i] for i, w in results]
def get_meals(tokenized_string, enum=False):
    """
    Returns a tuple of (index, meal) or a list of meals from a
    tokenized string.
    
    >>> raw_input_string = "I want cats for breakfast and dogs for dinner."
    >>> tokenizer = nltk.WordPunctTokenizer()
    >>> tokenized_string = tokenizer.tokenize(raw_input_string)
    >>> for i,w in get_meals(tokenized_string, enum=True): print i,w
    4 breakfast
    8 dinner
    """

    stemmed_string = utils.stem_words(tokenized_string)
    stemmed_meals = utils.stem_words(wordlists.meal_types)
    results = extract_words_from_list(stemmed_meals, stemmed_string, True)
    if enum:
        return [(i, tokenized_string[i]) for i, w in results]
    else:
        return [tokenized_string[i] for i, w in results]
def get_cuisines(tokenized_string, enum=False):
    """
    Returns a tuple of (index, cuisine) or a list of cuisines from a
    tokenized string.
    
    >>> raw_input_string = "I want a chinese or mexican dish."
    >>> tokenizer = nltk.WordPunctTokenizer()
    >>> tokenized_string = tokenizer.tokenize(raw_input_string)
    >>> for i,w in get_cuisines(tokenized_string, enum=True): print i,w
    3 chinese
    5 mexican
    """
    
    stemmed_string = utils.stem_words(tokenized_string)
    cuisines = set.difference(wordlists.cuisines, wordlists.meal_types)
    cuisines = cuisines.union(wordlists.list_of_adjectivals)
    stemmed_cuisines = utils.stem_words(cuisines)
    results = _extract_words_from_list(stemmed_cuisines, stemmed_string, True)
    if enum:
        return [(i, tokenized_string[i]) for i, w in results]
    else:
        return [tokenized_string[i] for i, w in results]
def get_cuisines(tokenized_string, enum=False):
    """
    Returns a tuple of (index, cuisine) or a list of cuisines from a
    tokenized string.
    
    >>> raw_input_string = "I want a chinese or mexican dish."
    >>> tokenizer = nltk.WordPunctTokenizer()
    >>> tokenized_string = tokenizer.tokenize(raw_input_string)
    >>> for i,w in get_cuisines(tokenized_string, enum=True): print i,w
    3 chinese
    5 mexican
    """

    stemmed_string = utils.stem_words(tokenized_string)
    cuisines = set.difference(wordlists.cuisines, wordlists.meal_types)
    cuisines = cuisines.union(wordlists.list_of_adjectivals)
    stemmed_cuisines = utils.stem_words(cuisines)
    results = extract_words_from_list(stemmed_cuisines, stemmed_string, True)
    if enum:
        return [(i, tokenized_string[i]) for i, w in results]
    else:
        return [tokenized_string[i] for i, w in results]
示例#6
0
 def filter_text(self):
     text = self.get_tweet_text()
     text = set(utils.tokenise(text))
     filteredText = list(self.remove_stopwords(text))
     keywords = utils.stem_words(filteredText)
     self.set_tweet_tokens(keywords)