示例#1
0
def make_trie_helper(text, word = None, phrase = None):
    """
    Helper function for make_word_trie and make_phrase_trie
    Use tokenize_sentences to extract sentences
    For sentence in setences:
        - If word: split sentence into words, if word in trie, increment by 1, otherwise set to 1
        - If sentence: split into tuple of words, if sentence if trie, increment by 1, otherwise set to 1
    Return trie
    """
    t = Trie()
    sentences = tokenize_sentences(text)
    for sentence in sentences:
        if word:
            for word in sentence.split(' '):
                if word in t:
                    t[word] += 1
                else:
                    t[word] = 1
        elif phrase:
            sentence = tuple(sentence.split(' '))
            if sentence in t:
                t[sentence] += 1
            else:
                t[sentence] = 1
    return t
示例#2
0
def make_word_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    words in the text, and whose values are the number of times the associated
    word appears in the text
    >>> phrase = 'hello my name is darius'
    >>> word_trie = make_word_trie(phrase)
    >>> 'hello' in word_trie
    True
    >>> 'darius' in word_trie
    True
    >>> 'charles' in word_trie
    False
    """
    new_text = tokenize_sentences(text)  #creates a list of strings
    word_trie = Trie(str)
    for sentence in new_text:  #ex of sentence: 'how shall i ever repay you'
        word = ''
        for i in range(len(sentence)):  #counter to detect last element
            letter = sentence[i]
            if letter != ' ':  #Updates word for letters and not spaces
                word = word + letter
            if letter == ' ' or i == len(sentence) - 1:  #full word
                if word not in word_trie:  #Initializes if it doesn't exist
                    word_trie[word] = 1
                else:  #Updates the count
                    word_trie[word] += 1
                word = ''  #Resets if the count has been updated for the word
    return word_trie
示例#3
0
def make_phrase_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    sentences in the text (as tuples of individual words) and whose values are
    the number of times the associated sentence appears in the text.
    >>> phrase = 'hello my name is darius. I am a MIT student. I like to robots. hello my name is darius. I am on the crew team'
    >>> phrase_trie = make_phrase_trie(phrase) 
    >>> ('hello','my','name','is','darius') in phrase_trie
    True
    """
    new_text = tokenize_sentences(text)  #creates a list of strings
    phrase_trie = Trie(tuple)
    for sentence in new_text:
        words_in_sentence = []
        word = ''
        for i in range(len(sentence)):  #counter to detect last element
            letter = sentence[i]
            if letter != ' ':  #Updates word for letters and not spaces
                word = word + letter
            if letter == ' ' or i == len(sentence) - 1:  #full word
                words_in_sentence.append(word)
                word = ''  #Resets if the count has been updated for the word
        words_in_sentence = tuple(words_in_sentence)
        if words_in_sentence not in phrase_trie:  #Updates phrase trie for the sentence
            phrase_trie[words_in_sentence] = 1
        else:
            phrase_trie[words_in_sentence] += 1
    return phrase_trie
示例#4
0
def frequency_sen(text):
    freq = {}
    s_list = tokenize_sentences(text)
    for sentence in s_list:
        if sentence not in freq:
            freq[sentence] = 1
        else:
            freq[sentence] += 1
    return freq
示例#5
0
def frequency_word(text):
    freq = {}
    s_list = tokenize_sentences(text)
    for sentence in s_list:
        for key in sentence.split(' '):
            if key not in freq:
                freq[key] = 1
            else:
                freq[key] += 1
    return freq
示例#6
0
def make_word_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    words in the text, and whose values are the number of times the associated
    word appears in the text
    """
    trie = Trie()
    for sentence in tokenize_sentences(text):
        for word in sentence.split():
            if word not in trie:
                trie[word] = 0
            trie[word] += 1
    return trie
示例#7
0
def make_phrase_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    sentences in the text (as tuples of individual words) and whose values are
    the number of times the associated sentence appears in the text.
    """
    trie = Trie()
    for sentence in tokenize_sentences(text):
        sentence = tuple(sentence.split())
        if sentence not in trie:
            trie[sentence] = 0
        trie[sentence] += 1
    return trie
示例#8
0
def make_phrase_trie(text):
    """
    Given a piece of text as a single string, return a Trie whose keys are the
    sentences in the text (as tuples of individual words) and whose values are
    the number of times the associated sentence appears in the text.
    """
    t = Trie()
    s_list = tokenize_sentences(text)
    freq = frequency_sen(text)

    for sentence in s_list:
        tuple_sen = tuple(sentence.split(' '))
        t.__setitem__(tuple_sen, freq[sentence])
    return t
示例#9
0
def make_phrase_trie(text):
    """
    Given a piece of text as a single string, return a Trie whose keys are the
    sentences in the text (as tuples of individual words) and whose values are
    the number of times the associated sentence appears in the text.
    """
    t = Trie()
    my_list = tokenize_sentences(text)
    for line in my_list:
        if tuple(line.split()) in t:
            t[tuple(line.split())] += 1
        else:
            t[tuple(line.split())] = 1
    return t
示例#10
0
def make_word_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    words in the text, and whose values are the number of times the associated
    word appears in the text
    """

    text = tokenize_sentences(text)
    counter = text_data(text)['word_count']  # Build a dictionary counting the number of times each word appears

    trie = Trie()
    for word in counter:
        # Build trie
        trie[word] = counter[word]
    return trie
示例#11
0
def make_phrase_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    sentences in the text (as tuples of individual words) and whose values are
    the number of times the associated sentence appears in the text.
    """
    t = Trie()
    sentences = tokenize_sentences(text)
    for sentence in sentences:
        key = tuple(sentence.split())
        if key in t:
            t[key] = t[key] + 1  #adds one to the sentence's value
        else:
            t[key] = 1
    return t
示例#12
0
def make_word_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    words in the text, and whose values are the number of times the associated
    word appears in the text
    """
    t = Trie()
    sentences = tokenize_sentences(text)
    for sentence in sentences:
        for word in sentence.split():
            if word in t:
                t[word] = t[word] + 1  #adds 1 to every word's value
            else:
                t[word] = 1
    return t
示例#13
0
def make_word_trie(text):
    """
    Given a piece of text as a single string, return a Trie whose keys are the
    words in the text, and whose values are the number of times the associated
    word appears in the text
    """
    t = Trie()
    my_list = tokenize_sentences(text)
    for line in my_list:
        for word in line.split():
            if word in t:
                t[word] += 1
            else:
                t[word] = 1
    return t
示例#14
0
def make_word_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    words in the text, and whose values are the number of times the associated
    word appears in the text
    """
    t = Trie()
    for sentence in tokenize_sentences(text): 
        # each sentence is a string of words separated by whitespace
        for word in sentence.split():
            if word not in t:
                t[word] = 1
            else:
                t[word] = t[word] + 1
    return t
示例#15
0
def make_phrase_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    sentences in the text (as tuples of individual words) and whose values are
    the number of times the associated sentence appears in the text.
    """
    full_sentences = tokenize_sentences(text)
    trie = Trie()
    for sentence in full_sentences:
        phrase_list = sentence.split()
        final_tuple = tuple(phrase_list)
        if final_tuple in trie:
            trie[final_tuple] += 1
        else:
            trie[final_tuple] = 1
    return trie
示例#16
0
def make_phrase_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    sentences in the text (as tuples of individual words) and whose values are
    the number of times the associated sentence appears in the text.
    """
    phraseTrie = Trie()

    sentenceList = tokenize_sentences(
        text)  # sentenceList is a list of space-containing strings
    sentenceSet = set(sentenceList)

    for s in sentenceSet:
        tempTuple = tuple(s.split())
        phraseTrie.set(tempTuple, sentenceList.count(s))

    return phraseTrie
示例#17
0
def make_word_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    words in the text, and whose values are the number of times the associated
    word appears in the text
    """
    wordTrie = Trie()

    allText, sentenceList = '', tokenize_sentences(text)
    for sentence in sentenceList:
        allText += sentence + ' '
    stringList = allText.split()
    stringSet = set(stringList)

    for s in stringSet:
        wordTrie.set(s, stringList.count(s))
    return wordTrie
示例#18
0
def make_phrase_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    sentences in the text (as tuples of individual words) and whose values are
    the number of times the associated sentence appears in the text.
    """
    t = Trie()
    key_counts = {}
    new_text = tokenize_sentences(text)
    for sentence in new_text:
        new_s = tuple(sentence.split(' '))
        if new_s in key_counts:
            key_counts[new_s] += 1
        else:
            key_counts[new_s] = 1
    for key in key_counts:
        t.set(key, key_counts[key])
    return t
示例#19
0
def make_word_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    words in the text, and whose values are the number of times the associated
    word appears in the text
    """
    t = Trie()
    key_counts = {}
    new_text = tokenize_sentences(text)
    for sentence in new_text:
        new_s = sentence.split(' ')
        for word in new_s:
            if word in key_counts:
                key_counts[word] += 1
            else:
                key_counts[word] = 1
    for key in key_counts:
        t.set(key, key_counts[key])
    return t
示例#20
0
def make_word_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    words in the text, and whose values are the number of times the associated
    word appears in the text
    """
    word_list = tokenize_sentences(text)
    # print(word_list)
    out_trie = Trie()
    for sentence in word_list:
        for word in sentence.split():
            #print(word, len(word))
            if word in out_trie:
                # print("hello")
                out_trie[word] += 1
            else:
                out_trie[word] = 1

    return out_trie
示例#21
0
def make_phrase_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    sentences in the text (as tuples of individual words) and whose values are
    the number of times the associated sentence appears in the text.
    """

    text = tokenize_sentences(text)
    # Build a dictionary counting the number of times each sentence appears and a list representation of each
    # sentence where each element is a word:
    data = text_data(text)
    counter = data['sentence_count']
    sentences = data['sentence_lists']

    trie = Trie()
    for sentence in sentences:
        # Build trie
        trie[sentence] = counter[sentence]
    return trie
示例#22
0
def make_word_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    words in the text, and whose values are the number of times the associated
    word appears in the text
    >>> t = make_word_trie("code code and more code")
    >>> t['and']
    1
    >>> t['code']
    3
    """
    word_trie = Trie(str)
    sentences = tokenize_sentences(text)
    for s in sentences:
        for w in s.split(" "):
            if w in word_trie:
                word_trie[w] += 1
            else:
                word_trie[w] = 1
    return word_trie
示例#23
0
def make_phrase_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    sentences in the text (as tuples of individual words) and whose values are
    the number of times the associated sentence appears in the text.
    >>> t = make_phrase_trie("I like waffles. I like french toast.")
    >>> t[('i', 'like', 'waffles')]
    1
    >>> t[('i', 'like', 'french', 'toast')]
    1
    """
    phrase_trie = Trie(tuple)
    sentences = tokenize_sentences(text)
    for s in sentences:
        sent_tuple = get_sentence_tuple(s)
        if sent_tuple in phrase_trie:
            phrase_trie[sent_tuple] += 1
        else:
            phrase_trie[sent_tuple] = 1
    return phrase_trie
示例#24
0
def make_phrase_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    sentences in the text (as tuples of individual words) and whose values are
    the number of times the associated sentence appears in the text.
    >>> text = 'I am a test. Test.'
    >>> x = make_phrase_trie(text)
    >>> x[('test',)]
    1
    """
    sentences = tokenize_sentences(text)
    trie = Trie(tuple)

    for sentence in sentences:
        tup_sent = tuple(word for word in sentence.split(' '))
        if tup_sent not in trie:
            trie[tup_sent] = 1
        else:
            trie[tup_sent] += 1
    return trie
示例#25
0
def make_phrase_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    sentences in the text (as tuples of individual words) and whose values are
    the number of times the associated sentence appears in the text.
    """
    #split text into tuples each containing a sentence
    text = tokenize_sentences(text)
    sentences = []
    for sent in text:
        sent = sent.split()
        sentences.append(tuple([x for x in sent]))

    root = Trie()
    for s in sentences:
        #if phrase isn't in trie, add it
        if not root.contains(s):
            root.set(s, 1)
        #if phrase is in trie, add 1 to it's value (frequency)
        else:
            value = root.get(s)
            root.set(s, value + 1)
    return root
示例#26
0
def make_word_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    words in the text, and whose values are the number of times the associated
    word appears in the text
    >>> text = 'I am a test.'
    >>> x = make_word_trie(text)
    >>> x['test']
    1
    >>> text = 'I am a test test.'
    >>> x = make_word_trie(text)
    >>> x['test']
    2
    """
    sentences = tokenize_sentences(text)
    trie = Trie(str)
    for sentence in sentences:
        for word in sentence.split(' '):
            if word not in trie:
                trie[word] = 1
            else:
                trie[word] += 1
    return trie
示例#27
0
def make_word_trie(text):
    """
    Given a piece of text as a single string, create a Trie whose keys are the
    words in the text, and whose values are the number of times the associated
    word appears in the text
    """
    #split text into a list of words
    text = tokenize_sentences(text)
    words = []
    for sentence in text:
        sentence = sentence.split()
        words = words + sentence

    root = Trie()
    for w in words:
        #if word isn't in trie, add it
        if not root.contains(w):
            root.set(w, 1)
        #if word is in trie, add 1 to it's value (frequency)
        else:
            value = root.get(w)
            root.set(w, value + 1)
    return root
示例#28
0
    return [(word, freq) for word, freq in trie if is_match(word, pattern, 0, 0)]


# you can include test cases of your own in the block below.
if __name__ == '__main__':
    import doctest
    doctest.testmod()
   

    with open("alice.txt", encoding="utf-8") as f:
        text = f.read()
        trie = make_phrase_trie(text)
        print(autocomplete(trie, tuple(), 6))
        [('said', 'alice'), ('thought', 'alice'), ('wow',), ('said', 'the', 'caterpillar'), ('said', 'the', 'march', 'hare'), ('beauootiful', 'soooop')]
        print(len(tokenize_sentences(text)))
        print(len(list(trie)))

        trie = make_word_trie(text)
        print(autocorrect(trie, 'hear', 12))
        
        
    with open("metamorphosis.txt", encoding='utf-8') as f:
        text = f.read()
        trie = make_word_trie(text)
        print(autocomplete(trie, 'gre', 6))
        print(word_filter(trie, 'c*h'))

    with open("twocities.txt", encoding="utf-8") as f:
        text = f.read()
        trie = make_word_trie(text)
示例#29
0
def get_words(text):
    return [tuple(i.split()) for i in tokenize_sentences(text, True)]