Python HashTableLinear.HashTableLinear примеры, hashtables.HashTableLinear.HashTableLinear Python примеры использования

Пример #1

0

Показать файл

 def __init__(self, directory, stopwords=[]):
     self.doc_length = HashTableLinear(
     )  # Replace HashTableLinear() with your hash table.
     self.doc_freqs = HashTableLinear(
     )  # this will not be used in this assignment
     self.term_freqs = HashTableLinear()
     self.stopwords = stopwords
     self.index_files(directory)
     self.directory = directory

Пример #2

0

Показать файл

    def __init__(self, directory, stopwords):
        """ Initialize the data structure by taking a directory name and a
            hash table containing stopwords.
            Args:
                directory (str): a directory name
                stopwords (HashMap): a hash table containing stopwords
        """

        self.doc_length = HashTableLinear()
        self.term_freqs = HashTableLinear()
        self.stopwords = stopwords
        self.file_list = []
        self.index_files(directory)

Пример #3

0

Показать файл

 def get_scores(self, terms):
     """creates a list of scores for each file in corpus
     The score = weighted frequency / the total word count in the file.
     Compute this score for each term in a query and sum all the scores.
     Args:
         terms (list) : a list of str
     Returns:
         list : a list of tuples, each containing the filename and its relevancy score
     """
     scores = HashTableLinear()
     for term in terms:
         if self.term_freqs.contains(term):
             hashtable = self.term_freqs[term]
             for file1 in hashtable.slots:
                 if file1 != None and hashtable.contains(file1[0]):
                     if scores.contains(file1[0]):
                         key, val = scores.remove(file1[0])
                         scores.put(file1[0], val + self.get_wf(file1[1]))
                     else:
                         scores.put(file1[0], self.get_wf(file1[1]))
     for file1 in scores.slots:
         if file1 is not None:
             key, val = scores.remove(file1[0])
             val /= self.doc_length.get(file1[0])
             scores.put(file1[0], val)
     return scores

Пример #4

0

Показать файл

    def get_scores(self, terms):
        """ Creates a list of scores for each file in corpus
            The score = weighted frequency / the total word count in file.

            Args:
                terms (list): a list of str
            Returns:
                list: a list of tuples, each containing the file_path_name
                      and its relevancy score
        """

        scores = HashTableLinear()
        for term in terms:
            word_hash_table = self.term_freqs.get(term)
            for file in self.file_list:
                if word_hash_table.contains(file):
                    if scores.contains(file):
                        scores[file] += self.get_wf(word_hash_table[file])
                    else:
                        scores[file] = self.get_wf(word_hash_table[file])

        score_list = []
        for file in self.file_list:
            if scores.contains(file) and scores[file] > 0:
                norm_score = scores[file] / self.doc_length[file]
                score_list.append((file, norm_score))
        return score_list

Пример #5

0

Показать файл

Файл: SearchEngine.py Проект: daryadarvish/Python-Search-Engine

 def count_words(self, filename, words):
     """count words in a file and store the frequency of each
     word in the term_freqs hash table. The keys of the term_freqs hash table shall be
     words. The values of the term_freqs hash table shall be hash tables (term_freqs
     is a hash table of hash tables). The keys of the hash tables (inner hash table) stored
     in the term_freqs shall be file names. The values of the inner hash tables shall be
     the frequencies of words. For example, self.term_freqs[word][filename] += 1;
     Words should not contain stopwords.
     Also store the total count of words contained in the file in the doc_length hash table.
     Args:
     filename (str) : the file name
     words (list) : a list of words
     """
     for item in words:
         if self.term_freqs.contains(item):
             if self.term_freqs[item].contains(filename):
                 self.term_freqs[item][filename] += 1
             else:
                 self.term_freqs[item].put(filename, 1)
         else:
             ht = HashTableLinear()
             ht.put(filename, 1)
             self.term_freqs.put(item, ht)
         if self.doc_length.contains(filename):
             self.doc_length[filename] += 1
         else:
             self.doc_length.put(filename, 1)

Пример #6

0

Показать файл

Файл: SearchEngine.py Проект: daryadarvish/Python-Search-Engine

 def get_scores(self, terms):
     """creates a list of scores for each file in corpus
     The score = weighted frequency / the total word count in the file.
     Compute this score for each term in a query and sum all the scores.
     Args:
     terms (list) : a list of str
     Returns:
     list : a list of tuples, each containing the filename and its relevancy score
     """
     scores = HashTableLinear()
     for item in terms:
         if item in self.term_freqs:
             temp = self.term_freqs[item]
             for thing in temp.hash_table:
                 if thing is not None:
                     if thing[0] in scores:
                         scores[thing[0]] += self.get_wf(thing[1])
                     else:
                         scores.put(thing[0], self.get_wf(thing[1]))
     for item in scores.hash_table:
         if item is None:
             continue
         else:
             scores[item[0]] /= self.doc_length[item[0]]
     return scores

Пример #7

0

Показать файл

Файл: project4_tests.py Проект: legochris/project4_basic_search

    def test_whole_functionality(self):
        """ Tests the Separate Chain Hash Table Functionality"""

        filename = 'stop_words.txt'
        hash_table = HashTableLinear()

        hash_table = import_stopwords(filename, hash_table)

        self.assertRaises(KeyError, hash_table.get, 'BubbleGum')
        self.assertTrue('to' in hash_table)

        second_hash = HashTableLinear()
        second_hash.put('three', 'three')
        third_hash = HashTableLinear()
        third_hash.put('three', 'three')
        self.assertEqual(second_hash, third_hash)
        self.assertNotEqual(hash_table, second_hash)
        self.assertNotEqual(hash_table, 5)
        expected = "Hash_val = 0: None\n" \
            "Hash_val = 1: None\n" \
            "Hash_val = 2: None\n" \
            "Hash_val = 3: None\n" \
            "Hash_val = 4: ('three', 'three')\n" \
            "Hash_val = 5: None\n" \
            "Hash_val = 6: None\n" \
            "Hash_val = 7: None\n" \
            "Hash_val = 8: None\n" \
            "Hash_val = 9: None\n" \
            "Hash_val = 10: None\n"

        self.assertEqual(expected, repr(second_hash))

        second_hash['four'] = 'four'
        self.assertEqual(second_hash['four'], 'four')
        second_hash['five'] = 'five'
        self.assertEqual(0, hash_table.get('from'))

        self.assertFalse(second_hash.contains('p'))
        self.assertTrue(second_hash.contains('five'))
        second_hash.remove('five')
        self.assertFalse(second_hash.contains('five'))
        self.assertRaises(KeyError, second_hash.remove, 'p')

        self.assertEqual(1, third_hash.size())

        self.assertEqual(0, third_hash.collisions())

Пример #8

0

Показать файл

Файл: project4_tests.py Проект: parthray16/DataStructures

 def test_SE(self):
     SE = SearchEngine(
         "docs", import_stopwords("stop_words.txt", HashTableLinear()))
     self.assertEqual(SE.doc_length.num_items, 4)
     self.assertEqual(SE.stopwords,
                      import_stopwords("stop_words.txt", HashTableLinear()))
     self.assertEqual(
         SE.search("Computer Science")[0], Pair("docs\\test.txt", 1.0))
     self.assertEqual(SE.search("ADT")[0][0], "docs\\data_structure.txt")
     self.assertEqual(round(SE.search("ADT")[0][1], 2), 0.01)
     self.assertEqual(
         SE.search("Hash Table")[1][0], "docs\\data_structure.txt")
     self.assertEqual(round(SE.search("Hash Table")[1][1], 2), 0.01)
     list_of_pairs = [
         Pair("P", 5),
         Pair("A", 2),
         Pair("R", 1),
         Pair("T", 4),
         Pair("H", 3)
     ]
     self.assertEqual(SE.rank(list_of_pairs), [
         Pair("P", 5),
         Pair("T", 4),
         Pair("H", 3),
         Pair("A", 2),
         Pair("R", 1)
     ])
     self.assertEqual(
         SE.get_scores(["computer", "science"])[0],
         Pair("docs\\test.txt", 1.0))
     self.assertEqual(SE.get_scores(["every", "nothing", "few"]), [])
     self.assertEqual(round(SE.get_wf(6), 2), 2.79)
     self.assertEqual(SE.get_wf(-6), 0)
     list1 = [
         "Automated information retrieval systems of ",
         "Information retrieval and afterwards say\n"
     ]
     list2 = [
         'automated', 'information', 'retrieval', 'systems', 'information',
         'retrieval'
     ]
     self.assertEqual(SE.parse_words(list1), list2)
     self.assertEqual(
         SE.parse_words(["and afterwards say\n", "much without the"]), [])
     self.assertEqual(SE.read_file("docs\\test.txt"),
                      ["computer science\n"])

Пример #9

0

Показать файл

Файл: hashtables_tests.py Проект: adrianabarca42/CPE-202

 def test_linear3(self):
     ht = HashTableLinear()
     stop_words = import_stopwords(stop_words.txt, ht)
     self.assertEqual(stop_words.size(), 305)
     self.assertTrue(0.3 <= stop_words.load_factor() <= 0.4)
     self.assertFalse("collision" in stop_words)
     self.assertTrue("very" in stop_words)
     self.assertFalse("linear" in stop_words)
     self.assertTrue("a" in stop_words)

Пример #10

0

Показать файл

Файл: hashtables_tests.py Проект: adrianabarca42/CPE-202

 def test_linear2(self):
     ht = HashTableLinear()
     for i in range(22):
         ht.put(chr(i), i)
     self.assertEqual(ht.size(), 22)
     self.assertTrue(ht.load_factor() <= 0.75)
     self.assertTrue(ht.contains(chr(0)))
     self.assertTrue(ht.contains(chr(1)))
     self.assertTrue(ht.contains(chr(19)))
     self.assertFalse(ht.contains(chr(22)))

Пример #11

0

Показать файл

 def test_import_stopwords(self):
     hashtable = import_stopwords("stop_words.txt", HashTableSepchain())
     self.assertEqual(hashtable["unless"], "unless")
     self.assertRaises(KeyError, hashtable.get, "Parth")
     hashtable = import_stopwords("stop_words.txt", HashTableLinear())
     self.assertEqual(hashtable["unless"], "unless")
     self.assertRaises(KeyError, hashtable.get, "Parth")
     hashtable = import_stopwords("stop_words.txt", HashTableQuadratic())
     self.assertEqual(hashtable["unless"], "unless")
     self.assertRaises(KeyError, hashtable.get, "Parth")

Пример #12

0

Показать файл

Файл: hashtables_tests.py Проект: adrianabarca42/CPE-202

 def test_linear1(self):
     ht = HashTableLinear()
     for i in range(11):
         ht.put(str(i), i)
     self.assertEqual(ht.size(), 11)
     self.assertTrue(ht.load_factor() <= 0.75)
     self.assertTrue(ht.contains('0'))
     self.assertTrue(ht.contains('1'))
     self.assertTrue(ht.contains('10'))
     self.assertFalse(ht.contains('11'))

Пример #13

0

Показать файл

def entry_point(dir_name):
    ht = HashTableLinear()
    stop_words = import_stopwords('stop_words.txt', ht)
    search = SearchEngine(dir_name, stop_words)
    while True:
        s = input('Input Search: ')
        if s == 'q':
            break
        scores = search.search(s)
        print(scores)

Пример #14

0

Показать файл

def build_stopwords(filename):
    """ Function to build hash table of stop words from a text list
        Args:
            filename (str): path of stop words file
    """

    hash_table = HashTableLinear()
    stop_words = import_stopwords(filename, hash_table)

    return stop_words

Пример #15

0

Показать файл

def main(
    directory
):  #Set the location of the files you want the engine to search through as the directory parameter.
    hash = HashTableLinear()
    hash = import_stopwords("stop_words.txt", hash)
    search = SearchEngine(directory, hash)
    while True:
        inp = input("Search here:")
        if inp == "q":
            break
        elif inp == "s:":
            inp = input("Search multiple things:")
            print(search.search(inp))

Пример #16

0

Показать файл

 def count_words(self, filename, words):
     """count words in a file and store the frequency of each
     word in the term_freqs hash table. Words should not contain stopwords.
     Also store the total count of words contained in the file
     in the doc_length hash table.
     Args:
     filename (str) : the file name
     words (list) : a list of words
     """
     #file_lines = self.read_file(filename)
     #str_list = self.parse_words(file_lines)
     for i in words:
         self.term_freqs.put(i, HashTableLinear())
     self.doc_freqs.put(filename, 0)
     for i in words:
         if i in self.term_freqs and filename in self.term_freqs[i]:
             self.term_freqs[i][filename] = self.term_freqs[i][filename] + 1
         elif i and i in self.term_freqs:
             self.term_freqs[i].put(filename, 1)
         elif i:
             self.term_freqs.put(i, HashTableLinear())
             self.term_freqs[i].put(filename, 1)
     self.doc_freqs[filename] = len(words)

Пример #17

0

Показать файл

Файл: project4_tests.py Проект: legochris/project4_basic_search

    def test_basic(self):
        """ Tests basic functionality"""

        hash_table = HashTableLinear()
        hash_table.put('unless', 'unless')
        self.assertTrue(hash_table.contains('unless'))
        hash_table.put('every', 'every')
        hash_table.put('being', 'being')
        hash_table.put('elsewhere', 'elsewhere')
        hash_table.put('nothing', 'nothing')
        hash_table.put('hereby', 'hereby')
        hash_table.put('latter', 'latter')
        hash_table.put('and', 'and')
        hash_table.put('afterwards', 'afterwards')
        hash_table.put('say', 'say')
        hash_table.put('very', 'very')
        hash_table.put('few', 'few')
        hash_table.put('well', 'well')
        hash_table.put('various', 'various')
        hash_table.put('make', 'make')
        hash_table.put('regarding', 'regarding')
        hash_table.put('take', 'take')
        hash_table.put('give', 'give')
        hash_table.put('whole', 'whole')
        hash_table.put('i', 'i')
        hash_table.put('against', 'against')
        hash_table.put('can', 'can')

        hash_table.get('every')
        hash_table.get('being')
        hash_table.get('elsewhere')
        hash_table.get('nothing')
        hash_table.get('hereby')
        hash_table.get('latter')
        hash_table.get('and')
        hash_table.get('afterwards')
        hash_table.get('say')
        hash_table.get('very')
        hash_table.get('few')
        hash_table.get('well')
        hash_table.get('various')
        hash_table.get('make')
        hash_table.get('regarding')
        hash_table.get('take')
        hash_table.get('give')
        hash_table.get('whole')
        hash_table.get('i')
        hash_table.get('against')
        hash_table.get('can')

Пример #18

0

Показать файл

 def count_words(self, filename, words):
     """count words in a file and store the frequency of each
        word in the term_freqs hash table. Words should not contain stopwords.
        Also store the total count of words contained in the file in the doc_length hash table.
     Args:
         filename (str) : the file name
         words (list) : a list of words
     """
     self.doc_length[filename] = len(words)
     for i in words:
         if not i in self.term_freqs:
             self.term_freqs[i] = HashTableLinear()
         if filename in self.term_freqs[i]:
             self.term_freqs[i][filename] += 1
         else:
             self.term_freqs[i][filename] = 1

Пример #19

0

Показать файл

def main():
    """It  takes a directory name as its command line argument and continuously askes for
       user input on what query terms to search for. It will return the relavent files
       associated with the query terms or inputing q will exit the function and return None
    """
    search_engine = SearchEngine(
        sys.argv[1], import_stopwords("stop_words.txt", HashTableLinear()))
    while True:
        user_input = input(
            "Type 's:' and what you would like to search for or type 'q' to exit: "
        )
        if user_input == "q":
            return
        if "s:" in user_input:
            user_input = user_input[2::].lower().strip()
            print(search_engine.search(user_input))

Пример #20

0

Показать файл

 def count_words(self, filename, words):
     """
     Args:
         filename (str) : the file name
         words (list) : a list of words
     """
     for word in words:
         if word not in self.term_freqs:
             self.term_freqs[word] = HashTableLinear()
             self.term_freqs[word][filename] = 1
         else:
             if filename not in self.term_freqs[word]:
                 self.term_freqs[word][filename] = 1
             else:
                 self.term_freqs[word][filename] += 1
     self.doc_length.put(filename, len(words))

Пример #21

0

Показать файл

    def test_HashTableLinear(self):
        t = HashTableLinear()

        self.assertEqual(t.size(), 0)
        self.assertFalse(t.contains('us'))
        self.assertRaises(KeyError, t.get, 'us')

        t.put('us', 'us')
        self.assertEqual(t.get('us'), 'us')
        self.assertEqual(t['us'], 'us')
        self.assertTrue(t.contains('us'))
        self.assertFalse(t.contains('say'))
        self.assertEqual(t.size(), 1)
        self.assertEqual(t.collisions(), 0)

        t.put('say', 'say')
        self.assertEqual(t.get('say'), 'say')
        self.assertTrue(t.contains('say'))
        self.assertEqual(t.size(), 2)
        self.assertEqual(t.collisions(), 1)

        t.remove('say')
        self.assertFalse(t.contains('say'))
        self.assertTrue(t.contains('us'))
        t.remove('us')
        self.assertEqual(t.size(), 0)

        # print(hash_string('the', 11)) # 'the' = 5
        t.put('us', 'us')
        t.put('say', 'say')
        # self.assertEqual(t.load_factor(), 0.18181818181818182)
        t.put('the', 'the')
        # t.put(chr(0), chr(0))
        # t.put('0', '0')
        # print('chr 0', chr(0))
        # print('just 0', '0')
        # print(type(chr(0)))
        # print(type('0'))

        # print(hash_string('us', 23)) # 'the' = 5
        # print(hash_string('say', 23)) # 'the' = 5
        # print(hash_string('the', 23)) # 'the' = 5
        # print('from tests', t)

        self.assertTrue(t.contains('us'))
        self.assertTrue(t.contains('the'))

Пример #22

0

Показать файл

    def test_linear4(self):
        ht = HashTableLinear()
        for i in range(22):
            ht.put(chr(i), i)
        self.assertEqual(ht.size(), 22)
        self.assertTrue(ht.load_factor() <= 0.75)
        self.assertEqual(ht[chr(0)], 0)
        self.assertEqual(ht[chr(1)], 1)
        self.assertEqual(ht[chr(19)], 19)

        self.assertRaises(KeyError, ht.get, 'a')

        for i in range(22):
            ht.remove(chr(i))
        self.assertFalse(ht.contains(chr(0)))
        self.assertFalse(ht.contains(chr(1)))
        self.assertFalse(ht.contains(chr(19)))

        self.assertRaises(KeyError, ht.remove, 'a')

Пример #23

0

Показать файл

 def test_hash_linear(self):
     table = HashTableLinear()
     self.assertEqual(table.table_size, 11)
     table["3"] = "3"
     table["2"] = "2"
     table["4"] = "4"
     table["5"] = "5"
     self.assertEqual("5" in table, True)
     self.assertEqual("6" in table, False)
     self.assertRaises(KeyError, table.get, "6")
     table["3"] = "6"
     self.assertEqual(table["3"], "6")
     table[chr(40)] = "20"
     self.assertEqual(table["3"], "6")
     self.assertEqual(table.num_collisions, 1)
     table.remove("3")
     table.remove("4")
     self.assertRaises(KeyError, table.get, "4")
     self.assertRaises(KeyError, table.remove, "4")

Пример #24

0

Показать файл

Файл: SearchEngine.py Проект: daryadarvish/Python-Search-Engine

def main():
    # execute unit tests
    directory = input("please enter a directory name\n")
    yeet = True
    while yeet:
        command = input("press q to exit\n"
                        "press s to search\n"
                        "What would you like to do?\n")
        if command == "q":
            break
        elif command == "s":
            search = SearchEngine(
                directory, import_stopwords("stop_words.txt",
                                            HashTableLinear()))
        else:
            print("that is not a valid command\n")
            continue
        new_query = [input("what would you like to search?\n")]
        query_string = search.parse_words(new_query)
        search.search(query_string)

Пример #25

0

Показать файл

    def search(self, query):
        """ Search for the query terms in files
            Args:
                query (str): query input: e.g. "computer science"
            Returns:
                list: a list of tuples: (files_path_name, score) sorted in
                descending order or relevancy excluding files whose relevancy
                score is 0.
        """

        terms = self.parse_words([query])
        cleaned_terms = []
        hash_terms = HashTableLinear()
        for term in terms:
            if not hash_terms.contains(term):
                cleaned_terms.append(term)
            hash_terms.put(term, term)
        scores = self.get_scores(cleaned_terms)
        scores = self.rank(scores)

        return scores

Пример #26

0

Показать файл

    def count_words(self, file_path_name, words):
        """ Count words in a file and store the frequency of each word in the
            term_freqs hash table. The keys of the term_freqs hash table shall
            be words. The values of the term_freqs hash table shall be hash
            tables (term_freqs is a hash table of hash tables). The keys of
            the hash tables (inner hash table) stored in the term_freqs shall
            be file names. Values of inner hash tables shall be the frequencies
            of words.

        Args:
            file_path_name (str): the file name
            words (list): a list of words
        """
        self.doc_length.put(file_path_name, len(words))

        while len(words) > 0:
            current_word = words[0]
            word_freq = 0

            word_freq = words.count(current_word)

            try:
                while True:
                    words.remove(current_word)
            except ValueError:
                pass

            # If the word already in term_freqs, retrieve the doc freq table
            # otherwise, create a new hash table
            if current_word in self.term_freqs:
                freq_hash = self.term_freqs.get(current_word)
            else:
                freq_hash = HashTableLinear()

            freq_hash.put(file_path_name, word_freq)
            self.term_freqs.put(current_word, freq_hash)

Пример #27

0

Показать файл

 def get_scores(self, terms):
     """creates a list of scores for each file in corpus
        The score = weighted frequency / the total word count in the file.
        Compute this score for each term in a query and sum all the scores.
     Args:
         terms (list) : a list of str
     Returns:
         list : a list of Pairs, each containing the filename and its relevancy score
     """
     scores = HashTableLinear()
     for query in terms:
         if query in self.term_freqs:
             term_hash = self.term_freqs[query]
             for i in term_hash.table:
                 if i:
                     if not i.key in scores:
                         scores[i.key] = 0
                     scores[i.key] += self.get_wf(i.data)
     for j in range(len(scores.table)):
         if scores.table[j]:
             scores.table[j].data /= self.doc_length[scores.table[j].key]
         else:
             scores.table[j] = Pair(None, 0)
     return keys(scores)

Пример #28

0

Показать файл

 def __init__(self, directory, stopwords):
     self.doc_length = HashTableLinear()
     self.term_freqs = HashTableLinear()
     self.stopwords = stopwords
     self.index_files(directory)

Пример #29

0

Показать файл

Файл: SearchEngine.py Проект: daryadarvish/Python-Search-Engine

 def __init__(self, directory, stopwords):
     self.doc_length = HashTableLinear(
     )  # Replace HashMap() with your hash table.
     self.term_freqs = HashTableLinear()
     self.stopwords = stopwords
     self.index_files(directory)

Python HashTableLinear.HashTableLinear примеры использования