Пример #1
0
class WordCountTest(unittest.TestCase):
    '''
    generate test input files and use them for invoking the tests
    '''
    def setUp(self):
        self.wc = WordCount()
        self.word_dict_counts = {"this":1,"is":2,"a":2,"test":1,"file":2,"contents":1,"are":1,"written":1,"to":1,"used":1,"it":1,"for":2,"the":1,"purpose":1,"testing":2}
        self.word_dict_counts_total = {"this":5,"is":10,"a":10,"test":5,"file":10,"contents":5,"are":5,"written":5,"to":5,"used":5,"it":5,"for":10,"the":5,"purpose":5,"testing":10}
        self.file_list = []
        fp2 =open("test_path.txt","w")
        for i in range(1,6):
            file_name = str(i)+".txt"
            fp =open(file_name,"w")
            fp.write("This is a test file. Contents are written to the file for testing!\n")
            fp.write("It is used for a testing purpose!\n")
            fp2.write(os.path.abspath(file_name))
            fp2.write("\n")
            fp.close()
            self.file_list.append(os.path.abspath(file_name))
        fp2.close()
   
    def test_count_totoal_words(self):
        countedWords = self.wc.process_input_file("test_path.txt")
        #print (countedWords)
        self.assertEqual(len(countedWords), len(self.word_dict_counts_total), FAILURE)

    def test_count_words_single_file_2(self):
        countedWords = self.wc.count_words("2.txt")
        self.assertEqual(len(countedWords), 15, FAILURE)

    def test_search_word_totoal_count(self):
        countedWords = self.wc.process_input_file("test_path.txt")
        table = self.wc.search_word("file")
        self.assertEqual(countedWords['file']['total_count'], self.word_dict_counts_total["file"], FAILURE)

    def test_search_word_totoal_count_individual_file(self):
        countedWords = self.wc.count_words("2.txt")
        self.assertEqual(countedWords['file']['total_count'], self.word_dict_counts["file"], FAILURE)

    def test_invalid_search_word(self):
        countedWords = self.wc.process_input_file("test_path.txt")
        table = self.wc.search_word("invalid_word")
        self.assertFalse("invalid_word" in countedWords)

    def test_list_words(self):
        countedWords = self.wc.process_input_file("test_path.txt")
        for k,v in self.word_dict_counts_total.items():
            self.assertEqual(countedWords[k]['total_count'],v,FAILURE)

    def test_search_words_count_across_files(self):
        countedWords = self.wc.process_input_file("test_path.txt")
        word_info = countedWords["for"]
        for i in range(len(self.file_list)):
            self.assertEqual(word_info[self.file_list[i]],self.word_dict_counts["for"],FAILURE)

    def tearDown(self):
        for i in range(len(self.file_list)):
            os.remove(self.file_list[i])
        os.remove("test_path.txt")
Пример #2
0
 def __init__(self):
     self.files = FileUtils().get_files_from_path(sys.argv[1])
     self.file_count = len(self.files)
     self.k1 = 1.2
     self.b = 0.75
     self.max_score = 0
     self.min_score = 100
     self.avg_doc_length = 0
     self.idf = {}
     self.word_count_calculator = WordCount()
     self.bm_25_scores = {}
Пример #3
0
 def test_word_occurance8(self):
     self.assertDictEqual({
         'hello': 1,
         'world': 1
     },
                          WordCount.words('hello\nworld'),
                          msg='should not count multilines')
Пример #4
0
 def test_word_occurance9(self):
     self.assertDictEqual({
         'hello': 1,
         'world': 1
     },
                          WordCount.words('hello\tworld'),
                          msg='should not count tabs')
Пример #5
0
 def test_word_occurance0(self):
     self.assertDictEqual({
         'hello': 1,
         'world': 1
     },
                          WordCount.words('hello  world'),
                          msg='should count multiple spaces as one')
Пример #6
0
 def setUp(self):
     self.wc = WordCount()
     self.word_dict_counts = {"this":1,"is":2,"a":2,"test":1,"file":2,"contents":1,"are":1,"written":1,"to":1,"used":1,"it":1,"for":2,"the":1,"purpose":1,"testing":2}
     self.word_dict_counts_total = {"this":5,"is":10,"a":10,"test":5,"file":10,"contents":5,"are":5,"written":5,"to":5,"used":5,"it":5,"for":10,"the":5,"purpose":5,"testing":10}
     self.file_list = []
     fp2 =open("test_path.txt","w")
     for i in range(1,6):
         file_name = str(i)+".txt"
         fp =open(file_name,"w")
         fp.write("This is a test file. Contents are written to the file for testing!\n")
         fp.write("It is used for a testing purpose!\n")
         fp2.write(os.path.abspath(file_name))
         fp2.write("\n")
         fp.close()
         self.file_list.append(os.path.abspath(file_name))
     fp2.close()
Пример #7
0
 def test_word_occurance2(self):
     self.assertDictEqual({
         'one': 1,
         'of': 1,
         'each': 1
     },
                          WordCount.words("one of each"),
                          msg='should count one of each')
Пример #8
0
 def test_word_occurance5(self):
     self.assertDictEqual({
         'testing': 2,
         1: 1,
         2: 1
     },
                          WordCount.words('testing 1 2 testing'),
                          msg='should include numbers')
Пример #9
0
 def test_word_occurance6(self):
     self.assertDictEqual({
         'go': 1,
         'Go': 1,
         'GO': 1
     },
                          WordCount.words('go Go GO'),
                          msg='should respect case')
Пример #10
0
def main():
	# NOTE: Change this to name of .txt file you are using within /data/.
	file_path = 'data/book.txt'

	start = time.time()
	print("Starting up.")
	k = int(input("Enter a value for k:\n"))

	# Parse and sort data using collections.
	data = WordCount(file_path)
	data.sort_and_pop_word_dict(k)

	# Parse and sort data using dictionary and max heap.
	# data = WordCountDict('data/book.txt')
	# heap = MaxHeap()
	# heap.insert_dict(data.get_word_dict())
	# heap.pop_top_k_words(k)

	print(time.time()-start)
Пример #11
0
 def test_word_occurance7(self):
     self.assertDictEqual(
         {
             "¡Hola!": 1,
             "¿Qué": 1,
             "tal?": 1,
             "Привет!": 1
         },
         WordCount.words('¡Hola! ¿Qué tal? Привет!'),
         msg='should count international characters properly')
Пример #12
0
def generate_counts():
    res = request.get_json()
    urls = res['urls']
    print(urls)
    for url in urls:
        if cache.get(url) is None:
            cache.set(url, '')
            job = q.enqueue_call(func=WordCount().run, args=(url, ))

    return ('Accepted', 202)
Пример #13
0
 def test_word_occurance3(self):
     self.assertDictEqual(
         {
             'one': 1,
             'fish': 4,
             'two': 1,
             'red': 1,
             'blue': 1
         },
         WordCount.words("one fish two fish red fish blue fish"),
         msg='should count multiple occurrences')
Пример #14
0
 def test_word_occurance4(self):
     self.assertDictEqual(
         {
             'car': 1,
             ":": 2,
             'carpet': 1,
             'as': 1,
             'java': 1,
             'javascript!!&@$%^&': 1
         },
         WordCount.words('car : carpet as java : javascript!!&@$%^&'),
         msg='should include punctuation')
def generate_dict(reviewList):
    dictionary = wc.count_words(wc, reviewList)
    dictionary = wc.sort_freq_dict(wc, dictionary)
    return dictionary
Пример #16
0
# coding=utf-8

from word_count import WordCount
from lookup import lookup


FLAG_LAST_WORD = 1
FLAG_MIDDLE_WORD = 2

wc = WordCount()


def word_topk(k, flag):
    if flag == FLAG_LAST_WORD:
        return wc.last_word_topk(k)
    if flag == FLAG_MIDDLE_WORD:
        return wc.middle_word_topk(k)
    return []


def word_freq(word, flag):
    """返回常见系数、级别"""
    if flag == FLAG_LAST_WORD:
        ret = wc.last_word_freq(word)
    else:
        ret = wc.middle_word_freq(word)

    freq, rank, total = ret
    # 1/4 -- 2/4 -- 3/4 -- 1
    if rank <= total / 4:
        comment = u'大众名'
Пример #17
0
 def test_word_occurance1(self):
     self.assertDictEqual({'word': 1},
                          WordCount.words('word'),
                          msg='should count one word')
Пример #18
0
class TermWeights:
    def __init__(self):
        self.files = FileUtils().get_files_from_path(sys.argv[1])
        self.file_count = len(self.files)
        self.k1 = 1.2
        self.b = 0.75
        self.max_score = 0
        self.min_score = 100
        self.avg_doc_length = 0
        self.idf = {}
        self.word_count_calculator = WordCount()
        self.bm_25_scores = {}

    # Calculates the IDF value for a word
    def calculate_idf(self):
        for word, count in self.word_count_calculator.document_count.items():
            self.idf[word] = math.log10(
                (self.file_count - count + 0.5) / (count + 0.5))
        self.avg_doc_length = sum(self.word_count_calculator.document_length.
                                  values()) / self.file_count

    # normalizes the BM25 scores to values between 0 and 1
    def normalize_score(self, score):
        if (self.max_score - self.min_score) == 0:
            return 0
        else:
            return ((score - self.min_score) /
                    (self.max_score - self.min_score))

    # Primary method that calculates the BM25 score using the word count
    # Also keeps track of the max and min BM25 scores to be used for normalization
    def calculate(self):
        self.word_count_calculator.calculate()
        self.calculate_idf()
        for file_name, word_count_dict in self.word_count_calculator.word_counts.items(
        ):
            bm_25 = {}
            for word, count in word_count_dict.items():
                try:
                    if self.word_count_calculator.global_count[
                            word] != 1 and len(word) != 1:
                        count = int(count)
                        numerator = (count *
                                     (self.k1 + 1)) * float(self.idf[word])
                        denominator = (
                            count + self.k1 *
                            (1 - self.b +
                             (self.b *
                              (int(self.word_count_calculator.document_length[
                                  file_name]) / self.avg_doc_length))))
                        bm_25_score = (numerator / denominator)
                        bm_25[word] = bm_25_score
                        if bm_25_score > self.max_score:
                            self.max_score = bm_25_score
                        if bm_25_score < self.min_score:
                            self.min_score = bm_25_score
                except KeyError:
                    continue
            self.bm_25_scores[file_name] = bm_25
        self.normalize()

    def normalize(self):
        for file_name, bm_25 in self.bm_25_scores.items():
            for word, score in bm_25.items():
                bm_25[word] = self.normalize_score(score)