Пример #1
0
 def test_remove_keyword_via_file(self):
     self.word = KeyWords('./temp.csv')
     self.word.add_keyword('hello')
     self.word.remove_keyword('Hello')
     word2 = KeyWords('./temp.csv')
     self.assertEqual(word2.get_keywords(), [],
                      "KeyWords couldn't remove keywords from a file")
Пример #2
0
class KeyWordUnit(unittest.TestCase):
    def setUp(self):
        self.word = KeyWords()
        self.file = open('temp.csv', 'w')

    def tearDown(self):
        self.word = None
        self.file.close()
        os.remove('temp.csv')

    def test_blank_init(self):
        self.assertEqual(self.word.get_keywords(), [],
                         "KeyWords doesn't initialize empty correctly.")

    def test_bad_path(self):
        with self.assertRaises(FileNotFoundError):
            self.word = KeyWords('./BAD_PATH.NONEXISTENT')

    def test_init_type_error(self):
        with self.assertRaises(TypeError):
            self.word = KeyWords(69)

    def test_add_type_error(self):
        with self.assertRaises(TypeError):
            self.word.add_keyword(69)

    def test_remove_type_error(self):
        with self.assertRaises(TypeError):
            self.word.remove_keyword(69)

    def test_occurrence_type_error(self):
        with self.assertRaises(TypeError):
            self.word.occurrence(69)

    def test_good_path(self):
        self.word = KeyWords('./temp.csv')
        self.assertEqual(
            self.word.get_keywords(), [],
            "KeyWords couldn't access the newly made file correctly.")

    def test_single_word(self):
        self.file.write("hello")
        self.file.close()
        self.word = KeyWords('./temp.csv')
        self.assertEqual(
            self.word.get_keywords(), ['hello'],
            "KeyWords couldn't get keywords from the file correctly.")

    def test_multiple_words(self):
        self.file.write("hello,world")
        self.file.close()
        self.word = KeyWords('./temp.csv')
        self.assertEqual(
            self.word.get_keywords(), ['hello', 'world'],
            "KeyWords couldn't get keywords from the file "
            "correctly.")

    def test_added_keyword(self):
        self.word.add_keyword('hello')
        self.assertEqual(self.word.get_keywords(), ['hello'],
                         "KeyWords couldn't add keywords correctly")

    def test_add_keyword_via_file(self):
        self.word = KeyWords('./temp.csv')
        self.word.add_keyword('hello')
        word2 = KeyWords('./temp.csv')
        self.assertEqual(word2.get_keywords(), ['hello'],
                         "KeyWords couldn't add keywords correctly to a file")

    def test_removed_keyword(self):
        self.word.add_keyword('hello')
        self.word.remove_keyword('Hello')
        self.assertEqual(self.word.get_keywords(), [],
                         "KeyWords couldn't remove a keyword correctly")

    def test_remove_keyword_via_file(self):
        self.word = KeyWords('./temp.csv')
        self.word.add_keyword('hello')
        self.word.remove_keyword('Hello')
        word2 = KeyWords('./temp.csv')
        self.assertEqual(word2.get_keywords(), [],
                         "KeyWords couldn't remove keywords from a file")

    def test_single_occurrence(self):
        self.word.add_keyword('hello')
        self.assertEqual(
            self.word.occurrence(
                "Hello None of this hello text makes yhello much hElLo"),
            [('hello', 3)], "Couldn't count all instances of a keyword")

    def test_multi_occurrence(self):
        self.word.add_keyword('hello')
        self.word.add_keyword('world')
        self.assertEqual(
            self.word.occurrence(
                "Hello None world this helloworld text WoRld yhello much hElLo"
            ), [('hello', 2), ('world', 2)],
            "Couldn't count all instances of multiple keyword")

    def test_empty_occurrenceA(self):
        self.word.add_keyword('hello')
        self.assertEqual(self.word.occurrence(""), [('hello', 0)],
                         "Couldn't handle empty text")

    def test_empty_occurrenceB(self):
        self.assertEqual(self.word.occurrence("This is a fun test text"), [],
                         "Couldn't handle empty KeyWords")
Пример #3
0
# -*- coding: utf-8 -*-
import sys
import codecs
if sys.stdout.encoding != 'cp850':
    sys.stdout = codecs.getwriter('cp850')(sys.stdout.buffer, 'strict')
if sys.stderr.encoding != 'cp850':
    sys.stderr = codecs.getwriter('cp850')(sys.stderr.buffer, 'strict')


from keywords import KeyWords
from nltk.corpus import stopwords

with open('script.txt', 'r') as f:
    data = f.read()

with open('transcript_1.txt', 'r', encoding="utf8") as f1:
    corpus_1 = f1.read()

with open('transcript_2.txt', 'r', encoding="utf8") as f2:
    corpus_2 = f2.read()

with open('transcript_3.txt', 'r', encoding="utf8") as f3:
    corpus_3 = f3.read()

stopWords = stopwords.words('english')
keyword = KeyWords(corpus=corpus_1, stop_words=stopWords, alpha=0.8)
d = keyword.get_keywords(data, n=20)
for i in d:
    print("Keyword : %s \n Score : %f" %(i[0], i[1]))
Пример #4
0
 def test_add_keyword_via_file(self):
     self.word = KeyWords('./temp.csv')
     self.word.add_keyword('hello')
     word2 = KeyWords('./temp.csv')
     self.assertEqual(word2.get_keywords(), ['hello'],
                      "KeyWords couldn't add keywords correctly to a file")
# print(len(fdist))
k = int(len(fdist) / 2.8)
top_k_words = fdist.most_common(k)
# print(top_k_words[-10:])
top_k_words, _ = zip(*fdist.most_common(k))
top_k_words = set(top_k_words)
dfToList = df['text'].tolist()

final_list = []
for i in range(len(dfToList)):
    if i % 9 == 0:
        with open('testcorpus.txt', 'r', encoding="utf8") as f1:
            corpus_1 = f1.read()
        stopWords = stopwords.words('english')
        keyword = KeyWords(corpus=corpus_1, stop_words=stopWords, alpha=0.8)
        d = keyword.get_keywords(str(dfToList[i]), n=2)
        #final_list=[]
        for pair in d:
            for kw in word_tokenize(pair[0]):
                final_list.append(kw)

ps = PorterStemmer()

for i in range(len(final_list)):
    top_k_words.add(ps.stem(final_list[i]))

# print(len(top_k_words))
# print(type(top_k_words))
df['tokenized'] = df['tokenized'].apply(keep_top_k_words)
df['doc_len'] = df['tokenized'].apply(lambda x: len(x))
doc_lengths = list(df['doc_len'])