Exemplo n.º 1
0
 def setUp(self):
     stopwords = "stop".split()
     keywords = "aaa bbb ccc ddd eee fff".split()
     documents = [
         ("document 1 ccc", "aaa aaa aaa ccc"),
         ("document 2 stop", "stop aaa bbb ccc"),
         ("document 3 stop", "aaa"),
         ("document 4 ddd", "aaa bbb ccc ddd eee"),
     ]
     self.s = TFIDF(keywords, documents, Cleaner(stopwords))
Exemplo n.º 2
0
 def setUp(self):
     stopwords = "stop".split()
     keywords = "information agency retrieval".split()
     # documents = [
     #        ("Document 1", "information retrieval information retrieval"),
     #        ("Document 2", "retrieval retrieval retrieval retrieval"),
     #        ("Document 3", "agency information retrieval agency"),
     #        ("Document 4", "retrieval agency retrieval agency"),
     #    ]
     documents = Loader.load_documents("data/documents-lab1.txt")
     self.s = TFIDF(keywords, documents, Cleaner(stopwords))
Exemplo n.º 3
0
 def setUp(self):
     stopwords = "stop".split()
     keywords = "bee wasp fly fruit like".split()
     documents = [
         ("D1", "Time fly like an arrow but fruit fly like a banana."),
         ("D2", "It's strange that bees and wasps don't like each other."),
         ("D3", "The fly attendant sprayed the cabin with a strange fruit " "aerosol."),
         ("D4", "Try not to carry a light, as wasps and bees may fly " "toward it."),
         ("D5", "Fruit fly fly around in swarms. When fly they flap their " "wings 220 times a second."),
     ]
     self.s = TFIDF(keywords, documents, Cleaner(stopwords))
Exemplo n.º 4
0
    parser.add_argument('-s', '--stopwords', help="Stopwords file path",
            default="data/stopwords.txt")
    parser.add_argument('-d', '--documents', help="Documents file path",
            default="data/documents-2.txt")
    parser.add_argument('-n', '--noresults',
            help="Number of displayed results", default="5")
    parser.add_argument('-v', '--version', action='version',
            version='%(prog)s 0.3')
    args = parser.parse_args()

    keywords = Loader.load_keywords(args.keywords)
    stopwords = Loader.load_stopwords(args.stopwords)
    documents = Loader.load_documents(args.documents)
    n = int(args.noresults)

    cleaner = Cleaner(stopwords)
    tfidf = TFIDF(keywords, documents, cleaner)

    question = raw_input("Enter search string or \"exit()\" and press enter: ")
    while question != "exit()":
            found = tfidf.search(question)           
            for title, similarity, index in found[:n]:
                print "{0:4f}\t{1}".format(similarity, title)
            groups = tfidf.group_kmeans(9, 10)
            for i, group in enumerate(groups):
                print "\nGroup {0}:\n".format(i)
                for doc_id in group:
                    print "\t{0}\n".format(documents[doc_id][0])
            question = raw_input("\nEnter search string or \"exit()\" and "
                    "press enter: ")
Exemplo n.º 5
0
from search import TFIDF
from guess import Guesses
import expander

from flask import Flask, render_template, request, jsonify

keywords_path = "data/keywords-2.txt"
stopwords_path = "data/stopwords.txt"
documents_path = "data/documents-2.txt"

keywords = Loader.load_keywords(keywords_path)
stopwords = Loader.load_stopwords(stopwords_path)
documents = Loader.load_documents(documents_path, categories=True)

cleaner = Cleaner(stopwords)
tfidf = TFIDF(keywords, documents, cleaner)
autocomplete = Guesses(tfidf.get_term_document_matrix(), tfidf.keywords, tfidf.keywords_lookup)

app = Flask(__name__)


@app.route('/')
def home():
    found_extended = None
    question = ""
    if 'search' in request.args:
        question = request.args['search']
        found = tfidf.search(question)
        found_extended = [(Cleaner.make_printable(title),
            similarity,
            Cleaner.make_printable(tfidf.documents[index][1]))
Exemplo n.º 6
0
class TestTFIDF(unittest.TestCase):
    def setUp(self):
        stopwords = "stop".split()
        keywords = "aaa bbb ccc ddd eee fff".split()
        documents = [
            ("document 1 ccc", "aaa aaa aaa ccc"),
            ("document 2 stop", "stop aaa bbb ccc"),
            ("document 3 stop", "aaa"),
            ("document 4 ddd", "aaa bbb ccc ddd eee"),
        ]
        self.s = TFIDF(keywords, documents, Cleaner(stopwords))

    def test_keyword_setup(self):
        actual = self.s.keywords.items()
        expected = [("aaa", 0), ("bbb", 1), ("ccc", 2), ("ddd", 3), ("eee", 4), ("fff", 5)]
        self.assertEqual(actual, expected)

    def test_documents_setup(self):
        actual = self.s.document_vectors
        expected = {0: [3, 0, 2, 0, 0, 0], 1: [1, 1, 1, 0, 0, 0], 2: [1, 0, 0, 0, 0, 0], 3: [1, 1, 1, 2, 1, 0]}
        self.assertEqual(actual, expected)

    def test_search_with_no_results(self):
        actual = self.s.search("fff")
        expected = []
        self.assertEqual(actual, expected)

    def test_search_with_only_popular_terms(self):
        actual = self.s.search("aaa")
        expected = []  # because idf=0
        self.assertEqual(actual, expected)

    def test_tf(self):
        document = self.s.document_vectors[0]
        actual = self.s.tf(document, "ccc")
        expected = 0.6666666666
        self.assertAlmostEqual(actual, expected)

        document = self.s.document_vectors[0]
        actual = self.s.tf(document, "aaa")
        expected = 1.0
        self.assertAlmostEqual(actual, expected)

        document = self.s.document_vectors[1]
        actual = self.s.tf(document, "aaa")
        expected = 1.0
        self.assertAlmostEqual(actual, expected)

        document = self.s.document_vectors[2]
        actual = self.s.tf(document, "aaa")
        expected = 1.0
        self.assertAlmostEqual(actual, expected)

        document = self.s.document_vectors[3]
        actual = self.s.tf(document, "aaa")
        expected = 0.5
        self.assertAlmostEqual(actual, expected)

    def test_idf(self):
        expected_results = [
            ("aaa", math.log(1.0, 10)),
            ("bbb", math.log(2.0, 10)),
            ("ccc", math.log(1.3333333333333, 10)),
            ("ddd", math.log(4.0, 10)),
            ("eee", math.log(4.0, 10)),
            ("fff", 0.0),
        ]

        for term, expected in expected_results:
            actual = self.s.idf(term)
            self.assertAlmostEqual(actual, expected)
Exemplo n.º 7
0
class TestTFIDF_InfoRetrieval(unittest.TestCase):
    def setUp(self):
        stopwords = "stop".split()
        keywords = "information agency retrieval".split()
        # documents = [
        #        ("Document 1", "information retrieval information retrieval"),
        #        ("Document 2", "retrieval retrieval retrieval retrieval"),
        #        ("Document 3", "agency information retrieval agency"),
        #        ("Document 4", "retrieval agency retrieval agency"),
        #    ]
        documents = Loader.load_documents("data/documents-lab1.txt")
        self.s = TFIDF(keywords, documents, Cleaner(stopwords))

    def test_keyword_setup(self):
        actual = self.s.keywords.items()
        expected = [("agenc", 0), ("inform", 1), ("retriev", 2)]
        self.assertEqual(actual, expected)

    def test_documents_setup(self):
        actual = self.s.document_vectors
        expected = {0: [0, 2, 2], 1: [0, 0, 4], 2: [2, 1, 1], 3: [2, 0, 2]}
        self.assertEqual(actual, expected)

    def test_tf(self):
        expected_results = [(0, [0, 1, 1]), (1, [0, 0, 1]), (2, [1, 0.5, 0.5]), (3, [1, 0, 1])]
        for index, expected_vector in expected_results:
            document = self.s.document_vectors[index]
            for word, i in self.s.keywords.items():
                actual = self.s.tf(document, word)
                expected = expected_vector[i]
                self.assertEqual(actual, expected)

    def test_idf(self):
        expected_results = [("inform", math.log(2, 10)), ("retriev", 0.0), ("agenc", math.log(2, 10))]
        for term, expected in expected_results:
            actual = self.s.idf(term)
            self.assertAlmostEqual(actual, expected, places=6)

    def test_tfidf(self):
        expected_results = [
            (0, [0, math.log(2, 10), 0]),
            (1, [0, 0, 0]),
            (2, [math.log(2, 10), 0.5 * math.log(2, 10), 0]),
            (3, [math.log(2, 10), 0, 0]),
        ]
        for index, expected_vector in expected_results:
            document = self.s.document_vectors[index]
            actual_vector = self.s.tfidf(document)
            for actual, expected in zip(actual_vector, expected_vector):
                self.assertAlmostEqual(actual, expected, places=6)

    def test_similarity(self):
        expected_results = [(0, 1), (1, 0), (2, math.sqrt(0.2)), (3, 0)]
        question_vector = self.s.phrase_to_vector("information retrieval")
        question_tfidfs = self.s.tfidf(question_vector)
        for index, expected in expected_results:
            actual = self.s.doc_question_similarity(index, question_tfidfs)
            self.assertEqual(actual, expected)

    def test_search(self):
        expected = [("Document 1", 1.0, 0), ("Document 3", math.sqrt(0.2), 2)]
        actual = self.s.search("information retrieval")
        self.assertEqual(actual, expected)
Exemplo n.º 8
0
class TestTFIDF_flies(unittest.TestCase):
    def setUp(self):
        stopwords = "stop".split()
        keywords = "bee wasp fly fruit like".split()
        documents = [
            ("D1", "Time fly like an arrow but fruit fly like a banana."),
            ("D2", "It's strange that bees and wasps don't like each other."),
            ("D3", "The fly attendant sprayed the cabin with a strange fruit " "aerosol."),
            ("D4", "Try not to carry a light, as wasps and bees may fly " "toward it."),
            ("D5", "Fruit fly fly around in swarms. When fly they flap their " "wings 220 times a second."),
        ]
        self.s = TFIDF(keywords, documents, Cleaner(stopwords))

    def test_keyword_setup(self):
        actual = self.s.keywords.items()
        expected = [("bee", 0), ("fly", 1), ("fruit", 2), ("like", 3), ("wasp", 4)]
        self.assertEqual(actual, expected)

    def test_documents_setup(self):
        actual = self.s.document_vectors
        expected = {0: [0, 2, 1, 2, 0], 1: [1, 0, 0, 1, 1], 2: [0, 1, 1, 0, 0], 3: [1, 1, 0, 0, 1], 4: [0, 3, 1, 0, 0]}
        self.assertEqual(actual, expected)

    def test_tf(self):
        expected_results = [
            (0, [0, 1, 0.5, 1, 0]),
            (1, [1, 0, 0, 1, 1]),
            (2, [0, 1, 1, 0, 0]),
            (3, [1, 1, 0, 0, 1]),
            (4, [0, 1, 0.333333333333333333, 0, 0]),
        ]
        for index, expected_vector in expected_results:
            document = self.s.document_vectors[index]
            for word, i in self.s.keywords.items():
                actual = self.s.tf(document, word)
                expected = expected_vector[i]
                self.assertEqual(actual, expected)

    def test_idf(self):
        expected_results = [
            ("bee", 0.397940009),
            ("fly", 0.096910013),
            ("fruit", 0.22184875),
            ("like", 0.397940009),
            ("wasp", 0.397940009),
        ]
        for term, expected in expected_results:
            actual = self.s.idf(term)
            self.assertAlmostEqual(actual, expected, places=6)

    def test_tfidf(self):
        expected_results = [
            (0, [0, 0.096910013, 0.110924375, 0.397940009, 0]),
            (1, [0.397940009, 0, 0, 0.397940009, 0.397940009]),
            (2, [0, 0.096910013, 0.22184875, 0, 0]),
            (3, [0.397940009, 0.096910013, 0, 0, 0.397940009]),
            (4, [0, 0.096910013, 0.073949583, 0, 0]),
        ]
        for title, expected_vector in expected_results:
            document = self.s.document_vectors[title]
            actual_vector = self.s.tfidf(document)
            for actual, expected in zip(actual_vector, expected_vector):
                self.assertAlmostEqual(actual, expected, places=6)
def second_func(q, cont):

    words = stopword_remover2(q, set_alph, cont)
    #words = gen_tokenizer(q)
    print(words)
    list_mul = collections.OrderedDict()
    doc_list = []
    seen = []
    for i in range(len(words)):
        if words[i] in seen:
            break
        else:
            seen.append(words[i])
        query = words[i]
        if query in dict:
            print(query)
            temp_list = []
            with open('posting_list.txt') as f:
                index = 0
                f_line = f.readlines()
                for key in dict.keys():
                    if key == query:
                        break
                    index = index + int(dict[key]) * 2

            flag = index + int(dict[key]) * 2
            while index < flag:
                doc_num = str(int(disconnector(f_line[index])) - 1)
                if doc_num not in temp_list:
                    temp_list.append(doc_num)
                # print(doc_num)
                index = index + 2
            temp_score = {}
            count = len(temp_list)
            if len(temp_list) > 15:
                count = 15
            for j in range(len(temp_list)):
                body = []
                if full_list[int(temp_list[j])].get(
                        'Title') is not None and full_list[int(
                            temp_list[j])].get('Abstract') is not None:
                    body = full_list[int(
                        temp_list[j])].get('Title') + full_list[int(
                            temp_list[j])].get('Abstract')
                    if full_list[int(temp_list[j])].get('Authors') is not None:
                        body = body + full_list[int(
                            temp_list[j])].get('Authors')
                if full_list[int(temp_list[j])].get(
                        'Title') is not None and full_list[int(
                            temp_list[j])].get('Abstract') is None:
                    body = full_list[int(temp_list[j])].get('Title')
                    if full_list[int(temp_list[j])].get('Authors') is not None:
                        body = body + full_list[int(
                            temp_list[j])].get('Authors')
                else:
                    body = full_list[int(temp_list[j])].get('Abstract')
                    if full_list[int(temp_list[j])].get('Authors') is not None:
                        body = body + full_list[int(
                            temp_list[j])].get('Authors')
                (a, b) = TFIDF(dict, body, words, len(full_list))
                sim = cosine_similarity(a, b)
                temp_score.update({temp_list[j]: sim})
            sorted_score = sorted(temp_score.items(),
                                  key=operator.itemgetter(1))
            if len(sorted_score) < 30:
                for q in range(0, len(sorted_score)):
                    (ind, sc) = sorted_score[len(sorted_score) - 1 - q]
                    list_mul.update({ind: sc})
            else:
                for q in range(0, 30):
                    (ind, sc) = sorted_score[len(sorted_score) - 1 - q]
                    list_mul.update({ind: sc})

    score = collections.OrderedDict()
    sorted_score = sorted(list_mul.items(), key=operator.itemgetter(1))
    if len(sorted_score) < 25:
        for q in range(0, len(sorted_score)):
            (ind, sc) = sorted_score[len(sorted_score) - 1 - q]
            score.update({ind: sc})
    else:
        for q in range(0, 25):
            (ind, sc) = sorted_score[len(sorted_score) - 1 - q]
            score.update({ind: sc})
    score = sorted(score.items(), key=operator.itemgetter(1))
    for i in range(len(score)):
        print(i + 1)
        id_doc = full_list[int(ind)].get('ID')
        print("Document ID:" + id_doc)
        doc_list.append(id_doc)
        (ind, sc) = score[len(score) - 1 - i]
        tit = full_list[int(ind)].get('Title')
        if tit is not None:
            print(" ".join(str(x) for x in tit))
        if full_list[int(ind)].get('Authors') is not None:
            print('Author(s): ')
            for k in range(len(full_list[int(ind)].get('Authors'))):
                print(full_list[int(ind)].get('Authors')[k])
        print("score: " + str(sc))
        print('--------------------------------------------')
    return doc_list