def test_generate(self): cfg = { "path": "index.md", "sort_by": "title", "group_by": "tag", "insert_toc": 1, "insert_hook": 1, "reverse_sort": 0, "key_filter": 0 } out = Index.generate(self.itens, IndexConfig(cfg)) expected = """ ## Links - [ed](#ed) - [fup](#fup) - [poo](#poo) ## ed - [@2 d teste 2](b/2/2.md#d-teste-2-ed) - [@5 f teste 5](b/5/5.md#f-teste-5-ed) ## fup - [@0 a teste 0](b/0/0.md#a-teste-0-fup-sub-one) [one] ## poo - [@4 b teste 4](b/4/4.md#b-teste-4-poo) - [@1 c teste 1](b/1/1.md#c-teste-1-poo) - [@3 e teste 3](b/3/3.md#e-teste-3-poo) """ self.assertEqual(out, expected)
def __init__(self, name=''): self._name = name print('the app is initialized') # self.say_name() self.index = Index() self.df_amazon = pd.read_csv('Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv', \ error_bad_lines=False,encoding='utf-8-sig') num_files = self.index.index_product(self.df_amazon) print("indexed %d files" % num_files) self.index.index_review(self.df_amazon) # init predict filename1 = "xgboost.pkl" filename2 = "randomForest.pkl" filename3 = "word_vectorize.pkl" filename4 = "char_vectorize.pkl" with open(filename1, "rb") as f1: self.xgboostModel = pickle.load(f1) with open(filename3, "rb") as f2: self.word_vectorizer = pickle.load(f2) with open(filename4, "rb") as f3: self.char_vectorizer = pickle.load(f3)
# Base query class (parent of all query classes) class Query(object): def __init__(self): pass def get_matches(self, index): # TODO, return all documents IDs (as a set) from the index pass # Query containing a single search term class TermQuery(Query): def __init__(self, term): self.term = term def get_matches(self, index): # TODO, return all documents IDs (as a set) that contain the search term pass # Load index index = Index() index.load_from_file("data/index.txt") # TODO, construct the following (or similar) queries and get results # - "states" # - "NOT washington" # - "united AND states" # - "(us OR (united AND states)) AND NOT washington"
freq = int(p.payload) doclen = index.get_doc_meta(doc_id)['length'] wtd = tfidf(index, t, freq, doclen) scores[doc_id] += wtq * wtd doc_norm[doc_id] += wtd * wtd # `scores` at this points holds the counter of the cosine formula # we need to perform normslization dividing by sqrt(q_norm * doc_norm) for doc_id, score in scores.iteritems(): scores[doc_id] = scores[doc_id] / math.sqrt(q_norm * doc_norm[doc_id]) return scores if __name__ == "__main__": # Load index index = Index() index.load_from_file("../data/index.txt", "../data/meta.txt") # Input query query = "financial japan world news" # Retrieve documents using the vector space model res = retrieve_vsm(index, query) # Print relevance scores and document titles for the top 10 results for doc_id in sorted(res, key=res.get, reverse=True)[:10]: docmeta = index.get_doc_meta(doc_id) print res[doc_id], docmeta['title']
def main(args, loader=None, picker_cls=Picker): """ :param args: commandline arguments :param loader: loader class :param picker_cls: picker class :return: """ shorter_esc_delay() index = Index() picker = picker_cls(args=args) picker.index = index if args.debug: picker.do_debug = True if not sys.stdin.isatty(): while True: stdin_line = sys.stdin.readline() picker.index.add(stdin_line) if not stdin_line: break else: if loader: picker.loader = loader picker.load_lines() elif args.input: file_loader = FileLoader(args.input) picker.loader = file_loader picker.load_lines() else: history_loader = HistoryLoader() picker.loader = history_loader picker.load_lines() f = open("/dev/tty") os.dup2(f.fileno(), 0) picker.win = curses.initscr() curses.noecho() curses.start_color() curses.init_pair(1, curses.COLOR_WHITE, curses.COLOR_BLUE) picker.win.timeout(-1) picker.win.keypad(1) max_y, max_x = picker.get_max_viewport() picker.last_lines = picker.index.last_lines[0:max_y] logger.debug("lastlines %s", picker.last_lines) try: picker.refresh_window("") # thread.start_new_thread( picker.cursor_blink,()) while True: char = picker.win.getch() picker.key_pressed(char) except (KeyboardInterrupt, SystemExit, QuitException): pass finally: picker.win.keypad(0) curses.nocbreak() curses.echo() curses.endwin() if picker.do_print: print picker.last_lines[picker.selected_lineno][1]
class MyApp: def __init__(self, name=''): self._name = name print('the app is initialized') # self.say_name() self.index = Index() self.df_amazon = pd.read_csv('Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv', \ error_bad_lines=False,encoding='utf-8-sig') num_files = self.index.index_product(self.df_amazon) print("indexed %d files" % num_files) self.index.index_review(self.df_amazon) # init predict filename1 = "xgboost.pkl" filename2 = "randomForest.pkl" filename3 = "word_vectorize.pkl" filename4 = "char_vectorize.pkl" with open(filename1, "rb") as f1: self.xgboostModel = pickle.load(f1) with open(filename3, "rb") as f2: self.word_vectorizer = pickle.load(f2) with open(filename4, "rb") as f3: self.char_vectorizer = pickle.load(f3) def say_name(self): print('my name is {0}'.format(self._name)) def get_name(self): return self._name def search_product(self, query): # dataList = [] # for i in range(10): # dataList.append({'id': i+1, 'name': '{0}-{1}'.format(arg, i+1)}) return self.index.search_product(query) def get_product_info(self, id): info = dict() # reviews = [{'id': i, 'text': '{0} is good. [{1}]'.format(id, i)} for i in range(5)] labels = self.index.top_frequency_words(id) # info['topReviews'] = reviews info['labels'] = labels review_txt = default_reviews(id, self.df_amazon) info['topReviews'] = [{'text': txt} for txt in review_txt] # print(info['topReviews']) return info def search_review(self, id, term): # dummy = '{0} of {1} is good'.format(term, id) # reviews = [] # for i in range(10): # reviews.append({'id': i, 'text': '{0}-[{1}]'.format(dummy, i)}) review_text = self.index.search_review(id, term) print('# of reviews={0}'.format(len(review_text))) reviews = [{'text': txt} for txt in review_text] return reviews def predict_score(self, id, text): test1 = text test1 = [test1] test1_word = self.word_vectorizer.transform(test1) test1_char = self.char_vectorizer.transform(test1) test1_features = hstack([test1_char, test1_word]) test1_features = test1_features.tocsr() result = self.xgboostModel.predict(test1_features) return {'score': float(result)}
# Retrieve documents matching a query and displaying the number of results. # If there are less than 20 matching documents, list the titles of the matches. def run_query(query, index): print "Query: ", query.to_string() docs = query.get_matches(index) print "Results: ", len(docs) if len(docs) < 20: print "---- RESULTS ----" for doc_id in docs: docmeta = index.get_doc_meta(doc_id) print docmeta['date'], docmeta['title'] # Load index index = Index() index.load_from_file("../data/index.txt", "../data/meta.txt") # Term query: `states` q1 = TermQuery("states") run_query(q1, index) # NOT query: `NOT (washington)` q2 = NotQuery(TermQuery("washington")) run_query(q2, index) # AND query: `(united AND states)` q3 = BooleanQuery("AND", [TermQuery("united"), TermQuery("states")]) run_query(q3, index) # OR query: `(us OR (united AND states))`
def setUp(self): """ Setup index that will be subjected to the tests. """ self.index = Index(sample_stop_words())
class IndexerTests(unittest.TestCase): """ Test case for Index class. """ def setUp(self): """ Setup index that will be subjected to the tests. """ self.index = Index(sample_stop_words()) def test_sample_index_words_count(self): sample = Indexable(1, "this is an indexable metadata, that is an indexable super metadata") expected_words_count = defaultdict(int) expected_words_count['this'] = 1 expected_words_count['that'] = 1 expected_words_count['super'] = 1 expected_words_count['is'] = 2 expected_words_count['metadata,'] = 1 # Exact terms are not yet processed. expected_words_count['metadata'] = 1 expected_words_count['indexable'] = 2 expected_words_count['an'] = 2 self.assertItemsEqual(sample.words_count, expected_words_count) def test_sample_indexing_and_validate_items(self): sample1 = Indexable(1, "this is an indexable metadata") sample2 = Indexable(2, "this is an indexable super metadata") sample3 = Indexable(3, "this is another indexable metadata") self.index.build_index([sample1, sample2, sample3]) expected_term_index = defaultdict(int) expected_term_index['indexable'] = [0, 1, 2] self.assertItemsEqual(self.index.term_index['indexable'], expected_term_index['indexable']) def test_invalid_term_search(self): """ Test if the search returns when the term is not found. """ sample1 = Indexable(1, "this is an indexable simple metadata") sample2 = Indexable(2, "this is an indexable super metadata") sample3 = Indexable(3, "this is another indexable metadata") expected_indices = [] self.index.build_index([sample1, sample2, sample3]) search_results = self.index.search_terms(["not_valid_term"]) self.assertItemsEqual(search_results, expected_indices) def test_mixed_valid_invalid_term_search(self): """ Test if the search returns when there are valid and invalid terms mixed. """ sample1 = Indexable(1, "this is an indexable simple metadata") sample2 = Indexable(2, "this is an indexable super metadata") sample3 = Indexable(3, "this is another indexable metadata") expected_indices = [] self.index.build_index([sample1, sample2, sample3]) search_results = self.index.search_terms(["not_valid_term", "super"]) self.assertItemsEqual(search_results, expected_indices) def test_one_term_search(self): """ Test if the search for one term returns expected results. """ sample1 = Indexable(1, "this is an indexable metadata") sample2 = Indexable(2, "this is an indexable super metadata") sample3 = Indexable(3, "this is another indexable super metadata") expected_indices = [1, 2] self.index.build_index([sample1, sample2, sample3]) search_results = self.index.search_terms(["super"]) self.assertItemsEqual(search_results, expected_indices) def test_stop_word_search(self): """ Test if stop words are correctly ignored. """ sample1 = Indexable(1, "this is an indexable metadata") sample2 = Indexable(2, "this is an indexable super metadata") sample3 = Indexable(3, "this is another indexable super metadata") expected_indices = [] self.index.build_index([sample1, sample2, sample3]) search_results = self.index.search_terms(["this"]) self.assertItemsEqual(search_results, expected_indices) def test_two_terms_search(self): """ Test if the search for two term returns expected results. """ sample1 = Indexable(1, "this is an indexable simple metadata") sample2 = Indexable(2, "this is an indexable super metadata") sample3 = Indexable(3, "this is another indexable super metadata") expected_indices = [1, 2] self.index.build_index([sample1, sample2, sample3]) search_results = self.index.search_terms(["indexable", "super"]) self.assertItemsEqual(search_results, expected_indices) def test_three_terms_search_with_stop_words(self): """ Test if the search for stop words returns expected results. """ sample1 = Indexable(1, "this is an indexable simple metadata") sample2 = Indexable(2, "this is an indexable super metadata") sample3 = Indexable(3, "this is another indexable super metadata") expected_indices = [0, 1, 2] self.index.build_index([sample1, sample2, sample3]) search_results = self.index.search_terms(["this", "is", "metadata"]) self.assertItemsEqual(search_results, expected_indices)