示例#1
0
文件: tests.py 项目: senapk/indexer
    def test_generate(self):
        cfg = {
            "path": "index.md",
            "sort_by": "title",
            "group_by": "tag",
            "insert_toc": 1,
            "insert_hook": 1,
            "reverse_sort": 0,
            "key_filter": 0
        }

        out = Index.generate(self.itens, IndexConfig(cfg))
        expected = """
## Links
- [ed](#ed)
- [fup](#fup)
- [poo](#poo)

## ed

- [@2 d teste 2](b/2/2.md#d-teste-2-ed)
- [@5 f teste 5](b/5/5.md#f-teste-5-ed)

## fup

- [@0 a teste 0](b/0/0.md#a-teste-0-fup-sub-one) [one]

## poo

- [@4 b teste 4](b/4/4.md#b-teste-4-poo)
- [@1 c teste 1](b/1/1.md#c-teste-1-poo)
- [@3 e teste 3](b/3/3.md#e-teste-3-poo)
"""
        self.assertEqual(out, expected)
示例#2
0
    def __init__(self, name=''):
        self._name = name
        print('the app is initialized')
        # self.say_name()
        self.index = Index()
        self.df_amazon = pd.read_csv('Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv', \
            error_bad_lines=False,encoding='utf-8-sig')
        num_files = self.index.index_product(self.df_amazon)
        print("indexed %d files" % num_files)
        self.index.index_review(self.df_amazon)

        # init predict
        filename1 = "xgboost.pkl"
        filename2 = "randomForest.pkl"
        filename3 = "word_vectorize.pkl"
        filename4 = "char_vectorize.pkl"
        with open(filename1, "rb") as f1:
            self.xgboostModel = pickle.load(f1)
        with open(filename3, "rb") as f2:
            self.word_vectorizer = pickle.load(f2)
        with open(filename4, "rb") as f3:
            self.char_vectorizer = pickle.load(f3)
示例#3
0
# Base query class (parent of all query classes)
class Query(object):
    def __init__(self):
        pass

    def get_matches(self, index):
        # TODO, return all documents IDs (as a set) from the index
        pass


# Query containing a single search term
class TermQuery(Query):
    def __init__(self, term):
        self.term = term

    def get_matches(self, index):
        # TODO, return all documents IDs (as a set) that contain the search term
        pass


# Load index
index = Index()
index.load_from_file("data/index.txt")

# TODO, construct the following (or similar) queries and get results
# - "states"
# - "NOT washington"
# - "united AND states"
# - "(us OR (united AND states)) AND NOT washington"
示例#4
0
# Base query class (parent of all query classes)
class Query(object):

    def __init__(self):
        pass

    def get_matches(self, index):
        # TODO, return all documents IDs (as a set) from the index
        pass

# Query containing a single search term
class TermQuery(Query):

    def __init__(self, term):
        self.term = term

    def get_matches(self, index):
        # TODO, return all documents IDs (as a set) that contain the search term
        pass


# Load index
index = Index()
index.load_from_file("data/index.txt")

# TODO, construct the following (or similar) queries and get results
# - "states"
# - "NOT washington"
# - "united AND states"
# - "(us OR (united AND states)) AND NOT washington"
示例#5
0
            freq = int(p.payload)
            doclen = index.get_doc_meta(doc_id)['length']
            wtd = tfidf(index, t, freq, doclen)
            scores[doc_id] += wtq * wtd
            doc_norm[doc_id] += wtd * wtd

    # `scores` at this points holds the counter of the cosine formula
    # we need to perform normslization dividing by sqrt(q_norm * doc_norm)
    for doc_id, score in scores.iteritems():
        scores[doc_id] = scores[doc_id] / math.sqrt(q_norm * doc_norm[doc_id])

    return scores


if __name__ == "__main__":

    # Load index
    index = Index()
    index.load_from_file("../data/index.txt", "../data/meta.txt")

    # Input query
    query = "financial japan world news"

    # Retrieve documents using the vector space model
    res = retrieve_vsm(index, query)

    # Print relevance scores and document titles for the top 10 results
    for doc_id in sorted(res, key=res.get, reverse=True)[:10]:
        docmeta = index.get_doc_meta(doc_id)
        print res[doc_id], docmeta['title']
示例#6
0
文件: hst.py 项目: aliahmet/hst
def main(args, loader=None, picker_cls=Picker):
    """

    :param args: commandline arguments
    :param loader: loader class
    :param picker_cls: picker class
    :return:
    """
    shorter_esc_delay()
    index = Index()

    picker = picker_cls(args=args)
    picker.index = index

    if args.debug:
        picker.do_debug = True

    if not sys.stdin.isatty():
        while True:
            stdin_line = sys.stdin.readline()
            picker.index.add(stdin_line)
            if not stdin_line:
                break
    else:
        if loader:
            picker.loader = loader
            picker.load_lines()
        elif args.input:
            file_loader = FileLoader(args.input)
            picker.loader = file_loader
            picker.load_lines()
        else:
            history_loader = HistoryLoader()
            picker.loader = history_loader
            picker.load_lines()

    f = open("/dev/tty")
    os.dup2(f.fileno(), 0)

    picker.win = curses.initscr()
    curses.noecho()
    curses.start_color()

    curses.init_pair(1, curses.COLOR_WHITE, curses.COLOR_BLUE)
    picker.win.timeout(-1)
    picker.win.keypad(1)

    max_y, max_x = picker.get_max_viewport()

    picker.last_lines = picker.index.last_lines[0:max_y]
    logger.debug("lastlines %s", picker.last_lines)

    try:
        picker.refresh_window("")
        # thread.start_new_thread( picker.cursor_blink,())
        while True:
            char = picker.win.getch()
            picker.key_pressed(char)
    except (KeyboardInterrupt, SystemExit, QuitException):
        pass
    finally:
        picker.win.keypad(0)
        curses.nocbreak()
        curses.echo()
        curses.endwin()
        if picker.do_print:
            print picker.last_lines[picker.selected_lineno][1]
示例#7
0
class MyApp:
    def __init__(self, name=''):
        self._name = name
        print('the app is initialized')
        # self.say_name()
        self.index = Index()
        self.df_amazon = pd.read_csv('Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv', \
            error_bad_lines=False,encoding='utf-8-sig')
        num_files = self.index.index_product(self.df_amazon)
        print("indexed %d files" % num_files)
        self.index.index_review(self.df_amazon)

        # init predict
        filename1 = "xgboost.pkl"
        filename2 = "randomForest.pkl"
        filename3 = "word_vectorize.pkl"
        filename4 = "char_vectorize.pkl"
        with open(filename1, "rb") as f1:
            self.xgboostModel = pickle.load(f1)
        with open(filename3, "rb") as f2:
            self.word_vectorizer = pickle.load(f2)
        with open(filename4, "rb") as f3:
            self.char_vectorizer = pickle.load(f3)

    def say_name(self):
        print('my name is {0}'.format(self._name))

    def get_name(self):
        return self._name

    def search_product(self, query):
        # dataList = []
        # for i in range(10):
        #     dataList.append({'id': i+1, 'name': '{0}-{1}'.format(arg, i+1)})
        return self.index.search_product(query)

    def get_product_info(self, id):
        info = dict()
        # reviews = [{'id': i, 'text': '{0} is good. [{1}]'.format(id, i)}  for i in range(5)]
        labels = self.index.top_frequency_words(id)
        # info['topReviews'] = reviews
        info['labels'] = labels
        review_txt = default_reviews(id, self.df_amazon)
        info['topReviews'] = [{'text': txt} for txt in review_txt]
        # print(info['topReviews'])
        return info

    def search_review(self, id, term):
        # dummy = '{0} of {1} is good'.format(term, id)
        # reviews = []
        # for i in range(10):
        #     reviews.append({'id': i, 'text': '{0}-[{1}]'.format(dummy, i)})
        review_text = self.index.search_review(id, term)
        print('# of reviews={0}'.format(len(review_text)))
        reviews = [{'text': txt} for txt in review_text]
        return reviews

    def predict_score(self, id, text):
        test1 = text
        test1 = [test1]
        test1_word = self.word_vectorizer.transform(test1)
        test1_char = self.char_vectorizer.transform(test1)
        test1_features = hstack([test1_char, test1_word])
        test1_features = test1_features.tocsr()
        result = self.xgboostModel.predict(test1_features)
        return {'score': float(result)}
示例#8
0
# Retrieve documents matching a query and displaying the number of results.
# If there are less than 20 matching documents, list the titles of the matches.
def run_query(query, index):
    print "Query:   ", query.to_string()
    docs = query.get_matches(index)
    print "Results: ", len(docs)
    if len(docs) < 20:
        print "---- RESULTS ----"
        for doc_id in docs:
            docmeta = index.get_doc_meta(doc_id)
            print docmeta['date'], docmeta['title']


# Load index
index = Index()
index.load_from_file("../data/index.txt", "../data/meta.txt")

# Term query: `states`
q1 = TermQuery("states")
run_query(q1, index)

# NOT query: `NOT (washington)`
q2 = NotQuery(TermQuery("washington"))
run_query(q2, index)

# AND query: `(united AND states)`
q3 = BooleanQuery("AND", [TermQuery("united"), TermQuery("states")])
run_query(q3, index)

# OR query: `(us OR (united AND states))`
 def setUp(self):
     """
     Setup index that will be subjected to the tests.
     """
     self.index = Index(sample_stop_words())
class IndexerTests(unittest.TestCase):
    """
    Test case for Index class.
    """

    def setUp(self):
        """
        Setup index that will be subjected to the tests.
        """
        self.index = Index(sample_stop_words())

    def test_sample_index_words_count(self):
        sample = Indexable(1, "this is an indexable metadata, that is an indexable super metadata")

        expected_words_count = defaultdict(int)
        expected_words_count['this'] = 1
        expected_words_count['that'] = 1
        expected_words_count['super'] = 1
        expected_words_count['is'] = 2
        expected_words_count['metadata,'] = 1 # Exact terms are not yet processed.
        expected_words_count['metadata'] = 1
        expected_words_count['indexable'] = 2
        expected_words_count['an'] = 2

        self.assertItemsEqual(sample.words_count, expected_words_count)

    def test_sample_indexing_and_validate_items(self):
        sample1 = Indexable(1, "this is an indexable metadata")
        sample2 = Indexable(2, "this is an indexable super metadata")
        sample3 = Indexable(3, "this is another indexable metadata")

        self.index.build_index([sample1, sample2, sample3])

        expected_term_index = defaultdict(int)
        expected_term_index['indexable'] = [0, 1, 2]

        self.assertItemsEqual(self.index.term_index['indexable'], expected_term_index['indexable'])

    def test_invalid_term_search(self):
        """
        Test if the search returns when the term is not found.
        """
        sample1 = Indexable(1, "this is an indexable simple metadata")
        sample2 = Indexable(2, "this is an indexable super metadata")
        sample3 = Indexable(3, "this is another indexable metadata")

        expected_indices = []

        self.index.build_index([sample1, sample2, sample3])
        search_results = self.index.search_terms(["not_valid_term"])

        self.assertItemsEqual(search_results, expected_indices)

    def test_mixed_valid_invalid_term_search(self):
        """
        Test if the search returns when there are valid and invalid terms mixed.
        """
        sample1 = Indexable(1, "this is an indexable simple metadata")
        sample2 = Indexable(2, "this is an indexable super metadata")
        sample3 = Indexable(3, "this is another indexable metadata")

        expected_indices = []

        self.index.build_index([sample1, sample2, sample3])
        search_results = self.index.search_terms(["not_valid_term", "super"])

        self.assertItemsEqual(search_results, expected_indices)

    def test_one_term_search(self):
        """
        Test if the search for one term returns expected results.
        """
        sample1 = Indexable(1, "this is an indexable metadata")
        sample2 = Indexable(2, "this is an indexable super metadata")
        sample3 = Indexable(3, "this is another indexable super metadata")

        expected_indices = [1, 2]

        self.index.build_index([sample1, sample2, sample3])
        search_results = self.index.search_terms(["super"])

        self.assertItemsEqual(search_results, expected_indices)

    def test_stop_word_search(self):
        """
        Test if stop words are correctly ignored.
        """
        sample1 = Indexable(1, "this is an indexable metadata")
        sample2 = Indexable(2, "this is an indexable super metadata")
        sample3 = Indexable(3, "this is another indexable super metadata")

        expected_indices = []

        self.index.build_index([sample1, sample2, sample3])
        search_results = self.index.search_terms(["this"])

        self.assertItemsEqual(search_results, expected_indices)

    def test_two_terms_search(self):
        """
        Test if the search for two term returns expected results.
        """
        sample1 = Indexable(1, "this is an indexable simple metadata")
        sample2 = Indexable(2, "this is an indexable super metadata")
        sample3 = Indexable(3, "this is another indexable super metadata")

        expected_indices = [1, 2]

        self.index.build_index([sample1, sample2, sample3])
        search_results = self.index.search_terms(["indexable", "super"])

        self.assertItemsEqual(search_results, expected_indices)

    def test_three_terms_search_with_stop_words(self):
        """
        Test if the search for stop words returns expected results.
        """
        sample1 = Indexable(1, "this is an indexable simple metadata")
        sample2 = Indexable(2, "this is an indexable super metadata")
        sample3 = Indexable(3, "this is another indexable super metadata")

        expected_indices = [0, 1, 2]

        self.index.build_index([sample1, sample2, sample3])
        search_results = self.index.search_terms(["this", "is", "metadata"])

        self.assertItemsEqual(search_results, expected_indices)