Пример #1
0
# Base query class (parent of all query classes)
class Query(object):

    def __init__(self):
        pass

    def get_matches(self, index):
        # TODO, return all documents IDs (as a set) from the index
        pass

# Query containing a single search term
class TermQuery(Query):

    def __init__(self, term):
        self.term = term

    def get_matches(self, index):
        # TODO, return all documents IDs (as a set) that contain the search term
        pass


# Load index
index = Index()
index.load_from_file("data/index.txt")

# TODO, construct the following (or similar) queries and get results
# - "states"
# - "NOT washington"
# - "united AND states"
# - "(us OR (united AND states)) AND NOT washington"
Пример #2
0
# Base query class (parent of all query classes)
class Query(object):
    def __init__(self):
        pass

    def get_matches(self, index):
        # TODO, return all documents IDs (as a set) from the index
        pass


# Query containing a single search term
class TermQuery(Query):
    def __init__(self, term):
        self.term = term

    def get_matches(self, index):
        # TODO, return all documents IDs (as a set) that contain the search term
        pass


# Load index
index = Index()
index.load_from_file("data/index.txt")

# TODO, construct the following (or similar) queries and get results
# - "states"
# - "NOT washington"
# - "united AND states"
# - "(us OR (united AND states)) AND NOT washington"
Пример #3
0
            freq = int(p.payload)
            doclen = index.get_doc_meta(doc_id)['length']
            wtd = tfidf(index, t, freq, doclen)
            scores[doc_id] += wtq * wtd
            doc_norm[doc_id] += wtd * wtd

    # `scores` at this points holds the counter of the cosine formula
    # we need to perform normslization dividing by sqrt(q_norm * doc_norm)
    for doc_id, score in scores.iteritems():
        scores[doc_id] = scores[doc_id] / math.sqrt(q_norm * doc_norm[doc_id])

    return scores


if __name__ == "__main__":

    # Load index
    index = Index()
    index.load_from_file("../data/index.txt", "../data/meta.txt")

    # Input query
    query = "financial japan world news"

    # Retrieve documents using the vector space model
    res = retrieve_vsm(index, query)

    # Print relevance scores and document titles for the top 10 results
    for doc_id in sorted(res, key=res.get, reverse=True)[:10]:
        docmeta = index.get_doc_meta(doc_id)
        print res[doc_id], docmeta['title']