for doc in docgroup: if not doc: break (title, ns, sha1, text) = doc if ns != '0': continue if not text: continue # wtf if text[:9].lower() == ('#redirect'): continue processed += 1 text = unwiki(text) tokens = normalise_gently(filter(good, utils.tokens(text))) tokens_title = normalise_gently(filter(good, utils.tokens(title))) round_tokens |= set(tokens_title) | set(tokens) for w in round_tokens: record = bdata.records.add() record.key = w record.value.parts.append('') del record.value.parts[:] t2 = time() # Index iserver.feedData(bdata, deadline_ms=10)
processed = 0 for doc in docgroup: if not doc: break (title, ns, sha1, text) = doc if ns != '0': continue if not text: continue # wtf if text[:9].lower() == ('#redirect'): continue processed += 1 text = unwiki(text) tokens = normalise_gently(filter(good, utils.tokens(text))) tokens_title = normalise_gently( filter(good, utils.tokens(title))) round_tokens |= set(tokens_title) | set(tokens) for w in round_tokens: record = bdata.records.add() record.key = w record.value.parts.append('') del record.value.parts[:] t2 = time() # Index iserver.feedData(bdata, deadline_ms=10)
def __init__(self, query, mongo_cred, server='tcp://*****:*****@{host}/{db}'.format(user=mongo_cred['user'], password=mongo_cred['password'], host=mongo_cred['host'], db=mongo_cred['db']) self.mongo = MongoClient(MONGO_ADDRESS) self.db = self.mongo[mongo_cred['db']] index = self.index = IndexServer(server, store_path) self._TIME() query_tokens = map(self.correct_token, tokens(query)) querysets = set([frozenset(normalise_drop(ts)) for ts in query_tokens]) querysets = filter(lambda s: s, querysets) if not querysets: raise NotEnoughEntropy() self._TIME('proc') kw_docsets = defaultdict(lambda: frozenset()) doc_poslists = defaultdict(lambda: defaultdict(lambda: [])) self.freq = freq = defaultdict(lambda: Counter()) docs = None for queryset in querysets: matched_docs = set() for kw in queryset: self._TIME() try : res = index.query(kw, max_mistakes=0, timeout=3) except rpcz.RpcDeadlineExceeded: try: res = index.query(kw, max_mistakes=0, timeout=4) except rpcz.RpcDeadlineExceeded: res = index.query(kw, max_mistakes=0, timeout=5) if res.exact_total == 0: try: res = index.query(kw, max_mistakes=1, timeout=3) except rpcz.RpcDeadlineExceeded: self.extraquery_deadline = True self._TIME('index') for record in res.values: key = record.key if key in kw_docsets: matched_docs |= kw_docsets[key] continue data = record.value.parts docpostings = map(cPickle.loads, data) key_set = set() for (sha1, positions) in docpostings: key_set.add(sha1) matched_docs.add(sha1) doc_poslists[sha1][key].append(positions) freq[key][sha1] += len(positions) kw_docsets[key] = frozenset(key_set) self._TIME('proc') if docs is None: docs = matched_docs else: docs &= matched_docs if not docs: break self._TIME('proc') doc_count = Counter() doc_count.update({kw: len(freq[kw]) for kw in freq}) N = self.N = self.db.articles.count() idf = {kw: max(0.4, log((N - doc_count[kw] + 0.5) / (doc_count[kw] + 0.5))) for kw in freq} self.poslists = {sha1: merge_sorted([l for klists in doc_poslists[sha1].values() for l in klists]) for sha1 in docs} self._TIME('proc') # Here comes BM25 to save the world! scores = [] avg_size = self.db.service.find_one({'_id': 'avg_len'})['val'] doc_headers = self.db.articles.find({'_id': {'$in': list(docs)}, 'size': {'$gt': 0}}, {'size':1, 'title':1}) query_tokens = set([t for qs in query_tokens for t in qs]) for d in doc_headers: score = 0 sha1 = d['_id'] size = d['size'] title = d['title'] for kw in freq: m = (freq[kw][sha1] / size * (k1 + 1)) / (freq[kw][sha1] / size + k1 * (1 - b + b * size / avg_size)) score += idf[kw] * m # Prioritise title matches (our own heuristic) keywords_bag = Counter(query_tokens) title_tokens = normalise_gently(tokens(title)) title_bag = Counter(title_tokens) both = keywords_bag & title_bag both_c = sum(both.values()) ratio = both_c / (len(query_tokens) + len(title_tokens) - both_c) score += 10 * ratio tokens_title = normalise_drop(title_tokens) title_set = set(tokens_title) both = set(freq.keys()) & title_set ratio = len(both) / len(freq) score += 10 * ratio scores.append((sha1, score)) self.scores = sorted(scores, key=lambda p: p[1], reverse=True) self._TIME('ranking') self.results = map(lambda p: p[0], self.scores)
def __init__(self, query, mongo_cred, server='tcp://*****:*****@{host}/{db}'.format( user=mongo_cred['user'], password=mongo_cred['password'], host=mongo_cred['host'], db=mongo_cred['db']) self.mongo = MongoClient(MONGO_ADDRESS) self.db = self.mongo[mongo_cred['db']] index = self.index = IndexServer(server, store_path) self._TIME() query_tokens = map(self.correct_token, tokens(query)) querysets = set([frozenset(normalise_drop(ts)) for ts in query_tokens]) querysets = filter(lambda s: s, querysets) if not querysets: raise NotEnoughEntropy() self._TIME('proc') kw_docsets = defaultdict(lambda: frozenset()) doc_poslists = defaultdict(lambda: defaultdict(lambda: [])) self.freq = freq = defaultdict(lambda: Counter()) docs = None for queryset in querysets: matched_docs = set() for kw in queryset: self._TIME() try: res = index.query(kw, max_mistakes=0, timeout=3) except rpcz.RpcDeadlineExceeded: try: res = index.query(kw, max_mistakes=0, timeout=4) except rpcz.RpcDeadlineExceeded: res = index.query(kw, max_mistakes=0, timeout=5) if res.exact_total == 0: try: res = index.query(kw, max_mistakes=1, timeout=3) except rpcz.RpcDeadlineExceeded: self.extraquery_deadline = True self._TIME('index') for record in res.values: key = record.key if key in kw_docsets: matched_docs |= kw_docsets[key] continue data = record.value.parts docpostings = map(cPickle.loads, data) key_set = set() for (sha1, positions) in docpostings: key_set.add(sha1) matched_docs.add(sha1) doc_poslists[sha1][key].append(positions) freq[key][sha1] += len(positions) kw_docsets[key] = frozenset(key_set) self._TIME('proc') if docs is None: docs = matched_docs else: docs &= matched_docs if not docs: break self._TIME('proc') doc_count = Counter() doc_count.update({kw: len(freq[kw]) for kw in freq}) N = self.N = self.db.articles.count() idf = { kw: max(0.4, log( (N - doc_count[kw] + 0.5) / (doc_count[kw] + 0.5))) for kw in freq } self.poslists = { sha1: merge_sorted( [l for klists in doc_poslists[sha1].values() for l in klists]) for sha1 in docs } self._TIME('proc') # Here comes BM25 to save the world! scores = [] avg_size = self.db.service.find_one({'_id': 'avg_len'})['val'] doc_headers = self.db.articles.find( { '_id': { '$in': list(docs) }, 'size': { '$gt': 0 } }, { 'size': 1, 'title': 1 }) query_tokens = set([t for qs in query_tokens for t in qs]) for d in doc_headers: score = 0 sha1 = d['_id'] size = d['size'] title = d['title'] for kw in freq: m = (freq[kw][sha1] / size * (k1 + 1)) / (freq[kw][sha1] / size + k1 * (1 - b + b * size / avg_size)) score += idf[kw] * m # Prioritise title matches (our own heuristic) keywords_bag = Counter(query_tokens) title_tokens = normalise_gently(tokens(title)) title_bag = Counter(title_tokens) both = keywords_bag & title_bag both_c = sum(both.values()) ratio = both_c / (len(query_tokens) + len(title_tokens) - both_c) score += 10 * ratio tokens_title = normalise_drop(title_tokens) title_set = set(tokens_title) both = set(freq.keys()) & title_set ratio = len(both) / len(freq) score += 10 * ratio scores.append((sha1, score)) self.scores = sorted(scores, key=lambda p: p[1], reverse=True) self._TIME('ranking') self.results = map(lambda p: p[0], self.scores)