postings = defaultdict(lambda: []) bdata = index_pb.BuilderData() docs = [] for doc in docgroup: if not doc: break (title, ns, sha1, text) = doc if ns != '0': continue if not text: continue # wtf if text[:9].lower() == ('#redirect'): continue text = unwiki(text) itokens = list(itokenise(text)) itokens_title = list(itokenise(title)) tokens = normalise(utils.tokens(text, itokens)) tokens_title = negate_tokens( normalise(utils.tokens(title, itokens_title))) tokens_all = tokens_title + tokens if not tokens_all: continue article_tokens = Counter() thisdoc_postings = defaultdict(lambda: []) for i, w in tokens_all: article_tokens[w] += 1 thisdoc_postings[w].append(i) for w, l in thisdoc_postings.iteritems():
def tokens(string, ilist=None): if ilist is None: ilist = itokenise(string) return [string[f:t] for f, t in ilist]
postings = defaultdict(lambda: []) bdata = index_pb.BuilderData() docs = [] for doc in docgroup: if not doc: break (title, ns, sha1, text) = doc if ns != '0': continue if not text: continue # wtf if text[:9].lower() == ('#redirect'): continue text = unwiki(text) itokens = list(itokenise(text)) itokens_title = list(itokenise(title)) tokens = normalise(utils.tokens(text, itokens)) tokens_title = negate_tokens(normalise(utils.tokens(title, itokens_title))) tokens_all = tokens_title + tokens if not tokens_all: continue article_tokens = Counter() thisdoc_postings = defaultdict(lambda: []) for i, w in tokens_all: article_tokens[w] += 1 thisdoc_postings[w].append(i) for w, l in thisdoc_postings.iteritems(): postings[w].append((sha1, l))