def intersect_with_skips(p1, p2): """ algorithm in figure 2.10 of IIR intersect two postings lists together """ if p1 == [] or p2 == []: return [] answer = [] ptr1 = 0 ptr2 = 0 #print "here ", p1 ,p2 while ptr1 != len(p1) and ptr2 != len(p2): if p1[ptr1][0][0] == p2[ptr2][0][0]: answer.append(p1[ptr1][0][0]) ptr1 += 1 ptr2 += 1 else: if p1[ptr1][0][0] < p2[ptr2][0][0]: # len(p1[ptr1]) == 2 means hasSkip # p1[ptr1][1] is the skip pointer if len(p1[ptr1]) == 2 and p1[ p1[ptr1][1] ][0] <= p2[ptr2][0][0]: while len(p1[ptr1]) == 2 and p1[ p1[ptr1][1] ][0] <= p2[ptr2][0][0]: ptr1 = p1[ptr1][1] else: ptr1 += 1 else: if len(p2[ptr2]) == 2 and p2[ p2[ptr2][1] ][0] <= p1[ptr1][0][0]: while len(p2[ptr2]) == 2 and p2[ p2[ptr2][1] ][0] <= p1[ptr1][0][0]: ptr2 = p2[ptr2][1] else: ptr2 += 1 if answer == []: return [] else: return [[e] for e in index.generate_skip_list(answer, always_insert_skips)]
def union_with_skips(p1, p2): """ union two postings lists together """ tmp_dict = {} for a in p1: tmp_dict[a[0]] = 1 for a in p2: tmp_dict[a[0]] = 1 answer = tmp_dict.keys() answer.sort() answer = index.generate_skip_list(answer, always_insert_skips) return answer
def union_with_skips(p1, p2): """ union two postings lists together """ tmp_dict = {} for a in p1: tmp_dict[a[0][0]] = 1 for a in p2: tmp_dict[a[0][0]] = 1 answer = tmp_dict.keys() answer.sort() answer = index.generate_skip_list(answer, always_insert_skips) answer = [[e] for e in answer] return answer
def process_NOT(a): # read the postings list for all doc ID's into all_fids when this function is first called global all_fids if all_fids == None: #all_fids = [pair[0] for pair in eval(linecache.getline(postings_file, 1))] print all_length f.seek(0) all_fids = [pair[0][0] for pair in singleline(f.read(all_length))] if not isinstance(a, list): f.seek(word_dict[a][1]) a_fids = [pair[0][0] for pair in singleline(f.read(word_dict[a][2]))] else: a_fids = [pair[0][0] for pair in a] other_fids = filter(lambda a: a not in a_fids, all_fids) return [[e] for e in index.generate_skip_list(other_fids, always_insert_skips)]
def process_NOT(a): # read the postings list for all doc ID's into all_fids when this function is first called global all_fids if all_fids == None: all_fids = [pair[0] for pair in eval(linecache.getline("all_id.txt", 1))] if not isinstance(a, list): start_byte = word_dict[a[0]][1] num_bytes = word_dict[a[0]][2] fh = open(postings_file) fh.seek(start_byte) bytestream = fh.read(num_bytes) lst = compress.uncompress_postings_list(bytestream) a_fids = [pair[0] for pair in lst] else: a_fids = [pair[0] for pair in a] other_fids = filter(lambda a: a not in a_fids, all_fids) return index.generate_skip_list(other_fids, always_insert_skips)
def intersect_with_skips(p1, p2): """ algorithm in figure 2.10 of IIR intersect two postings lists together """ if p1 == [] or p2 == []: return [] answer = [] ptr1 = 0 ptr2 = 0 #print "here ", p1 ,p2 while ptr1 != len(p1) and ptr2 != len(p2): if p1[ptr1][0][0] == p2[ptr2][0][0]: answer.append(p1[ptr1][0][0]) ptr1 += 1 ptr2 += 1 else: if p1[ptr1][0][0] < p2[ptr2][0][0]: # len(p1[ptr1]) == 2 means hasSkip # p1[ptr1][1] is the skip pointer if len(p1[ptr1]) == 2 and p1[p1[ptr1][1]][0] <= p2[ptr2][0][0]: while len(p1[ptr1] ) == 2 and p1[p1[ptr1][1]][0] <= p2[ptr2][0][0]: ptr1 = p1[ptr1][1] else: ptr1 += 1 else: if len(p2[ptr2]) == 2 and p2[p2[ptr2][1]][0] <= p1[ptr1][0][0]: while len(p2[ptr2] ) == 2 and p2[p2[ptr2][1]][0] <= p1[ptr1][0][0]: ptr2 = p2[ptr2][1] else: ptr2 += 1 if answer == []: return [] else: return [[e] for e in index.generate_skip_list(answer, always_insert_skips)]