def process_OR(pair): """ process a pair of terms/postings that are OR'ed """ if not isinstance(pair[0], list): start_byte = word_dict[pair[0]][1] num_bytes = word_dict[pair[0]][2] fh = open(postings_file) fh.seek(start_byte) bytestream = fh.read(num_bytes) p1 = compress.uncompress_postings_list(bytestream) else: p1 = pair[0] if not isinstance(pair[1], list): start_byte = word_dict[pair[1]][1] num_bytes = word_dict[pair[1]][2] fh = open(postings_file) fh.seek(start_byte) bytestream = fh.read(num_bytes) p2 = compress.uncompress_postings_list(bytestream) else: p2 = pair[1] return union_with_skips(p1, p2)
def search_query(query): qlist = re.split("([ ()\"])", query) # split on quotes qlist = filter(lambda a: a != "" and a != " ", qlist) # remove " " and "" for i in range(0, len(qlist)): if qlist[i] not in ops: # stem query words qlist[i] = index.stemmer.stem_word(qlist[i].lower()) if not word_dict.has_key(qlist[i]): # assign empty list to words not in dictionary qlist[i] = [] # qlist only contains one query term if len(qlist) == 1: if isinstance(qlist[0], list): res_list = qlist else: start_byte = word_dict[qlist[0]][1] num_bytes = word_dict[qlist[0]][2] fh = open(postings_file) fh.seek(start_byte) bytestream = fh.read(num_bytes) result = compress.uncompress_postings_list(bytestream) res_list = [result] else: res_list = process_query_list(qlist) res = [pair[0] for pair in res_list[0]] return " ".join(str(a) for a in res)
def process_AND(lst): """ process a list of terms/postings that are AND'ed """ for i in range(0, len(lst)): if not isinstance(lst[i], list): start_byte = word_dict[lst[i]][1] num_bytes = word_dict[lst[i]][2] fh = open(postings_file) fh.seek(start_byte) bytestream = fh.read(num_bytes) result = compress.uncompress_postings_list(bytestream) # strip off the positions, so we don't interfere with the original method lst[i] = remove_positions(result) # sort the list by frequency so that less frequent terms come first lst = sorted(lst, lambda x, y: cmp(len(x), len(y))) while len(lst) > 1: p1 = lst.pop(0) p2 = lst.pop(0) lst.insert(0, intersect_with_skips(p1,p2)) return lst[0]
def process_PHRASES(a): """ process phrasal queries """ if [] in a: return [[]] elif len(a) == 1: start_byte = word_dict[a[0]][1] num_bytes = word_dict[a[0]][2] fh = open(postings_file) fh.seek(start_byte) bytestream = fh.read(num_bytes) result = compress.uncompress_postings_list(bytestream) return [result] else: sublist = [] for elt in a: start_byte = word_dict[elt][1] num_bytes = word_dict[elt][2] fh = open(postings_file) fh.seek(start_byte) bytestream = fh.read(num_bytes) result = compress.uncompress_postings_list(bytestream) sublist.append(result) result = positional_intersect(sublist[0], sublist[1]) # remove processed sublist sublist.pop(0) sublist.pop(0) # process the rest of the sublist elements while len(sublist) > 0: result = positional_intersect(result, sublist.pop(0)) return [result]
def process_NOT(a): # read the postings list for all doc ID's into all_fids when this function is first called global all_fids if all_fids == None: all_fids = [pair[0] for pair in eval(linecache.getline("all_id.txt", 1))] if not isinstance(a, list): start_byte = word_dict[a[0]][1] num_bytes = word_dict[a[0]][2] fh = open(postings_file) fh.seek(start_byte) bytestream = fh.read(num_bytes) lst = compress.uncompress_postings_list(bytestream) a_fids = [pair[0] for pair in lst] else: a_fids = [pair[0] for pair in a] other_fids = filter(lambda a: a not in a_fids, all_fids) return index.generate_skip_list(other_fids, always_insert_skips)