예제 #1
0
파일: search.py 프로젝트: dw6/NUS
def process_OR(pair):
    """
    process a pair of terms/postings that are OR'ed
    """
    if not isinstance(pair[0], list):
       
        start_byte = word_dict[pair[0]][1]
        num_bytes = word_dict[pair[0]][2]

        fh = open(postings_file)
        fh.seek(start_byte)
        bytestream = fh.read(num_bytes)
        p1 = compress.uncompress_postings_list(bytestream)
       
    else:
        p1 = pair[0]
    if not isinstance(pair[1], list):

        start_byte = word_dict[pair[1]][1]
        num_bytes = word_dict[pair[1]][2]

        fh = open(postings_file)
        fh.seek(start_byte)
        bytestream = fh.read(num_bytes)
        p2 = compress.uncompress_postings_list(bytestream)
 
    else:
        p2 = pair[1]
    return union_with_skips(p1, p2)
예제 #2
0
파일: search.py 프로젝트: dw6/NUS
def search_query(query):
    qlist = re.split("([ ()\"])", query) # split on quotes
    qlist = filter(lambda a: a != "" and a != " ", qlist)     # remove " " and ""

    for i in range(0, len(qlist)):
        if qlist[i] not in ops:
            # stem query words
            qlist[i] = index.stemmer.stem_word(qlist[i].lower())   
            if not word_dict.has_key(qlist[i]):
                # assign empty list to words not in dictionary
                qlist[i] = []

    # qlist only contains one query term
    if len(qlist) == 1:
        if isinstance(qlist[0], list):
            res_list = qlist
        else:
             
            start_byte = word_dict[qlist[0]][1]
            num_bytes = word_dict[qlist[0]][2]

            fh = open(postings_file)
            fh.seek(start_byte)
            bytestream = fh.read(num_bytes)
            result = compress.uncompress_postings_list(bytestream)
            res_list = [result]

    else:
        res_list = process_query_list(qlist)
    
    res = [pair[0] for pair in res_list[0]]
    return " ".join(str(a) for a in res)
예제 #3
0
파일: search.py 프로젝트: dw6/NUS
def process_AND(lst):
    """
    process a list of terms/postings that are AND'ed
    """
    for i in range(0, len(lst)):
        if not isinstance(lst[i], list):
            
            start_byte = word_dict[lst[i]][1]
            num_bytes = word_dict[lst[i]][2]

            fh = open(postings_file)
            fh.seek(start_byte)
            bytestream = fh.read(num_bytes)
            result = compress.uncompress_postings_list(bytestream)

            # strip off the positions, so we don't interfere with the original method
            lst[i] = remove_positions(result)
    
    # sort the list by frequency so that less frequent terms come first
    lst = sorted(lst, lambda x, y: cmp(len(x), len(y)))
    while len(lst) > 1:
        p1 = lst.pop(0)
        p2 = lst.pop(0)
    
        lst.insert(0, intersect_with_skips(p1,p2))
    
    return lst[0]
예제 #4
0
파일: search.py 프로젝트: dw6/NUS
def process_PHRASES(a):
    """
    process phrasal queries
    """
    if [] in a: return [[]]

    elif len(a) == 1:
            
        start_byte = word_dict[a[0]][1]
        num_bytes = word_dict[a[0]][2]
        
        fh = open(postings_file)
        fh.seek(start_byte)
        bytestream = fh.read(num_bytes)
        result = compress.uncompress_postings_list(bytestream)
        return [result]
    
    else:
        sublist = []

        for elt in a:
            start_byte = word_dict[elt][1]
            num_bytes = word_dict[elt][2]
        
            fh = open(postings_file)
            fh.seek(start_byte)
            bytestream = fh.read(num_bytes)
            result = compress.uncompress_postings_list(bytestream)
            sublist.append(result)
            
        result = positional_intersect(sublist[0], sublist[1])            
        
        # remove processed sublist
        sublist.pop(0)
        sublist.pop(0)
        
        # process the rest of the sublist elements
        while len(sublist) > 0:
            result = positional_intersect(result, sublist.pop(0))
        
        return [result]
예제 #5
0
파일: search.py 프로젝트: dw6/NUS
def process_NOT(a):
    # read the postings list for all doc ID's into all_fids when this function is first called
    global all_fids
    if all_fids == None:

        all_fids = [pair[0] for pair in eval(linecache.getline("all_id.txt", 1))]
    if not isinstance(a, list):
       
        start_byte = word_dict[a[0]][1]
        num_bytes = word_dict[a[0]][2]
        
        fh = open(postings_file)
        fh.seek(start_byte)
        bytestream = fh.read(num_bytes)
        lst = compress.uncompress_postings_list(bytestream)
        
        a_fids = [pair[0] for pair in lst]
    else:
        a_fids = [pair[0] for pair in a]
    
    other_fids = filter(lambda a: a not in a_fids, all_fids)
    return index.generate_skip_list(other_fids, always_insert_skips)