Пример #1
0
def get_list_for_word(word, pos, details):
    """
    Gets the list for a single word with its corresponding relevant positions
    :param word: the word to search
    :param pos: the positions wanted
    :param details: true if detailed positions also need to be returned
    :return: the list of documents that contain this word in the wanted positions, and the corresponding list of
    positions if needed
    """
    res = []
    res_pos = []
    if len(pos) == 1:
        pf.seek(postings_base_pointer + dictionary[word]['ptr'])
        for _ in range(dictionary[word]['df']):
            res.append(read_doc_id(pf))
            tf = read_tf(pf)
            pp = read_position_pointer(pf)
            if details:
                this_pos = []
                pf.seek(pp)
                for _ in range(tf):
                    this_pos.append(read_positional_index(pf))
                res_pos.append(this_pos)
    pf.seek(postings_base_pointer + dictionary[word]['ptr'])
    for _ in range(dictionary[word]['df']):
        doc_id = read_doc_id(pf)
        tf = read_tf(pf)
        pos_ptr = read_position_pointer(pf)
        if tf < len(pos):
            continue
        pos_reader = open(postings_file, 'rb')
        pos_reader.seek(pos_ptr)
        current_positions = queue.Queue()
        t_id = len(pos)  # term id of next to inspect
        for _ in range(len(pos)):
            current_positions.put(read_positional_index(pos_reader))
        is_valid = False
        this_pos = []
        pos_list = list(current_positions.queue)
        if is_isomorphic(pos_list, pos):
            is_valid = True
            this_pos.append(pos_list[0])
        while t_id < tf:
            if is_valid and not details:
                break
            current_positions.get()
            current_positions.put(read_positional_index(pos_reader))
            t_id += 1
            pos_list = list(current_positions.queue)
            if is_isomorphic(pos_list, pos):
                is_valid = True
                this_pos.append(pos_list[0])
        if is_valid:
            res.append(doc_id)
            res_pos.append(this_pos)
        pos_reader.close()
    return res, res_pos
Пример #2
0
def run_search(dict_file, postings_file, queries_file, results_file):
    """
    using the given dictionary file and postings file,
    perform searching on the given queries file and output the results to a file
    """
    print('running search on the queries...')

    # Initial setup
    global pf, postings_base_pointer, lengths_base_pointer
    pf = open(postings_file, 'rb')
    df = open(dict_file, 'r')
    qf = open(queries_file, 'r')
    rf = open(results_file, 'w')

    # Load dictionary
    pointers = df.readline()[:-1].split(' ')
    postings_base_pointer = int(pointers[0])
    lengths_base_pointer = int(pointers[1])
    load_dict(df, dictionary)

    # Load vector lengths
    pf.seek(lengths_base_pointer)
    while True:
        doc = read_doc_id(pf)
        length = read_float_bin_file(pf)
        if not length:
            break
        doc_len[doc] = length[0]

    # Perform search
    query_input = qf.readline()
    # split query by 'AND'
    res = query_input.split('AND')
    queries = []
    for part in res:
        queries.append(part.strip())

    result_lists = []
    for query in queries:
        res = process_query(query)
        result_lists.append(res)

    final_score = compute_harmonic_scores(result_lists)

    result = select_first_k(final_score, 1000)

    if not result:
        rf.write('\n')
    else:
        rf.write(str(result[0] + smallest_doc_id))
        for doc_id in result[1:]:
            rf.write(' ')
            rf.write(str(doc_id + smallest_doc_id))
        rf.write('\n')

    # Close files
    pf.close()
    df.close()
    qf.close()
    rf.close()
Пример #3
0
def read_posting(word, index):
    """
    Read the document ID, term frequency, and pointer to positions list of the index-th document of given word
    :param word: the term to search
    :param index: the index of the document w.r.t. this term
    :return: tuple (ID, tf, ptr)
    """
    pf.seek(postings_base_pointer + dictionary[word]['ptr'] + doc_byte_width * index)
    id = read_doc_id(pf)
    tf = read_tf(pf)
    ptr = read_position_pointer(pf)
    return id, tf, ptr
Пример #4
0
def intersect_word_list(word, docs, pos_lst, pos, details):
    """
    Gets the list of simple intersection of the two given words
    :param word: the word to search for
    :param docs: the intermediate list of words
    :param pos_lst: the intermediate list of position lists
    :param pos: the relative positions of word and intermediate list
    :param details: true if the details of positions of each phrase needed is needed to be returned
    :return: the list of documents that contain the two words in the wanted relative positions, and the corresponding
    list of lists of positions if needed
    """
    res = []
    res_pos = []
    # [w1, w2] for the following
    doc_reader = open(postings_file, 'rb')
    base_pointer = postings_base_pointer + dictionary[word]['ptr']
    doc_reader.seek(base_pointer)
    if not docs:
        return [], []
    doc_id = [read_doc_id(doc_reader), docs[0]]
    term_freq = [read_tf(doc_reader), len(pos_lst[0])]
    pos_pointer = read_position_pointer(doc_reader)
    doc_count = [1, 1]  # count = next index to inspect
    doc_freq = [dictionary[word]['df'], len(docs)]
    docs.append(-1)
    pos_lst.append([])
    skip_width = [math.floor(math.sqrt(doc_freq[0])), math.floor(math.sqrt(doc_freq[1]))]
    while doc_count[0] <= doc_freq[0] and doc_count[1] <= doc_freq[1]:
        found = doc_id[0] == doc_id[1]
        if doc_id[0] < doc_id[1]:
            if skip_width[0] > 1:
                while doc_count[0] + skip_width[0] < doc_freq[0]:
                    d_id, _, _ = read_posting(word, doc_count[0] + skip_width[0])
                    if d_id > doc_id[1]:
                        break
                    doc_count[0] += skip_width[0]
            doc_reader.seek(base_pointer + doc_byte_width * (doc_count[0] - 1))
            for _ in range(skip_width[0] + 1):
                d_id = read_doc_id(doc_reader)
                tf = read_tf(doc_reader)
                pp = read_position_pointer(doc_reader)
                doc_count[0] += 1
                if d_id >= doc_id[1]:
                    found = d_id == doc_id[1]
                    doc_id[0] = d_id
                    term_freq[0] = tf
                    pos_pointer = pp
                    break
                if doc_count[0] >= doc_freq[0]:
                    break
        elif doc_id[0] > doc_id[1]:
            if skip_width[1] > 1:
                while doc_count[1] + skip_width[1] < doc_freq[1]:
                    d_id = docs[doc_count[1] + skip_width[1]]
                    if d_id > doc_id[0]:
                        break
                    doc_count[1] += skip_width[1]
            for _ in range(skip_width[1] + 1):
                d_id = docs[doc_count[1]]
                tf = len(pos_lst[doc_count[1]])
                doc_count[1] += 1
                if d_id >= doc_id[0]:
                    found = d_id == doc_id[0]
                    doc_id[1] = d_id
                    term_freq[1] = tf
                    break
                if doc_count[1] >= doc_freq[1]:
                    break
        if found:
            pos_reader = open(postings_file, 'rb')
            pos_reader.seek(pos_pointer)
            position_list = pos_lst[doc_count[1] - 1]
            position_list.append(-1)
            position = [read_positional_index(pos_reader), position_list[0]]
            skip_pos_width = [math.floor(math.sqrt(term_freq[0])), math.floor(math.sqrt(term_freq[1]))]
            pos_count = [1, 1]
            found2 = (position[1] - position[0] == pos[1] - pos[0])
            this_pos = []
            if found2:
                this_pos.append(position[0])
            while pos_count[0] <= term_freq[0] and pos_count[1] <= term_freq[1] and (details or not found2):
                if position[1] - position[0] > pos[1] - pos[0]:
                    if skip_pos_width[0] > 1:
                        while pos_count[0] + skip_pos_width[0] < term_freq[0]:
                            p_id = read_position(word, doc_count[0], pos_count[0] + skip_pos_width[0])
                            if position[1] - p_id < pos[1] - pos[0]:
                                break
                            pos_count[0] += skip_pos_width[0]
                    pos_reader.seek(pos_pointer + pos_byte_width * (pos_count[0] - 1))
                    for _ in range(skip_pos_width[0] + 1):
                        p_id = read_positional_index(pos_reader)
                        pos_count[0] += 1
                        if position[1] - p_id <= pos[1] - pos[0]:
                            matches = position[1] - p_id == pos[1] - pos[0]
                            found2 = matches or found2
                            position[0] = p_id
                            if matches:
                                this_pos.append(position[0])
                            break
                        if pos_count[0] >= term_freq[0]:
                            break
                elif position[1] - position[0] < pos[1] - pos[0]:
                    if skip_pos_width[1] > 1:
                        while pos_count[1] + skip_pos_width[1] < term_freq[1]:
                            p_id = position_list[pos_count[1] + skip_pos_width[1]]
                            if position[0] - p_id < pos[0] - pos[1]:
                                break
                            pos_count[1] += skip_pos_width[1]
                    for _ in range(skip_pos_width[1] + 1):
                        p_id = position_list[pos_count[1]]
                        pos_count[1] += 1
                        if position[0] - p_id <= pos[0] - pos[1]:
                            matches = position[0] - p_id == pos[0] - pos[1]
                            found2 = matches or found2
                            position[1] = p_id
                            if matches:
                                this_pos.append(position[0])
                            break
                        if pos_count[1] >= term_freq[1]:
                            break
            if found2:
                res.append(doc_id[0])
                res_pos.append(this_pos)
            pos_reader.close()
            doc_id[0] = read_doc_id(doc_reader)
            term_freq[0] = read_tf(doc_reader)
            pos_pointer = read_position_pointer(doc_reader)
            doc_count[0] += 1
            doc_id[1] = docs[doc_count[1]]
            term_freq[1] = len(pos_lst[doc_count[1]])
            doc_count[1] += 1
    doc_reader.close()
    return res, res_pos
Пример #5
0
def intersect_words(w1, w2, pos, details):
    """
    Gets the list of simple intersection of the two given words
    :param w1: the first word to search for
    :param w2: the second word to search for
    :param pos: the relative positions of w1 and w2
    :param details: true if the details of positions of each phrase needed is needed to be returned
    :return: the list of documents that contain the two words in the wanted relative positions, and the corresponding
    list of lists of positions if needed
    """
    res = []
    res_pos = []
    # [w1, w2] for the following
    w = [w1, w2]
    doc_reader = [open(postings_file, 'rb'), open(postings_file, 'rb')]
    base_pointer = [postings_base_pointer + dictionary[w1]['ptr'], postings_base_pointer + dictionary[w2]['ptr']]
    doc_reader[0].seek(base_pointer[0])
    doc_reader[1].seek(base_pointer[1])
    doc_id = [read_doc_id(doc_reader[0]), read_doc_id(doc_reader[1])]
    term_freq = [read_tf(doc_reader[0]), read_tf(doc_reader[1])]
    pos_pointer = [read_position_pointer(doc_reader[0]), read_position_pointer(doc_reader[1])]
    doc_count = [1, 1]  # count = next index to inspect
    doc_freq = [dictionary[w1]['df'], dictionary[w2]['df']]
    skip_width = [math.floor(math.sqrt(doc_freq[0])), math.floor(math.sqrt(doc_freq[1]))]
    while doc_count[0] <= doc_freq[0] and doc_count[1] <= doc_freq[1]:
        found = doc_id[0] == doc_id[1]
        if doc_id[0] != doc_id[1]:
            smaller_index = 0
            if doc_id[0] > doc_id[1]:
                smaller_index = 1
            other_index = 1 - smaller_index
            if skip_width[smaller_index] > 1:
                while doc_count[smaller_index] + skip_width[smaller_index] < doc_freq[smaller_index]:
                    d_id, _, _ = read_posting(w[smaller_index], doc_count[smaller_index] + skip_width[smaller_index])
                    if d_id > doc_id[other_index]:
                        break
                    doc_count[smaller_index] += skip_width[smaller_index]
            doc_reader[smaller_index].seek(base_pointer[smaller_index] + doc_byte_width
                                           * (doc_count[smaller_index] - 1))
            for _ in range(skip_width[smaller_index] + 1):
                d_id = read_doc_id(doc_reader[smaller_index])
                tf = read_tf(doc_reader[smaller_index])
                pp = read_position_pointer(doc_reader[smaller_index])
                doc_count[smaller_index] += 1
                if d_id >= doc_id[other_index]:
                    found = d_id == doc_id[other_index]
                    doc_id[smaller_index] = d_id
                    term_freq[smaller_index] = tf
                    pos_pointer[smaller_index] = pp
                    break
                if doc_count[smaller_index] >= doc_freq[smaller_index]:
                    break
        if found:
            diff = [pos[1] - pos[0], pos[0] - pos[1]]  # diff = other - this
            pos_reader = [open(postings_file, 'rb'), open(postings_file, 'rb')]
            pos_reader[0].seek(pos_pointer[0])
            pos_reader[1].seek(pos_pointer[1])
            position = [read_positional_index(pos_reader[0]), read_positional_index(pos_reader[1])]
            skip_pos_width = [math.floor(math.sqrt(term_freq[0])), math.floor(math.sqrt(term_freq[1]))]
            pos_count = [1, 1]
            found2 = (position[1] - position[0] == diff[0])
            this_pos = []
            if found2:
                this_pos.append(position[0])
            while pos_count[0] <= term_freq[0] and pos_count[1] <= term_freq[1] and (details or not found2):
                smaller_index = 0
                if position[1] - position[0] < diff[0]:
                    smaller_index = 1
                other_index = 1 - smaller_index
                if skip_pos_width[smaller_index] > 1:
                    while pos_count[smaller_index] + skip_pos_width[smaller_index] < term_freq[smaller_index]:
                        pf.seek(pos_pointer[smaller_index] +
                                (pos_count[smaller_index] + skip_pos_width[smaller_index]) * pos_byte_width)
                        p_id = read_positional_index(pf)
                        if position[other_index] - p_id < diff[smaller_index]:
                            break
                        pos_count[smaller_index] += skip_pos_width[smaller_index]
                pos_reader[smaller_index].seek(pos_pointer[smaller_index]
                                               + pos_byte_width * (pos_count[smaller_index] - 1))
                for _ in range(skip_pos_width[smaller_index] + 1):
                    p_id = read_positional_index(pos_reader[smaller_index])
                    pos_count[smaller_index] += 1
                    if position[other_index] - p_id <= diff[smaller_index]:
                        matches = position[other_index] - p_id == diff[smaller_index]
                        found2 = matches or found2
                        position[smaller_index] = p_id
                        if matches:
                            this_pos.append(position[0])
                        break
                    if pos_count[smaller_index] >= term_freq[smaller_index]:
                        break
            if found2:
                res.append(doc_id[0])
                res_pos.append(this_pos)
            pos_reader[0].close()
            pos_reader[1].close()
            for i in range(1):
                doc_id[i] = read_doc_id(doc_reader[i])
                term_freq[i] = read_tf(doc_reader[i])
                pos_pointer[i] = read_position_pointer(doc_reader[i])
                doc_count[i] += 1
    doc_reader[0].close()
    doc_reader[1].close()
    return res, res_pos
Пример #6
0
def merge_blocks(block_count, out_dict, out_postings):
    """
    Merges the blocks to write the overall postings file
    :param block_count: the number of blocks
    :param out_dict: the output dictionary file
    :param out_postings: the output postings file
    """
    # Load dictionaries and open files
    index_list = list(range(block_count))
    dictionaries = []
    posting_files = []
    positions_files = []
    lengths_files = []
    for i in index_list:
        df = open(f'dictionary{i}.txt', 'r')
        df.readline()
        dictionary = {}
        load_dict(df, dictionary)
        dictionaries.append(dictionary)
        posting_files.append(open(f'postings{i}.txt', 'rb'))
        positions_files.append(open(f'positions{i}.txt', 'rb'))
        lengths_files.append(open(f'lengths{i}.txt', 'rb'))

    # Prepare to write files
    # dict_writer = open(out_dict, 'a')
    post_writer = open(out_postings, 'wb')

    # Print positions
    leading_terms = []
    dictionary_iters = []
    pos_pointers = {}
    pointer = 0
    setup_iters(dictionaries, dictionary_iters, index_list, leading_terms)
    while leading_terms:
        leading_term = heappop(leading_terms)
        update_leading_term(dictionary_iters, leading_term, leading_terms)
        word = leading_term[0]
        collections = [leading_term]
        while leading_terms and leading_terms[0][0] == word:
            term_block_info = heappop(leading_terms)
            collections.append(term_block_info)
            update_leading_term(dictionary_iters, term_block_info, leading_terms)
        pos_pointers[word] = {}
        for block in collections:
            block_index = block[1]
            df = dictionaries[block_index][word]['df']
            ptr = dictionaries[block_index][word]['ptr']
            pf = posting_files[block_index]
            posf = positions_files[block_index]
            pf.seek(ptr)
            for _ in range(df):
                doc = read_doc_id(pf)
                tf = read_tf(pf)
                pp = read_position_pointer(pf)
                posf.seek(pp)
                for _ in range(tf):
                    pos = read_positional_index(posf)
                    write_int_bin_file(post_writer, pos, pos_byte_width)
                pos_pointers[word][doc] = pointer
                pointer += tf * pos_byte_width

    # Print postings
    dictionary_iters = []
    post_pointers = {}
    doc_freq = {}
    postings_base_pointer = pointer
    pointer = 0
    setup_iters(dictionaries, dictionary_iters, index_list, leading_terms)
    while leading_terms:
        leading_term = heappop(leading_terms)
        update_leading_term(dictionary_iters, leading_term, leading_terms)
        word = leading_term[0]
        post_pointers[word] = pointer
        doc_freq[word] = 0
        collections = [leading_term]
        while leading_terms and leading_terms[0][0] == word:
            term_block_info = heappop(leading_terms)
            collections.append(term_block_info)
            update_leading_term(dictionary_iters, term_block_info, leading_terms)
        for block in collections:
            block_index = block[1]
            df = dictionaries[block_index][word]['df']
            ptr = dictionaries[block_index][word]['ptr']
            pf = posting_files[block_index]
            pf.seek(ptr)
            doc_freq[word] += df
            for _ in range(df):
                doc = read_doc_id(pf)
                tf = read_tf(pf)
                pf.read(pos_pointer_byte_width)
                write_int_bin_file(post_writer, doc, post_byte_width)
                write_int_bin_file(post_writer, tf, tf_byte_width)
                write_int_bin_file(post_writer, pos_pointers[word][doc], pos_pointer_byte_width)
                pointer += doc_byte_width

    # Print dictionary
    lengths_base_pointer = pointer + postings_base_pointer
    dictionary_iters = []
    setup_iters(dictionaries, dictionary_iters, index_list, leading_terms)
    dict_writer = open(out_dict, 'a')
    dict_writer.write(f'{postings_base_pointer} {lengths_base_pointer}\n')
    dict_writer.close()
    while leading_terms:
        leading_term = heappop(leading_terms)
        update_leading_term(dictionary_iters, leading_term, leading_terms)
        word = leading_term[0]
        while leading_terms and leading_terms[0][0] == word:
            term_block_info = heappop(leading_terms)
            update_leading_term(dictionary_iters, term_block_info, leading_terms)
        dict_writer = open(out_dict, 'a')
        dict_writer.write(f'{word} {doc_freq[word]} {post_pointers[word]}\n')
        dict_writer.close()

    # Print lengths
    for i in index_list:
        lf = lengths_files[i]
        for _ in range(block_size):
            doc = read_doc_id(lf)
            length = read_float_bin_file(lf)
            if not length:
                break
            write_int_bin_file(post_writer, doc, post_byte_width)
            write_float_bin_file(post_writer, length)

    # Close files
    # dict_writer.close()
    post_writer.close()
    for f in posting_files:
        f.close()
    for f in positions_files:
        f.close()
    for f in lengths_files:
        f.close()
    """