def get_top_members(results, classes, x): k = int(0.1 * len(results)) top_few = results[:k] members = [] for result in top_few: try: members.extend(classes[result]) except KeyError: pass return helper.get_top_k(members, x)
def get_top_classes(results, classes, x): k = int(0.1 * len(results)) top_few = results[:k] PC = [] for result in top_few: try: PC.append(classes[result]) except KeyError: pass return helper.get_top_k(PC, x)
def build(index_directory, dictionary_file, postings_file): files_to_index = [f for f in listdir(index_directory) if isfile(join(index_directory, f))] index = [] doc_lengths = {} IPC_class = {} UPC_class = {} family_members = {} cited_by = {} counter = 0 doc_top_terms = {} for file_name in files_to_index: file_path = format_directory_path(index_directory) + file_name # Read XML tree = ET.parse(file_path) root = tree.getroot() tokens = [] for child in root: attr_name = child.attrib['name'] if attr_name == 'Title' or attr_name == 'Abstract': t = build_tokens(child.text) tokens.extend(t) elif attr_name == 'IPC Class': IPC_class[remove_file_ext(file_name)] = child.text.strip() elif attr_name == 'UPC Class': UPC_class[remove_file_ext(file_name)] = child.text.strip() elif attr_name == 'Family Members': members = child.text.strip().split('|') m = [] for member in members: m.append(member.strip()) family_members[remove_file_ext(file_name)] = m elif attr_name == 'Cited By': members = child.text.strip().split('|') m = [] for member in members: m.append(member.strip()) cited_by[remove_file_ext(file_name)] = m tokens = helper.remove_stop_words(helper.filter_invalid_characters(tokens)) doc_top_terms[remove_file_ext(file_name)] = helper.get_top_k(tokens, 10) # build tokens doc_lengths[remove_file_ext(file_name)] = get_doc_length(tokens) index_entries = add_doc_id_to_tokens(tokens, remove_file_ext(file_name)) index.extend(index_entries) counter += 1 if counter % 300 == 0: print 'indexing ............... {}% completed'.format(round(float(counter)/len(files_to_index)*100, 2)) print 'Writing index to disk...' index = sort_inverted_index(index) index = group_index(index) write_index_to_disk(index, dictionary_file, postings_file) write_meta_data_to_disk(doc_lengths, len(files_to_index), doc_top_terms, UPC_class, IPC_class, family_members, cited_by)
def expand_query(results, doc_top_terms, inverted_index, meta_data): """ To deal with the anomalous state of knowledge problem We take top 10% of documents. For each document, pick the 10 most frequent words (already indexed) From this pool of words, pick the final top 10 by frequency. Run query again and return results """ k = int(0.1 * len(results)) top_few = results[:k] pool_of_words = [] for result in top_few: pool_of_words.extend(doc_top_terms[result]) new_query = helper.get_top_k(pool_of_words, 10) return execute_query([], new_query, [], inverted_index, meta_data)