def load_index(self): if os.path.exists(os.path.join(self.data_directory, 'ent2id.txt')): self.entity_dict = load_index( os.path.join(self.data_directory, 'ent2id.txt')) print("Load preprocessed entity index") elif os.path.exists(os.path.join(self.data_directory, 'ent2ids')): self.entity_dict = unserialize(os.path.join( self.data_directory, "ent2ids"), form='json') print("Load raw entity index") else: print("Entity Index not exist") self.entity_dict = {} if os.path.exists(os.path.join(self.data_directory, 'relation2id.txt')): self.relation_dict = load_index( os.path.join(self.data_directory, 'relation2id.txt')) print("Load preprocessed relation index") elif os.path.exists(os.path.join(self.data_directory, 'relation2ids')): self.relation_dict = unserialize(os.path.join( self.data_directory, "relation2ids"), form='json') print("Load raw relation index") else: print("Relation Index not exist") self.relation_dict = {}
def load_data(self): self.data_directory = os.path.join(self.root_directory, "data") self.entity_dict = load_index( os.path.join(self.data_directory, "ent2id.txt")) self.relation_dict = load_index( os.path.join(self.data_directory, "relation2id.txt")) self.facts_data = translate_facts( load_facts(os.path.join(self.data_directory, "train.txt")), self.entity_dict, self.relation_dict) self.test_support = translate_facts( load_facts(os.path.join(self.data_directory, "test_support.txt")), self.entity_dict, self.relation_dict) self.valid_support = translate_facts( load_facts(os.path.join(self.data_directory, "valid_support.txt")), self.entity_dict, self.relation_dict) self.test_eval = translate_facts( load_facts(os.path.join(self.data_directory, "test_eval.txt")), self.entity_dict, self.relation_dict) self.valid_eval = translate_facts( load_facts(os.path.join(self.data_directory, "valid_eval.txt")), self.entity_dict, self.relation_dict) # augment with open(os.path.join(self.data_directory, 'pagerank.txt')) as file: self.pagerank = list( map(lambda x: float(x.strip()), file.readlines())) if os.path.exists(os.path.join(self.data_directory, "fact_dist")): self.fact_dist = unserialize( os.path.join(self.data_directory, "fact_dist")) else: self.fact_dist = None if os.path.exists(os.path.join(self.data_directory, "train_graphs")): self.train_graphs = unserialize( os.path.join(self.data_directory, "train_graphs")) else: self.train_graphs = None assert os.path.exists( os.path.join(self.data_directory, "evaluate_graphs")) if os.path.exists(os.path.join(self.data_directory, "evaluate_graphs")): print("Use evaluate graphs") self.evaluate_graphs = unserialize( os.path.join(self.data_directory, "evaluate_graphs")) else: self.evaluate_graphs = None if os.path.exists(os.path.join(self.data_directory, "rel2candidates")): self.rel2candidate = unserialize( os.path.join(self.data_directory, "rel2candidates")) else: self.rel2candidate = {} # self.rel2candidate = {self.relation_dict[key]: value for key, value in self.rel2candidate.items() if # key in self.relation_dict} self.id2entity = sorted(self.entity_dict.keys(), key=self.entity_dict.get) self.id2relation = sorted(self.relation_dict.keys(), key=self.relation_dict.get) self.data_loaded = True
def main(): # Get index fle idx_desired = input('Which index file do you wish to query (press enter for default inverted_index.txt): ') if idx_desired == '': idx_desired = os.path.join(CACHE_DIR, INDEX_FILE) else: idx_desired = os.path.join(CACHE_DIR, idx_desired) index_filename = idx_desired.lower() # determine if query should be stemmed while True: should_stem = input('Is this a stemmed index? [Y]es/[N]o: ').lower() possible_values = ['y', 'yes', 'n', 'no'] if should_stem in possible_values: break else: print('Please enter a correct response.') query_prompt = "Specify which type of query you are making [1] TF-IDF [2] BM25: " # for i, qtype in enumerate(QUERY_TYPES): # query_prompt += f'[{i+1}] {qtype}\n' # determine how the documents should be ranked while True: try: desired_query_type = int(input(query_prompt)) - 1 break except ValueError as e: print(e) print(f'Please enter and integer between 1-{len(QUERY_TYPES)}.') # format the query appropriately original_query = input('Please enter your query (keep in mind that casing is irrelevant): ') tokenizer = nltk.RegexpTokenizer(indexer.TOKENIZING_REGEX) query_tokens = indexer.custom_tokenizer(original_query, tokenizer, stem_doc=should_stem) index = utils.load_index(input_file=index_filename) print(f'\nYour formatted query is: {" ".join(query_tokens)}') # rank documents ranked_docs: dict = rank_documents(query_tokens, index, QUERY_TYPES[desired_query_type]) top15 = sorted(ranked_docs.items(), reverse=True, key=lambda x: x[1])[:15] i = 1 URLs = [] print() for docID, rank in top15: print(f'{i}.\tDoc {docID}\twith rank {round(rank,3)}') with open(os.path.join(CACHE_DIR, 'documents', str(docID), 'url'), mode='r') as f: URLs.append(f.read()) i -= -1 print('\nView those documents in the browser by clicking on their corresponding URLs:\n') for i, url in enumerate(URLs): print(f'{i+1}.\t{url}')
def run(): args = init_params() print( f'\nCompression performed on {args.input_file} and stored in {args.output_file}' ) unfiltered: dict = utils.load_index(args.input_file) table = { 'unfiltered': { 'tokens': { 'number': len(unfiltered), 'delta %': round(0.0, 2), 'total %': round(0.0, 2) }, 'non-positional postings': { 'number': sum([unfiltered[token][0] for token in unfiltered]), 'delta %': round(0.0, 2), 'total %': round(0.0, 2) } } } no_numbers: dict = remove_numbers(unfiltered.copy()) table = update_table(table, 'unfiltered', 'no numbers', no_numbers) case_folding: dict = case_fold(no_numbers) table = update_table(table, 'no numbers', 'case folding', case_folding) remove30 = remove_stop_words(case_folding, stop_words[:30]) table = update_table(table, 'case folding', '30 stop words', remove30) remove150 = remove_stop_words(case_folding, stop_words) table = update_table(table, '30 stop words', '150 stop words', remove150) final = remove150 utils.save_index_to_disk(final, args.output_file) print(f'\nCompression Table:') print(display_table(table))
def merge_blocks_into_one_index( dir=BLOCK_DIR) -> Dict[str, Tuple[int, List[int]]]: ''' Merge all block indices in BLOCK_DIR into a single inverted index. ''' inverted_index = {} print('Merging all blocks') for block_file in tqdm(os.listdir(dir)): block = utils.load_index(os.path.join(dir, block_file)) for token in block: postings: set = inverted_index.get(token, set()) _, block_postings = block[token] postings.update(block_postings) inverted_index[token] = postings for token in inverted_index: postings: set = inverted_index[token] postings: list = sorted(list(postings)) freq = len(postings) inverted_index[token] = freq, postings return inverted_index
dt = val["dt"].strftime("%Y/%m/%d") tm = val["dt"].strftime("%H:%M:%S") dist = val["dist"] values.append((key, 1,dt, tm, dist)) rows = db.addAttendanceMulti(values) print(f"{rows} inserted....") u.file_check(DISTANCE_FILE, "recognize_multi.py", "No User exists. Add a user...") u.file_check(LABELS_FILE, "recognize_multi.py", "User names not found...") u.file_check(CAM_FILE, "recognize_multi.py", "Cam file not found...") u.file_check(CONFIG_FILE, "recognize_multi.py", "Config file not found...") annoy_object = u.load_index(DISTANCE_FILE) print("[INFO] [recognize_multi.py] Distance file loaded...") labels = u.read_data(LABELS_FILE)["labels"] print("[INFO] [recognize_multi.py] Labels file loaded...") cam_links = u.read_txtfile(CAM_FILE) print("[INFO] [recognize_multi.py] cam file loaded...") configs = eval(u.read_txtfile(CONFIG_FILE)[0]) TIMESTAMP = configs["time_stamp"] dbConfig = configs["db"] host, user, passwd, dbname = dbConfig["host"], dbConfig["user"], dbConfig["passwd"], dbConfig["db"] print("[INFO] [recognize_multi.py] cam file loaded...") detector = FaceDetectionSSD()
print('\rindexing {}/{}'.format(i + 1, n_images), end='') sys.stdout.flush() print('') save_index(index, index_file) print('{} saved'.format(index_file)) # --------- # RETRIEVAL # --------- vocabulary = load_data(vocabulary_file) print('loading index ...', end=' ') sys.stdout.flush() index = load_index(index_file) print('OK') idf = np.log(index['n'] / (index['df'] + 2**-23)) idf2 = idf**2.0 n_short_list = 100 score = [] query_list = [image_list[i] for i in range(0, 4 * N_QUERY, 4)] for fname in query_list: imfile = join(base_path, fname) # compute low-level features
if os.path.isfile(qfile): return with open(qfile, mode='w') as f: f.write("{}") # because we want utils.load_json_from_disk to load an empty dict instead of raising an exception if __name__ == '__main__': args = init_params() utils.ensure_dir_exists('output') ensure_query_file(args.output_file) if args.query_string == None or type(args.query_string) != str: print("Please provide a query str with flag -q") exit() inv_index: Dict[str, Tuple[int, List[int]]] = utils.load_index(args.input_file) result: dict = exec_query(args.query_string, inv_index) x = result[args.query_string] print(f'<{args.query_string}> query was {x["message"]}: {x["frequency"]} hits found') queries: dict = utils.load_json_from_disk(args.output_file) del x['message'] queries.update(result) utils.write_json_obj_2_disk(queries, args.output_file, indentation=4)
import time import os from config import * from tqdm import tqdm if __name__ == "__main__": vocab = utils.get_vocabulary() # loads vocabulary present in system citations = utils.get_citations() # loads citation counts for doc_ids while True: print("Enter a word to search the index for:") x = input() if x in vocab: start_time = time.time() if os.path.exists(("indexes/inverted_index_" + x + ".pbz2")): index = utils.load_index("indexes/inverted_index_" + x) loaded = x else: index = utils.load_index(filename="indexes/inverted_index_" + x[0]) loaded = x[0] end_time = time.time() print(("Took {} seconds to load index " + loaded).format(end_time - start_time)) print(index[x]["doc_frequency"]) # print number of docs term is in for k in list(index[x]["doc_ids"].keys())[:10]: print( k, index[x]["doc_ids"][k], citations[k] ) # print top 10 docs for term, how many times term in doc, and citations of doc else: print("Sorry, this word is not in the index. Try another.")
def main(): inverted_index = utils.load_index(indexer.INVERTED_INDEX_FILE) query_type = { 'a': 'AND', 'and': 'AND', 'o': 'OR', 'or': 'OR', 'r': 'RANKED', 'ranked': 'RANKED', } requested = '' while requested not in query_type: if requested != '': print(f'\n"{requested}" is not a valid type of search.') requested = input("Enter the type of search you want to perform ([a]nd, [o]r, [r]anked): ").lower() query_terms = nltk.regexp_tokenize(input("Please enter your query: "), indexer.TOKENIZING_REGEX) results = '' documentIDs = [] if query_type[requested] == 'RANKED': postings = list(union_postings(query_terms, inverted_index).keys()) ranking = {} L_avg = find_average_document_length() for docID in postings: ranking[docID] = round(compute_ranking_RSV_11_32(query_terms, docID, L_avg, inverted_index), 2) sorted_rankings = sorted(ranking.items(), reverse=True, key=lambda x: x[1]) top10 = sorted_rankings[:10] if len(top10) == 0: results = '\nSorry, no documents match your query.' for i in range(len(top10)): results += f'\n{i+1}. \tDocument ID {top10[i][0]} \twith ranking {top10[i][1]}' documentIDs = [t[0] for t in top10] elif query_type[requested] == 'AND': postings = sorted(list(intersect_postings(query_terms, inverted_index))) print(f'\nThere are {len(postings)} documents that contain all query terms.') print('The first 10 are:') first10 = postings[:10] if len(first10) == 0: results = '\nSorry, no documents match your query.' for i in range(len(first10)): results += f'\n{i+1}. \tDocument ID {first10[i]}' documentIDs = first10 elif query_type[requested] == 'OR': doc_vs_occurence_tuples = union_postings(query_terms, inverted_index).items() postings = sorted(doc_vs_occurence_tuples, reverse=True, key=lambda x: (x[1], -x[0])) # sort first by number of query terms appearing in document, then by ID top10 = postings[:10] print(f'\nThere are {len(postings)} documents that contain at least one query term.') if len(top10) == 0: print('Sorry, no documents match your query.') for i in range(len(top10)): results += f'\n{i+1}. \tDocument ID {top10[i][0]} \tcontains {top10[i][1]} query terms' documentIDs = [t[0] for t in top10] else: raise AssertionError("Program flow should never reach this code block") print(results) if len(documentIDs) > 0: generated_html_path = generate_html(query_terms, query_type[requested], documentIDs) print("\nYou can view the contents of those documents in the browser if you open:") print(generated_html_path)