def save_data(self): # if not os.path.exists(os.path.join(self.root_directory, "ent2ids")): # serialize(self.entity_dict, os.path.join(self.root_directory, "ent2ids"), in_json=True) # if not os.path.exists(os.path.join(self.root_directory, "relation2ids")): # serialize(self.relation_dict, os.path.join(self.root_directory, "relation2ids"), in_json=True) # if not os.path.exists(os.path.join(self.root_directory, "e1rel_e2.json")): # e1rel_e2 = defaultdict(list) # for head, relation, tail in itertools.chain(self.facts_data, *self.test_tasks.values(), # *self.valid_tasks.values()): # if isinstance(relation, int): # relation = self.id2relation[relation] # e1rel_e2[self.id2entity[head] + relation].append(self.id2entity[tail]) # serialize(e1rel_e2, os.path.join(self.root_directory, "e1rel_e2.json"), in_json=True) if not os.path.exists( os.path.join(self.data_directory, "rel2candidates.json")): rel2candidates = { key: list(map(self.id2entity.__getitem__, value)) for key, value in self.rel2candidate.items() } train_tasks = set( map(lambda x: x[1], load_facts(os.path.join(self.data_directory, "train.txt")))) for task in train_tasks: rel2candidates[task] = self.id2entity serialize(rel2candidates, os.path.join(self.data_directory, "rel2candidates.json"), in_json=True) serialize(self.rel2candidate, os.path.join(self.data_directory, "rel2candidates")) save_index(self.id2entity, os.path.join(self.data_directory, "ent2id.txt")) save_index(self.id2relation, os.path.join(self.data_directory, "relation2id.txt"))
def main(): while True: should_stem = input( 'Do you wish to stem your index? [Y]es/[N]o:').lower() possible_values = ['y', 'yes', 'n', 'no'] if should_stem in possible_values: break else: print('Please enter a correct response.') should_stem = True if should_stem in ['y', 'yes'] else False docs = load_and_tokenize_documents(stem_docs=should_stem) index = create_index(docs) utils.save_index(index) print(f'Index contains {len(index)} unique terms.')
dist2 = distance.cdist(desc, vocabulary, metric='sqeuclidean') assignments = np.argmin(dist2, axis=1) idx, count = np.unique(assignments, return_counts=True) for j, c in zip(idx, count): index['dbase'][j].append((imID, c)) index['n'] += 1 index['df'][idx] += 1 #index['norm'][imID] = np.float32(nd) index['norm'][imID] = np.linalg.norm(count) print('\rindexing {}/{}'.format(i + 1, n_images), end='') sys.stdout.flush() print('') save_index(index, index_file) print('{} saved'.format(index_file)) # --------- # RETRIEVAL # --------- vocabulary = load_data(vocabulary_file) print('loading index ...', end=' ') sys.stdout.flush() index = load_index(index_file) print('OK') idf = np.log(index['n'] / (index['df'] + 2**-23)) idf2 = idf**2.0