예제 #1
0
파일: preprocess.py 프로젝트: zxlzr/CogKR
 def save_data(self):
     # if not os.path.exists(os.path.join(self.root_directory, "ent2ids")):
     #     serialize(self.entity_dict, os.path.join(self.root_directory, "ent2ids"), in_json=True)
     # if not os.path.exists(os.path.join(self.root_directory, "relation2ids")):
     #     serialize(self.relation_dict, os.path.join(self.root_directory, "relation2ids"), in_json=True)
     # if not os.path.exists(os.path.join(self.root_directory, "e1rel_e2.json")):
     #     e1rel_e2 = defaultdict(list)
     #     for head, relation, tail in itertools.chain(self.facts_data, *self.test_tasks.values(),
     #                                                 *self.valid_tasks.values()):
     #         if isinstance(relation, int):
     #             relation = self.id2relation[relation]
     #         e1rel_e2[self.id2entity[head] + relation].append(self.id2entity[tail])
     #     serialize(e1rel_e2, os.path.join(self.root_directory, "e1rel_e2.json"), in_json=True)
     if not os.path.exists(
             os.path.join(self.data_directory, "rel2candidates.json")):
         rel2candidates = {
             key: list(map(self.id2entity.__getitem__, value))
             for key, value in self.rel2candidate.items()
         }
         train_tasks = set(
             map(lambda x: x[1],
                 load_facts(os.path.join(self.data_directory,
                                         "train.txt"))))
         for task in train_tasks:
             rel2candidates[task] = self.id2entity
         serialize(rel2candidates,
                   os.path.join(self.data_directory, "rel2candidates.json"),
                   in_json=True)
     serialize(self.rel2candidate,
               os.path.join(self.data_directory, "rel2candidates"))
     save_index(self.id2entity,
                os.path.join(self.data_directory, "ent2id.txt"))
     save_index(self.id2relation,
                os.path.join(self.data_directory, "relation2id.txt"))
예제 #2
0
def main():

    while True:
        should_stem = input(
            'Do you wish to stem your index? [Y]es/[N]o:').lower()
        possible_values = ['y', 'yes', 'n', 'no']

        if should_stem in possible_values:
            break
        else:
            print('Please enter a correct response.')

    should_stem = True if should_stem in ['y', 'yes'] else False
    docs = load_and_tokenize_documents(stem_docs=should_stem)
    index = create_index(docs)
    utils.save_index(index)
    print(f'Index contains {len(index)} unique terms.')
예제 #3
0
            dist2 = distance.cdist(desc, vocabulary, metric='sqeuclidean')
            assignments = np.argmin(dist2, axis=1)
            idx, count = np.unique(assignments, return_counts=True)
            for j, c in zip(idx, count):
                index['dbase'][j].append((imID, c))
            index['n'] += 1
            index['df'][idx] += 1
            #index['norm'][imID] = np.float32(nd)
            index['norm'][imID] = np.linalg.norm(count)

            print('\rindexing {}/{}'.format(i + 1, n_images), end='')
            sys.stdout.flush()
        print('')

        save_index(index, index_file)
        print('{} saved'.format(index_file))

    # ---------
    # RETRIEVAL
    # ---------

    vocabulary = load_data(vocabulary_file)

    print('loading index ...', end=' ')
    sys.stdout.flush()
    index = load_index(index_file)
    print('OK')

    idf = np.log(index['n'] / (index['df'] + 2**-23))
    idf2 = idf**2.0