def load_data(self): self.data_path = "data/annotated/" + self.name + ".json" make_path(self.data_path) if os.path.exists(self.data_path): self.ann_data = load_json(self.data_path) else: self.ann_data = [] self.raw_data = load_json(config.processed_path) self.total_num = len(self.raw_data) self.annotated_num = len(self.ann_data) self.position = self.annotated_num # the page showing
def do_niv(X_test, X_train, T_train, Y_train, n_niv_params, dataset_name, fold_idx): niv_filename = 'niv_' + dataset_name fold_name = 'fold' + str(fold_idx + 1) niv_vars = load_json(niv_filename) survived_vars = niv_vars.get(fold_name) if niv_vars else None if survived_vars: print('Stored NIV:', survived_vars) X_test = X_test[survived_vars] X_train = X_train[survived_vars] else: niv_start_time = time.time() print('Start NIV variable selection') survived_vars = niv_variable_selection(X_train, Y_train, T_train, n_niv_params) print('NIV:', list(survived_vars)) X_train = X_train[survived_vars] X_test = X_test[survived_vars] niv_end_time = time.time() print('NIV time:', niv_end_time - niv_start_time) if niv_vars: niv_vars.update({fold_name: survived_vars.tolist()}) else: niv_vars = {fold_name: survived_vars.tolist()} save_json(niv_filename, niv_vars) return X_test, X_train
def select_relations(visualgenome_path, house_objects_path, model_path): ''' Select relations about how often attributes belong to objects of house domain ''' attribute_frequency = load_json( join(visualgenome_path, 'attribute_frequencies.json')) groups = classification(attribute_frequency.values(), model_path) save_json(groups, join(visualgenome_path, 'attribute_classes.json')) if 'others' in groups: del groups['others'] attribute_knowledge, relations = extract_knowledge(attribute_frequency, groups) house_objects = { v.replace(' ', '_'): k['dbpedia_uri'] for v, k in load_json(house_objects_path).items() } save_json(attribute_knowledge, join(visualgenome_path, 'attribute_knowledge.json')) create_triples(relations, house_objects, visualgenome_path)
def is_already_config(self): if not os.path.isfile(self._CONFIG_FILEPATH): return False config = load_json(self._CONFIG_FILEPATH) config_keys = config.keys() if not "backup_folder" in config_keys \ or not "bucket" in config_keys \ or not "time_interval" in config_keys: return False if not os.path.isfile(self._control_key_salt_dir): return False print("Backup program is already config") self._backup_folder = config["backup_folder"] self._bucket = config["bucket"] self._time_interval = config["time_interval"] self._stat_cache = StatCache(self._stat_cache_dir, self._backup_folder) self._object_db = ObjectDB(self._object_db_path) self._set_salt() while True: self._set_control_key() if self._correct_password_entered(): break return True
def eval(path1, path2): data1 = load_json(path1) data2 = load_json(path2) count1 = find_ann_num(data1) count2 = find_ann_num(data2) count_all = min(count1, count2) data1 = data1[:count_all] data2 = data2[:count_all] count = 0 f_all = 0 c_e = 0 c_g = 0 c_p = 0 for gt_data, pred_data in zip(data1, data2): assert gt_data["context_tokens"] == pred_data["context_tokens"] doc_list = gt_data["context_tokens"].split(" ") gt_ann = gt_data["ann"] pred_ann = pred_data["ann"] gt_tokens = get_token_list(doc_list, gt_ann) pred_tokens = get_token_list(doc_list, pred_ann) f1_matrix = get_f1_matrix(gt_tokens, pred_tokens) if len(pred_tokens) != 0: p = np.average(np.max(f1_matrix, axis=0)) r = np.average(np.max(f1_matrix, axis=1)) f = 2 * p * r / (p + r) if p > 0 and r > 0 else 0 f_all += f count += 1 c_e += get_em(gt_tokens, pred_tokens) c_g += len(gt_tokens) c_p += len(pred_tokens) print(c_e / c_g) print(c_e / c_p) print(f_all / count)
def __init__(self, player): self.button_events = btn_events(self) self.win = main_window(self, self.button_events) self.player = player # self.player.play() self.win.show() self.canvasHandler = canvasHandler(self, self.win, self.win.ui.label, self.player) self.canvasHandler.frameSignal.connect(self.win.display_video) """ 初始化電子圍籬 並且將繪製好的區域加入 下一步見 ./ui_controller/view.py 的 display_video() """ self.crossing_detector = crossing_detector("A") ok, area = load_json('./area.txt') self.crossing_detector.add_area_dict(area) ok, area = load_json('./area2.txt') self.crossing_detector.add_area_dict(area)
def preprocess(path, pid=None): data = load_json(path) list_data = [] for i in data: list_data += i["paragraphs"] data = list_data data = sample(data, config.num_sample) count = 0 tokenizer = StanfordTokenizer() examples = [] tqdm_text = get_tqdm_text(pid, 3) for j in tqdm(data, desc=tqdm_text, position=pid): c = j["context"].replace("''", '" ').replace("``", '" ') #.lower() tc = tokenizer.tokenize(c) if len(tc) > config.max_len: continue if isinstance(tokenizer, StanfordTokenizer): c_idx = tokenizer.character_level_idx() else: c_idx = convert_idx(tc) y1s, y2s = [], [] answer_texts = [] qas_sorted = sorted(j["qas"], key=lambda x: x["answers"][0]["answer_start"]) for k in qas_sorted: # q = k["question"].replace("''", '" ').replace("``", '" ').lower() # we don't use question here ans = k["answers"][0] a_s = ans["answer_start"] a = ans["text"].replace("''", '" ').replace("``", '" ') #.lower() a_e = a_s + len(a) answer_span = [] for idx, span in enumerate(c_idx): if not (a_e <= span[0] or a_s >= span[1]): answer_span.append(idx) assert len(answer_span) > 0, "Didn't find answer span" # y1s.append(answer_span[0]) # y2s.append(answer_span[-1]) answer_texts.append((a, answer_span[0], answer_span[-1])) count += 1 examples.append({ "context_tokens": " ".join(tc), "answers": answer_texts # "ans_starts": y1s, # "ans_ends": y2s }) print(count / len(data)) return examples
def test(args, MODEL_LOC, LABEL_JSON_LOC): print_statement('LOAD EMBEDDINGS') label_map = load_json(LABEL_JSON_LOC, reverse=True, name='Label Mapping') with open('dataset/ind2token', 'rb') as f: ind2token = pickle.load(f) with open('dataset/token2ind', 'rb') as f: token2ind = pickle.load(f) with open('dataset/embeddings_vector', 'rb') as f: embeddings_vector = pickle.load(f) print_value('Embed shape', embeddings_vector.shape) print_value('Vocab size', len(ind2token)) batch_size = args.batch_size embedding_size = embeddings_vector.shape[1] model = TextCNN(batch_size=batch_size, c_out=args.c_out, output_size=args.num_classes, vocab_size=len(ind2token), embedding_size=embedding_size, embeddings_vector=torch.from_numpy(embeddings_vector), kernel_sizes=args.kernel_sizes, trainable=args.embed_trainable, p=args.p) model.to(args.device) ckpt = torch.load(MODEL_LOC, map_location=args.device) model.load_state_dict(ckpt["state_dict"]) model.eval() print_statement('MODEL TESTING') qcdataset = QCDataset(token2ind, ind2token, split='test', batch_first=True) dataloader_test = DataLoader(qcdataset, batch_size=args.batch_size, shuffle=True, collate_fn=qcdataset.collate_fn) ct = ClassificationTool(len(label_map)) accs = [] length = [] for batch_inputs, batch_targets in dataloader_test: batch_inputs = batch_inputs.to(args.device) batch_targets = batch_targets.to(args.device) with torch.no_grad(): output = model(batch_inputs) acc = torch.sum(output.argmax(dim=1) == batch_targets) accs.append(acc) length.append(len(batch_targets)) ct.update(output, batch_targets) test_acc = float(np.sum(accs)) / sum(length) print('Testing on {} data:'.format(sum(length))) print('+ Overall ACC: {:.3f}'.format(test_acc)) PREC, REC, F1 = ct.get_result() for i, classname in enumerate(label_map.values()): print('* {} PREC: {:.3f}, {} REC: {:.3f}, {} F1: {:.3f}'.format( classname[:3], PREC[i], classname[:3], REC[i], classname[:3], F1[i]))
def create_dataset(visualgenome_raw_path, visualgenome_parsed_path): ''' Create a dataset of objects and their attributes using VisualGenome dataset ''' visualgenome_data = load_json( join(visualgenome_raw_path, 'attributes.json')) attribute_synsets = load_json( join(visualgenome_raw_path, 'attribute_synsets.json')) frequency_data = {} for image in visualgenome_data: objects = set() for attribute_data in image['attributes']: if 'attributes' in attribute_data and len( set(attribute_data['synsets'])) == 1: object_name = attribute_data['synsets'][0] assigned = assign_attribute(object_name, attribute_data['attributes'], attribute_synsets, frequency_data) if assigned and object_name not in objects: objects.add(object_name) frequency_data[object_name]['images'] += 1 logging.info('Size: %s objects selected' % len(frequency_data)) save_json(frequency_data, join(visualgenome_parsed_path, 'attribute_frequencies.json'))
def select_relations(frame_parsed_path, house_objects_path): ''' Select unique relations of frames about house's object ''' house_objects = {v.replace(' ', '_'):k['dbpedia_uri'] for v,k in load_json(house_objects_path).items()} house_object_uris = set(house_objects.values()) house_object_names = set(house_objects.keys()) frame_instances = load_json(join(frame_parsed_path, 'frame_instances.json')) netlemma = map_netlemma() wn31db = map_wn31db() triple_uris = [] triple_labels = [] for frame_id in frame_instances.keys(): valid_frame = False frame = frame_instances[frame_id] frame_uris, frame_labels = create_triples(frame['type'], frame['elements'], wn31db, netlemma) for i in range(len(frame_uris)): object_uri = re.match('<(.+)> <(.+)> <(.+)>', frame_uris[i]).group(1) if object_uri in house_object_uris and frame_uris[i] not in triple_uris: triple_uris.append(frame_uris[i]) triple_labels.append(frame_labels[i]) valid_frame = True else: object_name = re.match('<(.+)> <(.+)> <(.+)>', frame_labels[i]).group(1) if object_name in house_object_names and frame_uris[i] not in triple_uris: triple_uris.append(frame_uris[i]) triple_labels.append(frame_labels[i]) valid_frame = True if not valid_frame: del frame_instances[frame_id] calculate_statistics(triple_uris, frame_parsed_path) save_file(join(frame_parsed_path, 'selected_triples.nt'), triple_uris) save_file(join(frame_parsed_path, 'selected_triples_label.nt'), triple_labels) save_file(join(frame_parsed_path, 'selected_verbalized.txt'), [verbalize_frame(f['type'], f['elements'].items(), netlemma) for f in frame_instances.values()]) logging.info('Total valid relations with URIs: %s' % len(triple_uris))
def load_model(expdir, model_type, ckpt_name='bestmodel'): """Load a pre-trained model. Args: expdir (str): directory where model checkpoint is saved. model_type (str): either "CNN_classifier" or "MLP_regressor", depending on what type of model we wish to load. ckpt_name (str, optional): identifier of model checkpoint we wish to load. Returns: model (tf.keras.Model): a pre-trained model. """ param_file = os.path.join(expdir, 'params.json') model_params = utils.load_json(param_file) ckpt_path = os.path.join(expdir, 'ckpts/{}-1'.format(ckpt_name)) if model_type == 'CNN_classifier': return load_cnn_classifier(model_params, ckpt_path) else: return load_mlp_regressor(model_params, ckpt_path)
def validate_relations(concepnet_path): ''' Validate if the searched objects are the subjects in the relation ''' validated_relations = [] for file_name in os.listdir(concepnet_path): data = load_json(join(concepnet_path, file_name)) result = re.match('.+node=(.+)&.+', data['@id']) if result: object_id = result.group(1) else: object_id = None for element in data['edges']: if object_id == element['start']['@id']: object1 = element['start']['@id'].split('/')[-1] object2 = element['end']['@id'].split('/')[-1] relation = element['rel']['@id'].split('/')[-1] validated_relations.append((object1, relation, object2)) return validated_relations
def _reset(self, data_path, save): if not save: print("extract arch2vec on DARTS search space ...") dataset = load_json(data_path) print("length of the dataset: {}".format(len(dataset))) self.f_path = os.path.join(self.dir_name, 'arch2vec-darts.pt') if os.path.exists(self.f_path): print('{} is already saved'.format(self.f_path)) exit() print('save to {}'.format(self.f_path)) counter = 0 self.model.eval() for k, v in dataset.items(): adj = torch.Tensor(v[0]).unsqueeze(0).cuda() ops = torch.Tensor(one_hot_darts(v[1])).unsqueeze(0).cuda() adj, ops, prep_reverse = preprocessing(adj, ops, **cfg['prep']) with torch.no_grad(): x, _ = self.model._encoder(ops, adj) self.embedding[counter] = { 'feature': x.squeeze(0).mean(dim=0).cpu(), 'genotype': process(v[2]) } print("{}/{}".format(counter, len(dataset))) counter += 1 torch.save(self.embedding, self.f_path) print("finished arch2vec extraction") exit() else: self.f_path = os.path.join(self.dir_name, 'arch2vec-darts.pt') print("load arch2vec from: {}".format(self.f_path)) self.embedding = torch.load(self.f_path) for ind in range(len(self.embedding)): self.features.append(self.embedding[ind]['feature']) self.genotype.append(self.embedding[ind]['genotype']) self.features = torch.stack(self.features, dim=0) print('loading finished. pretrained embeddings shape: {}'.format( self.features.shape))
def build_lipreadingnet(self, config_path, weights='', extract_feats=False): if os.path.exists(config_path): args_loaded = load_json(config_path) print('Lipreading configuration file loaded.') tcn_options = { 'num_layers': args_loaded['tcn_num_layers'], 'kernel_size': args_loaded['tcn_kernel_size'], 'dropout': args_loaded['tcn_dropout'], 'dwpw': args_loaded['tcn_dwpw'], 'width_mult': args_loaded['tcn_width_mult'] } net = Lipreading(tcn_options=tcn_options, backbone_type=args_loaded['backbone_type'], relu_type=args_loaded['relu_type'], width_mult=args_loaded['width_mult'], extract_feats=extract_feats) if len(weights) > 0: print('Loading weights for lipreading stream') net.load_state_dict(torch.load(weights)) return net
def _reset(self, data_path, save): if not save: print("extract arch2vec from {}".format(os.path.join(self.dir_name, self.model_path))) if not os.path.exists(os.path.join(self.dir_name, self.model_path)): exit() dataset = load_json(data_path) self.model = Model(input_dim=5, hidden_dim=128, latent_dim=16, num_hops=5, num_mlp_layers=2, dropout=0, **cfg['GAE']).cuda() self.model.load_state_dict(torch.load(os.path.join(self.dir_name, self.model_path).format(args.dim))['model_state']) self.model.eval() with torch.no_grad(): print("length of the dataset: {}".format(len(dataset))) self.f_path = os.path.join(self.dir_name, 'arch2vec-{}'.format(self.model_path)) if os.path.exists(self.f_path): print('{} is already saved'.format(self.f_path)) exit() print('save to {}'.format(self.f_path)) for ind in range(len(dataset)): adj = torch.Tensor(dataset[str(ind)]['module_adjacency']).unsqueeze(0).cuda() ops = torch.Tensor(dataset[str(ind)]['module_operations']).unsqueeze(0).cuda() adj, ops, prep_reverse = preprocessing(adj, ops, **cfg['prep']) test_acc = dataset[str(ind)]['test_accuracy'] valid_acc = dataset[str(ind)]['validation_accuracy'] time = dataset[str(ind)]['training_time'] x,_ = self.model._encoder(ops, adj) self.embedding[ind] = {'feature': x.squeeze(0).mean(dim=0).cpu(), 'valid_accuracy': float(valid_acc), 'test_accuracy': float(test_acc), 'time': float(time)} torch.save(self.embedding, self.f_path) print("finish arch2vec extraction") exit() else: self.f_path = os.path.join(self.dir_name, self.emb_path) print("load arch2vec from: {}".format(self.f_path)) self.embedding = torch.load(self.f_path) for ind in range(len(self.embedding)): self.features.append(self.embedding[ind]['feature']) self.features = torch.stack(self.features, dim=0) print('loading finished. pretrained embeddings shape: {}'.format(self.features.shape))
def get_select_people_complex_payload(crm_complex_data, grp1_label_list, grp2_label_list): """ generate two group search people complex json. :param crm_complex_data: :param grp1_label_list: group 1 labels list, the groupId for labels in list should be same in yml. :param grp2_label_list: group 2 labels list, eg: ['latestRiskScore', 'installDate'] :return: payload json """ if not isinstance(crm_complex_data, dict) or len(crm_complex_data) <= 0: utils.warn('please pass a valid dict parameter') return None group1_label_data_list, group2_label_data_list = [], [] for label in grp1_label_list: group1_label_data_list.append(crm_complex_data[label]) for label in grp2_label_list: group2_label_data_list.append(crm_complex_data[label]) search_complex_payload = SearchLabelGroupReq([ get_logical_condition(group1_label_data_list), get_logical_condition(group2_label_data_list) ]) json_str = utils.dump_obj(search_complex_payload) payload_json = utils.load_json(json_str) return payload_json
def select_relations(conceptnet_raw_path, concepnet_parsed_path, house_objects_path): # ''' Select some relations from Conceptnet JSON files ''' validated_relations = validate_relations(conceptnet_raw_path) wn31db = map_wn31db() relations_with_uris = [] objects_with_uris = { v.replace(' ', '_'): k['dbpedia_uri'] for v, k in load_json(house_objects_path).items() } triple_labels = [] triple_uris = [] for object1, relation, object2 in validated_relations: if object2 not in objects_with_uris: objects_with_uris[object2] = to_dbpedia( get_uri('/c/en/' + object2, 10), wn31db) for object1, relation, object2 in validated_relations: if objects_with_uris[ object2]: # object2 must to have an URI (by default object1 has an URI) relation_uri = 'http://ns.inria.fr/deko/ontology/deko.owl#' + relation relations_with_uris.append((objects_with_uris[object1], relation, objects_with_uris[object2])) triple_uris.append('<%s> <%s> <%s>' % (objects_with_uris[object1], relation_uri, objects_with_uris[object2])) triple_labels.append('<%s> <%s> <%s>' % (object1, relation, object2)) calculate_statistics(relations_with_uris, concepnet_parsed_path) save_file(join(concepnet_parsed_path, 'selected_triples.nt'), triple_uris) save_file(join(concepnet_parsed_path, 'selected_triples_label.nt'), triple_labels) logging.info('Total valid relations with URIs: %s' % len(relations_with_uris))
def load_samples(samples_file): return load_json(samples_file)
def load_config(config_file): config = load_json(config_file) return config
def make_dictionary(self, question_dir, vocab_file, ent_setup, remove_notfound): if os.path.exists(vocab_file): print("loading vocabularies from " + vocab_file + " ...") vocabularies = list( map(lambda x: x.strip(), codecs.open(vocab_file, encoding="utf-8").readlines())) else: print("no " + vocab_file + " found, constructing the vocabulary list ...") fnames = glob.glob(question_dir + "/training/*.question") dataset_dev = load_json(question_dir + "dev1.0.json") dataset_test = load_json(question_dir + "test1.0.json") # first TRAINING **************************************** vocab_set = set() n = 0. for fname in fnames: fp = open(fname) fp.readline() fp.readline() document = fp.readline().split() fp.readline() query = fp.readline().split() fp.close() vocab_set |= set(document) | set(query) # show progress n += 1 if n % 10000 == 0: print('%3d%%' % int(100 * n / len(fnames))) # DEV + TEST ******************************************* assert ent_setup == "ent-anonym" or ent_setup == "ent" for datum in dataset_test[DATA_KEY] + dataset_dev[DATA_KEY]: document = to_entities(datum[DOC_KEY][TITLE_KEY] + "\n" + datum[DOC_KEY][CONTEXT_KEY]) document = document.lower() assert document for qa in datum[DOC_KEY][QAS_KEY]: doc_raw = document.split() question = to_entities(qa[QUERY_KEY]).lower() assert question qry_raw = question.split() ans_raw = "" for ans in qa[ANS_KEY]: if ans[ORIG_KEY] == "dataset": ans_raw = ("@entity" + "_".join(ans[TXT_KEY].split())).lower() assert ans_raw if remove_notfound: if ans_raw not in doc_raw: found_umls = False for ans in qa[ANS_KEY]: if ans[ORIG_KEY] == "UMLS": umls_answer = ("@entity" + "_".join( ans[TXT_KEY].split())).lower() if umls_answer in doc_raw: found_umls = True ans_raw = umls_answer if not found_umls: continue if ent_setup == "ent-anonym": entity_dict = {} entity_id = 0 lst = doc_raw + qry_raw lst.append(ans_raw) for word in lst: if (word.startswith('@entity')) and ( word not in entity_dict): entity_dict[word] = '@entity' + str(entity_id) entity_id += 1 qry_raw = [ entity_dict[w] if w in entity_dict else w for w in qry_raw ] doc_raw = [ entity_dict[w] if w in entity_dict else w for w in doc_raw ] ans_raw = entity_dict[ans_raw] vocab_set |= set(qry_raw) vocab_set |= set(doc_raw) vocab_set.add(ans_raw) # show progress n += 1 if n % 10000 == 0: print(n) entities = set(e for e in vocab_set if e.startswith('@entity')) # @placehoder, @begin and @end are included in the vocabulary list tokens = vocab_set.difference(entities) tokens.add(SYMB_BEGIN) tokens.add(SYMB_END) vocabularies = list(entities) + list(tokens) print("writing vocabularies to " + vocab_file + " ...") vocab_fp = codecs.open(vocab_file, "w", encoding="utf-8") vocab_fp.write('\n'.join(vocabularies)) vocab_fp.close() vocab_size = len(vocabularies) word_dictionary = dict(zip(vocabularies, range(vocab_size))) char_set = set([c for w in vocabularies for c in list(w)]) char_set.add(' ') char_dictionary = dict(zip(list(char_set), range(len(char_set)))) num_entities = len( [v for v in vocabularies if v.startswith('@entity')]) print("vocab_size = %d" % vocab_size) print("num characters = %d" % len(char_set)) print("%d anonymoused entities" % num_entities) print("%d other tokens (including @placeholder, %s and %s)" % (vocab_size - num_entities, SYMB_BEGIN, SYMB_END)) return word_dictionary, char_dictionary, num_entities
def parse_file(self, file_path, dictionary, use_chars, ent_setup, remove_notfound): """ parse a *.json dataset file into a list of questions, where each element is tuple(document, query, answer, filename, query_id) """ questions = [] w_dict, c_dict = dictionary[0], dictionary[1] relabeling_dicts = {} raw = load_json(file_path) for datum in raw[DATA_KEY]: document = to_entities(datum[DOC_KEY][TITLE_KEY] + "\n" + datum[DOC_KEY][CONTEXT_KEY]) document = document.lower() assert document for qa in datum[DOC_KEY][QAS_KEY]: if ent_setup in ["ent-anonym", "ent"]: doc_raw = document.split() question = to_entities(qa[QUERY_KEY]).lower() qry_id = qa[ID_KEY] assert question ans_raw = "" for ans in qa[ANS_KEY]: if ans[ORIG_KEY] == "dataset": ans_raw = ("@entity" + "_".join(ans[TXT_KEY].split())).lower() assert ans_raw if remove_notfound: if ans_raw not in doc_raw: found_umls = False for ans in qa[ANS_KEY]: if ans[ORIG_KEY] == "UMLS": umls_answer = ("@entity" + "_".join( ans[TXT_KEY].split())).lower() if umls_answer in doc_raw: found_umls = True ans_raw = umls_answer if not found_umls: continue qry_raw = question.split() if ent_setup == "ent-anonym": entity_dict = {} entity_id = 0 lst = doc_raw + qry_raw lst.append(ans_raw) for word in lst: if (word.startswith('@entity')) and ( word not in entity_dict): entity_dict[word] = '@entity' + str(entity_id) entity_id += 1 qry_raw = [ entity_dict[w] if w in entity_dict else w for w in qry_raw ] doc_raw = [ entity_dict[w] if w in entity_dict else w for w in doc_raw ] ans_raw = entity_dict[ans_raw] inv_entity_dict = { ent_id: ent_ans for ent_ans, ent_id in entity_dict.items() } assert len(entity_dict) == len(inv_entity_dict) relabeling_dicts[qa[ID_KEY]] = inv_entity_dict cand_e = [w for w in doc_raw if w.startswith('@entity')] cand_raw = [[e] for e in cand_e] # wrap the query with special symbols qry_raw.insert(0, SYMB_BEGIN) qry_raw.append(SYMB_END) try: cloze = qry_raw.index('@placeholder') except ValueError: print('@placeholder not found in ', qry_raw, '. Fixing...') at = qry_raw.index('@') qry_raw = qry_raw[:at] + [''.join(qry_raw[at:at + 2]) ] + qry_raw[at + 2:] cloze = qry_raw.index('@placeholder') # tokens/entities --> indexes doc_words = list(map(lambda w: w_dict[w], doc_raw)) # tokens/entities --> indexes qry_words = list(map(lambda w: w_dict[w], qry_raw)) if use_chars: qry_chars = list( map( lambda w: list( map(lambda c: c_dict.get(c, c_dict[' ']), list(w)[:MAX_WORD_LEN])), qry_raw)) else: qry_chars = [] ans = list(map(lambda w: w_dict.get(w, 0), ans_raw.split())) cand = [ list(map(lambda w: w_dict.get(w, 0), c)) for c in cand_raw ] if use_chars: doc_chars = list( map( lambda w: list( map(lambda c: c_dict.get(c, c_dict[' ']), list(w)[:MAX_WORD_LEN])), doc_raw)) else: doc_chars = [] questions.append((doc_words, qry_words, ans, cand, doc_chars, qry_chars, cloze, qry_id)) elif ent_setup == "no-ent": # collect candidate ents using @entity marks cand_e = [ w for w in to_entities( datum[DOC_KEY][TITLE_KEY] + "\n" + datum[DOC_KEY][CONTEXT_KEY]).lower().split() if w.startswith('@entity') ] cand_raw = [e[len("@entity"):].split("_") for e in cand_e] document = remove_entity_marks(datum[DOC_KEY][TITLE_KEY] + "\n" + datum[DOC_KEY][CONTEXT_KEY]) document = document.lower() doc_raw = document.split() question = remove_entity_marks(qa[QUERY_KEY]).lower() qry_id = qa[ID_KEY] assert question qry_raw = question.split() ans_raw = "" for ans in qa[ANS_KEY]: if ans[ORIG_KEY] == "dataset": ans_raw = ans[TXT_KEY].lower() assert ans_raw if remove_notfound: if ans_raw not in doc_raw: found_umls = False for ans in qa[ANS_KEY]: if ans[ORIG_KEY] == "UMLS": umls_answer = ans[TXT_KEY].lower() if umls_answer in doc_raw: found_umls = True ans_raw = umls_answer if not found_umls: continue relabeling_dicts[qa[ID_KEY]] = None # wrap the query with special symbols qry_raw.insert(0, SYMB_BEGIN) qry_raw.append(SYMB_END) try: cloze = qry_raw.index('@placeholder') except ValueError: print('@placeholder not found in ', qry_raw, '. Fixing...') at = qry_raw.index('@') qry_raw = qry_raw[:at] + [''.join(qry_raw[at:at + 2]) ] + qry_raw[at + 2:] cloze = qry_raw.index('@placeholder') # tokens/entities --> indexes doc_words = list(map(lambda w: w_dict[w], doc_raw)) # tokens/entities --> indexes qry_words = list(map(lambda w: w_dict[w], qry_raw)) if use_chars: qry_chars = list( map( lambda w: list( map(lambda c: c_dict.get(c, c_dict[' ']), list(w)[:MAX_WORD_LEN])), qry_raw)) else: qry_chars = [] ans = list(map(lambda w: w_dict.get(w, 0), ans_raw.split())) cand = [ list(map(lambda w: w_dict.get(w, 0), c)) for c in cand_raw ] if use_chars: doc_chars = list( map( lambda w: list( map(lambda c: c_dict.get(c, c_dict[' ']), list(w)[:MAX_WORD_LEN])), doc_raw)) else: doc_chars = [] questions.append((doc_words, qry_words, ans, cand, doc_chars, qry_chars, cloze, qry_id)) else: raise ValueError return questions, relabeling_dicts
def make_dictionary(self, question_dir, vocab_file, ent_setup, remove_notfound): vocab_file = "{}_stp{}_remove{}_py3".format(vocab_file, ent_setup, remove_notfound) if os.path.exists(vocab_file): print("loading vocabularies from " + vocab_file + " ...") vocabularies = list( map(lambda x: x.strip(), codecs.open(vocab_file, encoding="utf-8").readlines())) else: print("no " + vocab_file + " found, constructing the vocabulary list ...") vocab_set = set() n = 0. dataset_train = load_json(question_dir + "train1.0.json") dataset_dev = load_json(question_dir + "dev1.0.json") dataset_test = load_json(question_dir + "test1.0.json") if ent_setup in ["ent-anonym", "ent"]: # treats each entity as a single token # train here (remove_notfound=True|False), dev/test below for datum in dataset_train[DATA_KEY]: document = to_entities(datum[DOC_KEY][TITLE_KEY] + "\n" + datum[DOC_KEY][CONTEXT_KEY]) document = document.lower() assert document for qa in datum[DOC_KEY][QAS_KEY]: doc_raw = document.split() question = to_entities(qa[QUERY_KEY]).lower() assert question qry_raw = question.split() ans_raw = "" for ans in qa[ANS_KEY]: if ans[ORIG_KEY] == "dataset": ans_raw = ( "@entity" + "_".join(ans[TXT_KEY].split())).lower() assert ans_raw if remove_notfound: if ans_raw not in doc_raw: found_umls = False for ans in qa[ANS_KEY]: if ans[ORIG_KEY] == "UMLS": umls_answer = ("@entity" + "_".join( ans[TXT_KEY].split())).lower() if umls_answer in doc_raw: found_umls = True ans_raw = umls_answer if not found_umls: continue if ent_setup == "ent-anonym": # anonymize entity_dict = {} entity_id = 0 lst = doc_raw + qry_raw lst.append(ans_raw) for word in lst: if (word.startswith('@entity')) and ( word not in entity_dict): entity_dict[word] = '@entity' + str( entity_id) entity_id += 1 qry_raw = [ entity_dict[w] if w in entity_dict else w for w in qry_raw ] doc_raw = [ entity_dict[w] if w in entity_dict else w for w in doc_raw ] ans_raw = entity_dict[ans_raw] vocab_set |= set(qry_raw) vocab_set |= set(doc_raw) vocab_set.add(ans_raw) # show progress n += 1 if n % 10000 == 0: print(n) # treat dev/test separately to allow remove_notfound=False for datum in dataset_test[DATA_KEY] + dataset_dev[DATA_KEY]: document = to_entities(datum[DOC_KEY][TITLE_KEY] + "\n" + datum[DOC_KEY][CONTEXT_KEY]) document = document.lower() assert document for qa in datum[DOC_KEY][QAS_KEY]: doc_raw = document.split() question = to_entities(qa[QUERY_KEY]).lower() assert question qry_raw = question.split() ans_raw = "" for ans in qa[ANS_KEY]: if ans[ORIG_KEY] == "dataset": ans_raw = ( "@entity" + "_".join(ans[TXT_KEY].split())).lower() assert ans_raw if ent_setup == "ent-anonym": entity_dict = {} entity_id = 0 lst = doc_raw + qry_raw lst.append(ans_raw) for word in lst: if (word.startswith('@entity')) and ( word not in entity_dict): entity_dict[word] = '@entity' + str( entity_id) entity_id += 1 qry_raw = [ entity_dict[w] if w in entity_dict else w for w in qry_raw ] doc_raw = [ entity_dict[w] if w in entity_dict else w for w in doc_raw ] ans_raw = entity_dict[ans_raw] vocab_set |= set(qry_raw) vocab_set |= set(doc_raw) vocab_set.add(ans_raw) # show progress n += 1 if n % 10000 == 0: print(n) entities = set(e for e in vocab_set if e.startswith('@entity')) # @placehoder, @begin and @end are included in the vocabulary list tokens = vocab_set.difference(entities) tokens.add(SYMB_BEGIN) tokens.add(SYMB_END) vocabularies = list(entities) + list(tokens) elif ent_setup == "no-ent": # ignore entity markings # train here (remove_notfound=True|False), dev/test below for datum in dataset_train[DATA_KEY]: document = remove_entity_marks(datum[DOC_KEY][TITLE_KEY] + "\n" + datum[DOC_KEY][CONTEXT_KEY]) document = document.lower() assert document doc_raw = document.split() for qa in datum[DOC_KEY][QAS_KEY]: question = remove_entity_marks(qa[QUERY_KEY]).lower() assert question qry_raw = question.split() ans_raw = "" for ans in qa[ANS_KEY]: if ans[ORIG_KEY] == "dataset": ans_raw = ans[TXT_KEY].lower() assert ans_raw if remove_notfound: if ans_raw not in doc_raw: found_umls = False for ans in qa[ANS_KEY]: if ans[ORIG_KEY] == "UMLS": umls_answer = ans[TXT_KEY].lower() if umls_answer in document: found_umls = True ans_raw = umls_answer if not found_umls: continue vocab_set |= set(qry_raw) vocab_set |= set(doc_raw) vocab_set.add(ans_raw) # show progress n += 1 if n % 10000 == 0: print(n) # treat dev/test separately to allow remove_notfound=False for datum in dataset_test[DATA_KEY] + dataset_dev[DATA_KEY]: document = remove_entity_marks(datum[DOC_KEY][TITLE_KEY] + "\n" + datum[DOC_KEY][CONTEXT_KEY]) document = document.lower() assert document doc_raw = document.split() for qa in datum[DOC_KEY][QAS_KEY]: question = remove_entity_marks(qa[QUERY_KEY]).lower() assert question qry_raw = question.split() ans_raw = "" for ans in qa[ANS_KEY]: if ans[ORIG_KEY] == "dataset": ans_raw = ans[TXT_KEY].lower() assert ans_raw vocab_set |= set(qry_raw) vocab_set |= set(doc_raw) vocab_set.add(ans_raw) # show progress n += 1 if n % 10000 == 0: print(n) entities = set(e for e in vocab_set if e.startswith('@entity')) # @placehoder, @begin and @end are included in the vocabulary list tokens = vocab_set.difference(entities) tokens.add(SYMB_BEGIN) tokens.add(SYMB_END) vocabularies = list(entities) + list(tokens) else: raise ValueError print("writing vocabularies to " + vocab_file + " ...") vocab_fp = codecs.open(vocab_file, "w", encoding="utf-8") vocab_fp.write('\n'.join(vocabularies)) vocab_fp.close() vocab_size = len(vocabularies) word_dictionary = dict(zip(vocabularies, range(vocab_size))) char_set = set([c for w in vocabularies for c in list(w)]) char_set.add(' ') char_dictionary = dict(zip(list(char_set), range(len(char_set)))) num_entities = len( [v for v in vocabularies if v.startswith('@entity')]) print("vocab_size = %d" % vocab_size) print("num characters = %d" % len(char_set)) print("%d anonymoused entities" % num_entities) print("%d other tokens (including @placeholder, %s and %s)" % (vocab_size - num_entities, SYMB_BEGIN, SYMB_END)) return word_dictionary, char_dictionary, num_entities
help='Data file (default: data.json') parser.add_argument('--name', type=str, default='darts') parser.add_argument('--cfg', type=int, default=4, help='configuration (default: 4)') parser.add_argument('--bs', type=int, default=32, help='batch size (default: 32)') parser.add_argument('--epochs', type=int, default=10, help='training epochs (default: 10)') parser.add_argument('--dropout', type=float, default=0.3, help='decoder implicit regularization (default: 0.3)') parser.add_argument('--normalize', action='store_true', default=True, help='use input normalization') parser.add_argument('--input_dim', type=int, default=11) parser.add_argument('--hidden_dim', type=int, default=128) parser.add_argument('--dim', type=int, default=16, help='feature dimension (default: 16)') parser.add_argument('--hops', type=int, default=5) parser.add_argument('--mlps', type=int, default=2) parser.add_argument('--latent_points', type=int, default=10000, help='latent points for validaty check (default: 10000)') args = parser.parse_args() cfg = configs[args.cfg] dataset = load_json(args.data) print('using {}'.format(args.data)) print('feat dim {}'.format(args.dim)) pretraining_gae(dataset, cfg)
def smooth_exp(data_path, emb_path, supervised_emb_path, output_path, data_type, random_path, path_step, straight_path): print('experiments:') ## load raw architecture dataset = load_json(data_path) ## load feature & test_acc feature, test_acc = read_feature(emb_path) feature_sup = np.squeeze(np.load(supervised_emb_path)) feature_nums = len(dataset) ## get start points start_idx = np.random.choice(feature_nums, random_path, replace=False).tolist() if straight_path > 0: straight_idx = get_straight(dataset, num=straight_path) start_idx = np.stack(start_idx + straight_idx) ## smooth experiments ops = [] adj = [] ops_sup = [] adj_sup = [] for k, ind in enumerate(start_idx): ops_k = [] adj_k = [] prev_node = feature[ind].reshape(1, -1) mask = np.zeros(feature_nums, dtype=int) ## supervised ops_k_sup = [] adj_k_sup = [] prev_node_sup = feature_sup[ind].reshape(1, -1) mask_sup = np.zeros(feature_nums, dtype=int) for i in tqdm(range(path_step), desc='smooth experiment {} of {}'.format( k + 1, len(start_idx))): dis = linalg.norm(feature - prev_node, axis=1) mdis = ma.masked_array(dis, mask) idx = np.argmin(mdis) mask[idx] = 1 prev_node = feature[idx].reshape(1, -1) ops_k.append( torch.LongTensor(dataset[str(idx)]['module_operations'])) adj_k.append( torch.LongTensor(dataset[str(idx)]['module_adjacency'])) ## supervised dis_sup = linalg.norm(feature_sup - prev_node_sup, axis=1) mdis_sup = ma.masked_array(dis_sup, mask_sup) idx_sup = np.argmin(mdis_sup) mask_sup[idx_sup] = 1 prev_node_sup = feature_sup[idx_sup].reshape(1, -1) ops_k_sup.append( torch.LongTensor(dataset[str(idx_sup)]['module_operations'])) adj_k_sup.append( torch.LongTensor(dataset[str(idx_sup)]['module_adjacency'])) ops_k = torch.stack(ops_k) adj_k = torch.stack(adj_k) ops.append(ops_k) adj.append(adj_k) ops_k_sup = torch.stack(ops_k_sup) adj_k_sup = torch.stack(adj_k_sup) ops_sup.append(ops_k_sup) adj_sup.append(adj_k_sup) ## conver to graph for i in tqdm(range(len(start_idx)), desc='draw graphs'): G = adj2graph(ops[i], adj[i]) names = [] temp_path = '.temp' G_sup = adj2graph(ops_sup[i], adj_sup[i]) names_sup = [] temp_path_sup = '.temp_sup' if not os.path.exists(temp_path): os.makedirs(temp_path) if not os.path.exists(temp_path_sup): os.makedirs(temp_path_sup) for j in range(path_step): namej = plot_DAG(G[j], temp_path, str(j), data_type, backbone=True) names.append(namej) namej_sup = plot_DAG(G_sup[j], temp_path_sup, str(j), data_type, backbone=True) names_sup.append(namej_sup) ## pave to single image if not os.path.exists(os.path.join(output_path, 'unsupervised')): os.makedirs(os.path.join(output_path, 'unsupervised')) images = [[Image.open(name) for name in names]] join_images(*images, bg_color='white', alignment=(0, 0)).save( os.path.join(output_path, 'unsupervised', '{}_unsupervised.png'.format(start_idx[i]))) if not os.path.exists(os.path.join(output_path, 'supervised')): os.makedirs(os.path.join(output_path, 'supervised')) images = [[Image.open(name) for name in names_sup]] join_images(*images, bg_color='white', alignment=(0, 0)).save( os.path.join(output_path, 'supervised', '{}_supervised.png'.format(start_idx[i]))) if not os.path.exists(os.path.join(output_path, 'compare')): os.makedirs(os.path.join(output_path, 'compare')) images = [[Image.open(name) for name in names], [Image.open(name) for name in names_sup]] join_images(*images, bg_color='white', alignment=(0, 0)).save( os.path.join(output_path, 'compare', '{}_compare.png'.format(start_idx[i])))
def smooth_exp_nas201(data_path, emb_path, supervised_emb_path, output_path, random_path, path_step): print('experiments (NAS 201):') ## load raw architecture dataset = load_json(data_path) ## load feature & test_acc feature_raw = torch.load(emb_path) feature = [] for i in tqdm(range(len(feature_raw)), desc='load feature'): feature.append(feature_raw[i]['feature'].detach().numpy()) feature = np.stack(feature) feature_sup = np.load(supervised_emb_path) feature_nums = len(dataset) ## get start points start_idx = np.random.choice(feature_nums, random_path, replace=False).tolist() ## smooth experiments ops = [] ops_sup = [] for k, ind in enumerate(start_idx): ops_k = [] prev_node = feature[ind].reshape(1, -1) mask = np.zeros(feature_nums, dtype=int) ## supervised ops_k_sup = [] prev_node_sup = feature_sup[ind].reshape(1, -1) mask_sup = np.zeros(feature_nums, dtype=int) for i in tqdm(range(path_step), desc='smooth experiment {} of {}'.format( k + 1, len(start_idx))): dis = linalg.norm(feature - prev_node, axis=1) mdis = ma.masked_array(dis, mask) idx = np.argmin(mdis) mask[idx] = 1 prev_node = feature[idx].reshape(1, -1) ops_k.append( np.argmax(np.array(dataset[str(idx)]['module_operations']), axis=1)) ## supervised dis_sup = linalg.norm(feature_sup - prev_node_sup, axis=1) mdis_sup = ma.masked_array(dis_sup, mask_sup) idx_sup = np.argmin(mdis_sup) mask_sup[idx_sup] = 1 prev_node_sup = feature_sup[idx_sup].reshape(1, -1) ops_k_sup.append( np.argmax(np.array(dataset[str(idx_sup)]['module_operations']), axis=1)) ops_k = np.stack(ops_k) ops.append(ops_k) ops_k_sup = np.stack(ops_k_sup) ops_sup.append(ops_k_sup) ## conver to graph num2ops = { 0: 'in', 1: '1x1', 2: '3x3', 3: 'pool', 4: 'skip', 5: 'none', 6: 'out' } x = [130, 300, 280, 40, 150, 320] y = [550, 500, 350, 400, 250, 200] img = mpimg.imread( os.path.join(os.path.dirname(os.path.abspath(__file__)), 'nas201.jpg')) for i in tqdm(range(len(start_idx)), desc='draw graphs'): names = [] temp_path = '.temp' names_sup = [] temp_path_sup = '.temp_sup' if not os.path.exists(temp_path): os.makedirs(temp_path) if not os.path.exists(temp_path_sup): os.makedirs(temp_path_sup) ops0_prev = [] ops1_prev = [] for j in range(path_step): namej = os.path.join(temp_path, str(j) + '.jpg') names.append(namej) ops0 = [num2ops[x] for x in ops[i][j]] fig, ax = plt.subplots() ax.imshow(img) ax.set_xticks([]) ax.set_yticks([]) ax.spines['top'].set_visible(False) ax.spines['bottom'].set_visible(False) ax.spines['left'].set_visible(False) ax.spines['right'].set_visible(False) for k in range(6): if len(ops0_prev) == 0 or ops0[k + 1] == ops0_prev[k + 1]: plt.text(x[k], y[k], ops0[k + 1], fontsize=18, color='blue') else: plt.text(x[k], y[k], ops0[k + 1], fontsize=18, color='red') plt.savefig(namej, bbox_inches='tight') plt.close() ops0_prev = ops0 namej_sup = os.path.join(temp_path_sup, str(j) + '.jpg') names_sup.append(namej_sup) ops1 = [num2ops[x] for x in ops_sup[i][j]] fig, ax = plt.subplots() ax.imshow(img) ax.set_xticks([]) ax.set_yticks([]) ax.spines['top'].set_visible(False) ax.spines['bottom'].set_visible(False) ax.spines['left'].set_visible(False) ax.spines['right'].set_visible(False) for k in range(6): if len(ops1_prev) == 0 or ops1[k + 1] == ops1_prev[k + 1]: plt.text(x[k], y[k], ops1[k + 1], fontsize=18, color='blue') else: plt.text(x[k], y[k], ops1[k + 1], fontsize=18, color='red') plt.savefig(namej_sup, bbox_inches='tight') plt.close() ops1_prev = ops1 ## pave to single image if not os.path.exists(os.path.join(output_path, 'unsupervised')): os.makedirs(os.path.join(output_path, 'unsupervised')) images = [[Image.open(name) for name in names]] join_images(*images, bg_color='white', alignment=(0, 0)).save( os.path.join(output_path, 'unsupervised', '{}_unsupervised.png'.format(start_idx[i]))) if not os.path.exists(os.path.join(output_path, 'supervised')): os.makedirs(os.path.join(output_path, 'supervised')) images = [[Image.open(name) for name in names_sup]] join_images(*images, bg_color='white', alignment=(0, 0)).save( os.path.join(output_path, 'supervised', '{}_supervised.png'.format(start_idx[i]))) if not os.path.exists(os.path.join(output_path, 'compare')): os.makedirs(os.path.join(output_path, 'compare')) images = [[Image.open(name) for name in names], [Image.open(name) for name in names_sup]] join_images(*images, bg_color='white', alignment=(0, 0)).save( os.path.join(output_path, 'compare', '{}_compare.png'.format(start_idx[i])))
def create_dataset(conceptnet_raw_path, house_objects_path, relations_path): ''' Create a dataset of objects and their relations from Conceptnet ''' objects = load_json(house_objects_path).keys() objects = [x.replace(' ', '_') for x in objects] relations = [line.rstrip() for line in open(relations_path)] collect_relations(objects, relations, conceptnet_raw_path, 7)
def plot_data(dataset_names, only_table=False): for dataset_name in dataset_names: print('*** Dataset name:', dataset_name) qini_dict = load_json(dataset_name + '_qini') var_sel_dict = load_json(dataset_name + '_val_sel') plot_all(dataset_name, qini_dict, var_sel_dict, only_table)
help='silent: 0, progress bar: 1, detailed: 2') args = parser.parse_args() # Create log object. if args.mode == 'train': sys.stdout = Logger(TRAIN_LOG_LOC) else: sys.stdout = Logger(TEST_LOG_LOC) print_statement('HYPERPARAMETER SETTING', verbose=args.verbose) print_flags(args, verbose=args.verbose) # Load data. print_statement('DATA PROCESSING', verbose=args.verbose) label_map = load_json(LABEL_JSON_LOC, reverse=True, name='Label Mapping', verbose=args.verbose) train_data = load_json(TRAIN_JSON_LOC, label_map, name='Training Set', verbose=args.verbose) val_data = load_json(VAL_JSON_LOC, label_map, name='Validation Set', verbose=args.verbose) test_data = load_json(TEST_JSON_LOC, label_map, name='Test Set', verbose=args.verbose) # Train model.
def _reset(self, data_path, save): if not save: print("extract arch2vec embedding table...") dataset = load_json(data_path) self.model = Model(input_dim=args.input_dim, hidden_dim=args.hidden_dim, latent_dim=args.latent_dim, num_hops=args.hops, num_mlp_layers=args.mlps, dropout=args.dropout, **cfg['GAE']).cuda() model_ckpt_path = os.path.join(self.dir_name, '{}'.format(args.model_path)) if not os.path.exists(model_ckpt_path): print("File {} does not exist.".format(model_ckpt_path)) exit() self.model.load_state_dict( torch.load(model_ckpt_path)['model_state']) self.model.eval() print("length of the dataset: {}".format(len(dataset))) self.f_path = os.path.join( self.dir_name, '{}-arch2vec.pt'.format(args.dataset_name)) if os.path.exists(self.f_path): print('ATTENTION!!! {} is already saved.'.format(self.f_path)) exit() print('save to {} ...'.format(self.f_path)) for ind in range(len(dataset)): adj = torch.Tensor( dataset[str(ind)]['module_adjacency']).unsqueeze(0).cuda() ops = torch.Tensor(dataset[str(ind)] ['module_operations']).unsqueeze(0).cuda() adj, ops, prep_reverse = preprocessing(adj, ops, **cfg['prep']) test_acc = dataset[str(ind)]['test_accuracy'] valid_acc = dataset[str(ind)]['validation_accuracy'] other_info = { 'valid_accuracy_avg': dataset[str(ind)]['validation_accuracy_avg'], 'test_accuracy_avg': dataset[str(ind)]['test_accuracy_avg'] } time = dataset[str(ind)]['training_time'] x, _ = self.model._encoder(ops, adj) self.embedding[ind] = { 'feature': x.mean(dim=1).squeeze(0).cpu(), 'valid_accuracy': float(valid_acc), 'test_accuracy': float(test_acc), 'time': float(time), 'other_info': other_info } torch.save(self.embedding, self.f_path) print("finished arch2vec extraction") exit() else: self.f_path = os.path.join( self.dir_name, '{}-arch2vec.pt'.format(args.dataset_name)) print("load pretrained arch2vec in path: {}".format(self.f_path)) self.embedding = torch.load(self.f_path) random.seed(args.seed) random.shuffle(self.embedding) for ind in range(len(self.embedding)): self.features.append(self.embedding[ind]['feature']) self.features = torch.stack(self.features, dim=0) print('loading finished. pretrained embeddings shape: {}'.format( self.features.shape))
def load_config(config_file): config = load_json(config_file) return config
def evaluate_helmet_image_sgcc_score(predicted_file_json_path: str, gold_json_file_path: str, iou_threshold: float, false_detection_weight: float, missed_detection_weight: float, object_detection_weight: float): """ calculate the sgcc helmet image score by the predicted and gold json file """ try: gt_data = load_json(gold_json_file_path) pred_data = load_json(predicted_file_json_path) # load the names of categories class_name_list = [] for class_item in gt_data['categories']: if isinstance(class_item['name'], list): class_name_list.append(class_item['name'][0]) else: class_name_list.append(class_item['name']) class_name_dict = {'wear_helmet_label': 1, 'no_helmet_label': 0} # traverse the images, a batch of one picture false_detection_count = 0 detection_no_wear_total_count = 0 missed_detection_count = 0 gold_no_wear_total_count = 0 object_detection_correct_count = 0 object_detection_total_count = 0 for i in range(len(gt_data['images'])): image_id = gt_data['images'][i]['id'] # load gold annotations,ann_gt = n * [cls_id, x1, y1, x2, y2] labels_gt, ann_gt = get_ann(image_id, gt_data['annotations']) # load predicted annotations,ann_pred = n * [x1, y1, x2, y2, pred_score, cls_id] _, ann_pred = get_ann(image_id, pred_data) # sort the ann pred list by the confidence pred scores in a descending order if len(ann_pred): ann_pred = ann_pred[(-ann_pred[:, 4]).argsort()] ann_pred = torch.Tensor(ann_pred) ann_gt = torch.Tensor(ann_gt) # predicted no_wear boxes and labels if len(ann_pred) == 0: pred_no_wear_indices, pred_no_wear_labels, pred_no_wear_boxes = [], [], [] else: pred_no_wear_indices = torch.where( ann_pred[:, -1] == class_name_dict['no_helmet_label']) pred_no_wear_labels = ann_pred[:, -1][pred_no_wear_indices] pred_no_wear_boxes = ann_pred[:, :4][pred_no_wear_indices] # target no_wear boxes and labels if len(ann_gt) == 0: target_no_wear_indices, target_no_wear_labels, target_no_wear_boxes = [], [], [] else: target_no_wear_indices = torch.where( ann_gt[:, 0] == class_name_dict['no_helmet_label']) target_no_wear_labels = ann_gt[:, 0][target_no_wear_indices] target_no_wear_boxes = ann_gt[:, 1:][target_no_wear_indices] false_detection_number, detection_no_wear_number = helmet_image_false_detection( pred_no_wear_labels=pred_no_wear_labels, pred_no_wear_boxes=pred_no_wear_boxes, target_no_wear_labels=target_no_wear_labels, target_no_wear_boxes=target_no_wear_boxes, iou_threshold=iou_threshold) false_detection_count += false_detection_number detection_no_wear_total_count += detection_no_wear_number missed_detection_number, gold_no_wear_number = helmet_image_missed_detection( pred_no_wear_labels=pred_no_wear_labels, pred_no_wear_boxes=pred_no_wear_boxes, target_no_wear_labels=target_no_wear_labels, target_no_wear_boxes=target_no_wear_boxes, iou_threshold=iou_threshold) missed_detection_count += missed_detection_number gold_no_wear_total_count += gold_no_wear_number object_detection_correct_number, object_detection_total_number = helmet_image_object_detection( pred_no_wear_labels=pred_no_wear_labels, pred_no_wear_boxes=pred_no_wear_boxes, target_no_wear_labels=target_no_wear_labels, target_no_wear_boxes=target_no_wear_boxes, iou_threshold=iou_threshold) object_detection_correct_count += object_detection_correct_number object_detection_total_count += object_detection_total_number false_detection_rate = (false_detection_count / detection_no_wear_total_count) if ( detection_no_wear_total_count != 0) else 0 missed_detection_rate = (missed_detection_count / gold_no_wear_total_count) if ( gold_no_wear_total_count != 0) else 0 object_detection_correct_rate = ( object_detection_correct_count / object_detection_total_count) if ( object_detection_total_count != 0) else 0 logger.info("false_detection_rate: {} / {} = {}".format( false_detection_count, detection_no_wear_total_count, false_detection_rate)) logger.info("missed_detection_rate: {} / {} = {}".format( missed_detection_count, gold_no_wear_total_count, missed_detection_rate)) logger.info("object_detection_correct_rate: {} / {} = {}".format( object_detection_correct_count, object_detection_total_count, object_detection_correct_rate)) sgcc_helmet_image_score = 1 - ( false_detection_weight * false_detection_rate + missed_detection_weight * missed_detection_rate + object_detection_weight * (1 - object_detection_correct_rate)) logger.info("evaluation for {} and {}\n".format( predicted_file_json_path, gold_json_file_path)) ap_table = [[ "false detection rate", "missed detection rate", "object detection correct rate", "sgcc helmet image score" ]] ap_table += [[ false_detection_rate, missed_detection_rate, object_detection_correct_rate, sgcc_helmet_image_score ]] logger.info("\n{}\n".format(AsciiTable(ap_table).table)) return float('{:.8f}'.format(sgcc_helmet_image_score)), "评测成功" except Exception as e: return -1, "格式错误" except AssertionError: _, _, tb = sys.exc_info() traceback.print_tb(tb) tb_info = traceback.extract_tb(tb) filename, line, func, text = tb_info[-1] logger.info('an error occurred on line {} in statement {}'.format( line, text)) return -1, "格式错误"