def evaluate(config: Config) -> List[str]: data = load_pickle(file_path=config.data_path) results = _load_results(result_paths=config.result_paths) print('Assemble explanations, model weights, ...!') scores = _generate_empty_evaluation_results_dict() scores[A.global_based][A.method_names] = _get_method_names( results=results, sample_based=False) scores[A.sample_based][A.method_names] = _get_method_names( results=results, sample_based=True) scores[A.data_weights] = _get_data_weights(result=load_pickle( file_path=config.result_paths[0])) scores[A.model_accuracies] = _assemble_model_accuracies(results=results, data=data) scores[A.model_weights] = _assemble_model_weights( results=results, weights=scores[A.data_weights]) g, s = _assemble_explanations2(results=results, scores=scores) scores[A.global_based][A.explanations] = g scores[A.sample_based][A.explanations] = s print('Calculate scores!') pattern_type = int(extract_pattern_type(data_path=config.data_path)) scores[A.global_based]['roc_auc'] = _assemble_results_roc_analysis( explanations=scores[A.global_based][A.explanations], weights=scores[A.data_weights], pattern_type=pattern_type) scores[A.global_based][ 'precision_based_scores'] = _assemble_results_precision_analysis( explanations=scores[A.global_based][A.explanations], weights=scores[A.data_weights], pattern_type=pattern_type) scores[A.sample_based]['roc_auc'] = _assemble_results_roc_analysis( explanations=scores[A.sample_based][A.explanations], weights=scores[A.data_weights], pattern_type=pattern_type) scores[A.sample_based][ 'precision_based_scores'] = _assemble_results_precision_analysis( explanations=scores[A.sample_based][A.explanations], weights=scores[A.data_weights], pattern_type=pattern_type) print('Save results!') output_paths = list() date = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") pattern_type = f'pattern_type_{extract_pattern_type(data_path=config.data_path)}' suffix = '_'.join(['evaluation', date, pattern_type]) output_paths += [ to_pickle(output_dir=config.output_dir_scores, data=scores, suffix=suffix) ] return output_paths
def main(): fpath = '../data/data_vary_signal_exact_2021-04-29-14-44-03_pattern_type_6.pkl' data = load_pickle(file_path=fpath) idx_experiment = 22 idx_sample = 107 for weight, data_list in data.items(): print(weight) d = data_list[idx_experiment] sample = data_list[idx_experiment]['val']['x'][idx_sample, :] model = LogisticRegression(penalty='none', fit_intercept=False, max_iter=10, random_state=123) model.fit(X=d['train']['x'], y=d['train']['y'].flatten()) pred_train = model.predict(d['train']['x']) pred_val = model.predict(d['val']['x']) print( f"Accuracy train: {accuracy_score(y_true=d['train']['y'].flatten(), y_pred=pred_train)}" ) print( f"Accuracy val: {accuracy_score(y_true=d['val']['y'].flatten(), y_pred=pred_val)}" ) sns.heatmap(sample.reshape((8, 8)), center=0.0) plt.show() label = data_list[idx_experiment]['val']['y'][idx_sample] print( f'Weight: {weight} Prediction: {model.predict(sample.reshape((1, 64)))} Label: {label}' )
def main(input_path: str) -> None: config = Config.get(input_conf=load_json_file(file_path=input_path)) data = load_pickle(file_path=config.data_path) np.random.seed(seed=config.seed) results = generate_empty_results_dict() results['method_names'] = config.method_names print(f'Input: {asdict(config)}') print('Run experiments!') for weights, data_list in data.items(): results_per_weight = list() print(f'Run experiments for weights: {weights}') for data in tqdm(data_list): results_per_weight += [main_experiment(data=data, config=config)] results['results'][weights] = results_per_weight date = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") suffix = '_'.join( ['results_agnostic_sample_based', date, f'_{weights}']) to_pickle(output_dir=config.output_dir, data=results_per_weight, suffix=suffix) date = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") pattern_type = f'pattern_type_{extract_pattern_type(data_path=config.data_path)}' suffix = '_'.join(['results_agnostic_sample_based', date, pattern_type]) to_pickle(output_dir=config.output_dir, data=results, suffix=suffix)
def get_aug_examples(self, distance_path, aug_num, mode): new_examples = [] old_examples = [] old_train = self.get_train_examples() if mode == 'train': old_examples = old_train elif mode == 'dev': old_examples = self.get_dev_examples() elif mode == 'test': old_examples = self.get_test_examples() examples_train = load_pickle(distance_path) for i, ele in enumerate(old_examples): cur_train = {} cur_train['ori_sentence'] = ele cur_train['aux_sentences'] = [] sort_list = examples_train[i] sort_id = 0 sort_id_list = [] while len(cur_train['aux_sentences']) < aug_num: sort_sentence_id = sort_list[sort_id] if old_train[sort_sentence_id]['id'] != old_examples[i][ 'id']: #不同句子 cur_train['aux_sentences'].append( old_train[sort_sentence_id].copy()) sort_id_list.append(sort_list[sort_id]) sort_id += 1 if sort_id >= len(sort_list): raise ValueError('Need more sentences id!') new_examples.append(cur_train) return new_examples
def module_process(ianadir, host, days, ipv6=False, bestonly=False): """ Match BGP prefixes in IANA's directory and generate text outputs and stats that determine average active prefix counts and average de-aggregation for each RIR. :param IanaDirectory ianadir: IanaDirectory instance to match agains :param str host: Host to take BGP feeds from :param days: List of days to analyze :param bool ipv6: IPv6 flag :param bool bestonly: Take only best BGP paths into account """ timeline=[] timelineavg=[] for t in days: rirpfxlens={} ifn = bgp.bgpdump_pickle(t, host, ipv6) if not ifn: continue bgpdump=common.load_pickle(ifn) common.d("ianaspace.module_run: matching prefixes in a tree (%d)"%len(bgpdump)) for pv in bgpdump: if bestonly and not (pv[0] and '>' in pv[0]): continue net = ipaddr.IPNetwork(pv[1]) r=ianadir.resolve_network(net) if not r: common.w("No IANA assignment for", str(pv[1])) continue name=r[2] if r[1] == 'LEGACY' and not name in RIRS: name='LEGACY' if not name in rirpfxlens: rirpfxlens[name]=[] rirpfxlens[name].append(net.prefixlen) timeline.append([str(t)]+[len(rirpfxlens[n]) for n in RIRS]) timelineavg.append([str(t)]+[(reduce(lambda x, y: x + y, rirpfxlens[n])/ float(len(rirpfxlens[n]))) for n in RIRS]) outtxt = '%s/rirstats%d-%s.txt'%(common.resultdir(t), (6 if ipv6 else 4), host) common.d("Generating output RIR stats text "+outtxt) with open(outtxt,'w') as f: for i,k in enumerate(RIRS): f.write('%s: %d (avg pfxlen: %.2f)\n'%(str(k), timeline[-1][1+i], round(timelineavg[-1][1+i], 2))) if timeline: outgraph = '%s/rirpfxcount%d-%s'%(common.resultdir(), (6 if ipv6 else 4), host) common.d("Generating output RIR pfxcount graph with prefix "+outgraph) graph.gen_multilineplot(timeline, outgraph, legend=RIRS, ylabel='Pfx count') if timelineavg: outgraph = '%s/rirpfxlen%d-%s'%(common.resultdir(), (6 if ipv6 else 4), host) common.d("Generating output RIR pfxlen graph with prefix "+outgraph) graph.gen_multilineplot(timelineavg, outgraph, legend=RIRS, ylabel='Avg pfx len')
def plot(config: Config, score_paths: List[str]) -> None: rnd_state = np.random.default_rng(config.seed) idx = rnd_state.integers(low=0, high=100) rnd_sample_idx = 107 scores = load_pickle(file_path=score_paths[0]) data = load_pickle(file_path=config.data_path) data_dict = get_randomized_heat_map_data(scores=scores, data=data, rnd_idx=idx) print(f'Create plots!') overview_correlation_plot(scores=scores, config=config) overall_accuracy_plot(scores=scores, config=config) print(f'Create rain cloud plots!') rain_clouds(scores=scores[A.sample_based], config=config, mode='sample_based', score_data_keys=[('roc_auc', 'auc'), # ('precision_based_scores', 'pr_auc'), ('precision_based_scores', 'max_precision'), ('precision_based_scores', 'avg_precision')]) rain_clouds(scores=scores[A.global_based], config=config, mode='global', score_data_keys=[('roc_auc', 'auc'), # ('precision_based_scores', 'pr_auc'), ('precision_based_scores', 'max_precision'), ('precision_based_scores', 'avg_precision')]) print(f'Create box plots!') box_plot(scores=scores[A.sample_based], config=config, mode='sample_based', snrs_of_interest=['0.00', '0.04', '0.08'], score_data_keys=[('roc_auc', 'auc'), # ('precision_based_scores', 'pr_auc'), ('precision_based_scores', 'max_precision'), # ('precision_based_scores', 'avg_precision') ]) box_plot(scores=scores[A.global_based], config=config, mode='global', snrs_of_interest=['0.00', '0.04', '0.08'], score_data_keys=[('roc_auc', 'auc'), # ('precision_based_scores', 'pr_auc'), ('precision_based_scores', 'max_precision'), # ('precision_based_scores', 'avg_precision') ]) print(f'Create heat maps!') pattern_type = int(extract_pattern_type(data_path=config.data_path)) global_heat_maps(scores=scores, config=config, rnd_experiment_idx=idx, pattern_type=pattern_type, snrs_of_interest=['0.00', '0.04', '0.08']) sample_based_heat_maps(scores=scores, config=config, data=data_dict, rnd_sample_idx=rnd_sample_idx, pattern_type=pattern_type, snrs_of_interest=['0.00', '0.04', '0.08'])
def load_from_file(self, file_path): ''' 从文件组红加载vocab :param file_name: :param pickle_path: :return: ''' mappings = load_pickle(input_file=file_path) self.idx2word = mappings['idx2word'] self.word2idx = mappings['word2idx']
def main(): # fpath = '../data/data_vary_signal_exact_2021-01-18-16-07-37.pkl' fpath = '../data/data_vary_signal_exact_2021-02-23-12-45-08.pkl' # fpath = '../data/data_vary_signal_exact_2021-02-01-11-36-15.pkl' data = load_pickle(file_path=fpath) idx_experiment = 22 new_data = dict() for weight, data_list in data.items(): new_data['w' + 'd'.join(weight.split('.'))] = data_list[idx_experiment] savemat(file_name='data_vary_signal_exact_2021-02-23-12-45-08.mat', mdict=new_data)
def load_and_cache_examples(args,processor, data_type='train'): # Load data features from cache or dataset file cached_examples_file = args.data_dir / 'cached_crf-{}_{}_{}'.format( data_type, args.arch,#结构 str(args.task_name)) if cached_examples_file.exists(): logger.info("Loading features from cached file %s", cached_examples_file) examples = load_pickle(cached_examples_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) if data_type == 'train': examples = processor.get_aug_examples(args.data_dir/'train_train.bin',args.aug_num,data_type) elif data_type == 'dev': examples = processor.get_aug_examples(args.data_dir/'train_dev.bin',args.aug_num,data_type) logger.info("Saving features into cached file %s", cached_examples_file) save_pickle(examples, str(cached_examples_file)) return examples
def gen_bgpdump_pickle(infile,outfile,ipv6=False): """ Read Cisco show ip bgp output captured in a infile and generate outfile (pickle that contains list of tuples that parse_cisco_bgp_file returns). infile: in filename (prefferably full path to the BGP text file) outfile: out filename ipv6: IPv6 indicator (needed for prefix normalization) """ if os.path.isfile(outfile): return common.load_pickle(outfile) o=list(parse_cisco_bgp_file(infile, ipv6)) common.save_pickle(o, outfile) return o
def create_path_matrix(host, days, ipv6=False): """ Generate matrix: [t:buckets,...] where buckets (r) contains r[16]=[x,y,z,...] ; x,y,z are ints. It means that there was prefixes with netmask /16. One with AS-path length x, another y, ... """ bucket_matrix={} for t in days: bgpfile=bgpdump_pickle(t, host, ipv6) if not bgpfile: common.d("bgp.create_path_matrix skipping time "+str(t)+"...") continue common.d("bgp.create_path_matrix processing time "+str(t)+"...") bgpdump=common.load_pickle(bgpfile) bucket_matrix[t]=gen_buckets(bgpdump, ipv6, bestonly=True) return bucket_matrix
def gen_bgpdump_pickle(infile,outfile,ipv6=False): """ Read Cisco show ip bgp output captured in a infile and generate outfile (pickle that contains list of tuples that parse_cisco_bgp_file returns). :param str infile: Input filename (prefferably full path to the BGP text file) :param str outfile: Output filename :param bool ipv6: IPv6 indicator (needed for prefix normalization) :returns: The parsed cisco bgp output either from pickle or from the primary source """ if os.path.isfile(outfile): return common.load_pickle(outfile) o=list(parse_cisco_bgp_file(infile, ipv6)) common.save_pickle(o, outfile) return o
def gen_bgpdump_pickle(infile, outfile, ipv6=False): """ Read Cisco show ip bgp output captured in a infile and generate outfile (pickle that contains list of tuples that parse_cisco_bgp_file returns). :param str infile: Input filename (prefferably full path to the BGP text file) :param str outfile: Output filename :param bool ipv6: IPv6 indicator (needed for prefix normalization) :returns: The parsed cisco bgp output either from pickle or from the primary source """ if os.path.isfile(outfile): return common.load_pickle(outfile) o = list(parse_cisco_bgp_file(infile, ipv6)) common.save_pickle(o, outfile) return o
def match(): import deepmatching_wrapper as dm import cv2 candidate_matching_database = common.load_pickle(Path("temp/candidate_matching_database.pickle")) common.prepare_clean_dir(Path("output/")) common.prepare_clean_dir(Path("output/images/")) output = {} for query_file, candidates in candidate_matching_database.items(): query_name = Path(query_file).stem matching_result = [] for target_class_name, target_images in candidates.items(): for i, (target_path, similarity) in enumerate(target_images): print("Matching", query_file, "with target image", target_path) matches, name1, name2, qw, qh, tw, th, img1, img2 = dm.match(query_file, target_path) src_pts = np.float32([[m[0], m[1]] for m in matches]) dst_pts = np.float32([[m[2], m[3]] for m in matches]) i = 0 inlier = [] M, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, setting.RANSAC_THRESHOLD) for index, m in enumerate(mask): if np.isclose(m, 1): i += 1 inlier.append(matches[index]) output_name = "%s_%s_%02d.jpg" % (query_name, target_class_name, i) dm.draw(img1, img2, inlier, Path("output/images/") / output_name) matching_result.append({ "class_name": target_class_name, "inlier": len(inlier) }) output[query_file.name] = sorted(matching_result, key=lambda x: x["inlier"], reverse=True) common.write_json(Path("output/result.json"), output)
def overview_rain_cloud_plot(paths: List[str], config: Config, score_data_key: str, metric_name: str): df = pd.DataFrame() for score_path in paths: scores = load_pickle(file_path=score_path) aux_df = create_rain_cloud_data(data=scores[score_data_key], metric_name=metric_name) aux_df = add_column_for_class_of_explanation_method(data=aux_df) df = df.append(aux_df) sigma = .5 sns.set_theme('paper') sns.set(font_scale=1) with sns.axes_style("whitegrid"): g = sns.FacetGrid(df, row='class', col='SNR', height=6, ylim=(0, 1.05)) g.map_dataframe(pt.RainCloud, x='Method', y=metric_name, data=df, orient='v', bw=sigma, width_viol=.0) for ax in g.axes.flat: labels = ax.get_xticklabels() ax.set_xticklabels(labels, rotation=20) g.fig.subplots_adjust(bottom=0.15) file_name = '_'.join(['rain_cloud_plot', 'overview', metric_name, '.png']) output_path = join(config.output_dir_plots, file_name) save_figure(file_path=output_path, fig=g.fig, dpi=config.dpi)
def create_path_matrix(host, days, ipv6=False): """ Generate matrix: [t:buckets,...] where buckets (r) contains r[16]=[x,y,z,...] ; x,y,z are ints. It means that there was prefixes with netmask /16. One with AS-path length x, another y, ... :param str host: Host name to analyze :param days: List of Day obj. to analyze :param bool ipv6: IPv6 flag :returns: Bucket matrix """ bucket_matrix={} for t in days: bgpfile=bgpdump_pickle(t, host, ipv6) if not bgpfile: common.d("bgp.create_path_matrix skipping time "+str(t)+"...") continue common.d("bgp.create_path_matrix processing time "+str(t)+"...") bgpdump=common.load_pickle(bgpfile) bucket_matrix[t]=gen_buckets(bgpdump, ipv6, bestonly=True) return bucket_matrix
def load_categories(self, fpath): logging.info("Loading categories") self.categories = common.load_pickle(fpath)
def load_x(filename): return common.load_pickle(filename)
def init(): global vectorizer, km vectorizer = common.load_pickle('vectorizer.pickle') km = common.load_pickle('km.pickle') logger.info('Initialized')
def predict(args, processor): # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True config = config_model(args) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file) with tf.Session(config=tf_config) as sess: model = create_model(sess, NERModel, args.output_dir, config, logger) test_data = [] with open(str(args.data_dir / "test.json"), 'r') as f: idx = 0 for line in f: tokens = [] json_d = {} line = json.loads(line.strip()) textlist = list(line['text']) for i, word in enumerate(textlist): token = tokenizer.tokenize(word) assert len(token) == 1 tokens.extend(token) assert len(tokens) < args.max_seq_len ntokens = [] segment_ids = [] label_ids = [] ntokens.append("[CLS]") # 句子开始设置CLS 标志 segment_ids.append(0) for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) ntokens.append("[SEP]") segment_ids.append(0) # append("O") or append("[SEP]") not sure! input_ids = tokenizer.convert_tokens_to_ids(ntokens) input_len = len(input_ids) input_mask = [1] * len(input_ids) while len(input_ids) < args.max_seq_len: input_ids.append(0) input_mask.append(0) segment_ids.append(0) raw_text = [] raw_text.append('[CLS]') raw_text.extend(textlist) raw_text.append('[SEP]') assert len(raw_text) == len(ntokens) assert len(input_ids) == args.max_seq_len assert len(input_mask) == args.max_seq_len assert len(segment_ids) == args.max_seq_len json_d['id'] = idx json_d['input_ids'] = input_ids json_d['input_mask'] = input_mask json_d['segment_ids'] = segment_ids json_d['input_len'] = input_len json_d['text'] = raw_text idx += 1 test_data.append(json_d) results = [] train_data = processor.get_train_examples() test_train = load_pickle(args.data_dir / 'train_test.bin') for step, line in enumerate(test_data): a_input_ids = [] a_input_mask = [] a_label_ids = [] a_input_lens = [] a_segment_ids = [] aux_sentence = [ train_data[i] for i in test_train[step][:args.aug_num] ] for s in aux_sentence: a_input_ids.append(s['input_ids']) # a_label_ids.append(s['label_ids']) #地址信息增强,将所有的标签信息改成adress标签,全1 a_label_ids.append(s['input_mask']) a_input_mask.append(s['input_mask']) a_input_lens.append(s['input_len']) a_segment_ids.append(s['segment_ids']) input_ids = line['input_ids'] input_mask = line['input_mask'] input_lens = line['input_len'] segment_ids = line['segment_ids'] batch = { 'ori': ([input_ids], [input_mask], [[]], [input_lens], [segment_ids]), 'aug': ([a_input_ids], [a_input_mask], [a_label_ids], [a_input_lens], [a_segment_ids]) } tags = model.evaluate_line(sess, batch) label_entities = get_entities(tags[0], args.id2label) json_d = {} json_d['id'] = step tags[0] = [args.id2label[idx] for idx in tags[0]] json_d['tag_seq'] = " ".join(tags[0]) json_d['entities'] = label_entities results.append(json_d) print(" ") output_predic_file = str(args.output_dir / "test_prediction.json") output_submit_file = str(args.output_dir / "cluener_submit.json") with open(output_predic_file, "w") as writer: for record in results: writer.write(json.dumps(record) + '\n') test_text = [] test_submit = [] for x, y in zip(test_data, results): json_d = {} json_d['id'] = x['id'] json_d['label'] = {} entities = y['entities'] #加了标记 words = x['text'] if len(entities) != 0: for subject in entities: tag = subject[0] start = subject[1] end = subject[2] word = "".join(words[start:end + 1]) if tag in json_d['label']: if word in json_d['label'][tag]: json_d['label'][tag][word].append([start, end]) else: json_d['label'][tag][word] = [[start, end]] else: json_d['label'][tag] = {} json_d['label'][tag][word] = [[start, end]] test_submit.append(json_d) json_to_text(output_submit_file, test_submit)
def _load_results(result_paths: List[str]) -> List: output = list() for p in result_paths: output += [load_pickle(file_path=p)] return output