def save(self, file_path): ''' 保存vocab :param file_name: :param pickle_path: :return: ''' mappings = {"word2idx": self.word2idx, 'idx2word': self.idx2word} save_pickle(data=mappings, file_path=file_path)
def gen_candidate_database(): from imagefeature import ImageFeature print("Candidate Matching Database Generation Start") common.prepare_clean_dir(Path("temp/")) IF = ImageFeature() query_features, query_pathes, orbs = [], [], [] for img_file in sorted(Path("input/query/").glob("*")): query_pathes.append(img_file) x = IF.get_feature(img_file) query_features.append(x) print("Extracting Query Feature", img_file) target_class_names, target_features, target_pathes = [], [], [] for folder in sorted(Path("input/target/").glob("*")): class_name = folder.stem for img_file in sorted(Path("input/target/%s/" % class_name).glob("*")): target_pathes.append(img_file) target_class_names.append(class_name) feature = IF.get_feature(img_file) target_features.append(feature) print("Extracting Target Feature", img_file) print("Calculating Similarities...") sims = cosine_similarity(query_features, target_features) candidate_matching_database = {} for query_index, row in enumerate(sims): query_file = query_pathes[query_index] candidate_matching_database[query_file] = {} args = np.argsort(row) args = args[::-1] for arg in args: target_path = target_pathes[arg] target_class_name = target_class_names[arg] if target_class_name not in candidate_matching_database[query_file]: candidate_matching_database[query_file][target_class_name] = [] if len(candidate_matching_database[query_file][target_class_name]) < setting.MAX_NUMBER_ONE_CLASS: candidate_matching_database[query_file][target_class_name].append((target_path, row[arg])) common.save_pickle(Path("temp/candidate_matching_database.pickle"), candidate_matching_database) print("Candidate Matching Database Generation Finish")
def analyze(n_preview=10): global vectorizer, km # Encode: logger.info('Encoding...') vectorizer = TfidfVectorizer(max_df=0.5, max_features=common.n_features, min_df=2, stop_words='english') common.X = vectorizer.fit_transform(common.doc_texts) common.save_pickle(vectorizer, 'vectorizer.pickle') common.vocab = np.array(vectorizer.get_feature_names()) logger.info(f'X: {common.X.shape}') common.save_encoded_vocab() logger.info('Clustering...') # km = MiniBatchKMeans(n_clusters=common.n_topics, init=init_centroids(), init_size=1000, batch_size=1000, # verbose=0, random_state=common.random_seed) # km = MiniBatchKMeans(n_clusters=common.n_topics, verbose=1, random_state=1) km = KMeans(n_clusters=common.n_topics, init=init_centroids(), max_iter=3, verbose=1, random_state=2) # Analyze: common.doc_topics = km.fit_transform(common.X) # the smaller, the closer common.doc_topics_reduced = np.argmin(common.doc_topics, axis=1) common.topics = km.cluster_centers_ common.save_pickle(km, 'km.pickle') logger.info(f'doc_topics: {common.doc_topics.shape}') logger.info(f'topics: {common.topics.shape}') print() print('----------------') for i, topic_dist in enumerate(common.topics): top_words = common.vocab[np.argsort(topic_dist)[-10:][::-1]] print(f"Topic {i}: {' '.join(top_words)}") print() print('----------------') for i in range(n_preview): print( f'Article {i} (topic: {common.doc_topics_reduced[i]}), {common.doc_titles[i]}' ) print() common.save_analyze_result()
def load_and_cache_examples(args,processor, data_type='train'): # Load data features from cache or dataset file cached_examples_file = args.data_dir / 'cached_crf-{}_{}_{}'.format( data_type, args.arch,#结构 str(args.task_name)) if cached_examples_file.exists(): logger.info("Loading features from cached file %s", cached_examples_file) examples = load_pickle(cached_examples_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) if data_type == 'train': examples = processor.get_aug_examples(args.data_dir/'train_train.bin',args.aug_num,data_type) elif data_type == 'dev': examples = processor.get_aug_examples(args.data_dir/'train_dev.bin',args.aug_num,data_type) logger.info("Saving features into cached file %s", cached_examples_file) save_pickle(examples, str(cached_examples_file)) return examples
def gen_bgpdump_pickle(infile,outfile,ipv6=False): """ Read Cisco show ip bgp output captured in a infile and generate outfile (pickle that contains list of tuples that parse_cisco_bgp_file returns). infile: in filename (prefferably full path to the BGP text file) outfile: out filename ipv6: IPv6 indicator (needed for prefix normalization) """ if os.path.isfile(outfile): return common.load_pickle(outfile) o=list(parse_cisco_bgp_file(infile, ipv6)) common.save_pickle(o, outfile) return o
def gen_bgpdump_pickle(infile, outfile, ipv6=False): """ Read Cisco show ip bgp output captured in a infile and generate outfile (pickle that contains list of tuples that parse_cisco_bgp_file returns). :param str infile: Input filename (prefferably full path to the BGP text file) :param str outfile: Output filename :param bool ipv6: IPv6 indicator (needed for prefix normalization) :returns: The parsed cisco bgp output either from pickle or from the primary source """ if os.path.isfile(outfile): return common.load_pickle(outfile) o = list(parse_cisco_bgp_file(infile, ipv6)) common.save_pickle(o, outfile) return o
def gen_bgpdump_pickle(infile,outfile,ipv6=False): """ Read Cisco show ip bgp output captured in a infile and generate outfile (pickle that contains list of tuples that parse_cisco_bgp_file returns). :param str infile: Input filename (prefferably full path to the BGP text file) :param str outfile: Output filename :param bool ipv6: IPv6 indicator (needed for prefix normalization) :returns: The parsed cisco bgp output either from pickle or from the primary source """ if os.path.isfile(outfile): return common.load_pickle(outfile) o=list(parse_cisco_bgp_file(infile, ipv6)) common.save_pickle(o, outfile) return o
def get_categories(self, save_path="", cache=True): """ Find categories and return a dictionary with each category information """ if (cache is True) and (self.categories is not None): logging.info("cache categories dictionary...") return url = self.url["category"] page = common.get_page(url) common.sleep_random_between(1, 2) cat_container = page.find_all("div", class_="categories__container") cat = {} if len(cat_container) == 0: logging.info( "category container is empty, returning empty dictionary...") return cat for c in cat_container: name = c.h2.text link = c.h2.a["href"] cat_id = re.findall(self.patt["cat_id"], link) if len(cat_id) == 0: logging.info("could not find category id, passing...") continue cat_id = cat_id[0] sub = self.get_sub_cats(link) cat[cat_id] = {"name": name, "link": link, "sub": sub} if len(cat) != 0 and save_path != "": common.save_pickle(save_path, cat) self.categories = cat
def main(unused_argv): pp = pprint.PrettyPrinter(indent=2, compact=True) # Load training and eval data (train_x, train_y), (test_x, test_y) = common.load_original_mnist() print(train_x.shape, train_x.dtype, train_y.shape, train_y.dtype) print(test_x.shape, test_x.dtype, test_y.shape, test_y.dtype) def train_model(classifier, log_stats=True): start_time = time.time() # Train the model # profiler_hook = tf.train.ProfilerHook(save_steps=50, output_dir=MODEL_DIR + '/train') train_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": train_x}, y=train_y, batch_size=TRAINING_BATCH_SIZE, num_epochs=None, shuffle=True) classifier.train( input_fn=train_input_fn, steps=TRAINING_STEPS, # hooks=[profiler_hook] ) duration = round(time.time() - start_time, 3) if log_stats: print("Training duration: " + common.duration_to_string(duration)) return duration def eval_model(classifier, log_stats=True): start_time = time.time() tensors_to_log = { # "probabilities": "softmax_tensor", "pred": "diff" } logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=1) eval_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": test_x}, y=test_y, batch_size=EVAL_BATCH_SIZE, shuffle=False) result = classifier.evaluate( input_fn=eval_input_fn, steps=EVAL_STEPS, # hooks=[logging_hook] ) duration = round(time.time() - start_time, 3) if log_stats: print("Training duration: " + common.duration_to_string(duration)) print("Eval result:", result) return result, duration model_stats_map = {} for conf_name, config in model_configs.items(): # if config["skip"]: # continue print("RUN CONFIG: %s" % conf_name) model_dir = os.path.join(MODEL_DIR, conf_name) # common.clean_dir(model_dir) mnist_classifier = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir, params=config) eval_results = [] total_train_duration = 0 total_eval_duration = 0 for _ in range(TRAINING_EPOCHS): # train_duration = train_model(mnist_classifier) # total_train_duration += train_duration eval_result, eval_duration = eval_model(mnist_classifier) eval_results.append(eval_result) total_eval_duration += eval_duration final_result = common.get_final_eval_result(eval_results) print("Eval results:") pp.pprint(eval_results) model_stats_map[conf_name] = { "model_details": model_details, "final_result": final_result, "total_train_duration": common.duration_to_string(total_train_duration), "total_eval_duration": common.duration_to_string(total_eval_duration), } common.save_pickle( model_stats_map[conf_name], os.path.join(model_details["model_dir"], "last_result.pkl")) common.save_json( model_stats_map[conf_name], os.path.join(model_details["model_dir"], "last_result.json")) print("Total training duration: " + common.duration_to_string(total_train_duration)) print("Total eval duration: " + common.duration_to_string(total_eval_duration)) print("Models results:") pp.pprint(model_stats_map)
def save_records(X, X_filename): common.save_pickle(X_filename, X)