def main(): # Load json config config = json.load(open("config.json")) extracted_features_root = config["extracted_features"] print("[+] Load features ...") X_test_num = utils.load_features(extracted_features_root, "X_test_num") X_test_cat = utils.load_features(extracted_features_root, "X_test_cat") X_test_desc = utils.load_features(extracted_features_root, "X_test_desc").any() X_test_title = utils.load_features(extracted_features_root, "X_test_title").any() #X_test_param = utils.load_features(extracted_features_root, "X_test_param").any() token_len = utils.load_features(extracted_features_root, "token_len") #X_test_text = [X_test_desc, X_test_title, X_test_param] X_test_text = [X_test_desc, X_test_title] n_folds = config["n_fold"] if n_folds: predict_fold(config, n_folds, X_test_num, X_test_cat, X_test_text, token_len) else: predict_one(config, X_test_num, X_test_cat, X_test_text, token_len)
def main(args): _, q, x = utils.load_benchmark(args.dataset, args.features) q = utils.load_features(q, chunks=(2500, 2048)) x = utils.load_features(x, chunks=(2500, 2048)) dim = q.shape[1] if args.random_rot is not None: rot = args.random_rot rot = os.path.join('features', 'random_ortho', f'rand_ortho_{dim}_{rot}.npy') rot = np.load(rot).astype(np.float32) q = q.dot(rot.T) x = x.dot(rot.T) # centering x_mean = x.mean(axis=0) q -= x_mean x -= x_mean out_dir = os.path.join('features', args.output) os.makedirs(out_dir, exist_ok=True) _, q_out, x_out = utils.load_benchmark(args.dataset, args.output) if not os.path.exists(q_out) or args.force: utils.save_as_hdf5(q, q_out, progress=True) if not os.path.exists(x_out) or args.force: utils.save_as_hdf5(x, x_out, progress=True)
def main(): parameters = {} parameters['axes'] = parameter_indices_to_plot assert len(parameters['axes']) == 2, \ 'Plot_3D can only plot over 2 parameters.' assert all(0<=p<=config.P-1 for p in parameters['axes']), \ 'Provided parameters do not coincide with those in config.' parameters['sliders'] = list(set(range(config.P))-set(parameters['axes'])) # Store references to plots, otherwise the widgets become unresponsive due # to garbage collector. https://stackoverflow.com/a/42884505 plots = {} scaler = utils.load_scaler() for component in config.components: model_constructor = utils.models[model_key] model = model_constructor() model.load(utils.model_dir, component) # Initialize and load data structres features = {}; targets = {}; outputs = {} for dataset in ['train', 'test']: features[dataset] = utils.load_features(dataset) targets[dataset] = utils.load_targets(dataset, component) outputs[dataset] = None # Create the interactive 3D plot plots[component] = Plot_3D(component, outputs, targets, features, model, parameters, scaler) pyplot.show()
def initialize(self): print('Initialization in progress...!\n') start = time.time() yolo = YOLO(**{"model_path": self.model_path, "anchors_path": self.anchors, "classes_path": self.yolo_classes_path, "score" : self.confidence, "gpu_num" : self.gpu_num, "model_image_size" : (416, 416), }) # load pre-processed features database features, _, _ = load_features(self.recog_model) with open(self.classes_path, 'rb') as f: #img_input, input_labels = pickle.load(f) input_feats, input_labels = pickle.load(f) # load pre-trained recognition model model, preprocessed, input_shape = load_extractor_model(self.recog_model) my_preprocess = lambda x: preprocessed(pad_image(x, input_shape)) #input_feat = extract_features(img_input, model, my_preprocess) sim_cutoff, (bins, cdf_list) = similarity_cutoff(input_feats, features, 0.95) print("Done...! It tooks {:.3f} mins\n".format((time.time() - start)/60)) self.model_preproc = (yolo, model, my_preprocess) self.params = (input_feats, sim_cutoff, bins, cdf_list, input_labels) return True
def main(): for component in config.components: D, denom_sq = utils.load_POD_D_and_denom_sq(component) features = {} targets = {} for dataset in ['train', 'validate']: features[dataset] = utils.load_features(dataset) targets[dataset] = utils.load_targets(dataset, component) ## Wrapper for the training routine def train_wrapper(tune_config): model_constructor = utils.models[model_key] model = model_constructor() model.set_data(features, targets, D, denom_sq) model.train(tune_config) model.save(utils.model_dir, component) for model_key in models_to_be_trained: ## Train without a tuning config t0 = time.time() train_wrapper(None) dt = time.time() - t0 print(F"Trained {model_key} for {component} in {dt:.4} s") for model_key in models_to_be_tuned: ## Tune using the defined tuning config analysis = tune.run(train_wrapper, local_dir=join(utils.model_dir, model_key), name=component, config=tune_config[model_key], stop={'time_total_s': 1800})
def initialize(filename): print('Initialization in progress...!\n') start = time.time() yolo = YOLO( **{ "model_path": './model/keras_yolo3/model_data/yolo_weights_logos.h5', "anchors_path": './model/keras_yolo3/model_data/yolo_anchors.txt', "classes_path": './data/preprocessed/classes.txt', "score": 0.05, "gpu_num": 1, "model_image_size": (416, 416), }) # get Inception/VGG16 model and flavor from filename model_name, flavor = model_flavor_from_name(filename) ## load pre-processed features database features, brand_map, input_shape = load_features(filename) ## load inception model model, preprocess_input, input_shape = load_extractor_model( model_name, flavor) my_preprocess = lambda x: preprocess_input(utils.pad_image(x, input_shape)) with open('./data/preprocessed/trained_brands.pkl', 'rb') as f: img_input, input_labels = pickle.load(f) (img_input, feat_input, sim_cutoff, (bins, cdf_list)) = load_brands_compute_cutoffs(img_input, (model, my_preprocess), features, sim_threshold) print('Done! It tooks {:.2f} mins.\n'.format((time.time() - start) / 60)) return (yolo, model, my_preprocess), (feat_input, sim_cutoff, bins, cdf_list, input_labels)
def main(fea_dir, pos_file, neg_file, out_dir, model_file): model = train("{0}/all.fea".format(fea_dir), pos_file, neg_file) if save_model != None: save_model(model, model_file) for f in listdir(fea_dir): #if f == "all.fea": # continue file_path = "{0}/{1}".format(fea_dir, f) ph_fea = load_features(file_path) phrase_list = [] fea_list = [] for ph in ph_fea: phrase_list.append(ph) fea_list.append(ph_fea[ph]) X = np.asarray(fea_list) scores = model.decision_function(X) items = [(scores[i], phrase_list[i]) for i in range(len(phrase_list))] items.sort(reverse=True) save_result(items, "{0}/{1}".format(out_dir, f))
def main(args): feat, case_ids = load_features(args.src, zscore=True) lab = load_labels(args.labsrc) ((nepc_f, nepc_lab), (m0_f, m0_lab), (m0p_f, m0p_lab), (m1_f, m1_lab)) = split_sets(feat, lab) yvect = ['M0']*m0_f.shape[0] + ['NPEC']*nepc_f.shape[0] ttests = [] fig = plt.figure() for f in feat.columns: m0_ = m0_f.loc[:, f] nepc_ = nepc_f.loc[:, f] tt = ttest_ind(m0_, nepc_) if tt.pvalue < 1e-10: feature_data = pd.DataFrame({'group': yvect, 'feature': np.concatenate([m0_, nepc_], axis=0)}) print(f, tt) out = os.path.join(args.dst, 'f_{}.png'.format(f)) plt.clf() # sns.boxplot(x='group', y='feature', data=feature_data) sns.distplot(m0_, label='M0') sns.distplot(nepc_, label='NEPC') plt.legend() plt.title('Feature {}'.format(f)) plt.savefig(out, bbox_inches='tight')
def main(args): feat, case_ids = load_features(args.src) lab = load_labels(args.labsrc) feat = drop_high_cor(feat, cor_thresh=0.8) print('Features after high cor drop') print(feat.head()) run_tsne(feat, lab)
def __init__(self, args): """ MUSAE and AE machine constructor. :param args: Arguments object with the model hyperparameters. """ self.args = args self.log = dict() self.graph = load_graph(args.graph_input) self.features = load_features(args.features_input)
def main(): # register arguments args = register_arguments() input_sequences = SeqIO.parse(args.sequences, 'fasta') clade_designations = read_in_clade_definitions( f"config/clades_{args.lineage}_ha.tsv") refname = (f"config/reference_{args.lineage}_ha.gb") ref = SeqIO.read(refname, 'genbank') features = load_features(refname) refstr, refCDS, refAA, cds_start, cds_end = get_cds(ref) # get clade internal clade and likeness clades_relatives, internal_clades = load_relatives() # output files prov_out = args.batchName + "_provanence.txt" results_out = args.batchName + "_cladeResults.txt" errors_out = args.batchName + "_error.txt" bucket_out = args.batchName + "_bucket.txt" # results results_bucket = ResultsBucket() for seq in input_sequences: seq_container = tmpNode() seq_aln = codon_align(seq, refstr, refAA, cds_start, cds_end) # error checking if seq_aln is None: print( f"{seq.id}\tError translating, check lineage and correct", file=sys.stdout) with open(errors_out, 'a') as ef: print( f"{seq.id}\tError translating, check lineage and correct", file=ef) continue clade_provanence = get_provanence( seq_aln, features, clade_designations, ref) # write out results with open(prov_out, 'a') as cf: print(f"{seq.description}\t{', '.join(clade_provanence)}", file=cf) #clade_final = get_likeness(seq, clade_provanence, clades_relatives, internal_clades) clade_desig, virus_like, desig = get_likeness( seq, clade_provanence, clades_relatives, internal_clades) with open(results_out, 'a') as rf: print(clade_desig, file=rf) print(clade_desig, file=sys.stdout) results_bucket.add_result(seqno=seq.description, ha_clade=desig, result=virus_like, prov=', '.join(clade_provanence)) print(results_bucket) results_bucket.write_results(bucket_out)
def get_features(limit=1000, features=[], stemmer_type="RegexpStemmer", db_name="yelp_train", standardized=False): """ ----------------------------------------------- It does a bit of optimization Loads features from pickle, if the features with the specified input conditions are already pickled If not fetches from the database (MongoDB) ----------------------------------------------- """ if os.path.exists(data_dir + "/X_%s_%s_%s_%s.pickle"%(limit, db_name, stemmer_type, "-".join(features))): X = load_features("X_%s_%s_%s_%s.pickle"%(limit, db_name, stemmer_type, "-".join(features))) y = load_features("Y_%s_%s_%s_%s.pickle"%(limit, db_name, stemmer_type, "-".join(features))) z = load_features("Z_%s_%s_%s_%s.pickle"%(limit, db_name, stemmer_type, "-".join(features))) else: #! fetch features from database X, y, z = extract_and_save_features(limit=limit, features=features, stemmer_type=stemmer_type, db_name=db_name, standardized=standardized) return X, y, z
def main(): torch.backends.cudnn.benchmark = True # Load json config config = json.load(open("config.json")) extracted_features_root = config["extracted_features"] # Load data and token len of embedding layers print("[+] Load features ...") y = utils.load_features(extracted_features_root, "y_train") token_len = utils.load_features(extracted_features_root, "token_len") X_train_num = utils.load_features(extracted_features_root, "X_train_num") X_train_cat = utils.load_features(extracted_features_root, "X_train_cat") X_train_desc = utils.load_features(extracted_features_root, "X_train_desc").any() X_train_title = utils.load_features(extracted_features_root, "X_train_title").any() # X_train_word_desc = utils.load_features(extracted_features_root, "X_train_word_description") # X_train_word_title = utils.load_features(extracted_features_root, "X_train_word_title") embedding_weights = utils.load_bcolz(extracted_features_root, "embedding_weights") X_train_word = [utils.load_bcolz(extracted_features_root, "X_train_word")] X_train_text = [X_train_desc, X_train_title] # X_train_word = [X_train_word_desc, X_train_word_title] n_folds = config["n_fold"] if n_folds: train_fold(config, n_folds, X_train_num, X_train_cat, X_train_text, X_train_word, embedding_weights, y, token_len) else: train_normal(config, X_train_num, X_train_cat, X_train_text, X_train_word, embedding_weights, y, token_len)
def train(): path_to_data = '../../data/processed/' path_to_output = '../../data/submissions/' path_to_preds = '../../data/predictions/' version = '1.1' random_seed = 8675309 sample_size = 50000 n_folds = 5 params = { 'nthread': 8, 'n_estimators': 10000, 'learning_rate': 0.02, 'num_leaves': 34, 'colsample_bytree': 0.9497036, 'subsample': 0.8715623, 'max_depth': 8, 'reg_alpha': 0.041545473, 'reg_lambda': 0.0735294, 'min_split_gain': 0.0222415, 'min_child_weight': 39.3259775, 'silent': -1, 'verbose': -1 } train, labels, test, train_ids, test_ids = utils.load_features( path_to_data, version, sample_size) oof_train, oof_test = utils.kfold(classifier_builder=LightGBMWrapper, base_classifier=lightgbm.LGBMClassifier, classifier_params=params, train=train, labels=labels, test=test, n_folds=n_folds, random_seed=random_seed, use_smote=True) df_oof_train = pd.DataFrame({ 'SK_ID_CURR': train_ids, 'TARGET': labels, 'lightgbm': oof_train }) # df_oof_train['SK_ID_CURR'] = df_oof_train['SK_ID_CURR'].astype('int32') df_oof_test = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': oof_test}) # df_oof_test['SK_ID_CURR'] = df_oof_test['SK_ID_CURR'].astype('int32') df_oof_train.to_csv(path_to_preds + version + '-lightgbm.csv', index=False) df_oof_test.to_csv(path_to_output + version + '-lightgbm.csv', index=False)
def main(args): feat_importance = pd.read_csv(args.src, sep='\t', index_col=0, header=None) features , _ = load_features(args.featsrc, zscore=True) labels = load_labels(args.labelsrc) feat_importance.sort_values(1, ascending=False, inplace=True) sns.distplot(feat_importance) plt.savefig('tile_feature_importance_dist.png', bbox_inches='tight') sns.regplot(np.squeeze(feat_importance.index.values), np.squeeze(feat_importance.values)) feat_importance = feat_importance.iloc[:args.n, :] print('highest feature importance:') for f in feat_importance.index.values: print(f, feat_importance.loc[f].values)
def train(): path_to_data = '../../data/processed/' path_to_output = '../../data/submissions/' path_to_preds = '../../data/predictions/' version = '1.3' random_seed = 8675309 sample_size = None n_folds = 5 xgb_params = { 'learning_rate':0.1, 'n_estimators':10000, 'max_depth':4, 'min_child_weight':5, 'subsample':0.8, 'colsample_bytree':0.8, 'objective':'binary:logistic', 'nthread':8, 'seed':random_seed, 'scale_pos_weight':2.5, 'reg_alpha':1.2, 'early_stopping_rounds':50, 'verbose':20, 'eval_metric':'auc' } train, labels, test, train_ids, test_ids = utils.load_features(path_to_data, version, sample_size) oof_train, oof_test = utils.kfold(classifier_builder=XgboostWrapper, base_classifier=XGBClassifier, classifier_params=xgb_params, train=train, labels=labels, test=test, n_folds=n_folds, random_seed=random_seed) df_oof_train = pd.DataFrame({'SK_ID_CURR':train_ids, 'TARGET':labels, 'xgboost':oof_train}) df_oof_train.fillna(0, inplace=True) df_oof_train['SK_ID_CURR'] = df_oof_train['SK_ID_CURR'].astype('int32') df_oof_test = pd.DataFrame({'SK_ID_CURR':test_ids, 'TARGET':oof_test}) df_oof_test['SK_ID_CURR'] = df_oof_test['SK_ID_CURR'].astype('int32') df_oof_train.to_csv(path_to_preds + version + '-xgboost.csv', index=False) df_oof_test.to_csv(path_to_output + version + '-xgboost.csv', index=False)
def main(): dataset = 'test' features = utils.load_features(dataset) error_table = [] #list of dictionaries for component in config.components: L = config.num_basis[component] D, denom_sq = utils.load_POD_D_and_denom_sq(component) eps_pod_sqs = utils.load_error_POD_sq(dataset, component) targets = utils.load_targets(dataset, component) outputs = {} for model_key, model_constructor in utils.models.items(): model = model_constructor() model.load(utils.model_dir, component) outputs[model_key] = model.evaluate(features) ## for each sample in test set for i in range(len(targets)): for l in range(L + 1): eps_pod_sq = eps_pod_sqs[l, i] line = { 'component': component, 'l': l, 'sample': i, 'dataset': dataset, 'eps_pod_sq': eps_pod_sq } for model_key in utils.models: q_rb_model = outputs[model_key][i][:l] q_rb_truth = targets[i][:l] D_l = D[:l] eps_reg_sq = np.sum( (D_l * (q_rb_model - q_rb_truth))**2) / denom_sq eps_sq = eps_pod_sq + eps_reg_sq eps_key = F'eps_pod{model_key.lower()}_sq' line[eps_key] = eps_sq error_table.append(line) df = pd.DataFrame(error_table) utils.save_error_table(df, dataset)
def initialize(yolo, model_name, DB_path): print("\n\nInitialization in progress...!\n") start = time.time() # load pre-processed features database features, _, _ = load_features(model_name) with open(args.classes_path, 'rb') as f: #img_input, input_labels = pickle.load(f) input_feats, input_labels = pickle.load(f) # load pre-trained recognition model model, preprocessed, input_shape = load_extractor_model(model_name) my_preprocess = lambda x: preprocessed(pad_image(x, input_shape)) #input_feats = extract_features(img_input, model, my_preprocess) sim_cutoff, (bins, cdf_list) = similarity_cutoff(input_feats, features, 0.95) print("Done...! It tooks {:.3f} mins\n".format((time.time() - start)/60)) return (yolo, model, my_preprocess), (input_feats, sim_cutoff, bins, cdf_list, input_labels)
def main(args): """ Characteristic function embedding wrapper. :param args: Arguments object parsed up. """ if args.model_type == "FEATHER": print("\nFitting a node embedding.\n") graph = load_graph(args.graph_input) features = load_features(args.feature_input) model = FEATHER() model.fit(graph, features) elif args.model_type == "FEATHER-G": print("\nFitting a graph level embedding.\n") graphs = load_graphs(args.graphs_input) model = FEATHERG() model.fit(graphs) else: quit() X = model.get_embedding() save_embedding(X, args.output)
'Chosen features: "%s". Compute %i Nearest Neighbors of %i randomly chosen postures. The Results will be saved in "%s".' % (args.feature_type, args.nn_per_query, args.number_of_queries, results_fold)) elif 'lstm' in args.feature_type: print( 'Chosen features: "%s". Compute %i Nearest Neighbors of %i randomly chosen sequences. The Results will be saved in "%s".' % (args.feature_type, args.nn_per_query, args.number_of_queries, results_fold)) else: raise ValueError( 'Chosen Features (%s) are not available. Please choose "fc6", "fc7" or "fc6fc7" for posture features or "lstm" for sequence features.' % args.feature_type) print('Load features...') feat, frames, coords, vids = load_features(args.feature_type, cfg.features_path, uni_videos.tolist()) ############################################ # 2. compute NN and plot it ############################################ k = args.nn_per_query #number of nearest neighbor per query nr = min(args.number_of_queries, len(feat)) #number of queries idx = np.random.permutation(len(feat))[:nr] #choose randomly queries #plot queries and NN if 'fc6' in args.feature_type or 'fc7' in args.feature_type: n_mean_nn = 100 #we also want to plot the mean over the 100 nearest neighbor of the queries print('Compute %i Nearest Neighbor for %i queries' % (k, nr)) D, I = compute_NN(feat, min(100 * n_mean_nn, feat.shape[0]), idx) nr_rows, r, fig_nr = 10, 0, 1
# Columns that we want to train on cols = [ 'ax_l', 'ay_l', 'az_l', 'ax_r', 'ay_r', 'az_r', 'a_res_l', 'a_res_r'] cols = [ 'ax_l', 'ay_l', 'az_l', 'ax_r', 'ay_r', 'az_r'] cols = ['ax_diff', 'ay_diff', 'az_diff'] cols = ['ax_diff', 'ay_diff', 'az_diff','a_res_diff'] cols = [ 'ax_l', 'ay_l', 'az_l', 'ax_r', 'ay_r', 'az_r', 'ax_diff', 'ay_diff', 'az_diff'] for event in events: for event_type in event_types: x = [] for col in cols: directory = get_directory(initial_directory=data_folder, columns=col, est_events=True, event=event, event_type=event_type) # Load features (after extract data has been run) X_dictionary, y_dictionary, groups = load_features(data_folder, directory, est_events=True) x.append(X_dictionary) X = {} for k in X_dictionary.keys(): concat_list = [] for idx in x: concat_list.append(idx[k]) X[k] = pd.concat(concat_list, axis=1) y = y_dictionary
############################################ # 1. Load sequences and features ############################################ detections = load_table(cfg.detection_file,asDict=False) det_cohort= np.array(detections['cohort']) # Used for classifier and plots det_time = np.array(detections['time']) # Used for classifier and plots det_frames= np.array(detections['frames']) det_videos= np.array(detections['videos']) uni_videos= np.unique(detections['videos'].values) #uni_videos= [v for v in uni_videos if '2kmh' in v] #uni_videos= [v for v in uni_videos if 'H' in v] uni_videos= np.array([v for v in uni_videos if os.path.isdir(cfg.crops_path+v)]) print('Load features...') pos_features,pos_frames,pos_coords,pos_videos = load_features('fc6', cfg.features_path,uni_videos.tolist()) ############################################ # 2. Posture healthy/impaired assignment ############################################ video_time =np.array([det_time[det_videos==v][0] for v in uni_videos]).astype(int) video_cohort=np.array([det_cohort[det_videos==v][0] for v in uni_videos]).astype(int) pos_time =np.concatenate([video_time[uni_videos==v] for v in pos_videos]) healthy, impaired = pos_time==0, pos_time==1 h_pos_feat, h_pos_videos= pos_features[healthy], pos_videos[healthy] h_pos_frames,h_pos_coords= pos_frames[healthy], pos_coords[healthy] i_pos_feat, i_pos_videos= pos_features[impaired], pos_videos[impaired] i_pos_frames,i_pos_coords= pos_frames[impaired], pos_coords[impaired] ############################################
def data_processing(files, seg_per_sent=3, debug=False): data_all = [] # stats full_sent_pos, seg_num_doc = [], [] total_instance = 0 for fi, file in tqdm(enumerate(files)): data = load_features(file) # [(name, [(time_elapsed, parsed_words, segments, full_sent_pos), ...])] doc_data = [] for doc in data: # doc doc_name = doc[0] if debug: print(doc_name, fi) num_sent = len(doc[1]) seg_nums = 0 sent_data = [] for i in range(num_sent): # sentence sent = doc[1][i] segments = [] for ssent in sent: parsed_words, segments_all = ssent[1], ssent[2] num_seg = len(segments_all) sub_segments = [] is_full_sent = False sum_list = [0, 1] if isinstance(segments_all, str): full_sent_key = '000{:03}'.format(len(parsed_words)) sub_segments.append([ full_sent_key, 0, 0, True, True, [0 for _ in sum_list] ]) full_sent_pos.append(0) else: segments_all = list(segments_all.items()) probs = [[] for _ in sum_list] max_ed = -1 for j in range(num_seg): key, prob = segments_all[j][0], segments_all[j][1] for p in range(len(probs)): probs[p].append( prob[p][1] + prob[p][2]) # P(comma) + P(period) _, ed = decode_index(key) max_ed = max(max_ed, ed) full_sent_key = '000{:03d}'.format(max_ed) mode = 0 # if > 0, scaling if mode > 0: probs = [scaling(pr, mode) for pr in probs] # ! filtering RULE ! quant_ratio = 0.75 # 0.5 for median thres = [ np.quantile(prs, quant_ratio) for prs in probs ] num_sel_seg = 0 for j in range(num_seg): key, prob = segments_all[j][0], segments_all[j][1] st, ed = decode_index(key) rouge_score = 0 sel_type = 0 # segment selection by threshold if probs[0][j] < thres[0] or probs[1][j] < thres[1]: sel_type = 1 # optional RULE if parsed_words[st] == 'and' or parsed_words[ ed - 1] == 'and': sel_type = 2 if sel_type == 0: num_sel_seg += 1 if key == full_sent_key: full_sent_pos.append(j) is_full_sent = True # segment: [key, psum, rouge, is_full_sent, is_sel, probs.] sub_segments.append([ key, np.sum([probs[p][j] for p in sum_list]), rouge_score, is_full_sent, sel_type, [probs[p][j] for p in sum_list] ]) is_full_sent = False if debug: # raw segments based on XLNet prob. dist. segment_sorted = sorted(sub_segments, key=lambda x: x[1], reverse=True) print('\n[Full sentence]: ', ' '.join(parsed_words)) print('\n[XLNet segments - sorted by prob. sum]') topn_seg = 10 for si, seg in enumerate(segment_sorted): if si < topn_seg or si >= len( segment_sorted) - topn_seg: seg_text = gen_segment_text( seg, parsed_words) app_txt = 'top-{}'.format( topn_seg ) if si < topn_seg else 'bot-{}'.format( topn_seg) fi = '-f' if seg[0] == full_sent_key else '' prob_txt = 'sel:[{}] Sum:{:.3e}, L-C + L-P:{:.3e}, R-C: + R-P:{:.3e}'.format( seg[4], seg[1], seg[5][0], seg[5][1]) print( '{:03}/{:03} [{}{}]'.format( si + 1, len(segment_sorted), app_txt, fi), prob_txt, seg_text) sub_segments = [ sseg for sseg in sub_segments if (sseg[4] == 0) and (not sseg[3]) ] sub_segments = sorted(sub_segments, key=lambda x: x[1], reverse=True) if debug: # selected segments print( '\n[candidate segments filtered by median - sorted]' ) for si, seg in enumerate(sub_segments): seg_text = gen_segment_text(seg, parsed_words) fi = '-f' if seg[3] else '' prob_txt = 'sel:[{}] Sum:{:.3e}, L-C + L-P:{:.3e}, R-C: + R-P:{:.3e}'.format( seg[4], seg[1], seg[5][0], seg[5][1]) print( '{:03}/{:03}{}'.format( si + 1, num_sel_seg, fi), prob_txt, seg_text) # final sub-segments if len(sub_segments) == 0: sub_segments.append([ full_sent_key, 0, 0, True, True, [0 for _ in sum_list] ]) else: sub_segments = sub_segments[:seg_per_sent] if debug: # selected segments print('\n[final candidate segments]') for si, seg in enumerate(sub_segments): seg_text = gen_segment_text(seg, parsed_words) fi = '-f' if seg[3] else '' prob_txt = 'sel:[{}] Sum:{:.3e}, L-C + L-P:{:.3e}, R-C: + R-P:{:.3e}'.format( seg[4], seg[1], seg[5][0], seg[5][1]) print( '{:03}/{:03}{}'.format( si + 1, num_sel_seg, fi), prob_txt, seg_text) pdb.set_trace() seg_nums += len(sub_segments) # exclude less than 5 words (not chunks) final_sub_segments = [] for sseg in sub_segments: seg_text = gen_segment_text(sseg, parsed_words) if len(seg_text.split()) >= 5: final_sub_segments.append(sseg) segments.append((parsed_words, final_sub_segments)) sent_data.append(segments) doc_data.append((doc_name, sent_data)) seg_num_doc.append(seg_nums) data_all = data_all + doc_data total_instance += len(data) print('data num.: {} {}'.format(total_instance, len(data_all))) return data_all, full_sent_pos, seg_num_doc
print(input_path + '/{}*'.format(split)) files = sorted(glob.glob(input_path + '/{}*'.format(split))) print('{} files are found'.format(len(files))) filename = os.path.join(output_path, split + '.pkl') filename_stats = os.path.join(output_path, split + '_stats.pkl') if not os.path.exists(filename): st_time = time.time() # merge data based on filtering rule data_all, full_sent_pos, seg_num_doc = data_processing( files, args.seg_per_sent, args.debug) save_features(filename, data_all) save_features(filename_stats, [full_sent_pos, seg_num_doc]) print('total num. sentences', len(full_sent_pos)) print('elapsed time: {:.3f}s'.format(time.time() - st_time)) else: data_all = load_features(filename) full_sent_pos, seg_num_doc = load_features(filename_stats) print('data is loaded from {} and {}'.format( filename, filename_stats)) full_sent_pos_list.append(full_sent_pos) print_stats(seg_num_doc, '{}-seg_num_doc'.format(split)) # draw data stats draw_stats(full_sent_pos_list, splits, data_name)
''' extract feature ''' #print('===> extract features for every videos ...') #utils.extract_feature_p1(feature_extractor, train_loader, val_loader, args) ''' define loss ''' criterion = nn.CrossEntropyLoss() ''' setup optimizer ''' FC.cuda() optimizer = torch.optim.Adam(FC.parameters(), lr=args.lr, weight_decay=args.weight_decay) sched = lr_scheduler.StepLR(optimizer, step_size=50) ''' setup tensorboard ''' writer = SummaryWriter(os.path.join(args.save_dir, 'train_info')) ''' load train and val features ''' print('===> load train and val features and labels ...') train_features, train_label, valid_features, valid_labels = utils.load_features( args) ''' train model ''' print('===> start training ...') iters = 0 best_acc = 0 for epoch in range(1, args.epoch + 1): FC.train() utils.set_requires_grad(FC, True) total_length = train_features.shape[0] perm_index = torch.randperm(total_length) train_X_sfl = train_features[perm_index] train_y_sfl = train_label[perm_index] # construct training batch for index in range(0, total_length, args.train_batch): train_info = 'Epoch: [{0}][{1}/{2}]'.format( epoch, index + 1, len(train_loader))
def test(filename): """ Test function: runs pipeline for a small set of input images and input brands. """ yolo = YOLO(**{"model_path": 'keras_yolo3/yolo_weights_logos.h5', "anchors_path": 'keras_yolo3/model_data/yolo_anchors.txt', "classes_path": 'data_classes.txt', "score" : 0.05, "gpu_num" : 1, "model_image_size" : (416, 416), } ) save_img_logo, save_img_match = True, True test_dir = os.path.join(os.path.dirname(__file__), os.path.pardir, 'data/test') # get Inception/VGG16 model and flavor from filename model_name, flavor = model_flavor_from_name(filename) ## load pre-processed features database features, brand_map, input_shape = load_features(filename) ## load inception model model, preprocess_input, input_shape = load_extractor_model(model_name, flavor) my_preprocess = lambda x: preprocess_input(utils.pad_image(x, input_shape).astype(np.float32)) ## load sample images of logos to test against input_paths = ['test_batman.jpg', 'test_robin.png', 'test_lexus.png', 'test_champions.jpg', 'test_duff.jpg', 'test_underarmour.jpg', 'test_golden_state.jpg'] input_labels = [ s.split('test_')[-1].split('.')[0] for s in input_paths] input_paths = [os.path.join(test_dir, 'test_brands/', p) for p in input_paths] # compute cosine similarity between input brand images and all LogosInTheWild logos ( img_input, feat_input, sim_cutoff, (bins, cdf_list) ) = load_brands_compute_cutoffs(input_paths, (model, my_preprocess), features, sim_threshold, timing=True) images = [ p for p in os.listdir(os.path.join(test_dir, 'sample_in/')) if p.endswith('.jpg')] images_path = [ os.path.join(test_dir, 'sample_in/',p) for p in images] start = timer() times_list = [] img_size_list = [] candidate_len_list = [] for i, img_path in enumerate(images_path): outtxt = img_path ## find candidate logos in image prediction, image = detect_logo(yolo, img_path, save_img = True, save_img_path = test_dir, postfix='_logo') ## match candidate logos to input outtxt, times = match_logo(image, prediction, (model, my_preprocess), outtxt, (feat_input, sim_cutoff, bins, cdf_list, input_labels), save_img = save_img_match, save_img_path=test_dir, timing=True) img_size_list.append(np.sqrt(np.prod(image.size))) candidate_len_list.append(len(prediction)) times_list.append(times) end = timer() print('Processed {} images in {:.1f}sec - {:.1f}FPS'.format( len(images_path), end-start, len(images_path)/(end-start) )) fig, axes = plt.subplots(1,2, figsize=(9,4)) for iax in range(2): for i in range(len(times_list[0])): axes[iax].scatter([candidate_len_list, img_size_list][iax], np.array(times_list)[:,i]) axes[iax].legend(['read img','get box','get features','match','draw','save']) axes[iax].set(xlabel=['number of candidates', 'image size'][iax], ylabel='Time [sec]') plt.savefig(os.path.join(test_dir, 'timing_test.png'))
sum_path=test_sum_path, is_duc=False, topn_sent=args.topn_sent) dest_dir = os.path.join(args.base_path, os.path.dirname(args.TAC_data_path[0]), args.data_type) data_name = 'TAC' train_ids = None test_ids = None if args.data_type == 'xlnet': train_file = os.path.join(dest_dir, 'train.pkl') train_data = load_features(train_file) test_file = os.path.join(dest_dir, 'test.pkl') test_data = load_features(test_file) data_ext = [] summary, Y = text_train.ref[:train_ids], text_train.Y[:train_ids] name, pos = text_train.name[:train_ids], text_train.pos[:train_ids] data_ext.append([summary, Y, name, pos]) summary, Y = text_test.ref[:test_ids], text_test.Y[:test_ids] name, pos = text_test.name[:test_ids], text_test.pos[:test_ids] data_ext.append([summary, Y, name, pos]) text_dir = [ os.path.join(dest_dir, 'train'), os.path.join(dest_dir, 'test') ]
features_file = sys.argv[1] input_file = sys.argv[2] output_file = sys.argv[3] target_col = 'SalaryNormalized' cols2tokenize = [ 'Title', 'FullDescription' ] cols2binarize = [ 'Loc1', 'Loc2', 'Loc3', 'Loc4', 'Loc5', 'ContractType', 'ContractTime', 'Company', 'Category', 'SourceName' ] cols2drop = [ 'SalaryRaw' ] # only some features from these columns cols2filter = [ 'Title', 'FullDescription', 'FullDescription' ] ### print "loading features..." features_by_col = load_features( features_file ) print "%s ---> %s" % ( input_file, output_file ) i_f = open( input_file ) o_f = open( output_file, 'wb' ) reader = csv.reader( i_f ) headers = reader.next() target_index = headers.index( target_col ) indexes2tokenize = map( lambda x: headers.index( x ), cols2tokenize ) indexes2binarize = map( lambda x: headers.index( x ), cols2binarize ) indexes2drop = map( lambda x: headers.index( x ), cols2drop ) indexes2filter = map( lambda x: headers.index( x ), cols2filter )
def train_model(config, _debug, logger, start_dt, train_and_predict): """ train model with features. model and features are designated in config """ features = config['features'] label_name = config['label_name'] id_name = config['id_name'] # load only train features and label x_train_all = load_features(features, _debug, target='train') y_train_all = load_target(label_name, _debug) gc.collect() logger.debug('x_train_all:{0}'.format(x_train_all.shape)) logger.debug('y_train_all:{0}'.format(y_train_all.shape)) # save feature names and index feature_names = x_train_all.columns.tolist() x_train_idx = x_train_all.index # convert from df to matrix x_train_all = df_to_matrix(x_train_all) # load model params params = config['params'] seed = config['seed'] model_name = config['model_name'] # generate stratified k-fold instance n_splits = config['n_splits'] skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) # to store results y_te_prs = np.zeros(len(y_train_all)) scores_tr, scores_te = defaultdict(list), defaultdict(list) importances_df = pd.DataFrame() trained_models = [] # cross validation for _fold, (tr_idx, te_idx) in enumerate(skf.split(x_train_idx, y_train_all)): _fold += 1 logger.debug('------ {0} / {1} fold ------'.format(_fold, n_splits)) # extract dataset x_tr, x_te = x_train_all[tr_idx, :], x_train_all[te_idx, :] y_tr, y_te = y_train_all[tr_idx], y_train_all[te_idx] logger.debug('x_tr:{0} x_te:{1}'.format(x_tr.shape, x_te.shape)) logger.debug('y_tr:{0} y_te:{1}'.format(y_tr.shape, x_te.shape)) # train model y_tr_pr, y_te_pr, model = train_and_predict(x_tr, y_tr, x_te, params) # save prediction y_te_prs[te_idx] += y_te_pr / (n_splits - 1) # compute metric scores_tr = calc_metrics(scores_tr, y_tr_pr, y_tr) scores_te = calc_metrics(scores_te, y_te_pr, y_te) logger.debug('[{0}f] train_acc:{1} test_acc:{2}'.format( _fold, scores_tr['acc'][-1], scores_te['acc'][-1])) logger.debug('[{0}f] train_auc:{1} test_auc:{2}'.format( _fold, scores_tr['auc'][-1], scores_te['auc'][-1])) # save model trained_models.append(model) # feature importance if hasattr(model, 'feature_importances_'): importances_df['{}_fold'.format( _fold)] = model.feature_importances_ elif hasattr(model, 'coef_'): importances_df['{}_fold'.format(_fold)] = model.coef_.flatten() del x_tr, x_te, y_tr, y_te, y_tr_pr, y_te_pr, model gc.collect() # mean metrics scores_cv_tr = np.mean(pd.DataFrame(scores_tr), axis=0) scores_cv_te = np.mean(pd.DataFrame(scores_te), axis=0) logger.debug('------ cross validation ------') logger.debug('[cv] train_acc:{0}, test_acc:{1}'.format( scores_cv_tr['acc'], scores_cv_te['acc'])) logger.debug('[cv] train_auc:{0}, test_auc:{1}'.format( scores_cv_tr['auc'], scores_cv_te['auc'])) if importances_df.any(axis=None): # mean feature importance importances_df = pd.DataFrame({ 'feature': feature_names, 'importance': np.mean(importances_df, axis=1) }) # save file_name = 'importances_{0:%m%d_%H%M%S}_{1:.5f}_{2}'.format( start_dt, scores_cv_te['auc'], model_name) importances_df.to_csv('../../data/output/{0}.csv'.format(file_name), index=False) # plot fig = plot_importances(importances_df, file_name) fig.savefig( '../../figures/feature_importance/{0}.png'.format(file_name)) # save prediction on te dataset train_df = pd.read_pickle('../../data/input/train.pkl') if _debug: train_df = train_df.iloc[:int(train_df.shape[0] / 100)] y_te_prs_df = pd.DataFrame({ 'id': train_df[id_name], 'pred': y_te_prs, 'truth': y_train_all }) logger.debug('y_tr_prs_df:{0}'.format(y_te_prs_df.shape)) del train_df gc.collect() # save prediction on cross-validation test y_te_prs_df.to_pickle( '../../data/output/val_{0:%m%d_%H%M%S}_{1:.5f}_{2}.pkl'.format( start_dt, scores_cv_te['auc'], model_name)) del y_te_prs_df gc.collect() # save models model_path = '../../models/models_{0:%m%d_%H%M%S}_{1:.5f}_{2}.pkl'.format( start_dt, scores_cv_te['auc'], model_name) with open(model_path, 'wb') as f: pickle.dump(trained_models, f)
import random import copy # ML import torch from torch.utils.tensorboard import SummaryWriter from models import RecursiveNN_Linear from rosetta import train_model, test_model from utils import create_loader, load_features logdir = "./logs/" folds = 3 dataset, _ = load_features(split=False, nt=False) def population_generator(pop, pop_size): """Generate a random population of size pop_size.""" for _ in range(pop_size + 1): epochs = np.random.randint(low=1, high=100) pop.append({ "N1": np.random.randint(low=4, high=64), "N2": np.random.randint(low=4, high=64), "lr": np.random.randint(low=1, high=10) * 1e-4, "gamma": np.random.random_sample(), "batch_size_train": np.random.randint(low=32, high=512), "epochs": epochs, "out_features": np.random.randint(low=1, high=15), "leaky_relu": bool(random.getrandbits(1)),
import argparse import faiss import time import utils if __name__ == '__main__': parser = argparse.ArgumentParser( description='Train and save an empty FAISS index') parser.add_argument('index_type', type=str, help='String for index_factory()') parser.add_argument('train_data', type=str, help='Path to train data') parser.add_argument('index_file', type=str, help='Output Index file') args = parser.parse_args() x = utils.load_features(args.train_data, 'rmac')[...] n, d = x.shape index = faiss.index_factory(d, args.index_type) train_time = time.time() index.train(x) train_time = time.time() - train_time print('Training Time:', train_time) faiss.write_index(index, args.index_file)
sim_file = '{}_sim*'.format(args.split) file_pattern = [y_name_pos_file, imp_file, sim_file, imp_vector_file] file_names = ['y_name_pos.pkl', 'imp.pkl', 'sim.pkl', 'imp_vector.pkl'] for i, pf in enumerate(zip(file_pattern, file_names)): pattern, fn = pf pattern_ = os.path.join(BERT_base_dir, pattern) file_n = 'imp_vector.h5' if args.dataset == 2 and i == 3 else fn file_name = os.path.join(BERT_output_dir, file_n) files = sorted(glob.glob(pattern_)) print('found {} files for {}'.format(len(files), pattern_)) if i == 0: Y_data, name_data, pos_data = [], [], [] for file in files: data = load_features(file) # 'Y': Y, 'name': name, 'pos': pos Y_data = Y_data + data['Y'] name_data = name_data + data['name'] pos_data = pos_data + data['pos'] save_features(file_name, { 'Y': Y_data, 'name': name_data, 'pos': pos_data }) else: data_all = [] for file in files: data = load_features(file) data_all = data_all + data if args.dataset == 2 and i == 3:
default='index_img') par.add_argument('--input_image', type=str, dest='input_image', help='input image path to search query', required=True) return par def build_search(images_features, file_index, image_feature): image_index = utils.index_features(images_features, dims=4096) results = utils.search_index_by_value(image_feature, image_index, file_index) print(results) if __name__ == "__main__": parser = build_parser() options = parser.parse_args() features_path = options.features_path file_mapping = options.file_mapping input_image = options.input_image model = utils.load_headless_pretrained_model() image = utils.load_img(input_image) image_feature = model.predict(image).reshape((4096, )) print(image_feature.shape) images_features, file_index = utils.load_features(features_path, file_mapping) build_search(images_features, file_index, image_feature)