def save_corrected_item(item, correct, tags): status = "NOT_VALID" if correct == "NOT_VALID" else "VALID" session.corrected_items.append({ "_id": item["_id"], "original": item["ingredients_text_fr"], "correct": correct, "tags": [status] + tags, }) save_dataset(FR_TEST_SET_PATH, session.corrected_items)
def generate_dataset(forms_dict, vect_words, cls_dic): rez_dict = {} for key in tqdm(forms_dict, desc="Generating dataset"): item = forms_dict[key] root = item['root'] x_cls = cls_dic[root['main']] x, x_len = vect_words[root['text']]['vect'] prefix_filter = root['text'][:PREFIX_FILTER_LENGTH] prefix_filter_e = prefix_filter.replace('ё', 'е') if MIN_WORD_SIZE > len(root['text']): continue form_dict = {} for form in item['items']: if MIN_WORD_SIZE > len(form['text']): continue if 'ad_tags' in form and any([tag for tag in IGNORE_TAGS if tag in form['ad_tags']]): #tqdm.write("Ignore form {0} for {1} by tags {2}".format(form['text'], root['text'], form['ad_tags'])) continue if not (form['text'].startswith(prefix_filter) or form['text'].replace('ё', 'е').startswith(prefix_filter_e)): #tqdm.write("Ignore form {0} for {1}".format(form['text'], root['text'])) continue y_cls = cls_dic[form['main']] if y_cls in form_dict and form_dict[y_cls]['index'] < form['index']: #tqdm.write("Ignore duplicate form {0} [{1}] for {2} ".format(form['text'], form_dict[y_cls]['text'], root['text'])) continue form_dict[y_cls] = form for y_cls in form_dict: form = form_dict[y_cls] y, y_len = vect_words[form['text']]['vect'] if y_cls not in rez_dict: rez_dict[y_cls] = [] rez_dict[y_cls].append(dict( id=form['inflect_id'], x_src=root['text'], x=x, x_cls=x_cls, x_len=x_len, y_src=form['text'], y=y, y_cls=y_cls, y_len=y_len )) save_dataset(rez_dict, 'inflect')
def predict_save(self, items): self.last_experiment_path = new_experiment_path(self.name) predictions = self.predict(items) for item, prediction in zip(items, predictions): item["prediction"] = prediction save_dataset(self.last_experiment_path, items) for item, prediction in zip(items, predictions): del item["prediction"] return predictions
def generate(vec_words, main_cls_dic): dict_words = [] rez_dict = defaultdict(list) for word in tqdm(vec_words, desc="Generating lemma dataset"): dic = vec_words[word] x_vec = dic['vect'] for form in dic['forms']: main_cls = main_cls_dic[form['main']] if 'lemma' in form: word_y = form['lemma'] else: continue if word_y not in vec_words \ or MIN_WORD_SIZE > len(word_y) \ or MIN_WORD_SIZE > len(word): continue if word_y[:PREFIX_FILTER_LENGTH] != word[:PREFIX_FILTER_LENGTH] \ and word_y[:PREFIX_FILTER_LENGTH].replace('ё', 'е') != word[:PREFIX_FILTER_LENGTH].replace('ё', 'е')\ and form['post'] != 'comp': #tqdm.write('Word to dictionary: {0} -> {1}'.format(word, word_y)) dict_words.append( dict(text=word, text_y=word_y, main=main_cls, id=form['inflect_id'])) continue y_vec = vec_words[word_y]['vect'] items = rez_dict[main_cls] items.append({ 'id': form['inflect_id'], 'x_src': word, 'x': x_vec[0], 'x_len': x_vec[1], 'y_src': word_y, 'y': y_vec[0], 'y_len': y_vec[1], 'main_cls': main_cls }) rez_dict[main_cls] = items save_dataset(rez_dict, 'lemma') save_dictionary_items(dict_words, 'lemma')
def change_nouns(split): data = load_dataset(split) changed_data = [] total_length = len(data) print(f"Changing nouns words for {split}") for i, element in enumerate(data): instructions = element["instructions"] changed_instructions = [replace_nouns(instr) for instr in instructions] element["instructions"] = changed_instructions changed_data.append(element) print_progress(i + 1, total_length, prefix='Progress:', suffix='Complete', bar_length=50) save_dataset(split, changed_data, "nouns")
def generate_dataset(vec_words, cls_type, cls_dic): ordered_keys = [ cls for cls in sorted(cls_dic, key=lambda cls: cls_dic[cls]) ] weights = [0 for key in ordered_keys] for word in tqdm(vec_words, desc=f"Calculating {cls_type} weights"): for form in vec_words[word]['forms']: if cls_type in form: i = cls_dic[form[cls_type]] weights[i] = weights[i] + 1 weights = normalize(np.asarray(weights).reshape(1, -1)) weights = np.ones((len(ordered_keys), )) - weights rez_items = defaultdict(list) cur_cls = None for word in tqdm(vec_words, desc=f"Generating classification {cls_type} dataset"): y = np.zeros((len(ordered_keys), ), dtype=np.int) has_classes = False for form in vec_words[word]['forms']: if cls_type in form: cur_cls = form[cls_type] index = cls_dic[cur_cls] y[index] = 1 has_classes = True if has_classes: items = rez_items[cur_cls] items.append({ 'src': word, 'x': vec_words[word]['vect'], 'y': y, 'weight': weights.reshape(-1, 1)[y == 1].max() }) rez_items[cur_cls] = items save_dataset(rez_items, cls_type)
def create_dataset(file_type, folder, train_diffs, train_msgs, test_diffs, test_msgs, valid_diffs, valid_msgs): (train_diffs, train_msgs, train_cnt, vocab_diffs, vocab_msgs) = get_dataset(file_type, train_diffs, train_msgs) test_diffs, test_msgs, test_cnt, _, _ = get_dataset( file_type, test_diffs, test_msgs) valid_diffs, valid_msgs, valid_cnt, _, _ = get_dataset( file_type, valid_diffs, valid_msgs) remove_dir(folder) make_dirs(folder) save_dataset(folder, "train." + str(train_cnt), train_diffs, train_msgs) save_dataset(folder, "test." + str(test_cnt), test_diffs, test_msgs) save_dataset(folder, "valid." + str(valid_cnt), valid_diffs, valid_msgs) save_vocab(folder, vocab_diffs, vocab_msgs)
def main(task, config): if task is 'split_dataset': """ Split dataset based on review score """ review_scores, review_data = utils.get_review_scores_and_data(config['split_dataset']['load_path']) utils.save_dataset(review_scores, config['split_dataset']['save_path_scores']) for i in range(5): score = str(i+1) save_path = config['split_dataset']['save_path_data'] + score + '.json' utils.save_dataset(review_data[i], save_path) elif task is 'clean_dataset': dataset = clean.clean_corpus(config['clean_dataset']['load_path']) sents_len = utils.get_sents_len(dataset) utils.save_dataset(dataset, config['clean_dataset']['save_path_data']) utils.save_dataset(sents_len, config['clean_dataset']['save_path_seq_len']) elif task is 'build_vocab': dataset = utils.load_dataset(config['build_vocab']['load_path']) # build vocab, remove sentences that are too long or contains words that are not in vocab truncated_dataset = [sent for sent in dataset if len(sent) < config['build_vocab']['max_seq_len']] vocab = Vocab.build(truncated_dataset, config['build_vocab']['max_vocab'], config['build_vocab']['min_word_freq'], config['build_vocab']['save_path_vocab_dist']) vocab.save(config['build_vocab']['save_path_vocab']) train_dataset = utils.remove_unk(truncated_dataset, vocab) utils.save_dataset(train_dataset, config['build_vocab']['save_path_data']) print('total number of reviews is {}'.format(len(train_dataset))) elif task is 'train_model': load_path_vocab = config['train_model']['load_path_vocab'] load_path_data = config['train_model']['load_path_data'] save_path = config['train_model']['save_path'] start_epoch = config['train_model']['start_epoch'] max_epoch = config['train_model']['max_epoch'] dim_embedding = config['train_model']['model']['dim_embedding'] dim_hidden = config['train_model']['model']['dim_hidden'] dim_latent = config['train_model']['model']['dim_latent'] dropout_prob = config['train_model']['model']['dropout_prob'] batch_size = config['train_model']['model']['batch_size'] keep_rate = config['train_model']['model']['keep_rate'] a = config['train_model']['model']['a'] b = config['train_model']['model']['b'] vocab = Vocab.load(load_path_vocab) dataset = utils.load_dataset(load_path_data) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if os.path.isfile(save_path): vae = torch.load(save_path) else: vae = Vae(dim_embedding=dim_embedding, dim_hidden=dim_hidden, dim_latent=dim_latent, vocab=vocab, dropout_prob=dropout_prob, device=device) vae.to(device) optimizer = optim.Adam(vae.parameters()) for epoch in range(start_epoch, max_epoch): train(epoch, vae, optimizer, dataset, batch_size, device, keep_rate, a, b) torch.save(vae, save_path) utils.set_epoch(epoch) elif task is 'detect_anomaly': load_path_data = config['detect_anomaly']['load_path_data'] load_path_model = config['detect_anomaly']['load_path_model'] save_path = config['detect_anomaly']['save_path'] sample_size = config['detect_anomaly']['sample_size'] vae = torch.load(load_path_model) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') vae.to(device) vae.eval() dataset = utils.load_dataset(load_path_data) sents_recon_loss = detect(vae=vae, dataset=dataset, sample_size=sample_size, device=device) utils.save_dataset(sents_recon_loss, save_path) elif task is 'test': test = [] dataset = utils.load_dataset('./files/Fashion/anomaly.json') for sent in dataset: if len(sent[0][0])>5 and len(sent[0][0])<50: test.append(sent) print(len(test)) utils.save_dataset(test, './files/Fashion/test.json') else: print('Invalid task')
# best_nrounds = res.shape[0] - 1 # cv_mean = res.iloc[-1, 0] # cv_std = res.iloc[-1, 1] # print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std)) # print('Best Round: {0}'.format(best_nrounds)) # del dtrain # del res # gc.collect() # exit(0) cols_ = [x for x in train.columns if 'loss' not in x][1:] test.to_csv("xgb_other_test.csv") train.to_csv("xgb_other_train.csv") utils.save_dataset("xgb_other.npz", train_features=train[cols_].as_matrix(), test_features=test[cols_].as_matrix(), ids=test["id"], train_labels=train["loss"], feature_names=cols_) best_nrounds = 20000 # 640 score from above commented out code (Faron) allpredictions = pd.DataFrame() kfolds = 10 # 10 folds is better! train_sub = np.zeros(len(train['loss'])) if kfolds > 1: kf = KFold(train.shape[0], n_folds=kfolds, random_state=int(time.time())) for i, (train_index, test_index) in enumerate(kf): dtest = xgb.DMatrix(test[test.columns[1:]]) print('Fold {0}'.format(i + 1)) X_train, X_val = train.iloc[train_index], train.iloc[test_index] dtrain = \ xgb.DMatrix(X_train[cols_], label=X_train.loss) dvalid = \
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor import utils trainf, trainl, testf, test_ids, feature_names = utils.read_dataset('data/no_big_cats1.npz') extra_trees = ExtraTreesRegressor(n_estimators = 100, max_features= .3, max_depth = 30, min_samples_leaf= 4, n_jobs = 4) y, trainy = utils.cross_val_model(extra_trees, trainf, trainl, testf) y = y.reshape((len(y),1)) trainy= trainy.reshape((len(trainy), 1)) utils.save_dataset("data/random_forest2.npz",train_features=trainy, train_labels=trainl, test_features=y, ids=test_ids, feature_names=['random_forest'])
plot_box_reaction_time, plot_detection_false_alarm) from utils import (skip_run, save_dataset) # The configuration file config_path = Path(__file__).parents[1] / 'src/config.yml' config = yaml.load(open(str(config_path)), Loader=yaml.SafeLoader) with skip_run('skip', 'create_dataset') as check, check(): data, dataframe, secondary_dataframe = create_dataframe( config['subjects'], config) # Save save_path = Path(__file__).parents[1] / config['processed_dataframe'] save_dataset(str(save_path), dataframe, save=True, use_pandas=True) save_path = Path(__file__).parents[1] / config['secondary_dataframe'] save_dataset(str(save_path), secondary_dataframe, save=True, use_pandas=True) # save_path = Path(__file__).parents[1] / config['processed_dataset'] # save_dataset(str(save_path), data, save=True) with skip_run('skip', 'export_data') as check, check(): config['save_path'] = 'models/experiment_1/' export_dataset(config, name=None) config['save_path'] = 'models/experiment_2/'
def train_model(datapath, output, appliance, hparams, doplot=None, reload=True): """ Train specific model and appliance """ # Load appliance specifications and hyperparameters from # settings buildings = appliance["buildings"]["train"] name = appliance["name"] params = appliance["hparams"] record_err = np.inf # Load whether data transformation is required. See details # on data normalization in documentation transform_enabled = appliance.get("normalization", False) # Load specific network architecture to train model_type = appliance.get("model", "ModelPaper") # Initialize active settings described in documentation. # Used to identify whether an appliance is classified as active # Used to enableoversampling to fix sliding windows active/inactive # imbalance active_threshold = appliance.get("active_threshold", 0.15) active_ratio = appliance.get("active_ratio", 0.5) active_oversample = appliance.get("active_oversample", 2) transform = None # Data transformation disabled by default # Load train dataset my_dataset = InMemoryKoreaDataset( datapath, buildings, name, windowsize=params["L"], active_threshold=active_threshold, active_ratio=active_ratio, active_oversample=active_oversample, transform_enabled=transform_enabled, ) if transform_enabled: # Load dataset transformation parameters from dataset transform = { "sample_mean": my_dataset.sample_mean, "sample_std": my_dataset.sample_std, "target_mean": my_dataset.target_mean, "target_std": my_dataset.target_std, } print(transform) # Size train and evaluation dataset total_size = len(my_dataset) train_size = int(hparams["train_size"] * (total_size)) eval_size = total_size - train_size print("============= DATASET =============") print(f"Total size: {total_size}".format(total_size)) print(f"Train size: {train_size}".format(train_size)) print(f"Eval size: {eval_size}".format(eval_size)) print("===================================") print("=========== ARCHITECTURE ==========") pprint.pprint(appliance) print("===================================") # Split and randomize train and evaluation dataset train_dataset, eval_dataset = torch.utils.data.random_split( my_dataset, (train_size, eval_size) ) # Save train dataset in order to use it in later # training sessions or debugging filename = os.path.join(output, "dataset.pt") save_dataset(transform, train_dataset, eval_dataset, filename) # Initialize train dataset loader train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=hparams["batch_size"], shuffle=True ) # Initialize evaluation dataset loader eval_loader = torch.utils.data.DataLoader( eval_dataset, batch_size=hparams["batch_size"] ) model_type = getattr(nilmmodel, model_type) model = model_type(params["L"], params["F"], params["K"], params["H"]) model = model.to(device) # Initialize optimizer optimizer = optim.Adam(model.parameters(), hparams["lr"]) scheduler = optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9) if reload: # Reload pretrained model in order to continue # previous training sessions filename = os.path.join(output, appliance["filename"]) print("====================================") print("Reloading model: ", filename) print("====================================") transform, record_err = load_model(filename, model, optimizer) results = [] start = datetime.now() for epoch in range(hparams["epochs"]): # Iterate over training epochs filename = os.path.join(output, appliance["filename"] + str(epoch)) plotfilename = None if doplot: plotfilename = filename err_ = None try: # Train single epoch loss, err = train_single_epoch( epoch, model, train_loader, transform, optimizer, eval_loader, plotfilename, ) print("==========================================") print(f"train epoch={epoch} loss={loss:.2f} err={err:.2f}") print("==========================================") loss_, err_ = eval_single_epoch(model, eval_loader, transform) print("==========================================") print(f"eval loss={loss_:.2f} err={err_:.2f}") print("==========================================") # tune.report(eval_loss=loss_) results.append([(epoch, loss, err), (epoch, loss_, err_)]) if err_ < record_err: # Compare current epoch error against previous # epochs error (minimum historic error) to check whether current # trained model is better than previous ones (best historic error) # Set and save current trained model as best historic trained # model if current error is lower than historic error filename = os.path.join(output, appliance["filename"]) save_model( model, optimizer, hparams, appliance, transform, filename, err_ ) record_err = err_ except Exception as e: print(e) scheduler.step() end = datetime.now() total_seconds = (end - start).seconds print("------------------------------------------") print(f"Total seconds: {total_seconds}") print("------------------------------------------") # Save model training results summary(output, results) return model, transform
from utils import load_clean_dataset, save_dataset, load_dataset import numpy as np trainDocs, ytrain = load_clean_dataset(True) testDocs, ytest = load_clean_dataset(False) save_dataset([trainDocs, ytrain], "data/train.pkl") save_dataset([testDocs, ytest], "data/test.pkl") # (dataset,labels) = load_dataset("data/train.pkl") # print(np.array(dataset).shape)
if args.use_citation_model: job = p.apply_async(f, [iterator[i], model, tokenizer]) else: job = p.apply_async(f, [iterator[i], None, None]) jobs.append(job) results = [] print( f"Started processing graph using {metadata_to_use} data, with {num_workers} workers." ) if (args.use_citation_model): print(f"I am using the NER model") else: print(f"I am NOT using the NER model") for job in tqdm(jobs): results.append(job.get()) add_to_graph(results, G, is_goodreads=args.use_goodreads) if (args.use_goodreads): pickle.dump(G, open("pickled_graphs/complete_graph_by_script.p", "wb")) else: pickle.dump(G, open("pickled_graphs/small_graph.p", "wb")) save_dataset(results, 'to_annotate/output_ner_multi.jsonl')
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor import utils trainf, trainl, testf, test_ids, feature_names = utils.read_dataset('data/factorized1.npz') extra_trees = ExtraTreesRegressor(n_estimators = 100, max_features= 50, max_depth = 35, min_samples_leaf= 4, n_jobs = 4) y, trainy = utils.cross_val_model(extra_trees, trainf, trainl, testf) y = y.reshape((len(y),1)) trainy= trainy.reshape((len(trainy), 1)) utils.save_dataset("data/extra_trees_factorized.npz",train_features=trainy, train_labels=trainl, test_features=y, ids=test_ids, feature_names= ['extra_trees'])
num_songs_genre='') print('X_train shape:', X_train.shape) # X_test, y_test, num_frames_test = extract_melgrams(test_songs_list, MULTIFRAMES, process_all_song=False, num_songs_genre=10) y_train = np.loadtxt(train_songs_tags, dtype=int) print(X_train.shape, 'train samples') # print(X_test.shape, 'test samples') #y_train = np.array(y_train) # y_test = np.array(y_test) if SAVE_DB: if MULTIFRAMES: save_dataset( 'music_dataset_spectogram/train/music_dataset_multiframe_train.h5', X_train, y_train, num_frames_train) # save_dataset('music_dataset/music_dataset_multiframe_test.h5', X_test,y_test,num_frames_test) else: save_dataset('music_dataset_spectogram/train/music_dataset_FMA.h5', X_train, y_train, 0) Y_train = np_utils.to_categorical(y_train, nb_classes) # Y_test = np_utils.to_categorical(y_test, nb_classes) print 'Shape labels y_train: ', Y_train.shape # print 'Shape labels y_test: ', Y_test.shape # Initialize model model = music_tagger_wrapper(LOAD_WEIGHTS) #model = MusicTaggerCNN(weights='msd', input_tensor=(1, 96, 1366))
else: X_train, y_train, num_frames_train = extract_melgrams(train_songs_list, MULTIFRAMES, process_all_song=False, num_songs_genre=20) print('X_train shape:', X_train.shape) X_test, y_test, num_frames_test = extract_melgrams(test_songs_list, MULTIFRAMES, process_all_song=False, num_songs_genre=10) print(X_train.shape, 'train samples') print(X_test.shape, 'test samples') y_train = np.array(y_train) y_test = np.array(y_test) if SAVE_DB: if MULTIFRAMES: save_dataset('music_dataset/music_dataset_multiframe_train.h5', X_train, y_train,num_frames_train) save_dataset('music_dataset/music_dataset_multiframe_test.h5', X_test,y_test,num_frames_test) else: save_dataset('music_dataset/music_dataset.h5', X_train, X_test, y_train, y_test) Y_train = np_utils.to_categorical(y_train, nb_classes) Y_test = np_utils.to_categorical(y_test, nb_classes) print 'Shape labels y_train: ', Y_train.shape print 'Shape labels y_test: ', Y_test.shape # Initialize model model = MusicTaggerCRNN(weights='msd', input_tensor=(1, 96, 1366)) #model = MusicTaggerCNN(weights='msd', input_tensor=(1, 96, 1366))
from utils import read_dataset, save_dataset from argparse import ArgumentParser import pandas as pd from typing import List, Dict def process_dataset(test_dataset: List[Dict[str, str]], train_dataset: List[Dict[str, str]]): train_df = pd.DataFrame(train_dataset) test_df = pd.DataFrame(test_dataset) refined_set = test_df[~test_df.entity_text.isin(train_df.entity_text)] return refined_set.drop_duplicates().to_dict('records') def get_arguments() -> ArgumentParser: parser = ArgumentParser() parser.add_argument('--train_data_folder') parser.add_argument('--test_data_folder') parser.add_argument('--save_to') return parser.parse_args() if __name__ == '__main__': args = get_arguments() train_dataset = read_dataset(args.train_data_folder) test_dataset = read_dataset(args.test_data_folder) refined_test_set = process_dataset(test_dataset, train_dataset) save_dataset(refined_test_set, args.save_to)
model = Model(inputs=[inputs1, inputs2, inputs3], outputs=output) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.summary() return model trainX, Y = load_dataset('data/train.pkl') tokenizer = Tokenizer() tokenizer.fit_on_texts(trainX) length = max([len(s.split()) for s in trainX]) vocab_size = len(tokenizer.word_index) + 1 print("Max document length: {:d}".format(length)) print("Vocab size: {:d}".format(vocab_size)) # Encode and pad the data encoded = tokenizer.texts_to_sequences(trainX) X = pad_sequences(encoded, maxlen=length, padding='post') model = define_model(length, vocab_size) # plot_model(model, to_file='model.png') model.fit([X, X, X], Y, batch_size=16, epochs=7) model.save('model.h5') save_dataset(tokenizer, 'tokenizer.pkl')
if opts.filename is None: filename = os.path.join( datadir, "{}{}{}_{}_seed{}.pkl".format( problem, "_{}".format(distribution) if distribution is not None else "", graph_size, opts.name, opts.seed)) else: filename = check_extension(opts.filename) assert opts.f or not os.path.isfile(check_extension(filename)), \ "File already exists! Try running with -f option to overwrite." np.random.seed(opts.seed) if problem == 'tsp': dataset = generate_tsp_data(opts.dataset_size, graph_size) elif problem == 'vrp': dataset = generate_vrp_data(opts.dataset_size, graph_size) elif problem == 'pctsp': dataset = generate_pctsp_data(opts.dataset_size, graph_size) elif problem == "op": dataset = generate_op_data(opts.dataset_size, graph_size, prize_type=distribution) else: assert False, "Unknown problem: {}".format(problem) print(dataset[0]) save_dataset(dataset, filename)