def try_list(datapath, modelargs, outfile="ensemble_list.txt"): """ Score a list of models, to find the best of a fixed number of combinations for ensembling. Expects `modelargs` to be a list of model descriptions, see predict(). """ df = pd.read_csv(datapath) df = preproc.preproc(df, lower=True) lines = [] already_tried = load_tried_set(outfile) with open(datapath) as f: gt = f.readlines()[1:] while len(modelargs) > 0: lines.clear() ensemble = modelargs.pop() if len(set(ensemble)) != len(ensemble): logger.warning("Truncating ensemble: {}".format(str(ensemble))) ensemble = list(set(ensemble)) results = predict(list(ensemble), list(df['text']), None) lines = [] for i, result in enumerate(results): line = "{},{},{},{}".format(df['text'][i], df['sex'][i], df['age'][i], results[i]) lines.append(line) en_score = score(gt, lines) print("{}: {}".format(en_score, str(ensemble))) with open(outfile, "a") as f: f.write("{}: {}\n".format(en_score, str(ensemble)))
def main(datapath: ("The CSV file to use as input", "option", "d", str) = "valid5000.csv", permute: ("Permute the order of words in sentences randomly", "flag", "p") = False, reverse: ("Reverse the order of words in sentences", "flag", "r") = False, search: ("Do an exhaustive search to find the best ensemble of SEARCH models", "option", "s", int) = 0, models: ("Path to a file with a list of models", "option", "m") = ""): global prediction_cache_file global prediction_cache prediction_cache_file = datapath + ".prediction_cache.pickle" if os.path.exists(prediction_cache_file): with open(prediction_cache_file, "rb") as f: prediction_cache = pickle.load(f) logger.warning("Loaded prediction cache from {}".format( prediction_cache_file)) if search > 0: try_permutations(datapath, num_ensembles=search, outfile="tmp.txt") sys.exit() elif models != "": already_tried = load_tried_set("tmp.txt") already_tried = truncate_ensembles(already_tried) to_try = load_tried_set(models) to_try = truncate_ensembles(to_try) ensembles = to_try - already_tried logger.info("List of {} ensembles to try: {}".format( len(ensembles), ensembles)) try_list(datapath, ensembles, outfile="tmp.txt") sys.exit() ensemble = build_ensemble() df = pd.read_csv(datapath) df_nonums = pd.read_csv(datapath) df = preproc.preproc(df, lower=True) #, spelling=True) df_nonums = preproc.preproc(df_nonums, lower=True, nonumbers=True) if permute: results = permute_predict(ensemble, df) elif reverse: results = reverse_predict(ensemble, df) else: results = predict(ensemble, list(df['text']), list(df_nonums['text'])) lines = [] for i, result in enumerate(results): line = "{},{},{},{}".format(df['text'][i], df['sex'][i], df['age'][i], results[i]) lines.append(line) write_solution(lines) with open(datapath) as f: gt = f.readlines() print("Score: {}".format(score(gt[1:], lines)))
def fit_score(word, pos, batch_pmass): """ Convenience function wrapper for the score function found in main.py. It also happens to hide the ugliness of passing a ton of arguments to it. """ return score(pos, word, english_pmass, batch_pmass, wts["english_freq"], wts[ "batch_freq" ], wts["starts_word" ], wts["repetitions" ], wts["avoid_vowels"], wts["consecutives"] )
def main(modelpath, datapath): import main logger.info("Loading model") model = Model.load_model(modelpath) df = pd.read_csv(datapath) logger.info("Predicting") ret, _ = model.predict_df(df) import pdb pdb.set_trace() solution = [] for i in tqdm.tqdm(range(len(ret))): solution.append(ret[i] + 1) with open(datapath) as f: gt = f.readlines() print("Score: {}".format(main.score(gt[1:], solution)))
def batch_fit(): X = [] Y = [] prediction_data = [] prediction_labels = [] X, Y, prediction_data, prediction_labels = get_Data() print('Train data length ={0} , Test data length ={1}'.format( len(X), len(prediction_data))) #print(X) X = np.array(X) / 255.0 Y = np.array(Y) prediction_data = np.array(prediction_data) / 255.0 prediction_labels = np.array(prediction_labels) N, D = X.shape #print ('Dshape {0}'.format(D)) M = 100 K = 7 # randomly initialize weights W1 = np.random.randn(D, M) / np.sqrt(D + M) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M + K) b2 = np.zeros(K) learning_rate = 5 * 10e-7 costs = [] best_validation_error = 1 batch_sz = 500 n_batches = int(N / batch_sz) #Use of Momentum mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 reg = 0.01 #rms propagation cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay_rate = 0.999 eps = 1e-10 #print ('{0} {1} {2}'.format(N,n_batches,D)) for m in range(1000): tmpX, tmpY = shuffle(X, Y) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz)] n = len(x) # Getting new length T = np.zeros((n, K)) for i in range(n): T[i, y[i]] = 1 output, hidden = forward(x, W1, b1, W2, b2) gW2 = derivative_w2(hidden, T, output) + reg * W2 gb1 = derivative_b1(T, output, W2, hidden) + reg * b1 gW1 = derivative_w1(x, W2, hidden, T, output) + reg * W1 gb2 = derivative_b2(T, output) + reg * b2 dW1 = dW1 * mu + learning_rate * gW1 dW2 = dW2 * mu + learning_rate * gW2 db1 = db1 * mu + learning_rate * gb1 db2 = db2 * mu + learning_rate * gb2 #cache calculation cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 W2 += (dW2 / (np.sqrt(cache_W2) + eps)) b1 += (db1 / (np.sqrt(cache_b1) + eps)) W1 += (dW1 / (np.sqrt(cache_W1) + eps)) b2 += (db2 / (np.sqrt(cache_b2) + eps)) #print('i value ={0}'.format(i)) if j % (n_batches / 2) == 0: c = cost(T, output) P = np.argmax(output, axis=1) r = score(y, P) e = error_rate(y, P) if e < best_validation_error: best_validation_error = e print('Cost = {0} , Training Score = {1} ,Error ={2}'.format( c, r, e)) costs.append(c) print("best_validation_error:", best_validation_error) plt.plot(costs) plt.show() # Test npar_train output, hidden = forward(prediction_data, W1, b1, W2, b2) P = np.argmax(output, axis=1) print('Test Score {0}'.format(score(prediction_labels, P))) # Final output
parser = argparse.ArgumentParser() parser.add_argument('file', type=str) args = parser.parse_args() n_books, n_libs, n_days, scores_of_books, libs = load_libraries(args.file) scores_of_books_dict = dict(enumerate(scores_of_books)) # print(n_books) # print(n_libs) # print(n_days) # print(book_scores) # print(libs[:10]) # lib = libs[0] # n = lib[0] # signup_days = lib[1] # bs_per_day = lib[2] # bs = lib[3] # score = score_library(bs, signup_days, bs_per_day, [], scores_of_books, n_days) # print(score) library_signup_times = [lib[1] for lib in libs] library_ship_capacities = [lib[2] for lib in libs] solution = order_libraries(libs, scores_of_books_dict, n_days) print(solution) print(scores_of_books) scores_of_books = np.asarray(scores_of_books, dtype=np.uint) s = score(solution, n_days, scores_of_books, library_signup_times, library_ship_capacities) print(s)
def hello(): return score()
with open("responses.json") as file: data = json.load(file) d1 = dict(list(data.items())[:len(data) // 2]) d2 = dict(list(data.items())[len(data) // 2:]) propscrDict = {} const1 = 0 const2 = 0 while len(d1) != 0 and len(d2) != 0: scoreDict = {} for men in d2: scoreDict[men] = [] for women in d1: value = score(d2[men], d1[women]) if value > 0: scoreDict[men].append((women, value)) scoreDict[men].sort(key=lambda x: x[1], reverse=True) for proposer in scoreDict: if len(scoreDict[proposer]) == 0: continue if scoreDict[proposer][0][0] not in propscrDict.keys(): propscrDict[scoreDict[proposer][0][0]] = ( proposer, scoreDict[proposer][0][1]) else: if scoreDict[proposer][0][1] > propscrDict[scoreDict[proposer][0] [0]][1]: propscrDict[scoreDict[proposer][0][0]] = (
def try_permutations(datapath, num_ensembles=8, outfile="tmp.txt"): """ Brute force random walk over the list of available checkpoints. Tries combinations at random to find the highest scoring ensemble. Never returns. """ DIRS = [ "cp/bert/*/pytorch_model.bin", "cp/roberta/*/pytorch_model.bin", "cp/xlnet/*/pytorch_model.bin", "cp/xlnet/09/checkpoint-21000", "cp/xlnet/07/checkpoint-42500", "cp/distilbert/*/pytorch_model.bin", ] score_hash = {} tried = load_tried_set(outfile) for models in tried: score_hash[tuple(set(models))] = 0.0 setlist = [set(x) for x in tried] buf = [] full_list = [] df = pd.read_csv(datapath) df = preproc.preproc(df, lower=True) with open(datapath) as f: gt = f.readlines()[1:] gt = [int(x.split(',')[-1][:-1]) for x in gt] for d in DIRS: buf = glob.glob(d) for i, tmp in enumerate(buf): tmp = tmp.replace("/pytorch_model.bin", "") full_list.append(tmp) logger.warning("Models to permute: {}".format(str(full_list))) combo = itertools.combinations(full_list, num_ensembles) combo_list = list(combo) random.shuffle(combo_list) df_text_list = list(df['text']) progbar = tqdm.tqdm( combo_list, total=len(combo_list) - len(list(filter(lambda x: len(x) == num_ensembles, tried))), desc="HiScore: 0.0") hiscore = 0.0 for paths in progbar: buf.clear() for path in paths: path = path.strip() if 'roberta' in path: buf.append(("roberta", path, "")) elif "distilbert" in path: buf.append(("distilbert", path, "")) elif 'bert' in path: buf.append(("bert", path, "")) elif 'xlnet' in path: buf.append(('xlnet', path, '')) else: logger.error("Epic fail, programmer.") sys.exit() buf_set = set(buf) buf_tuple = tuple(buf_set) if score_hash.get(buf_tuple, None) is not None: continue # if buf_set in setlist: # continue # setlist.append(buf_set) # if buf_tuple in tried: # continue # tried.add(buf_tuple) results = predict(buf_tuple, df_text_list, None) # lines = [] # for i, result in enumerate(results): # line = ("%s,%d,%d,%d" % (df['text'][i], # df['sex'][i], # df['age'][i], # results[i])) # lines.append(line) en_score = score(gt, results) score_hash[buf_tuple] = en_score if en_score > hiscore: hiscore = en_score progbar.set_description("HiScore: {}".format(hiscore)) #print("{}: {}".format(en_score, str(buf))) # if en_score > 0.83: with open(outfile, "a") as f: f.write("{}: {}\n".format(en_score, str(buf)))
def test2(self): self.assertEqual(main.score([4, 4, 4, 3, 3]), 400)
def test1(self): self.assertEqual(main.score([2, 3, 4, 6, 2]), 0)
def test3(self): self.assertEqual(main.score([2, 4, 4, 5, 4]), 450)
nb_train = 2000000 #%% x0 = data_array[:nb_train, 0:60] # - data_array[:nb_train, 30:60] train_y = data_array[:nb_train, 60] x1 = data_array[nb_train:, 0:60] # - data_array[nb_train:, 30:60] test_y = data_array[nb_train:, 60] reg = LinearRegression(normalize=True).fit(x0, train_y) #%% train_pred = reg.predict(x0) test_pred = reg.predict(x1) print('train score: {:.5f}'.format(score(train_y, train_pred))) print('test score: {:.5f}'.format(score(test_y, test_pred))) #%% [markdown] # ### linear model with normalized data #%% with open('normed_data.pickle', 'rb') as f: normed_data = pickle.load(f) #%% nb_train = 2000000 # train data train_x = normed_data[:nb_train, :60] train_y = normed_data[:nb_train, 60] # test data