def main(): argparser = argparse.ArgumentParser() argparser.add_argument('--model_file', type=str) argparser.add_argument('--cpu', action='store_true') argparser.add_argument('--cuda', action='store_true') argparser.add_argument('--chunk_len', type=int, default=200) argparser.add_argument('--batch_size', type=int, default=300) argparser.add_argument('--num_workers', type=int, default=8) argparser.add_argument('filename', type=str) args = argparser.parse_args() if args.cpu: decoder = torch.load(args.model_file, map_location=lambda storage, loc: storage) else: decoder = torch.load(args.model_file) dataset = WordDataset(args.filename, args.chunk_len) dataloader = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers, drop_last=True) criterion = nn.CrossEntropyLoss() loss, num_samples = 0, 0 for sample in dataloader: input_, target = prep_data(sample['input'], sample['target'], args.cuda) loss += evaluate(decoder, criterion, input_, target, args.batch_size, args.chunk_len, args.cuda) num_samples += 1 loss /= num_samples print('Loss (BPC): {:.2f}'.format(loss))
def predict(player_id): X = fetch_player_data(player_id) date = X['date'].max() rolling = prep_data(X) goals = round(pipe.predict(rolling)[0], 2) df = pd.DataFrame({ 'date_created': pd.Timestamp('now'), 'player_id': [player_id], 'last_game': [date], 'goals': [goals] }) df.to_sql('predictions', con, if_exists='append', index=False) print('Success!')
def main(): # Parse command line arguments argparser = argparse.ArgumentParser() argparser.add_argument('--train_set', type=str, required=True) argparser.add_argument('--valid_set', type=str, required=True) argparser.add_argument('--model', type=str, default="gru") argparser.add_argument('--model_file', type=str, default='None') argparser.add_argument('--n_epochs', type=int, default=30) argparser.add_argument('--hidden_size', type=int, default=200) argparser.add_argument('--n_layers', type=int, default=3) argparser.add_argument('--learning_rate', type=float, default=0.01) argparser.add_argument('--chunk_len', type=int, default=200) argparser.add_argument('--batch_size', type=int, default=300) argparser.add_argument('--num_workers', type=int, default=8) argparser.add_argument('--cuda', action='store_true') argparser.add_argument('--cpu', action='store_true') args = argparser.parse_args() # Initialize models and start training if args.model_file == 'None': decoder = CharRNN( n_characters, args.hidden_size, n_characters, model=args.model, n_layers=args.n_layers, ) epoch_from = 1 prev_valid_loss = sys.maxsize old_filename = None else: if args.cpu: decoder = torch.load(args.model_file, map_location=lambda storage, loc: storage) else: decoder = torch.load(args.model_file) info = args.model_file.split('_') args.model = info[0] epoch_from = int(info[1][5:]) + 1 args.n_layers = int(info[2][7:]) args.hidden_size = int(info[5][2:]) prev_valid_loss = float(info[7][4:-3]) old_filename = args.model_file print( "successfully loaded model! Continuing from epoch {0} with valid loss {1}" .format(epoch_from, prev_valid_loss)) optimizer = torch.optim.Adam(decoder.parameters(), lr=args.learning_rate) criterion = nn.CrossEntropyLoss() if args.cuda: decoder.cuda() start = time.time() train_dataset = WordDataset(args.train_set, args.chunk_len) train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, drop_last=True) valid_dataset = WordDataset(args.valid_set, args.chunk_len) valid_dataloader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, drop_last=True) try: print('Training for maximum {} epochs...'.format(args.n_epochs)) for epoch in range(epoch_from, args.n_epochs + 1): train_loss, num_samples = 0, 0 for s in tqdm(train_dataloader): input_, target = prep_data(s['input'], s['target'], args.cuda) train_loss += train(decoder, optimizer, criterion, input_, target, args.batch_size, args.chunk_len, args.cuda) num_samples += 1 train_loss /= num_samples valid_loss, num_samples = 0, 0 for s in valid_dataloader: input_, target = prep_data(s['input'], s['target'], args.cuda) valid_loss += evaluate(decoder, criterion, input_, target, args.batch_size, args.chunk_len, args.cuda) num_samples += 1 valid_loss /= num_samples elapsed = time_since(start) pcnt = epoch / args.n_epochs * 100 log = ( '{} elapsed - epoch #{} ({:.1f}%) - training loss (BPC) {:.2f} ' '- validation loss (BPC) {:.2f}') print(log.format(elapsed, epoch, pcnt, train_loss, valid_loss)) if valid_loss > prev_valid_loss: print('No longer learning, just overfitting, stopping here.') break else: filename = model_file_name(decoder, epoch, train_loss, valid_loss) torch.save(decoder, filename) print('Saved as {}'.format(filename)) if old_filename: os.remove(old_filename) old_filename = filename prev_valid_loss = valid_loss except KeyboardInterrupt: print("Saving before quit...") try: valid_loss except: valid_loss = 'no_val' filename = model_file_name(decoder, epoch, train_loss, valid_loss) torch.save(decoder, filename) print('Saved as {}'.format(filename))
import pickle import sqlite3 import pandas as pd from helpers import prep_data con = sqlite3.connect('data/hockey.db') player_id = 'ovechal01' new = pd.read_sql( f''' select * from players where player_id = "{player_id}" order by date asc limit 5 ''', con) X = prep_data(new) with open('pickles/pipe.pkl', 'rb') as f: pipe = pickle.load(f) pipe.predict(X)[0]
def predict(player_id): X = fetch_player_data(player_id) X = prep_data(X) X = round(pipe.predict(X)[0], 2) return X
def predict(self, payload): # recieves userid, outputs recommendation_id """Called once per request. Runs preprocessing of the request payload, inference, and postprocessing of the inference output. Required. Args: payload: The parsed JSON request payload. Returns: Prediction or a batch of predictions. """ self.model.connect_db() user_id = payload """ testing with local letterboxd data """ # ratings = pd.read_csv('exported_data/imdb/riley_imdb_ratings.csv', engine='python') # ratings = pd.read_csv('exported_data/letterboxd/riley/ratings.csv', engine='python') query = "SELECT EXISTS(SELECT 1 FROM user_letterboxd_ratings where user_id=%s);" self.model.cursor_dog.execute(query, (user_id,)) boolean = self.model.cursor_dog.fetchall() if boolean[0][0]==False: # True self.model.cursor_dog.close() self.model.connection.close() return "user_id not found" self.model.cursor_dog.execute("SELECT date, name, year, letterboxd_uri, rating FROM user_letterboxd_ratings WHERE user_id=%s;", (user_id,)) ratings_sql= self.model.cursor_dog.fetchall() ratings = pd.DataFrame(ratings_sql, columns = ['Date', 'Name', 'Year', 'Letterboxd URI', 'Rating']) ratings= ratings.dropna() # self.model.cursor_dog.execute("SELECT * FROM test_watchlist WHERE user_id=%s;", (user_id,)) # watchlist_sql= self.model.cursor_dog.fetchall() # watchlist = pd.DataFrame(watchlist_sql, columns = ['Date', 'Name', 'Year', 'Letterboxd URI', 'user_id']) # watchlist = watchlist.dropna() # self.model.cursor_dog.execute("SELECT * FROM test_watched WHERE user_id=%s;", (user_id,)) # watched_sql= self.model.cursor_dog.fetchall() # watched = pd.DataFrame(watched_sql, columns = ['Date', 'Name', 'Year', 'Letterboxd URI', 'user_id']) # watched = watched.dropna() # self.model.cursor_dog.execute("SELECT * FROM test_title_basics_small;") # title_basics_small_sql= self.model.cursor_dog.fetchall() # id_book = pd.DataFrame(title_basics_small_sql, columns = ['tconst', 'primaryTitle', 'originalTitle', 'startYear']) # id_book = id_book.dropna() """ Prepare data """ good_list, bad_list, hist_list, val_list, ratings_dict = prep_data( ratings, watched_df=None, watchlist_df=None, good_threshold=3, bad_threshold=2) """ Load JSON into a list (if applicable) """ # payload_jsonified = json.dumps(payload) # movie_dict = json.loads(payload_jsonified) # movie_list = list(movie_dict.values()) """ Run prediction with parameters """ predictions = self.model.predict(good_list, bad_list, hist_list, val_list, ratings_dict, n=20, harshness=4, rec_movies=True, scoring=True,) """ Turn predictions into JSON """ names = ['Title', 'Year', 'IMDB URL', 'Average Rating', 'Number of Votes', 'Similarity Score', 'IMDB ID'] names_lists = {key:[] for key in names} for x in range(0, len(predictions[0])): for y in range(0, len(predictions)): names_lists[names[x]].append(predictions[y][x]) results_dict = [dict(zip(names_lists,t)) for t in zip(*names_lists.values())] recommendation_json = json.dumps(results_dict) """ Commit to the database """ string_json = str(recommendation_json) hash_object = hashlib.md5(string_json.encode('ascii')) recommendation_id = hash_object.hexdigest() query = "SELECT EXISTS(SELECT 1 FROM recommendations where recommendation_id=%s);" self.model.cursor_dog.execute(query, (recommendation_id,)) boolean = self.model.cursor_dog.fetchall() date = datetime.now() if boolean[0][0]: # True self.model.cursor_dog.close() self.model.connection.close() return "Already recommended", recommendation_json else: query = "INSERT INTO recommendations(user_id, recommendation_id, recommendation_json, date) VALUES (%s, %s, %s, %s);" self.model.cursor_dog.execute(query, (user_id, recommendation_id, recommendation_json, date)) self.model.connection.commit() self.model.cursor_dog.close() self.model.connection.close() return "Recommendation committed to DB with id:", recommendation_id