def run(ModelType, args): print("\n********* %s %s Model *********" % (("Logistic" if ModelType == LogisticRegression else "Linear"), ("Wait" if args.time == 'w' else "Help"))) vectorizers = init_vectorizers() trainLoader = DataLoader() evaluateLoader = DataLoader() testLoader = DataLoader() # Filter out bad requests if we are running on help time if args.time == 'h': trainLoader.loadData('../dataset/dataset-train.npy', filterFn=lambda x: x.getHelpTimeMinutes() >= 2.0) evaluateLoader.loadData('../dataset/dataset-dev.npy', filterFn=lambda x: x.getHelpTimeMinutes() >= 2.0) testLoader.loadData('../dataset/dataset-test.npy', filterFn=lambda x: x.getHelpTimeMinutes() >= 2.0) else: trainLoader.loadData('../dataset/dataset-train.npy') evaluateLoader.loadData('../dataset/dataset-dev.npy') testLoader.loadData('../dataset/dataset-test.npy') if ModelType == LogisticRegression: buckets = make_buckets(trainLoader, args.buckets, args.time) mapper = make_bucket_mapper(buckets) else: mapper = lambda x: x labelFn = lambda x: mapper(x.getWaitTimeMinutes() if args.time == 'w' else x.getHelpTimeMinutes()) trainLabels = trainLoader.getLabels(labelFn) trainInputs = trainLoader.applyVectorizers(vectorizers, "train", args.time) devLabels = evaluateLoader.getLabels(labelFn) devInputs = evaluateLoader.applyVectorizers(vectorizers, "dev", args.time) testLabels = testLoader.getLabels(labelFn) testInputs = evaluateLoader.applyVectorizers(vectorizers, "test", args.time) trainedModel = trainModel(ModelType, trainInputs, trainLabels) evaluateModel(trainedModel, devInputs, devLabels) evaluateModel(trainedModel, testInputs, testLabels)
def main(): paths = "/home/nikoscf/PycharmProjects/BookRecommendation/configurations/paths.yml" load_begin = DataLoader() load_begin.read_paths(paths) # Uncomment to Execute this one time to get the zip if is .zip, unzip it in absolute dir you set in paths.yaml # Then it checks for .csv and remove the redundant zip folder # load_begin.check_zip_and_csv() books = load_begin.read_data("BX-Books.csv") users = load_begin.read_data("BX-Users.csv") ratings = load_begin.read_data("BX-Book-Ratings.csv") to_drop_columns = ['Image-URL-S', 'Image-URL-M', 'Image-URL-L'] numeric_col_to_nan = ["Year-Of-Publication"] data_books = DataClean(books) clean_books = data_books.execute_pipeline_cleaning(to_drop_columns, numeric_col_to_nan) to_drop_columns = [] numeric_col_to_nan = ["User-ID", "Age"] data_users = DataClean(users) clean_users = data_users.execute_pipeline_cleaning(to_drop_columns, numeric_col_to_nan) to_drop_columns = [] numeric_col_to_nan = ["User-ID", "ISBN", "Book-Rating"] data_ratings = DataClean(ratings) clean_ratings = data_ratings.execute_pipeline_cleaning( to_drop_columns, numeric_col_to_nan) data_analysis = DataAnalysis() ratings_pivoted = data_analysis.execute_pipeline_data_analysis( clean_ratings, clean_users, clean_books) return ratings_pivoted
def init_vectorizers(log=True): """ Initializes vectorizers. """ if log: print("Initializing vectorizers...", end="\r") # Create DataLoaders for train and full for the vectorizers trainD = DataLoader() trainD.loadData('../dataset/dataset-train.npy') fullD = DataLoader() fullD.loadData('../dataset/dataset.npy') # Create the vectorizers return [ TFIDFRequestTextVectorizer(trainD), HelperIDVectorizer(fullD), CourseIDVectorizer(), RequestTimeVectorizer(), StudentVectorizer(fullD), PastRequestsVectorizer(fullD), DueDateVectorizer() ]
def generate_inference_file(filename='test.tsv', cache=True): if cache: return pickle.load(open(f'{abspath}/ser/sents.ser', 'rb')) else: sents = [] tokenized_sents = [] pairs = DataLoader('srcdata2') pairs = pairs + get_augmented_data() + get_test_data() for pair in tqdm(pairs, desc='Tokenizing sentences'): sents.append(pair[0]) tokenized_sents.append(CustomTokenizer(pair[0])) sents.append(pair[1]) tokenized_sents.append(CustomTokenizer(pair[1])) pickle.dump(sents, open(f'{abspath}/ser/sents.ser', 'wb')) inf_file = open(filename, 'w') for sent in tqdm(tokenized_sents, desc='Writing tokenized sentences'): for token in sent: inf_file.write(f'{token.strip()}\tO\n') inf_file.write('\n') return sents
def __init__(self, method): print('USING METHOD: {}'.format(method)) # Read lyrics dataset and get train/test splits dl = DataLoader() self.train_x, self.train_y, self.test_x, self.test_y = dl.load(method)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Jun 16 17:33:23 2019 @author: ishanshrivastava """ from data.DataLoader import DataLoader #from features.CustomTokenizer import CustomTokenizer pairs = DataLoader("<srcdata>") #print(CustomTokenizer(pairs[3][0]))
import tensorflow as tf from data.DataLoader import DataLoader from ClassificationNNModel import ClassificationNNModel from random import uniform from RunNN import Config if __name__ == "__main__": config = Config("h", 10, "classification", 5) results = [] for i in range(100): print("Iteration %i" % i) config.lr = 10**uniform(-2, -6) with tf.Graph().as_default(): model = ClassificationNNModel(config) loader = DataLoader() loader.loadData("../dataset/dataset-train.npy", filterFn=lambda x: x.getHelpTimeMinutes() >= 2.0, log=False) model.run(loader, "h", train=True, log=False) loss = model.run(loader, "h", train=False, log=False) results.append((config.lr, loss)) print(sorted(results, key=lambda r: r[1]))
import keras from core.utils import AucHistory from core.VideoPixelNetwork import VideoPixelNetwork from data.DataLoader import DataLoader frames_count = 10 frames_step = 5 # >= 1 data_loader = DataLoader('data/ped1_train.txt', 'data/ped1_test.txt', frames_count, frames_step, validation_split=0.1) train_generator = data_loader.train_generator(batch_size=2) validation_generator = data_loader.validation_generator(batch_size=5) frame_shape = train_generator.X_shape filters = 16 k_encoder = 20 lstm_filters = 32 k_decoder = 32 dilation = True decoder_kernel_size = 5 vpn = VideoPixelNetwork(filters, frame_shape, frames_count, k_encoder=k_encoder, lstm_filters=lstm_filters, k_decoder=k_decoder,
from data.DataLoader import DataLoader import matplotlib.pyplot as plt from collections import Counter from util import make_buckets, make_bucket_mapper if __name__ == "__main__": d = DataLoader() d.loadData('../dataset/dataset.npy') help_vals = [r.getHelpTimeMinutes() for r in d.laIRRequests] wait_vals = [r.getWaitTimeMinutes() for r in d.laIRRequests] bucket_vals = [i for i in range(0, 120, 10)] + [float('inf')] plt.hist([help_vals, wait_vals], bucket_vals, label=["Help Time", "Wait Time"]) plt.title("CS106 LaIR Wait and Help Times") plt.xlabel("Time (minutes)") plt.ylabel("# Requests") plt.legend() plt.show()
from data.DataLoader import DataLoader import matplotlib.pyplot as plt import numpy as np import os if __name__ == '__main__': dl = DataLoader() train_x, train_y, test_x, test_y = dl.load('binary') genres = np.concatenate((train_y, test_y)) print(len(genres)) # read genrenames names = [] datadir = os.path.join(os.path.dirname(__file__), 'data/genresList.txt') with open(datadir) as file: for line in file: names.append(line.strip().lower()) genres = list(map(lambda x: names[x], genres)) counts = [] for i in range(len(names)): c = len(list(filter(lambda x: x == names[i], genres))) counts.append(c) plt.rc('axes', axisbelow=True) plt.grid(b=True, axis='x', color='#eeeeee', zorder=-1) plt.ylabel('Genre', labelpad=15, fontsize=18, color='#555555') plt.xlabel('Number of songs', labelpad=15, fontsize=18, color='#555555') plt.title('Genre Distribution', pad=15, fontsize=20, color='#555555')