def merge_train_extra(pickles_dir, shuffle=True): """ Merges the train and extra datasets. Optionally shuffles them as well. then saves the merged data as two pickle files: X_train_extra_cropped64.pickle Y_train_extra.pickle Args: pickles_dir: (str) directory containing the picle files shuffle: (bool) Should it shuffle the data (default is True) """ print("#" * 60) print((" " * 34) + "MERGE TRAIN AND EXTRA DATA") print("#" * 60) # OPEN TRAIN X_train = pickle2obj(os.path.join(pickles_dir, "X_train_cropped64.pickle")) Y_train = pickle2obj(os.path.join(pickles_dir, "Y_train.pickle")) # OPEN EXTRA X_extra = pickle2obj(os.path.join(pickles_dir, "X_extra_cropped64.pickle")) Y_extra = pickle2obj(os.path.join(pickles_dir, "Y_extra.pickle")) # CONCATENATE X_merged = np.append(X_train, X_extra, axis=0) Y_merged = {} for key in Y_train.keys(): Y_merged[key] = np.append(Y_train[key], Y_extra[key], axis=0) # SHUFFLE if shuffle: random_indices = np.random.permutation(Y_merged["N"].shape[0]) X_merged = X_merged[random_indices] for key in Y_merged.keys(): Y_merged[key] = Y_merged[key][random_indices] # SAVE AS: obj2pickle(X_merged, file=os.path.join(pickles_dir, "X_train_extra_cropped64.pickle")) obj2pickle(Y_merged, file=os.path.join(pickles_dir, "Y_train_extra.pickle")) # FEEDBACK print() print("X") print("Train Shape : ", X_train.shape) print("Extra Shape : ", X_extra.shape) print("Merged Shape: ", X_merged.shape) print() print("Y") for key in Y_merged.keys(): print("{} : {}".format(key.ljust(10, " "), Y_merged[key].shape))
def extract_word2vec_embeddings(file, n_words, embed_size, id2word, datadir=None): """ Tries to load pretrained word2vec weights from a file. If it does not exist, then it trains from scratch and caches the trained embeddings to that file. Returns a numpy array of the trained embeddings according to the word order from `id2word` """ if not os.path.isfile(file): print("Training word2vec embeddings from scratch") embeddings = create_word2vec_vectors(datadir, embed_size=embed_size) print("Caching word2vec embeddings") obj2pickle(embeddings, file) else: print("Loading cached word2vec embedings") embeddings = pickle2obj(file) # Reorder the embeddings weights = initialize_embeddings(n_words, embed_size) for id, word in enumerate(id2word): vector = embeddings.get(word, None) if vector is not None: weights[id] = vector return weights
def get_data(data_dir, cached_data, vocab_file): """ Loads cached data (as sequences of word ids) if it exists, otherwise it creates the dataset from the raw IMDB text files and caches the processed dataset. Args: data_dir: (str) The IMDB root directory containing the "train" and "test" subdirectories. cached_data: (str) The path to the pickle file contianing the cached data vocab_file: (str) The file that contains the vocabulary, one token per line in order from most frequent to least frequent. Returns: (dict) """ if os.path.exists(cached_data): print("LOADING CACHED DATA") data = pickle2obj(cached_data) else: print("PROCESSING RAW DATA") data = load_data(data_dir=data_dir, vocab_file=vocab_file, valid_ratio=0.3, seed=45) print("CACHING DATA") obj2pickle(data, cached_data) return data
def create_increased_representation_data(pickles_dir): print("-" * 60) print((" " * 35) + "INCREASING REPRESENTATION") print("-" * 60) # LOAD DATA X = pickle2obj(os.path.join(pickles_dir, "X_train_extra_cropped64.pickle")) Y = pickle2obj(os.path.join(pickles_dir, "Y_train_extra.pickle")) # INCREASE REPRESENTATION X, Y = increase_representation(X, Y, min_samples=5000) # SAVE PICKLES obj2pickle(X, os.path.join(pickles_dir, "X_aug_train_extra_cropped64.pickle"), verbose=True) obj2pickle(Y, os.path.join(pickles_dir, "Y_aug_train_extra.pickle"), verbose=True) # EXPLORATORY PRINTOUT explore_data(X=X, labels=Y)
def __init__(self, d=None, pickle=None, verbose=False): """ Creates an Evals object to store evaluation metrics for each epoch. Args: d: (dict or None)(optional) - initialize Evals object from a dictionary pickle: (str or None) (optional) path to a pickle file of a dictionary to initialize the Evals object. verbose: (bool) """ self.stuff = dict() self._initialized = True # INITIAL BLANK VALUES self.pda_train = [] self.pda_valid = [] self.wna_train = [] self.wna_valid = [] self.iou_train = [] self.iou_valid = [] self.time_pred = [] self.time_train = [] self.loss = [] self.alpha = [] # LOAD EVALS FROM DICTIONARY if d is not None: verbose_print("Loading evals from a dictionary", verbose=verbose, end="") self.stuff.update(copy.deepcopy(d)) # LOAD EVALS FROM PICKLE FILE (of a dictionary) elif pickle is not None: short_path = limit_string(pickle, tail=PRINT_WIDTH - 32) verbose_print("Loading evals from " + short_path, verbose, end="") if os.path.exists(pickle): d = pickle2obj(pickle) self.stuff.update(copy.deepcopy(d)) else: verbose_print("\n-- file does not exist. Creating blank Evals", verbose, end="") else: verbose_print("Creating blank Evals", verbose, end="") verbose_print_done(verbose)
def get_evals_dict(file): """ Loads previously saved evals dict if it exists, otherwise initializes a new blank one. """ # KEEP TRACK OF EVALS - loading from file if they already exist if os.path.exists(file): print("LOADING EXISTING EVALS") evals = pickle2obj(file) else: print("INITIALIZING NEW EVALS") evals = {"loss": [], "train_acc": [], "valid_acc": [], "train_time": [], # Time taken to train each round "eval_time": [], # Time on evaluation "alpha": [], "step": [], } return evals
# ESTABLISH MODEL SETTINGS settings = Settings() settings.conv_dropout = 0.1 settings.fc_dropout = 0.1 settings.image_chanels = 1 settings.image_size = (54, 54) # ============================================================================== # DATA # ============================================================================== data_dir = "data" X_file = os.path.join(data_dir, "X_test_cropped64.pickle") Y_file = os.path.join(data_dir, "Y_test.pickle") verbose_print("Loading data", verbose=verbose, end="") data = DataObj(X=pickle2obj(X_file), Y=pickle2obj(Y_file), batchsize=128) verbose_print_done(verbose) data.limit_samples(n=limit, verbose=verbose) verbose_print("Performing center crops", verbose=verbose, end="") data.do_center_crops() verbose_print_done(verbose) # ============================================================================== # GRAPH AND SESSION # ============================================================================== model = model_a checkpoint_dir = "results/A_02/checkpoints/" checkpoint_file = os.path.join(checkpoint_dir, "checkpoint_max.chk")
################################################################################ if __name__ == "__main__": # SETTINGS verbose = True opts = parse_settings() print_headers(opts.output_name, border="#", align="center", width=PRINT_WIDTH) verbose_print("Establishing paths", verbose, end="") paths = establish_paths(output_name=opts.output_name, input=opts.input_dir) verbose_print_done(verbose) # TRAIN DATA data = DataObjects() data.set_train_data(X=pickle2obj(paths.X_train, verbose=True), Y=pickle2obj(paths.Y_train, verbose=True), batchsize=opts.batch_size) # VALID DATA data.set_valid_data(n=opts.valid_size, random=False, batchsize=128, verbose=verbose) # LIMIT TRAIN DATA - eg during development and debugging limit = opts.data_size data.train.limit_samples(n=limit, verbose=verbose) # PORTION OF THE TRAINING DATA USED FOR EVALUATION data.set_train_eval_data(n=1024,
y[i], size=10, xy=(x[i], y[i]), # Position of the corresponding bar xytext=(0, 2), # Offset text textcoords='offset points', # Use offset points ha='center', # Horizontal alignment va='center') # Vertical alignment fig.tight_layout() # SAVE THE PLOT fig.savefig("imgs/raw_digit_distributions.png") # ------------------------------------------------------------------------------ # MERGED TRAIN AND EXTRA DATA # ------------------------------------------------------------------------------ Y_train_extra = pickle2obj(os.path.join(data_dir, "Y_train_extra.pickle"), verbose=verbose) print("MERGED TRAIN-EXTRA DATA samples: ", Y_train_extra["N"].shape[0]) # ------------------------------------------------------------------------------ # DISTRIBUTION OF EACH DIGIT FOR EACH DIGIT POSITION # ------------------------------------------------------------------------------ def myFunc(s): return s.value_counts(sort=False, dropna=False) df = pd.DataFrame(Y_train_extra["digits"]).apply(myFunc, axis=0) df.columns = [1, 2, 3, 4, 5] fig, axes = plt.subplots(2, 3, figsize=(8, 6), sharex=False, sharey=True) fig.suptitle('Distribution of digits at each digit position',