예제 #1
0
def merge_train_extra(pickles_dir, shuffle=True):
    """ Merges the train and extra datasets. Optionally shuffles them as well.
        then saves the merged data as two pickle files:
            X_train_extra_cropped64.pickle
            Y_train_extra.pickle

    Args:
        pickles_dir: (str) directory containing the picle files
        shuffle:     (bool) Should it shuffle the data (default is True)
    """
    print("#" * 60)
    print((" " * 34) + "MERGE TRAIN AND EXTRA DATA")
    print("#" * 60)

    # OPEN TRAIN
    X_train = pickle2obj(os.path.join(pickles_dir, "X_train_cropped64.pickle"))
    Y_train = pickle2obj(os.path.join(pickles_dir, "Y_train.pickle"))

    # OPEN EXTRA
    X_extra = pickle2obj(os.path.join(pickles_dir, "X_extra_cropped64.pickle"))
    Y_extra = pickle2obj(os.path.join(pickles_dir, "Y_extra.pickle"))

    # CONCATENATE
    X_merged = np.append(X_train, X_extra, axis=0)
    Y_merged = {}
    for key in Y_train.keys():
        Y_merged[key] = np.append(Y_train[key], Y_extra[key], axis=0)

    # SHUFFLE
    if shuffle:
        random_indices = np.random.permutation(Y_merged["N"].shape[0])
        X_merged = X_merged[random_indices]
        for key in Y_merged.keys():
            Y_merged[key] = Y_merged[key][random_indices]

    # SAVE AS:
    obj2pickle(X_merged,
               file=os.path.join(pickles_dir,
                                 "X_train_extra_cropped64.pickle"))
    obj2pickle(Y_merged,
               file=os.path.join(pickles_dir, "Y_train_extra.pickle"))

    # FEEDBACK
    print()
    print("X")
    print("Train Shape : ", X_train.shape)
    print("Extra Shape : ", X_extra.shape)
    print("Merged Shape: ", X_merged.shape)
    print()
    print("Y")
    for key in Y_merged.keys():
        print("{} : {}".format(key.ljust(10, " "), Y_merged[key].shape))
예제 #2
0
def extract_word2vec_embeddings(file,
                                n_words,
                                embed_size,
                                id2word,
                                datadir=None):
    """ Tries to load pretrained word2vec weights from a file. If it does
        not exist, then it trains from scratch and caches the trained
        embeddings to that file.

        Returns a numpy array of the trained embeddings according to the
        word order from `id2word`
    """
    if not os.path.isfile(file):
        print("Training word2vec embeddings from scratch")
        embeddings = create_word2vec_vectors(datadir, embed_size=embed_size)
        print("Caching word2vec embeddings")
        obj2pickle(embeddings, file)
    else:
        print("Loading cached word2vec embedings")
        embeddings = pickle2obj(file)

    # Reorder the embeddings
    weights = initialize_embeddings(n_words, embed_size)
    for id, word in enumerate(id2word):
        vector = embeddings.get(word, None)
        if vector is not None:
            weights[id] = vector

    return weights
예제 #3
0
def get_data(data_dir, cached_data, vocab_file):
    """ Loads cached data (as sequences of word ids) if it exists, otherwise it
        creates the dataset from the raw IMDB text files and caches the
        processed dataset.
    
    Args:
        data_dir:       (str) The IMDB root directory containing the "train"
                        and "test" subdirectories.
        cached_data:    (str) The path to the pickle file contianing the
                        cached data
        vocab_file:     (str) The file that contains the vocabulary, one
                        token per line in order from most frequent to
                        least frequent.

    Returns:
        (dict)
    """
    if os.path.exists(cached_data):
        print("LOADING CACHED DATA")
        data = pickle2obj(cached_data)
    else:
        print("PROCESSING RAW DATA")
        data = load_data(data_dir=data_dir,
                         vocab_file=vocab_file,
                         valid_ratio=0.3,
                         seed=45)
        print("CACHING DATA")
        obj2pickle(data, cached_data)

    return data
예제 #4
0
def create_increased_representation_data(pickles_dir):
    print("-" * 60)
    print((" " * 35) + "INCREASING REPRESENTATION")
    print("-" * 60)

    # LOAD DATA
    X = pickle2obj(os.path.join(pickles_dir, "X_train_extra_cropped64.pickle"))
    Y = pickle2obj(os.path.join(pickles_dir, "Y_train_extra.pickle"))

    # INCREASE REPRESENTATION
    X, Y = increase_representation(X, Y, min_samples=5000)

    # SAVE PICKLES
    obj2pickle(X,
               os.path.join(pickles_dir, "X_aug_train_extra_cropped64.pickle"),
               verbose=True)
    obj2pickle(Y,
               os.path.join(pickles_dir, "Y_aug_train_extra.pickle"),
               verbose=True)

    # EXPLORATORY PRINTOUT
    explore_data(X=X, labels=Y)
예제 #5
0
    def __init__(self, d=None, pickle=None, verbose=False):
        """ Creates an Evals object to store evaluation metrics for each epoch.
        
        Args:
            d:          (dict or None)(optional) - initialize Evals object from
                        a dictionary
            pickle:     (str or None) (optional) path to a pickle file of a
                        dictionary to initialize the Evals object.
            verbose:    (bool)
        """
        self.stuff = dict()
        self._initialized = True

        # INITIAL BLANK VALUES
        self.pda_train = []
        self.pda_valid = []
        self.wna_train = []
        self.wna_valid = []
        self.iou_train = []
        self.iou_valid = []
        self.time_pred = []
        self.time_train = []
        self.loss = []
        self.alpha = []

        # LOAD EVALS FROM DICTIONARY
        if d is not None:
            verbose_print("Loading evals from a dictionary",
                          verbose=verbose,
                          end="")
            self.stuff.update(copy.deepcopy(d))

        # LOAD EVALS FROM PICKLE FILE (of a dictionary)
        elif pickle is not None:
            short_path = limit_string(pickle, tail=PRINT_WIDTH - 32)
            verbose_print("Loading evals from " + short_path, verbose, end="")
            if os.path.exists(pickle):
                d = pickle2obj(pickle)
                self.stuff.update(copy.deepcopy(d))
            else:
                verbose_print("\n-- file does not exist. Creating blank Evals",
                              verbose,
                              end="")
        else:
            verbose_print("Creating blank Evals", verbose, end="")

        verbose_print_done(verbose)
예제 #6
0
파일: eval.py 프로젝트: ronrest/senty
def get_evals_dict(file):
    """ Loads previously saved evals dict if it exists, otherwise
        initializes a new blank one.
    """
    # KEEP TRACK OF EVALS - loading from file if they already exist
    if os.path.exists(file):
        print("LOADING EXISTING EVALS")
        evals = pickle2obj(file)
    else:
        print("INITIALIZING NEW EVALS")
        evals = {"loss": [],
                 "train_acc": [],
                 "valid_acc": [],
                 "train_time": [], # Time taken to train each round
                 "eval_time": [],  # Time on evaluation
                 "alpha": [],
                 "step": [],
                 }
    return evals
# ESTABLISH MODEL SETTINGS
settings = Settings()
settings.conv_dropout = 0.1
settings.fc_dropout = 0.1
settings.image_chanels = 1
settings.image_size = (54, 54)

# ==============================================================================
#                                                                           DATA
# ==============================================================================
data_dir = "data"
X_file = os.path.join(data_dir, "X_test_cropped64.pickle")
Y_file = os.path.join(data_dir, "Y_test.pickle")

verbose_print("Loading data", verbose=verbose, end="")
data = DataObj(X=pickle2obj(X_file), Y=pickle2obj(Y_file), batchsize=128)
verbose_print_done(verbose)

data.limit_samples(n=limit, verbose=verbose)

verbose_print("Performing center crops", verbose=verbose, end="")
data.do_center_crops()
verbose_print_done(verbose)

# ==============================================================================
#                                                              GRAPH AND SESSION
# ==============================================================================
model = model_a
checkpoint_dir = "results/A_02/checkpoints/"
checkpoint_file = os.path.join(checkpoint_dir, "checkpoint_max.chk")
예제 #8
0
################################################################################
if __name__ == "__main__":
    # SETTINGS
    verbose = True
    opts = parse_settings()
    print_headers(opts.output_name,
                  border="#",
                  align="center",
                  width=PRINT_WIDTH)
    verbose_print("Establishing paths", verbose, end="")
    paths = establish_paths(output_name=opts.output_name, input=opts.input_dir)
    verbose_print_done(verbose)

    # TRAIN DATA
    data = DataObjects()
    data.set_train_data(X=pickle2obj(paths.X_train, verbose=True),
                        Y=pickle2obj(paths.Y_train, verbose=True),
                        batchsize=opts.batch_size)

    # VALID DATA
    data.set_valid_data(n=opts.valid_size,
                        random=False,
                        batchsize=128,
                        verbose=verbose)

    # LIMIT TRAIN DATA - eg during development and debugging
    limit = opts.data_size
    data.train.limit_samples(n=limit, verbose=verbose)

    # PORTION OF THE TRAINING DATA USED FOR EVALUATION
    data.set_train_eval_data(n=1024,
            y[i],
            size=10,
            xy=(x[i], y[i]),  # Position of the corresponding bar
            xytext=(0, 2),  # Offset text
            textcoords='offset points',  # Use offset points
            ha='center',  # Horizontal alignment
            va='center')  # Vertical alignment
    fig.tight_layout()

# SAVE THE PLOT
fig.savefig("imgs/raw_digit_distributions.png")

# ------------------------------------------------------------------------------
#                                                    MERGED TRAIN AND EXTRA DATA
# ------------------------------------------------------------------------------
Y_train_extra = pickle2obj(os.path.join(data_dir, "Y_train_extra.pickle"),
                           verbose=verbose)
print("MERGED TRAIN-EXTRA DATA samples: ", Y_train_extra["N"].shape[0])


# ------------------------------------------------------------------------------
#                             DISTRIBUTION OF EACH DIGIT FOR EACH DIGIT POSITION
# ------------------------------------------------------------------------------
def myFunc(s):
    return s.value_counts(sort=False, dropna=False)


df = pd.DataFrame(Y_train_extra["digits"]).apply(myFunc, axis=0)
df.columns = [1, 2, 3, 4, 5]

fig, axes = plt.subplots(2, 3, figsize=(8, 6), sharex=False, sharey=True)
fig.suptitle('Distribution of digits at each digit position',