values_test = np.array(values_test, dtype="float32") labels_test = np.array(labels_test, dtype="float32") else: # initialize word vector embedder embedder = WordVectorEmbedder(embedder_model) # initialize lists (will be converted later into numpy arrays) values = [] labels = [] # get equal-sized subsets of each class data_sampler = DataSampler(klass, file_path=data_params["path"], num_classes=2) data = data_sampler.sample_balanced( min_samples=data_args.get("min_samples", None), rng_seed=data_args.get("load", {}).get("rng_seed", None), ) # load dataset logger.info("processing {} samples from {}...".format(len(data), data_params["path"])) profile_results = timed_dataload(loader, data, data_args, embedder, values, labels) # store loading time seconds_loading = profile_results.timer.total_tt # shuffle if necessary if data_args["shuffle_after_load"]: # store new lists values_shuffled = [] labels_shuffled = []
labels = [] # initialize vector embedder prebuilt_model_path = data_args.get('models', {}).get(embedder_model, {}).get('prebuilt_model_path', None) embedder = WordVectorEmbedder(embedder_model, prebuilt_model_path) # load pre-sampled data from disk if prebuilt_model_path: with open(WordVectorBuilder.filename_train(prebuilt_model_path), 'rb') as f: data = pickle.load(f) else: # get equal-sized subsets of each class min_samples = data_args['min_samples'] if data_args.has_key('min_samples') else None data_sampler = DataSampler(klass, file_path=data_params['path'], num_classes=2) data = data_sampler.sample_balanced(min_samples) # load dataset logger.info("processing {} samples from {}...".format(len(data), data_params['path'])) profile_results = timed_dataload(data, data_args, values, labels) # store loading time seconds_loading = profile_results.timer.total_tt # shuffle if necessary if data_args['shuffle_after_load']: indices = np.arange(len(labels)) np.random.shuffle(indices) values = [values[i] for i in indices] labels = [labels[i] for i in indices]
# shuffle if necessary if data_args['shuffle_after_load']: np.random.shuffle(values_train) np.random.shuffle(labels_train) np.random.shuffle(values_test) np.random.shuffle(labels_test) else: # initialize lists (will be converted later into numpy arrays) values = [] labels = [] # get equal-sized subsets of each class data_sampler = DataSampler(klass, file_path=data_params['path'], num_classes=2) data = data_sampler.sample_balanced(min_samples=data_args.get('min_samples', None), rng_seed=data_args.get('load', {}).get('rng_seed', None)) # load dataset logger.info("processing {} samples from {}...".format(len(data), data_params['path'])) profile_results = timed_dataload(data, data_args, values, labels) # store loading time seconds_loading = profile_results.timer.total_tt # convert into nparray for sklearn values = np.array(values, dtype="float32") labels = np.array(labels, dtype="float32") logger.info("Loaded {} samples...".format(len(values))) # shuffle if necessary if data_args['shuffle_after_load']:
embedder = WordVectorEmbedder(embedder_model, prebuilt_model_path) # load pre-sampled data from disk if prebuilt_model_path: with open(WordVectorBuilder.filename_train(prebuilt_model_path), 'rb') as f: data = pickle.load(f) else: # get equal-sized subsets of each class min_samples = data_args['min_samples'] if data_args.has_key( 'min_samples') else None data_sampler = DataSampler(klass, file_path=data_params['path'], num_classes=2) data = data_sampler.sample_balanced(min_samples) # load dataset logger.info("processing {} samples from {}...".format( len(data), data_params['path'])) profile_results = timed_dataload(data, data_args, values, labels) # store loading time seconds_loading = profile_results.timer.total_tt # shuffle if necessary if data_args['shuffle_after_load']: indices = np.arange(len(labels)) np.random.shuffle(indices) values = [values[i] for i in indices] labels = [labels[i] for i in indices]