def make_ct_datasets(configs, paths): TRAIN_SIZE = 0.9 o_img_paths = np.array( sorted(glob(os.path.join(paths['data']['path'], 'Original/*')))) f_img_paths = np.array( sorted(glob(os.path.join(paths['data']['path'], 'Filtered/*')))) img_paths_train = { 'original': o_img_paths[:int(TRAIN_SIZE * len(o_img_paths))], 'filtered': f_img_paths[:int(TRAIN_SIZE * len(f_img_paths))] } img_paths_val = { 'original': o_img_paths[int(TRAIN_SIZE * len(o_img_paths)):], 'filtered': f_img_paths[int(TRAIN_SIZE * len(f_img_paths)):] } crop_size = configs['data_params']['augmentation_params']['crop_size'] transforms_train = Compose([RandomCrop(crop_size), ToFloat(), ToTensor()]) transforms_val = Compose([RandomCrop(1344), ToFloat(), ToTensor()]) train_loader = DataLoader( Dataset(img_paths_train, transforms_train), batch_size=configs['data_params']['batch_size'], num_workers=configs['data_params']['num_workers'], shuffle=True) val_loader = DataLoader(Dataset(img_paths_val, transforms_val), batch_size=1, num_workers=configs['data_params']['num_workers'], shuffle=False) return train_loader, val_loader
def __init__( self, config, name: str, device=torch.device('cuda'), model_path: str = None, ): self.name = name self.config = config self.device = device self.model = Network(config).to(self.device) if model_path is not None: chckpt = torch.load(model_path) self.model.load_state_dict(chckpt) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config.lr) self.writer = SummaryWriter( os.path.join(self.config.work_dir, self.name)) self.training_dataset = Dataset(dataset_type='training', config=config) self.validation_dataset = Dataset(dataset_type='validation', config=config) self.training_dataloader = torch.utils.data.DataLoader( self.training_dataset, batch_size=self.config.batch_size, shuffle=True, drop_last=True, ) self.validation_dataloader = torch.utils.data.DataLoader( self.validation_dataset, batch_size=self.config.batch_size, ) self.criterion = torch.nn.CrossEntropyLoss()
def convert(): # Load model image_shape = (224, 224) detector = Detector(image_shape, 'models') model = detector.model # Data pipeline batch_size = 64 ds = Dataset(image_shape, batch_size) pipeline, _ = ds.pipeline() def representative_dataset_gen(): for tensor in pipeline.take(1): raw_imgs, mask_imgs = tensor img = np.array([raw_imgs[0]]) yield [img] # Shape (1, height, width, channel) converter = tf.lite.TFLiteConverter.from_keras_model(model) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.representative_dataset = representative_dataset_gen converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] converter.inference_input_type = tf.uint8 converter.inference_output_type = tf.uint8 tflite_quant_model = converter.convert() MODEL = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../models/tpu/ohmnilabs_floornet_224_quant_postprocess.tflite') open(MODEL, 'wb').write(tflite_quant_model)
def train(): # Config params image_shape = (224, 224) batch_size = 64 epochs = 30 # Dataset & model detector = Detector(image_shape) ds = Dataset(image_shape, batch_size) training_pipeline, validation_pipeline = ds.pipeline() steps_per_epoch = ds.num_training//batch_size # Start training model_history = detector.train( training_pipeline, epochs, steps_per_epoch, validation_pipeline, ) # Visualize loss loss = model_history.history['loss'] val_loss = model_history.history['val_loss'] range_of_epochs = range(epochs) plt.figure() plt.plot(range_of_epochs, loss, 'r', label='Training loss') plt.plot(range_of_epochs, val_loss, 'bo', label='Validation loss') plt.title('Training Loss and Validation Loss') plt.xlabel('Epoch') plt.ylabel('Loss Value') plt.ylim([0, 1]) plt.legend() plt.show()
def test_encode(filename, seq_length, text): dataset = Dataset([filename], seq_length) encoded = dataset.encode(text) assert len(encoded) == len(text) for label in encoded: assert sum(label) == 1 assert len(label) == dataset.vocab_size
def main(): network_dataset = Dataset('twitters2') nl = read_file_to_dict(os.path.join(DATASET_PATH, 'TwitterSample2.txt')) # 10% sampling nbunch = nl[0:int(len(nl) // 2)] network_dataset.graph = network_dataset.graph.subgraph(nbunch) server_list = [Server(k) for k in range(0, 512)] vp_number = 0 node_list = list(network_dataset.graph.nodes) random.shuffle(node_list) print('Dataset information: TwitterSample2\nNodes Number:', network_dataset.graph.order(), '\nEdge Number:', network_dataset.graph.size()) print('Using Random Partitioning Method...\nServer Number:', len(server_list), '\nVirtual Primary Copy Number:', vp_number, '\nWrite Frequency of Nodes: 1') start = time.time() m = RandomP(server_list, network_dataset, node_list) m.add_new_primary_node(server_list, vp_number) m.check_server_load() m.check_locality() end = time.time() print('Random Partitioning Time:', end - start, 'seconds') m.compute_inter_sever_cost() path = RANDOM_GRAPH_PATH m.save_all(path)
def test_load(filename, start_seq): seq_length = 25 dataset = Dataset([filename], seq_length) model = RNNTextGenerator(25, dataset.vocab_size, meta_graph='./model/RNNTextGenerator') print(model.generate(dataset, start_seq, 50))
def load_data(): # read training_df = pd.read_csv(os.path.join(DEFAULT_DATA_FOLDER, "training"), sep="\t", dtype={ "user_id": str, "item_id": str }) test_df = pd.read_csv(os.path.join(DEFAULT_DATA_FOLDER, "test"), sep="\t", dtype={ "user_id": str, "item_id": str }) item_info_long = pd.read_csv(os.path.join(DEFAULT_DATA_FOLDER, "item_features"), sep="\t", dtype={"item_id": str}) item_info_wide = item_info_long.pivot( index="item_id", columns="feature", values="value").reset_index().fillna(0) # y_train = training_df.rating.values.astype(np.float) training_df = training_df.drop(columns=["rating"]) y_test = test_df.rating.values.astype(np.float) test_df = test_df.drop(columns=["rating"]) return Dataset(training_df, y_train, test_df, y_test, item_info_wide)
def test_EncodedDataset_constructor(self): dataset = ch.datasets.TupleDataset([ Entry("entry1", [Example(([10, 20, 30], ), 10)], dict([["HEAD", True], ["SORT", False]])), Entry("entry2", [Example(([30, 20, 10], ), [10, 20, 30])], dict([["HEAD", False], ["SORT", True]])) ]) cdataset = EncodedDataset( Dataset(dataset, DatasetMetadata(1, set(["HEAD", "SORT"]), 256, 5))) [(types0, values0, attribute0), (types1, values1, attribute1)] = list(cdataset) self.assertTrue(np.all([[[0, 1], [1, 0]]] == types0)) self.assertTrue( np.all([[[266, 276, 286, 512, 512], [266, 512, 512, 512, 512]]] == values0)) self.assertTrue(np.all(np.array([1, 0]) == attribute0)) self.assertTrue(np.all([[[0, 1], [0, 1]]] == types1)) self.assertTrue( np.all([[[286, 276, 266, 512, 512], [266, 276, 286, 512, 512]]] == values1)) self.assertTrue(np.all(np.array([0, 1]) == attribute1))
def __init__(self, raw_dataframe, data_config): self.raw = raw_dataframe if "test_ratio" in data_config.keys( ) and data_config.test_ratio is not None: self.train_test_split = True train_data, test_data = train_test_split( self.raw, test_size=data_config.test_ratio, random_state=0, stratify=self.raw[["label"]]) self.train = Dataset(train_data) self.test = Dataset(test_data) else: self.train_test_split = False train_data = self.raw self.train = pd.DataFrame(raw_dataframe)
def setUp(self): name = "cassandra20200615" mode = "train" repositories = [{ "name": "cassandra20200615", "url": "", "CommitTarget": "", "filterFile": "", "codeIssueJira": "", "projectJira": "" }] parameters = {} option = { "name": name, "mode": mode, "repositories": repositories, "parameters": parameters #needless when to infer. } option = Option(option) self.dataset = Dataset(option.getRepositorieImproved()) self.repository = repositories[0] print( os.path.join(UtilPath.Test(), "testDataset", self.repository["name"], "repository")) self.gr = GitRepository( os.path.join(UtilPath.Test(), "testDataset", self.repository["name"], "repository"))
def create_algo(server_count=4, node_count=10): data = Dataset(dataset_str='facebook') data.graph = nx.Graph() for i in range(node_count): data.graph.add_node(i) server_list = [Server(serer_id=i) for i in range(server_count)] algo = OfflineAlgo(server_list=server_list, network_dataset=data) return algo
def show_predictions(): image_shape = (224, 224) detector = Detector(image_shape) ds = Dataset(image_shape) pipeline, _ = ds.pipeline() for image, mask in pipeline.take(1): pred_mask = detector.predict(image) __display([image[0], mask[0], __create_mask(pred_mask)])
def load_data(x_data,source_data,length_data, batch_size): data_loader = None if x_data != '': X = pickle.load(open(x_data, 'rb')) source = pickle.load(open(source_data, 'rb')) length = pickle.load(open(length_data, 'rb')) data = Dataset(X,source,length) data_loader = DataLoader(data, batch_size=batch_size, shuffle = True) return data_loader
def test_shuffle(self): data = Dataset(os.path.join("test", "test_dataset"), shuffle=True, rng=np.random.RandomState(0)) grammar = Grammar(data.node_types, data.rules, data.tokens(0) + [CLOSE_NODE]) data.prepare({"foo": 1, "bar": 2, "<unknown>": 0}, grammar) d = data.next() self.assertEqual(d.annotation.query, ["test"]) self.assertEqual(d.annotation.mappings, {"foo": "bar"})
def test(): logger.debug('Loading test dataset.') test_data = Dataset(test_path, photo_json, photo_path, w2v, config) test_dlr = DataLoader(test_data, batch_size=config.batch_size, collate_fn=lambda x: batch_loader(x, config.review_net_only)) logger.info('Start to test.') if config.multi_gpu: model = torch.nn.DataParallel(torch.load(config.model_path)).to(config.device) else: model = torch.load(config.model_path) test_loss = evaluate_mse(model, test_dlr) logger.info(f"Test end, test mse is {test_loss:.6f}")
def train(): try: train_data, valid_data = pickle.load(open(config.data_dir + '/dataset.pkl', 'rb')) logger.info('Loaded dataset from dataset.pkl!') except Exception: logger.debug('Loading train dataset.') train_data = Dataset(train_path, photo_json, photo_path, w2v, config) logger.debug('Loading valid dataset.') valid_data = Dataset(valid_path, photo_json, photo_path, w2v, config) pickle.dump([train_data, valid_data], open(config.data_dir + '/dataset.pkl', 'wb')) logger.info(f'Training dataset contains {len(train_data.data[0])} samples.') train_dlr = DataLoader(train_data, batch_size=config.batch_size, shuffle=True, collate_fn=lambda x: batch_loader(x, config.review_net_only)) valid_dlr = DataLoader(valid_data, batch_size=config.batch_size, collate_fn=lambda x: batch_loader(x, config.review_net_only)) if config.multi_gpu: model = torch.nn.DataParallel(UMPR(config, w2v.embedding)).to(config.device) else: model = UMPR(config, w2v.embedding).to(config.device) training(train_dlr, valid_dlr, model, config, config.model_path)
def main(): args = Parser().get_parser().parse_args() print("=====Configurations=====\n", args) # Load Configuration and data config = Config(args) dataset = Dataset(config) start = time.time() outer_tracking = {} # TODO Load data once across all folds headers = ['O_EPOCH', 'I_EPOCH', 'TR_F1', 'VAL_LOSS', 'VAL_F1', 'k-MICRO-F1', 'k-MACRO-F1', 'MICRO-F1', 'MACRO-F1', 'MC_ACC', 'ML_ACC', 'BAE'] perc_results = [[]]*len(config.train_percents) for perc_id, train_percent in enumerate(config.train_percents): print('\n\n############################ Percentage: ', train_percent, '#####################################') # config.train_percent = train_percent fold_results = [[]]*len(config.train_folds) for fold_id, fold in enumerate(config.train_folds): print('\n------- Fold: ', fold) # config.train_fold = fold dataset.load_indexes(train_percent, fold) values = train_model(dataset) if config.prop_model_name == 'propagation_gated': np.save(path.join(config.paths['experiment'], config.dataset_name + '-' + str(fold) + '-' + str(config.max_depth) + '_gating_scores.npy'), scores) outer_tracking[fold_id] = values fold_results[fold_id] = values[-1] if not config.save_model: remove_directory(config.paths['perc_' + train_percent] + '_' + fold) fold_results = np.vstack(fold_results) file_name = os.path.join(config.paths['perc_' + train_percent], 'metrics.txt') np.savetxt(file_name, fold_results, header=str(headers), comments='', fmt='%1.5f') perc_results[perc_id] = np.mean(fold_results, axis=0) if not config.save_model: remove_directory(config.paths['perc_' + train_percent]) results = np.vstack(perc_results) file_name = os.path.join(config.paths['experiment'], 'metrics.txt') np.savetxt(file_name, results, header=str(headers), comments='', fmt='%1.5f') print('Mico: ', results[0][8], '| Macro: ', results[0][9]) np.save(path.join(config.paths['experiment'], config.dataset_name+str(config.max_depth)+'_batch_results.npy'), outer_tracking) # TODO code inference - Load model and run test print('Time taken:', time.time() - start)
def main(): cfg = Config() cfg.print_summary() train_set = Dataset(cfg, "train") val_set = Dataset(cfg, "val") # Parse command line args # Make dirs for model saving and logging root_dir = os.path.abspath("") model_dir = os.path.join(root_dir, "models") if not os.path.exists(model_dir): os.mkdir(model_dir) model_dir = os.path.join(model_dir, cfg.MODEL_NAME) if not os.path.exists(model_dir): os.mkdir(model_dir) #train(cfg, model_dir, train_set, val_set) production_test(cfg, model_dir, val_set)
def get_dataset(self): with open(f"../experiments/configs/{self.dataset_name}.yaml", "r") as yamlfile: self.data = yaml.load(yamlfile, Loader=yaml.FullLoader) self.logger.info( f"Read successful file for dataset {self.dataset_name}") targ = self.data[self.experiment_id]['target'] dataset = Dataset(self.dataset_name, self.data[self.experiment_id]['features'], self.data[self.experiment_id]['target'], self.gender_sep, self.logger) return dataset
def test_node_types(self): data = Dataset(os.path.join("test", "test_dataset")) expected = set([ ROOT, NodeType("ImportFrom", False), NodeType("str", False), NodeType("alias", True), NodeType("int", False), NodeType("alias", False), NodeType("str", False) ]) self.assertEqual(set(data.node_types), expected)
def test_sample(filename, batch_size, seq_length): dataset = Dataset([filename], seq_length) count = 0 batch = dataset.sample(batch_size) for seq in batch.inputs: assert len(seq) == seq_length for i in range(seq_length): # One-hot encoded assert sum(seq[i]) == 1 assert len(seq[i]) == dataset.vocab_size count += 1 assert count == batch_size
def main(): if len(sys.argv) < 3: print('Usage:\tpython ' + sys.argv[0] + ' <from scratch> <input file>\n') print( '\t<from scratch> : Determines whether the solution will be calculated' ) print( '\t from scratch or from a greedy starting point') print('\t Options: 0 | 1\n') print( '\t<input file> : The file from which the dataset will be extracted' ) print('\t Options: a_example\n' '\t b_should_be_easy\n' '\t c_no_hurry\n' '\t d_metropolis\n' '\t e_high_bonus\n') return from_scratch = bool(int(sys.argv[1])) entry_file = sys.argv[2] dataset = Dataset(entry_file, from_scratch) print('\nHill Climbing - Standard') hc_standard = HillClimbing(dataset) hc_standard.solve() print('\nHill Climbing - Random') hc_random = HillClimbing(dataset, random=True) hc_random.solve() print('\nHill Climbing (Steepest Ascent) - Standard') hc_sa_standard = SteepestAscent(dataset) hc_sa_standard.solve() print('\nHill Climbing (Steepest Ascent) - Random') hc_sa_random = SteepestAscent(dataset, random=True) hc_sa_random.solve() print('\nSimulated Annealing - Standard') sa_standard = SimulatedAnnealing(dataset) sa_standard.solve() print('\nSimulated Annealing - Random') sa_random = SimulatedAnnealing(dataset, random=True) sa_random.solve() print() return 0
def test_batch(filename, batch_size, seq_length): dataset = Dataset([filename], seq_length) for batch in dataset.batch(batch_size): # The number of elements in the batch is `batch_size` assert len(batch.inputs) == batch_size assert len(batch.targets) == batch_size for i in range(batch_size): # Each element in the batch is a sequence assert len(batch.inputs[i]) == seq_length assert len(batch.targets[i]) == seq_length for j in range(seq_length): # One-hot encoded assert sum(batch.inputs[i][j]) == 1 assert len(batch.inputs[i][j]) == dataset.vocab_size
def test(self): seq_length = 25 filename = './data/alice.txt' dataset = Dataset([filename], seq_length) params = { 'rnn_cell': [tf.contrib.rnn.BasicRNNCell], 'n_neurons': np.arange(1, 1000), 'optimizer': [ tf.train.AdamOptimizer, ], 'learning_rate': np.linspace(0, 1, 10000, endpoint=False), 'epoch': np.arange(1, 6), 'batch_size': np.arange(25, 100), } print(test_model_selector(dataset, params, 3))
def main(args): nlp = spacy.load('en',disable=['parser', 'tagger', 'ner']) if args.command == 'train': if args.comment is not None: model_path = args.model_path + '_' + args.comment else: model_path = args.model_path if not os.path.exists(model_path): os.makedirs(model_path) config = read_config(args.model_config) hparams = read_hparams(args.train_specs) print('Loading data...', flush=True) dataset = Dataset(args.data_root, nlp=nlp, image_size=(224,224), size=args.ds, split='train', random_seed=RANDOM_SEED) print('Creating new model...', flush=True) model = create_model(config, args={'image_shape':dataset[0][0].shape, 'vocab_size':dataset.vocab_size}, cuda=args.cuda) print('Model initialized!', flush=True) print('Training model...', flush=True) train(model, hparams, dataset, model_path, log_interval=6) elif args.command == 'test': if not os.path.exists(args.model_path): print("Model doesn't exist!") exit(0) print('Loading data...', flush=True) dataset = Dataset(args.data_root, nlp=nlp, image_size=(224,224), split='val') print('Loding model...', flush=True) model = load_model(args.model_path, args={'image_shape':dataset[0][0].shape, 'vocab_size':dataset.vocab_size}, cuda=args.cuda, weights=not args.test_init) print('Model loaded!', flush=True) print('Testing model...', flush=True) test(model, dataset, args.model_path)
def test_restore(filename, start_seq): seq_length = 25 batch_size = 25 learning_rate = 0.01 epoch = 5 dataset = Dataset([filename], seq_length) model = RNNTextGenerator( seq_length, dataset.vocab_size, learning_rate=learning_rate, epoch=epoch, batch_size=batch_size, ) model.restore() print('Using restored model') print(model.generate(dataset, start_seq, 50))
def test_rules(self): data = Dataset(os.path.join("test", "test_dataset")) expected = set([ Rule(ROOT, (Node("-", NodeType("ImportFrom", False)), )), Rule(NodeType("ImportFrom", False), (Node("module", NodeType( "str", False)), Node("names", NodeType("alias", True)), Node("level", NodeType("int", False)))), Rule(NodeType("alias", True), (Node("val0", NodeType("alias", False)), )), Rule(NodeType("alias", False), (Node("-", NodeType("alias", False)), )), Rule(NodeType("alias", False), (Node("name", NodeType("str", False)), )) ]) self.assertEqual(set(data.rules), expected)
def main(args): if args.seed != 0: random.seed(args.seed) np.random.seed(args.seed) th.manual_seed(args.seed) start_time = time.time() device = torch.device( f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu") dir_path = os.path.dirname(os.path.realpath(__file__)) parent_path = os.path.abspath(os.path.join(dir_path, os.pardir)) data_file = os.path.join(parent_path, f'data/{args.dataset}.txt') split_ratio = float(args.split_ratio) dataset = Dataset(emb_dim=args.embed_dim, data_file=data_file, split_ratio=split_ratio, include_all=args.include_all, few_shot_ratio=args.few_shot_ratio) args.n_labels = dataset.n_labels logger = init_logger(args) logger.info(f'[TAG]: {args.tag}') print_config(args, logger) model = eval(args.model)(args.n_node, dataset.n_vocab, args.n_labels, args.embed_dim, args.h_dim, args.z_dim, pretrained_embeddings=None, freeze_embeddings=False, teacher_forcing=args.teacher_forcing, device=device) try: model.train_model(args, dataset, logger) except KeyboardInterrupt: save_model(model, logger, args.log_dir) exit('KeyboardInterrupt!') logger.info("total cost time: {} ".format( timedelta(seconds=(time.time() - start_time)))) if args.few_shot_ratio == 1.0: save_model(model, logger, args.log_dir)
def test_relocate_process(self): data = Dataset(dataset_str='facebook') data.graph = nx.Graph() for i in range(10): data.graph.add_node(i) data.graph.add_edge(0, 1) data.graph.add_edge(0, 2) data.graph.add_edge(0, 3) data.graph.add_edge(0, 4) server_list = [Server(serer_id=i) for i in range(8)] algo = OfflineAlgo(server_list=server_list, network_dataset=data) node_list = list(data.graph.nodes) node_len = len(node_list) for i in range(node_len): n = node_list[i] algo.add_new_primary_node(node_id=n, write_freq=Constant.WRITE_FREQ) algo.node_relocation_process()