def fit(self, patches, label_input='prob', batch_size=8, n_epochs=10, valid_patches=None, valid_label_input='prob', **kwargs): if not os.path.exists(self.model_path): os.mkdir(self.model_path) X, y = preprocess(patches, n_classes=self.n_classes, label_input=label_input) validation_data = None if valid_patches is not None: validation_data = preprocess(valid_patches, n_classes=self.n_classes, label_input=valid_label_input) self.valid_score_callback.valid_data = validation_data self.model.fit(x=X, y=y, batch_size=batch_size, epochs=n_epochs, verbose=1, callbacks=self.call_backs + [self.valid_score_callback], validation_data=validation_data, **kwargs)
def cascade4(filenames, debug = False): if debug: print("Cascade of 4 processes with queues. read->process->map->reduce") lines = [item for sublist in [list(data.extractData(fn)) for fn in filenames] for item in sublist] finalDict = {} masterq = multiprocessing.Queue() lpq = multiprocessing.Queue() mapq = multiprocessing.Queue() redq = multiprocessing.Queue() lineProc = os.fork() if lineProc == 0: for toProcess in iter(lpq.get, None): if(data.preprocess(toProcess) is not None): for item in data.preprocess(toProcess): if len(item) > 0: mapq.put(item) mapq.put(None) time.sleep(0.3) os._exit(0) else: mapProc = os.fork() if(mapProc == 0): for toMap in iter(mapq.get, None): for item in mapper.map(toMap): if len(item) > 0: redq.put(item) redq.put(None) time.sleep(0.2) os._exit(0) else: reducProc = os.fork() if(reducProc == 0): r = myReducer.reducer() for toReduce in iter(redq.get, None): result = r.onlineReduce(toReduce) masterq.put(result) masterq.put(None) time.sleep(0.1) os._exit(0) else: for l in lines: lpq.put(l) lpq.put(None) for k,v in iter(masterq.get, None): finalDict[k] = v if debug: l = list(finalDict.iteritems()) print("\t{} files processed. Dictionary of {} instances of {} words made".format(len(filenames), len(l), sum([v for _,v in l]))) os.wait() return
def tokenize(input_sentence): """Converts an input sentence to a set of tokens after applying preprocessing""" preprocessed_sentence = data.preprocess(input_sentence, remove_punct=True, lower_case=True) tokens = preprocessed_sentence.split() return tokens
def train(train_csv): data = read_csv(train_csv) dict_data = preprocess(data) X, Y = convert_to_input(dict_data) model = model_generate() modela.fit(X, Y, epochs=10) return (modela)
def train(options): attributes_train, labels_train = preprocess(load_train(), normalize=options.normalize) attributes_val, labels_val = preprocess(load_val(), normalize=options.normalize) n_attributes = attributes_train.shape[1] model = get_model(options, n_attributes) model.train(attributes_train, labels_train, attributes_val, labels_val) # save model if options.save_model is not None: model.save(options.save_model) # compute validation scores predictions_val = model.predict(attributes_val) return get_binary_class_scores(labels_val, predictions_val)
def telemetry(sid, data): if data: # The current steering angle of the car steering_angle = data["steering_angle"] # The current throttle of the car throttle = data["throttle"] # The current speed of the car speed = data["speed"] # The current image from the center camera of the car imgString = data["image"] image = Image.open(BytesIO(base64.b64decode(imgString))) image_array = np.asarray(image) # Need to preprocess the image the say way the training image have been preprocessed image_array = preprocess(image_array) steering_angle = float( model.predict(image_array[None, :, :, :], batch_size=1)) throttle = controller.update(float(speed)) print(steering_angle, throttle) send_control(steering_angle, throttle) # save frame if args.image_folder != '': timestamp = datetime.utcnow().strftime('%Y_%m_%d_%H_%M_%S_%f')[:-3] image_filename = os.path.join(args.image_folder, timestamp) image.save('{}.jpg'.format(image_filename)) else: # NOTE: DON'T EDIT THIS. sio.emit('manual', data={}, skip_sid=True)
def predict(sentence): checkpoint_dir = hparams.ckpt_dir checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)) sentence = preprocess(sentence) inputs = [input_token.word_index.get(i, 3) for i in sentence.split(' ')] inputs = tf.keras.preprocessing.sequence.pad_sequences( [inputs], maxlen=max_length_input, padding='post') inputs = tf.convert_to_tensor(inputs) result = '' hidden = [tf.zeros((1, hparams.units))] enc_out, enc_hidden = encoder(inputs, hidden) dec_hidden = enc_hidden dec_input = tf.expand_dims([target_token.word_index['start']], 0) for t in range(max_length_target): predictions, dec_hidden, attention_weights = decoder( dec_input, dec_hidden, enc_out) predicted_id = tf.argmax(predictions[0]).numpy() if target_token.index_word[predicted_id] == 'end': break result += str(target_token.index_word[predicted_id]) + ' ' dec_input = tf.expand_dims([predicted_id], 0) result = result.replace(" ", "") return result
def predict(self, patches, label_input='prob'): if patches.__class__ is list: X, _ = preprocess(patches, label_input=label_input) y_pred = self.model.predict(X) elif patches.__class__ is np.ndarray: y_pred = self.model.predict(patches) else: raise ValueError("Input format not supported") y_pred = scipy.special.softmax(y_pred, -1) return y_pred
def test_net(): data_set = TestDataset() data_loader = DataLoader(data_set, batch_size=1, shuffle=True, drop_last=False) classes = data_set.classes net = MyNet(classes) _, _, last_time_model = get_check_point() # assign directly # last_time_model='./weights/weights_21_110242' if os.path.exists(last_time_model): model = torch.load(last_time_model) if cfg.test_use_offline_feat: net.load_state_dict(model) else: net.load_state_dict(model) print("Using the model from the last check point:`%s`" % (last_time_model)) else: raise ValueError("no model existed...") net.eval() is_cuda = cfg.use_cuda did = cfg.device_id # img_src=cv2.imread("/root/workspace/data/VOC2007_2012/VOCdevkit/VOC2007/JPEGImages/000012.jpg") # img_src=cv2.imread('./example.jpg') img_src = cv2.imread('./dog.jpg') # BGR img = img_src[:, :, ::-1] # RGB h, w, _ = img.shape img = img.transpose(2, 0, 1) # [c,h,w] img = preprocess(img) img = img[None] img = torch.tensor(img) if is_cuda: net.cuda(did) img = img.cuda(did) boxes, labels, probs = net(img, torch.tensor([[w, h]]).type_as(img))[0] prob_mask = probs > cfg.out_thruth_thresh boxes = boxes[prob_mask] labels = labels[prob_mask].long() probs = probs[prob_mask] draw_box(img_src, boxes, color='pred', text_list=[ classes[_] + '[%.3f]' % (__) for _, __ in zip(labels, probs) ]) show_img(img_src, -1)
def main(): options = parse_arguments() functional_features, non_functional_features, normal_ff, normal_nff = split_features(load_train(), selected_attack_class=options.attack) nff_attributes, labels_mal = preprocess(non_functional_features, normalize=options.normalize) normal_attributes, labels_nor = preprocess(normal_nff, normalize=options.normalize) n_attributes = nff_attributes.shape[1] trainingset = (normal_attributes, nff_attributes, labels_nor, labels_mal) functional_features, non_functional_features, normal_ff, normal_nff = split_features(load_val(), selected_attack_class=options.attack) nff_attributes, labels_mal = preprocess(non_functional_features, normalize=options.normalize) normal_attributes, labels_nor = preprocess(normal_nff, normalize=options.normalize) n_attributes = nff_attributes.shape[1] validationset = (normal_attributes, nff_attributes, labels_nor, labels_mal) model = WGAN(options, n_attributes) model.train(trainingset, validationset) # save model if options.save_model is not None: save_model_directory = os.path.join(options.save_model, options.name) os.makedirs(save_model_directory, exist_ok=True) model.save(save_model_directory)
def branching(filenames, debug = False): if debug: print("Data Extraction Split Randomly Over 4 processes:\n a \n / \ \n b c\n /\nd") outref = os.fork() split1 = chunkList(filenames) c = random.choice([0,1]) if outref == 0: message = "\tb" fns = split1[c] else: message = "\ta" fns = split1[1 - c] lines = [item for sublist in [list(data.extractData(fn)) for fn in fns] for item in sublist] split2 = chunkList(lines) cc = random.choice([0,1]) inref = os.fork() if inref == 0: dlines = split2[cc] message = "\tc" if outref == 0: message = "\td" else: dlines = split2[1 - cc] prep = [item for sublist in [data.preprocess(d) for d in dlines] if sublist is not None for item in sublist ] mapd = [item for sublist in [list(mapper.map(l)) for l in prep] if len(sublist) > 0 for item in sublist ] r = myReducer.reducer() for d in mapd: r.reduce(d) if debug: print(message + "({})".format(os.getpid()) + ": ({}|{})".format(fn,len(list(r.dictionary.iteritems())))) if(inref == 0): os._exit(0) else: os.wait() if(outref == 0): os._exit(0) else: os.wait() return
def telemetry(sid, data): # The current steering angle of the car steering_angle = data["steering_angle"] # The current throttle of the car throttle = data["throttle"] # The current speed of the car speed = data["speed"] # The current image from the center camera of the car imgString = data["image"] image = Image.open(BytesIO(base64.b64decode(imgString))) image_array = preprocess(np.asarray(image)) transformed_image_array = image_array[None, :, :, :] steering_angle = float(model.predict(transformed_image_array, batch_size=1)) throttle = .2 if float(speed) > 5 else 1. print(steering_angle, throttle) send_control(steering_angle, throttle)
def train_model(self): proportion_labeled = 0.1 assert proportion_labeled == 0.1 train_data_np, train_labels_np, test_data_np, test_labels_np = get_mnist_np( root='./data', download=True) x_labeled, x_unlabelled, x_test, y_labeled, _, y_unlabelled, y_test = preprocess( train_data_np=train_data_np, train_labels_np=train_labels_np, test_data_np=test_data_np, test_labels_np=test_labels_np, proportion_labeled=proportion_labeled) self.x_test = x_test self.y_test = y_test self.train_model_supervised(x=x_labeled, y=y_labeled, num_epochs=self.args.num_epochs)
def train_model(self): proportion_labeled = 0.1 assert proportion_labeled == 0.1 train_data_np, train_labels_np, test_data_np, test_labels_np = get_mnist_np( root='./data', download=True) x_labeled, x_unlabelled, x_test, y_labeled, _, y_unlabelled, y_test = preprocess( train_data_np=train_data_np, train_labels_np=train_labels_np, test_data_np=test_data_np, test_labels_np=test_labels_np, proportion_labeled=proportion_labeled) self.x_test = x_test self.y_test = y_test for i in range(self.args.num_iterations): loss, grad_loss = self.train_model_helper(x=x_labeled, y=y_labeled, is_supervised=True) _, _ = self.train_model_helper( x=x_unlabelled, y=y_unlabelled, is_supervised=False, weight=self.unlabelled_weight_schedule(i)) if (i + 1) % 100 == 0: print('Iteration [%d/%d], Loss: %.6f, Grad Loss: %.8f' % (i + 1, self.args.num_iterations, loss.item(), grad_loss.item())) logging.info('Iteration [%d/%d], Loss: %.6f, Grad Loss: %.8f' % (i + 1, self.args.num_iterations, loss.item(), grad_loss.item())) if self.unlabelled_weight_schedule(i) != 0.0: print('Current synthetic gradient weigth is: %.4f' % (self.unlabelled_weight_schedule(i))) logging.info('Current synthetic gradient weigth is: %.4f' % (self.unlabelled_weight_schedule(i))) self.test_model(i + 1)
def singleCore(filenames, debug = False, maxTime = 0): if debug: print("One process") initialT = time.time() dlines = [item for sublist in [list(data.extractData(fn)) for fn in filenames] for item in sublist] stopCondition = False; nDatapoints = len(dlines) prep = [l for l in [data.preprocess(d) for d in dlines] if l is not None] split = [item for sublist in [list(data.splitify(line)) for line in prep] for item in sublist] while(not stopCondition): nGrams = [item for sublist in [list(markov.nGrams(l)) for l in split] if len(sublist) > 0 for item in sublist ] mod = markov.markovNGramModel() for d in nGrams: mod.update(d) dlines = [mod.sampleGen() for _ in range(nDatapoints)] split = [line.split(" ") for line in dlines] if(time.time() - initialT > maxTime): stopCondition = True
def main(): "learn and predict" def lines(filename): with open(filename) as f: return f.read().splitlines() # read and prepare data xtrain, ytrain, xtest, vocab, max_len, n_classes = data.preprocess( lines('data/test/xtrain.txt'), lines('data/test/ytrain.txt'), lines('data/test/xtest.txt')) # compile model model = compiled(char_cnn(len(vocab), max_len, n_classes)) # tensorflow specific, off callbacks = [] if True: callbacks.append(TensorBoard(write_images=True)) # fit model and log out to tensorboard history = fit(model, xtrain, ytrain, callbacks) model.save_weights('weights.h5') # evaluation print(history.history) with open('metrics.txt', 'w') as f: f.write(json.dumps(history.history, indent=1)) # prediction _, ytest = predict(model, xtest) with open('ytest.txt', 'w') as f: f.write('\n'.join(map(str, ytest))) # test set predictions for inspection _, ytrain_predicted = predict(model, xtrain) with open('ytrain.predicted.txt', 'w') as f: f.write('\n'.join(map(str, ytrain_predicted)))
def extract_into_array(text='Agrimet 15min.csv', imgdir='RADAR DATA/Dataset'): data15, minimum, maximum = preprocess(text) text = [ [datetime.datetime.strptime(x, '%m/%d/%Y %H:%M'), y] #convert to datetime every date and time values from CSV file for x, y in #choose all the values zip(np.genfromtxt(text, delimiter=',', dtype='string')[:, 0], data15) ] #datetime row images = [ [ datetime.datetime. strptime( #convert to datetime every date and time values from available image folders x[0].split('/')[-1][21:29], '%Y%m%d'), x[0] ] #extract dirname alone too for x in os.walk(imgdir) if x[0] != imgdir ] #over all folders images.sort(key=lambda x: x[1]) return text, images, minimum, maximum
parser.add_argument('--eval_batch_size', type=int, default=32, metavar='N', help='eval batch size') parser.add_argument('--seed', type=int, default=1234, help='set random seed') parser.add_argument('--cuda', action='store_true', help='use CUDA device') parser.add_argument('--gpu_id', type=int, help='GPU device id used') args = parser.parse_args() if args.model_type == 'baseline': # data preprocess and prepare data_path = './data/dev.txt' split_ratio = 0.3 preprocess(data_path, split_ratio) # dataset load and plot train_dataset = EmojiDataset('./data/Xtrain.npy', './data/ytrain.npy') plotdata(np.load('./data/Xtrain.npy', allow_pickle=True), np.load('./data/ytrain.npy', allow_pickle=True)) test_dataset = EmojiDataset('./data/Xtest.npy', './data/ytest.npy') train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=False, collate_fn=collate_fn) test_dataloader = DataLoader(test_dataset, batch_size=args.eval_batch_size, shuffle=False, collate_fn=collate_fn)
if __name__ == "__main__": args = parse_args() # preprocess and get word dict if args.task == 'task3': data, label, vocab = preprocess_3label(task=args.task, lang=args.lang) # data = [train_pos, train_neg, dev_pos, dev_neg, test_pos, test_neg] train_pos, train_neg, train_neutral, dev_pos, dev_neg, dev_neutral, _, _, _ = data # build datasets trainset = ThreeLabelDataset(train_pos, train_neg, train_neutral, vocab, args.max_seq_length) valset = ThreeLabelDataset(dev_pos, dev_neg, dev_neutral, vocab, args.max_seq_length) else: data, label, vocab = preprocess(task=args.task, lang=args.lang) # data = [train_pos, train_neg, dev_pos, dev_neg, test_pos, test_neg] train_pos, train_neg, dev_pos, dev_neg, _, _ = data # build datasets trainset = TFDataset(train_pos, train_neg, vocab, args.max_seq_length) valset = TFDataset(dev_pos, dev_neg, vocab, args.max_seq_length) args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') ae_model = NewTransformer(vocab.size, device=args.device).to(args.device) cls_model = Classifier(latent_size=args.latent_size, output_size=args.label_size).to(args.device) args.vocab = vocab trainer = Trainer(trainset, valset, ae_model, cls_model, args)
def cascadeMarkovSameProcess(filenames, debug = False, maxIterations = -1, maxTime = 0): procs = listChunks(range(psutil.cpu_count()), 4) q = 0 if debug: print("4 markov models passing each other generated data in a cycle, then rebuilding on the new data:\n a \n / \ \n b c\n /\nd".format(maxTime)) dataq = [multiprocessing.Queue() for _ in range(4)] outref = os.fork() split1 = chunkList(filenames) c = random.choice([0,1]) if outref == 0: message = "\tb" fns = split1[c] op = psutil.Process(os.getpid()) op.cpu_affinity(procs[0]) q = 0 # op.nice(-10) else: message = "\ta" fns = split1[1 - c] ip = psutil.Process(os.getpid()) q = 1 ip.cpu_affinity(procs[1]) # ip.nice(-10) lines = [item for sublist in [list(data.extractData(fn)) for fn in fns] for item in sublist] split2 = chunkList(lines) cc = random.choice([0,1]) inref = os.fork() if inref == 0: oip = psutil.Process(os.getpid()) oip.cpu_affinity(procs[2]) q = 2 # oip.nice(-10) dlines = split2[cc] message = "\tc" if outref == 0: q = 3 iip = psutil.Process(os.getpid()) iip.cpu_affinity(procs[3]) message = "\td" else: dlines = split2[1 - cc] ic = maxIterations stopCondition = False; nDatapoints = len(dlines) prep = [l for l in [data.preprocess(d) for d in dlines] if l is not None] split = [item for sublist in [list(data.splitify(line)) for line in prep] for item in sublist] dataq[(q + 1) % 4].put(split) time.sleep(3) initialT = time.time() if debug: print("Data preprocessing ({}) complete, starting timer".format(q)) for toProcess in iter(dataq[q].get, None): nGrams = [item for sublist in [list(markov.nGrams(l)) for l in toProcess] if len(sublist) > 0 for item in sublist ] mod = markov.markovNGramModel() for d in nGrams: mod.update(d) dlines = [mod.sampleGen() for _ in range(100)] split = [line.split(" ") for line in dlines] if(stopCondition): time.sleep(0.1) dataq[(q + 1) % 4].put(None) else: if(debug): print("{}: Sample: {}".format(q,dlines[0])) dataq[(q + 1) % 4].put([line.split(" ") for line in dlines]) if(maxIterations >= 0): ic -= 1 if(ic == 0 or time.time() - initialT > maxTime): stopCondition = True time.sleep(0.1) dataq[(q + 1) % 4].put(None) if debug: print(message + "({})".format(os.getpid()) + ": ({}|{})".format(fn,len(list(mod.model.iteritems())))) if(inref == 0): os._exit(0) else: os.wait() if(outref == 0): os._exit(0) return
def preprocessing(self, text_str): proc = text.text_to_word_sequence(data.preprocess(text_str)) tokens = list(map(self.word_to_index, proc)) return tokens
node_output = mygraph.get_tensor_by_name('prefix/'+outputs_name+':0') with tf.Session(graph=mygraph) as sess: #simulate network with some data if(0): # Test if localizing from random data works works randomstorm = np.random.randn(batch_size,256,256,1) randomstorm = randomstorm* (randomstorm > 0.9) for i_image in range(0, batch_size): randomstorm[i_image,:,:,:] = gaussian_filter(randomstorm[i_image,:,:,:], sigma=9) # randomstorm = randomstorm-np.min(randomstorm) randomstorm = randomstorm/np.max(randomstorm) randomstorm = data.preprocess(randomstorm) elif(0): # Test if localizing from a TIFF works mytiffile_name = 'test_if_it_works.tif' import tifffile as tif import scipy.misc randomstorm = np.zeros((batch_size,256,256)) for i_image in range(0, batch_size): myframe = tif.imread(mytiffile_name, key=i_image) myframe = myframe/np.max(myframe) myframe = data.preprocess(myframe) # resize to scale_size myframe = scipy.misc.imresize(myframe, size = (256, 256), interp='bilinear', mode='F')
parser.add_argument('--num-workers', type=int, default=0, metavar='W', help='How many subprocesses to use for data loading (default: 0)') parser.add_argument('--epochs', type=int, default=100, metavar='N', help='Number of epochs to train (default: 100)') parser.add_argument('--patience', type=int, default=10, metavar='P', help='Number of epochs with no improvement after which training will be stopped (default: 10)') parser.add_argument('--lr', type=float, default=0.0001, metavar='LR', help='Learning rate (default: 0.0001)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='Random seed (default: 1)') parser.add_argument('--checkpoint', type=str, default='model.pt', metavar='M', help='checkpoint file name (default: model.pt)') args = parser.parse_args() torch.manual_seed(args.seed) # Data Initialization and Loading device = torch.device("cuda" if torch.cuda.is_available() else "cpu") preprocess(args.data) train_loader, valid_loader = get_train_loaders( args.data, device, args.batch_size, args.num_workers, args.class_count) # Neural Network and Optimizer model = TrafficSignNet().to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=args.lr) # Training and Validation fit(args.epochs, model, criterion, optimizer, train_loader, valid_loader, args.patience, args.checkpoint)
def validate_dataset(csv_path): data_v = read_csv(csv_path) dict_data = preprocess(data_v) X_valid, Y_valid = convert_to_input(dict_data) return (X_valid, Y_valid)
def main(): # Parser parser = parse() parser.add_argument('--inference_audio', type=str, default='inference.wav', help='the path of input wav file', required=True) parser.add_argument('--plot_path', type=str, default='inference.mp4', help='plot skeleton and add audio') parser.add_argument('--output_path', type=str, default='inference.pkl', help='save skeletal data') args = parser.parse_args() # Device if torch.cuda.is_available(): os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID' os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids # Load pretrain model download_data = Download() download_data.pretrain_model() checkpoint = torch.load( download_data.pretrain_model_dst, map_location='cuda:0' if torch.cuda.is_available() else 'cpu') keypoints_mean, keypoints_std = checkpoint['keypoints_mean'], checkpoint[ 'keypoints_std'] aud_mean, aud_std = checkpoint['aud_mean'], checkpoint['aud_std'] # Audio pre-processing aud = preprocess(args.inference_audio, aud_mean, aud_std) # Model movement_net = MovementNet( args.d_input, args.d_output_body, args.d_output_rh, args.d_model, args.n_block, args.n_unet, args.n_attn, args.n_head, args.max_len, args.dropout, args.pre_lnorm, args.attn_type).to('cuda:0' if torch.cuda.is_available() else 'cpu') movement_net.load_state_dict( checkpoint['model_state_dict']['movement_net']) movement_net.eval() with torch.no_grad(): print('inference...') X_test = torch.tensor(aud, dtype=torch.float32).to( 'cuda:0' if torch.cuda.is_available() else 'cpu').unsqueeze(0) lengths = X_test.size(1) lengths = torch.tensor(lengths).to( 'cuda:0' if torch.cuda.is_available() else 'cpu') lengths = lengths.unsqueeze(0) full_output = movement_net.forward(X_test, lengths) pred = full_output.squeeze(0) pred = pred.data.cpu().numpy() # Transform keypoints to world coordinate pred = pred * keypoints_std + keypoints_mean pred = np.reshape(pred, [len(pred), -1, 3]) plot(args.inference_audio, args.plot_path, pred) with open(args.output_path, 'wb') as f: pickle.dump(pred, f)
import sys import config as cfg import data ## Inputs: # PathToData: Training Data # Num: Number of training samples # StartNum: Start processing from this sample number ## Outputs: # voxels_preprocessed.vtu dataPath = cfg.Data_path_ps num = cfg.num_simulations_ps startNum = cfg.startNum_simulations_ps + 1 valid = 0 for i in range(startNum, num + startNum): print(str(i) + "/" + str(num)) if data.preprocess(dataPath, i): valid += 1 print("Converted {:d} samples.".format(valid))
def preprocess_step(data_path, preprocess_cache): from data import preprocess preprocessed_data_path = preprocess(data_path, preprocess_cache) return preprocessed_data_path
def cascadeMarkovMapReduce(filenames, debug = False, maxIterations = -1, maxTime = 0): procs = listChunks(range(psutil.cpu_count()), 4) if debug: print("System of 4 processes with queues. map->reduce->markov->sample, running for {} seconds".format(maxTime)) #Initial Setup: Get the data from the files and split it up lines = [item for sublist in [list(data.extractData(fn)) for fn in filenames] for item in sublist] prep = [l for l in [data.preprocess(d) for d in lines] if l is not None] datalines = [" ".join(item) for sublist in [list(data.splitify(line)) for line in prep] for item in sublist] initialSize = len(datalines) finalDict = {} dataq = multiprocessing.Queue() markovq = multiprocessing.Queue() selectq = multiprocessing.Queue() redq = multiprocessing.Queue() sampleq = multiprocessing.Queue() for d in datalines: dataq.put(d) initialT = time.time() redProc = os.fork() stopCondition = False; if redProc == 0: rp = psutil.Process(os.getpid()) rp.cpu_affinity(procs[0]) # rp.nice(-10) red = myReducer.reducer() for toProcess in iter(redq.get, None): val = [red.onlineReduce(m) for m in toProcess] markovq.put(val) markovq.put(None) time.sleep(0.3) os._exit(0) else: markovProc = os.fork() if(markovProc == 0): mp = psutil.Process(os.getpid()) mp.cpu_affinity(procs[1]) # mp.nice(-10) mod = markov.markovNGramModel() for toModel in iter(markovq.get, None): for ng in markov.nGrams([w for w,_ in toModel]): mod.update(ng) scores = {word : score for word,score in toModel} samples = [mod.sampleGen(w,) for w,_ in toModel] selectq.put((samples,scores)) selectq.put(None) time.sleep(0.2) os._exit(0) else: selectProc = os.fork() if(selectProc == 0): sp = psutil.Process(os.getpid()) # sp.nice(-10) r = myReducer.reducer() for toScore in iter(selectq.get, None): samples = toScore[0] scores = toScore[1] sampleScores = [] for s in [w for w in samples]: total = 0 for w in samples: if w in scores: total += scores[w] sampleScores.append(total) scoredSamples = sorted(zip(samples,sampleScores), key=lambda t: t[1]) coin = random.choice([1,-1]) num = random.choice(range(len(samples))) for winner,score in scoredSamples[:coin*num]: sampleq.put(winner) sampleq.put(None) time.sleep(0.1) os._exit(0) else: dp = psutil.Process(os.getpid()) # dp.nice(-10) count = 0 t = 0 while(count < initialSize): count += 1 toProcess = dataq.get() maps = [item for item in mapper.map(toProcess)] redq.put(maps) t = time.time() - initialT if(toProcess is None): stopCondition = True if(debug): print("{} examples of real data processed in {} seconds".format(count, t)) tick = 0 while(not stopCondition): if(debug): count += 1 t = time.time() - initialT if(tick < t // 1): tick = t // 1 print("Sample at {} seconds: {}".format(t, toProcess)) if(toProcess is None): stopCondition = True if(t > maxTime): stopCondition = True toProcess = sampleq.get() maps = [item for item in mapper.map(toProcess)] redq.put(maps) redq.put(None) if debug: print("Last Sample: {}".format(toProcess)) print("{} examples used, {} samples generated".format(initialSize, count)) os.wait() return
:param hour: input pandas dataframe """ #split data into test and train datatrain = hour[hour["train"] == 1] datatest = hour[hour["train"]!=1] logging.info('Split data into test and train.') #log transform reponse variable "cnt" - bike count y = datatrain["cnt"] ylabelslog = np.log1p(y) X=datatrain.drop(["cnt", "train"], 1) logging.info('Applied log transformation to response variable bike count.') #train random forest model rfmodel = RandomForestRegressor(n_estimators=100) rfmodel.fit(X, ylabelslog) logging.info('Trained a random forest model.') #create pickle file model_name = 'rf.pkl' model_pkl = open(model_name, 'wb') pickle.dump(rfmodel, model_pkl) model_pkl.close() logging.info('Saved model in a pkl file.') if __name__ == "__main__": rfmodel(data.preprocess("data","hour.csv"))
import tensorflow as tf import numpy as np import data import network from sklearn.model_selection import train_test_split from sklearn.utils import shuffle rate = 0.01 batch_size = 1000 train_step = 10000 filepath = "./data/train.csv" data_load = data.load_data(filepath) features, labels = data.preprocess(data_load) labels = np.reshape(labels, (-1, 1)) # features_train,features_validate,labels_train,labels_validate = train_test_split(features,labels,test_size=0.3) x = tf.placeholder(tf.float32, [None, 57], name="x") y = tf.placeholder(tf.float32, [None, 1]) pred = network.network(x) loss = tf.reduce_mean(tf.abs(y - pred)) train_opration = tf.train.AdamOptimizer(rate).minimize(loss) saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i in range(train_step): features_train, labels_train = shuffle(features, labels) sess.run(train_opration, feed_dict={