def main(): dataset = Dataset() client = Client(Config.IOT_ENDPOINT, Config.ROOT_CA_PATH, Config.PRIVATE_KEY_PATH, Config.CERT_PATH) client.connect() while (True): client.publish(Config.TOPIC_PREFIX + Config.DEVICE_NAME, dataset.get_random(), 0) time.sleep(Config.MESSAGE_FREQUENCY)
def main(): create_dir_if_not_exists('predicted_maps/') # Arguments Parsing global args args = parser.parse_args() cuda = args.cuda if cuda == 'true' and torch.cuda.is_available(): cuda = True else: cuda = False device_id = args.device_id # Load model model = RaptorXModel(feature_1d_dim=24, feature_2d_dim=3) if cuda: model.cuda(device_id) model = torch.load('model/raptorx_model') model.eval() # Load dataset dataset = Dataset() # Test on test split accuracy_infos = {} for _ in range(dataset.test.size): _id, _1d_feature, _2d_feature, contact_map = dataset.next_target( TEST_MODE, cuda, device_id) print('Testing on target {}'.format(_id)) # Apply the trained model network_outputs = model(_1d_feature, _2d_feature) network_outputs = torch.softmax(network_outputs, -1) contact_map = contact_map.cpu().data.numpy().squeeze() network_outputs = network_outputs.cpu().data.numpy().squeeze() accuracy_info = evaluate(contact_map, network_outputs) accuracy_infos[_id] = accuracy_info print(accuracy_info) np.save('predicted_maps/{}.npy'.format(_id), network_outputs[:, :, 1]) # Save accuracy_infos and calculate final averaged accuracy scores with open('predicted_maps/accuracy_infos.json', 'w') as outfile: json.dump(accuracy_infos, outfile) print('\n') for r in [10, 5, 2, 1]: for type in ['short', 'medium', 'long']: top_l_r = str('L/{}'.format(r)) score = np.average( [info[top_l_r][type] for info in accuracy_infos.values()]) print('For {} and {}-range contacts: {}'.format( top_l_r, type, score))
def main(args): # Read (and optionally, truncate) the training and validation data. train_data = Dataset.from_path(args.train_path) if args.max_train_chunks is not None: size = args.max_train_chunks * args.chunk_size train_data.truncate_seqs(size) valid_data = Dataset.from_path(args.valid_path) if args.max_valid_chunks is not None: size = args.max_valid_chunks * args.chunk_size valid_data.truncate_seqs(size, keep_first=True) num_users = train_data.num_users num_items = train_data.num_items tot_size = train_data.num_triplets + valid_data.num_triplets train_data.prepare_batches(args.chunk_size, args.batch_size) valid_data.prepare_batches(args.chunk_size, args.batch_size, batches_like=train_data) settings = { "chunk_size": args.chunk_size, "batch_size": args.batch_size, "hidden_size": args.hidden_size, "learning_rate": args.learning_rate, "rho": args.rho, } with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_normal_initializer(mean=0, stddev=1 / sqrt(args.hidden_size)) with tf.variable_scope("model", reuse=None, initializer=initializer): train_model = CollaborativeRNN(num_users, num_items, is_training=True, **settings) with tf.variable_scope("model", reuse=True, initializer=initializer): valid_model = CollaborativeRNN(num_users, num_items, is_training=False, **settings) tf.global_variables_initializer().run() session.run(train_model.rms_reset) for i in range(1, args.num_epochs + 1): order = np.random.permutation(train_data.num_batches) train_iter = train_data.iter_batches(order=order) valid_iter = valid_data.iter_batches(order=order) train_err, valid_err = run_epoch(session, train_model, valid_model, train_iter, valid_iter, tot_size) print("Epoch {}, train log-loss: {:.3f}".format(i, train_err)) print("Epoch {}, valid log-loss: {:.3f}".format(i, valid_err))
def main(): # Arguments Parsing global args args = parser.parse_args() cuda = args.cuda if cuda == 'true' and torch.cuda.is_available(): cuda = True else: cuda = False device_id = args.device_id save_path = 'model/raptorx_model' batch_size = 1 iterations = args.iterations learning_rate = args.learning_rate log_interval = args.log_interval # Load dataset dataset = Dataset() # Load model model = RaptorXModel(feature_1d_dim=24, feature_2d_dim=3) optimizer = optim.Adam(model.parameters(), lr=learning_rate) if cuda: model.cuda(device_id) criterion = nn.CrossEntropyLoss() # Start Training for itx in range(iterations): model.train() model.zero_grad() _, _1d_feature, _2d_feature, contact_map = dataset.next_target( TRAIN_MODE, cuda, device_id) network_outputs = model(_1d_feature, _2d_feature) loss = criterion(network_outputs.view(-1, 2), contact_map.view(-1)) loss.backward() optimizer.step() print('loss = {}'.format(loss)) if itx > 0 and itx % args.log_interval == 0: create_dir_if_not_exists('model') torch.save(model, save_path) print('Saved the model')
def test_emitters(): assert next(emit_rows(2015)) == [ 2015, '20160624', 28, 11, 0, 'Открытое акционерное общество', 'Энерготехмаш', '34', '3435900517', '00110467', '10000', '16', '384', '23616', '47666', '124323', '171989', '223076', '33574', '-250123', '227579', '194533', '171989', '39311', '-49052', '29000', '-229430', '27492', '42114', '347639', '389753', '-32497', '223076', '233014', '4806', '189236', '389753', '335342', '-30226', '25572', '-62270', '26572', '16333', '3123', '23721', '0', '0', '0', '0' ] assert next(emit_rows(2015)) == Dataset(2015).nth(0)
def read_dataset(year): df = Dataset(year).read_df() @print_elapsed_time def _foo(df): df = df[df.ta > 100] tabooed = [ 1001096 #'Научно-Производственный Финансовый Концерн "ИНТЭКОТЕРРА"' , 1002168 # Газпром? ] return df.drop(tabooed) return _foo(df)
def main(): # Arguments Parsing global args args = parser.parse_args() cuda = args.cuda if cuda == 'true' and torch.cuda.is_available(): cuda = True else: cuda = False device_id = args.device_id input_types = args.input_types output_types = args.output_types model_type = args.model_type # Checks for input_type in input_types: assert (input_type in SUPPORTED_INPUT_TYPES) for output_type in output_types: assert (output_type in SUPPORTED_OUTPUT_TYPES) assert (model_type in SUPPORTED_MODELS) save_path = 'model/{}'.format(model_type) for input_type in input_types: save_path = save_path + '_' + input_type.lower() iterations = args.iterations batch_size = args.batch_size learning_rate = args.learning_rate log_interval = args.log_interval # Load dataset dataset = Dataset(input_types, output_types) print('Loaded dataset') # Load model model = _initialize_model(model_type, dataset.input_dim, dataset.output_dim) optimizer = optim.Adam(model.parameters(), lr=learning_rate) if cuda: model.cuda(device_id) # Start Training best_dev_avg_mae = None for itx in range(iterations): model.train() model.zero_grad() _, lengths, inputs, outputs, masks, _ = dataset.next_batch( batch_size, TRAIN_MODE) if cuda: inputs = inputs.cuda(device_id) outputs = outputs.cuda(device_id) masks = masks.cuda(device_id) network_outputs = model(inputs, lengths) l2_loss = get_l2loss_from_predictions(network_outputs, outputs, masks) l2_loss.backward() optimizer.step() if itx % args.log_interval == 0: model.eval() dev_maes = evaluate(model, dataset, DEV_MODE, len(output_types), cuda, device_id) test_maes = evaluate(model, dataset, TEST_MODE, len(output_types), cuda, device_id) print("---------------------") print("iters:", itx) print("Dev MAEs:", dev_maes) print("Test MAEs:", test_maes) dev_avg_mae = np.mean(dev_maes) if best_dev_avg_mae == None or best_dev_avg_mae > dev_avg_mae: # Save the model create_dir_if_not_exists('model') torch.save(model, save_path) print('Saved the model') best_dev_avg_mae = dev_avg_mae
def get_df(year): return Dataset(year).read_df().nlargest(200, 'ta')[var].sort_values('ta')
def create_pyreader(args, file_name, feed_list, place, model='lac', reader=None, return_reader=False, mode='train'): # init reader pyreader = fluid.io.PyReader(feed_list=feed_list, capacity=300, use_double_buffer=True, iterable=True) if model == 'lac': if reader == None: reader = Dataset(args) # create lac pyreader if mode == 'train': pyreader.decorate_sample_list_generator(paddle.batch( paddle.reader.shuffle(reader.file_reader(file_name), buf_size=args.traindata_shuffle_buffer), batch_size=args.batch_size), places=place) else: pyreader.decorate_sample_list_generator(paddle.batch( reader.file_reader(file_name, mode=mode), batch_size=args.batch_size), places=place) elif model == 'ernie': # create ernie pyreader if reader == None: reader = task_reader.SequenceLabelReader( vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=False, random_seed=args.random_seed) if mode == 'train': pyreader.decorate_batch_generator(reader.data_generator( file_name, args.batch_size, args.epoch, shuffle=True, phase="train"), places=place) else: pyreader.decorate_batch_generator( reader.data_generator(file_name, args.batch_size, epoch=1, shuffle=False, phase=mode), places=place) if return_reader: return pyreader, reader else: return pyreader
# -*- coding: utf-8 -*- """ Created on Sun Nov 27 15:36:24 2016 @author: Евгений """ from reader import Dataset df = Dataset(2015).read_df() df0 = Dataset(2013).read_df() df.nlargest(20, 'ta')[['title','inn', 'unit']] # suspicious df[(df.tp_cap == df.tp) & (df.ta==df.ta_fix)][['title','ta']].sort_values('ta') #other suspicious inns bad_inns = ['7702844336', '7710244903', '7707089648', '7707322083'] df[df.inn.isin(bad_inns)].transpose() #7707089648 #http://kommersant.ru/doc/125895
def create_pyreader(args, file_name, feed_list, place, model='lac', reader=None, return_reader=False, mode='train'): # init reader device_count = len(fluid.cuda_places()) if args.use_cuda else len( fluid.cpu_places()) if model == 'lac': pyreader = fluid.io.DataLoader.from_generator(feed_list=feed_list, capacity=50, use_double_buffer=True, iterable=True) if reader == None: reader = Dataset(args) # create lac pyreader if mode == 'train': pyreader.set_sample_list_generator(fluid.io.batch( fluid.io.shuffle(reader.file_reader(file_name), buf_size=args.traindata_shuffle_buffer), batch_size=args.batch_size / device_count), places=place) else: pyreader.set_sample_list_generator(fluid.io.batch( reader.file_reader(file_name, mode=mode), batch_size=args.batch_size / device_count), places=place) elif model == 'ernie': # create ernie pyreader pyreader = fluid.io.DataLoader.from_generator(feed_list=feed_list, capacity=50, use_double_buffer=True, iterable=True) if reader == None: reader = SequenceLabelReader( vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, random_seed=args.random_seed) if mode == 'train': pyreader.set_batch_generator(reader.data_generator(file_name, args.batch_size, args.epoch, shuffle=True, phase="train"), places=place) else: pyreader.set_batch_generator(reader.data_generator(file_name, args.batch_size, epoch=1, shuffle=False, phase=mode), places=place) if return_reader: return pyreader, reader else: return pyreader
def subset2(df): year = df.year.loc[1] print("Firms with sales > 1 bln rub") BLN = 10**3 # df already in rub million bln = df[df.sales > BLN] fn = make_path_for_user_output(year, "bln") to_csv(bln, fn) fn = make_path_for_user_output(year, 'xl_bln', ext=".xlsx") bln.to_excel(fn, *FMT) print("Saved:", fn) if __name__ == '__main__': df = Dataset(2015).read_df() # import matplotlib.pyplot as plt # import matplotlib.ticker as ticker # import numpy as np # df = get_df(2013) # t=200 #mln # #z = df[(df.ta<t) & (df.sales<t) & (df.ta>0) & (df.sales>=0)][['ta','sales']] # z = df[(df.ta>0) & (df.sales>=0)][['ta','sales']] # z['ta_log']=z.ta.apply(lambda x: np.log10(x)) # df.nlargest(100, 'ta', keep='first')[['inn','title']].to_csv("inn.txt", index=False) # # plt.figure() # ax = z.ta_log.hist(bins=10, cumulative=True, edgecolor='none') # ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: ('%.0f')%(y*1e-3))) # ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: ('10^%.0f')%x))
# -*- coding: utf-8 -*- """ Created on Sun Nov 27 14:26:25 2016 @author: Евгений """ from reader import Dataset df15 = Dataset(2015).read_df() df13 = Dataset(2013).read_df() def get(df): z = df[df.unit == 385][['title', 'inn', 'ta']] #z['ta2'] = z['ta'] * 1000 return z.sort_values('ta')[['title', 'inn', 'ta']] z = get(df15) w = get(df13) c = ['unit_x', 'unit_y'] df = z.merge(w, on='inn', how='left') ex1 = df.ta_x != df.ta_y print("# Exclusions 1 (values do not change between 2015 and 2013):") print("ex1 =", df[~ex1].inn.tolist()) q = df[ex1].sort_values('ta_x')[['title_x', 'ta_y', 'ta_x', 'inn']] ex2 = q.ta_y.isnull() print("# Exclusions 2 (values not present in 2013, unit eq 385):")
from download import download from reader import Dataset download(2012) dataset = Dataset(2012) dataset.to_csv() df = dataset.read_dataframe() df[:1000].to_excel("2012_1000.xls") """ WARNING: this script operates in files up to 1,6 Gb in size. This may slow down your machine and fail in case of memory overflow or disk space shortage. In specific, Dataset(year).read_dataframe() is known to exhaust memory. """ # from remote import download, unpack # from reader import Dataset # #YEARS = [2012, 2013, 2014, 2015, 2016] # YEARS = [2012] # for year in YEARS: # download(year) # unpack(year) # Dataset(year).to_csv() # df = dict() # for year in YEARS: