def main(checkpoint, data_count, data_cols, should_train, nb_epoch, null_pct, try_reuse_data, batch_size, execution_config): maxlen = 20 max_cells = 500 p_threshold = 0.5 # set this boolean to True to print debug messages (also terminate on *any* exception) DEBUG = False checkpoint_dir = "checkpoints/" # basic preliminaries for database calls store_name = 'nktraining' adl = client.get_adl_client(store_name) files = adl.ls('training-data/CKAN') random.shuffle(files) cnxn = gc.getConnection() cursor = cnxn.cursor() # make required database calls out, out_array_header = FetchLabeledDataFromDatabase(max_cells, cursor, adl, DEBUG) cnxn.close() with open('Categories_base.txt','r') as f: Categories = f.read().splitlines() Categories = sorted(Categories) category_count = len(Categories) nx,ny = out.shape if(DEBUG): # orient the user a bit print("read data is: ") print(out) print("fixed categories are: ") print(Categories) print("The size of the read raw data is %d rows by %d columns"%(nx,ny)) print("corresponding header length is: %d"%len(out_array_header)) #print(out_array_header) # specify data for the transfer-learning experiment raw_data = out[0:max_cells,:] #truncate the rows (max_cells) header = out_array_header # transpose the data raw_data = np.char.lower(np.transpose(raw_data).astype('U')) # do other processing and encode the data if execution_config is None: raise TypeError Classifier = Simon(encoder={}) # dummy text classifier config = Classifier.load_config(execution_config, checkpoint_dir) encoder = config['encoder'] checkpoint = config['checkpoint'] encoder.categories=Categories # build classifier model Classifier = Simon(encoder=encoder) # text classifier for unit test model = Classifier.generate_transfer_model(maxlen, max_cells, category_count, category_count, checkpoint, checkpoint_dir) model_compile = lambda m: m.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy']) model_compile(model) # encode the data and evaluate model X, y = encoder.encode_data(raw_data, header, maxlen) if(DEBUG): print("DEBUG::y is:") print(y) print("DEBUG::The encoded labels (first row) are:") print(y[0,:]) data = Classifier.setup_test_sets(X, y) max_cells = encoder.cur_max_cells start = time.time() history = Classifier.train_model(batch_size, checkpoint_dir, model, nb_epoch, data) end = time.time() print("Time for training is %f sec"%(end-start)) config = { 'encoder' : encoder, 'checkpoint' : Classifier.get_best_checkpoint(checkpoint_dir) } Classifier.save_config(config, checkpoint_dir) Classifier.plot_loss(history) #comment out on docker images... Classifier.evaluate_model(max_cells, model, data, encoder, p_threshold)
def main(checkpoint, data_count, data_cols, should_train, nb_epoch, null_pct, try_reuse_data, batch_size, execution_config): maxlen = 20 max_cells = 500 # set this boolean to True to print debug messages (also terminate on *any* exception) DEBUG = True checkpoint_dir = "checkpoints/" # basic preliminaries for database calls store_name = 'nktraining' adl = client.get_adl_client(store_name) files = adl.ls('training-data/CKAN') random.shuffle(files) cnxn = gc.getConnection() cursor = cnxn.cursor() # make required database calls out, out_array_header = FetchLabeledDataFromDatabase(max_cells, cursor, adl, False) cnxn.close() with open('Categories_base_stat_geo.txt','r') as f: Categories = f.read().splitlines() Categories = sorted(Categories) category_count = len(Categories) nx,ny = out.shape if(DEBUG): # orient the user a bit print("read data is: ") print(out) print("fixed categories are: ") print(Categories) print("The size of the read raw data is %d rows by %d columns"%(nx,ny)) print("corresponding header length is: %d"%len(out_array_header)) #print(out_array_header) # specify data for the transfer-learning experiment raw_data = out[0:max_cells,:] #truncate the rows (max_cells) header = out_array_header ## ADD GEODATA import logging logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) # read-in relevant data in_file_postal = 'data/geodata/allCountries_postal.txt' reader_postal = GeonamesCountriesTxtFileReader(in_file_postal) df_postal = reader_postal.read_csv_postal() # read-in country codes in_file_cc = 'data/geodata/country_codes_csv.csv' df_countries = pd.read_csv(in_file_cc,dtype=str) # now, build up geo-data training data set N_SAMPLES = 500 # number of samples for each additional category start = time.time() data = np.asarray(df_countries['Name'].sample(n=max_cells,replace='True'))[np.newaxis].T # draw first random column data_header = list([['country','text'],]) # start with the 'country' category for i in np.arange(1,N_SAMPLES): # draw remaining random columns #print(data) data = np.concatenate((data,np.asarray(df_countries['Name'].sample(n=max_cells,replace='True'))[np.newaxis].T),axis=1) data_header.append(list(['country','text'])) # now, handle the remaining new geo categories NewCategories = list(['state','city','postal_code','latitude','longitude','country_code']) for category in NewCategories: for i in np.arange(N_SAMPLES): data = np.concatenate((data,np.asarray(df_postal[category].sample(n=max_cells,replace='True'))[np.newaxis].T),axis=1) if((category=='latitude') or (category=='longitude')): data_header.append(list([category,'float'])) elif(category=='postal_code'): # label as 'int' where appropriate (one may also need to label as 'text' sometimes as well, but going with this for now) data_header.append(list([category,'int'])) else : data_header.append(list([category,'text'])) if(DEBUG): print("DEBUG::the shape of geo data is:") print(data.shape) print("DEBUG::the length of the corresponding geo data header is:") print(len(data_header)) print("DEBUG::the time elapsed to build the geo data set is (sec):") print(time.time()-start) print("DEBUG::merging geo data with datalake data... ") raw_data = np.column_stack((raw_data,data)) header.extend(data_header) if(DEBUG): print("DEBUG::done!!") print("DEBUG::the shape of final merged data is:") print(raw_data.shape) print("DEBUG::the length of final merged data header is:") print(len(header)) ## FINISHED ADDING GEODATA ## LABEL COMBINED DATA AS CATEGORICAL/ORDINAL start_time_guess = time.time() guesses = [] print("Beginning Guessing categorical/ordinal for geo+datalake data...") category_count = 0 ordinal_count = 0 for i in np.arange(raw_data.shape[1]): tmp = guess(raw_data[:,i], for_types ='category') if tmp[0]=='category': category_count += 1 header[i].append('categorical') if ('int' in header[i]) or ('float' in header[i]) \ or ('datetime' in header[i]): ordinal_count += 1 header[i].append('ordinal') guesses.append(tmp) if(DEBUG): #print(guesses) #print(len(guesses)) print("DEBUG::The number of categorical columns is %d"%category_count) print("DEBUG::The number of ordinal columns is %d"%ordinal_count) #print(header) elapsed_time = time.time()-start_time_guess print("Total guessing time is : %.2f sec" % elapsed_time) ## FINISHED LABELING COMBINED DATA AS CATEGORICAL/ORDINAL # transpose the data raw_data = np.char.lower(np.transpose(raw_data).astype('U')) # do other processing and encode the data if execution_config is None: raise TypeError Classifier = Simon(encoder={}) # dummy text classifier config = Classifier.load_config(execution_config, checkpoint_dir) encoder = config['encoder'] checkpoint = config['checkpoint'] encoder.categories=Categories category_count = len(Categories) # need to reset this variable as it is used for stg. else above # build classifier model Classifier = Simon(encoder=encoder) # text classifier for unit test model = Classifier.generate_transfer_model(maxlen, max_cells, category_count-9, category_count, checkpoint, checkpoint_dir) model_compile = lambda m: m.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy']) model_compile(model) # encode the data and evaluate model X, y = encoder.encode_data(raw_data, header, maxlen) if(DEBUG): print("DEBUG::y is:") print(y) print("DEBUG::The encoded labels (first row) are:") print(y[0,:]) data = Classifier.setup_test_sets(X, y) ## Build p_threshold per class # p_threshold = np.sum(data.y_train,axis=0)*1/(data.y_train.shape[0]) p_threshold = 0.5 # you could also use a fixed scalar if(DEBUG): print("DEBUG::per class p_threshold is:") print(p_threshold) max_cells = encoder.cur_max_cells start = time.time() history = Classifier.train_model(batch_size, checkpoint_dir, model, nb_epoch, data) end = time.time() print("Time for training is %f sec"%(end-start)) config = { 'encoder' : encoder, 'checkpoint' : Classifier.get_best_checkpoint(checkpoint_dir) } Classifier.save_config(config, checkpoint_dir) Classifier.plot_loss(history) #comment out on docker images... Classifier.evaluate_model(max_cells, model, data, encoder, p_threshold)
def main(checkpoint, data_count, data_cols, should_train, nb_epoch, null_pct, try_reuse_data, batch_size, execution_config): maxlen = 20 max_cells = 500 p_threshold = 0.5 # set this boolean to True to print debug messages (also terminate on *any* exception) DEBUG = False checkpoint_dir = "checkpoints/" # basic preliminaries for database calls store_name = 'nktraining' adl = client.get_adl_client(store_name) files = adl.ls('training-data/CKAN') random.shuffle(files) cnxn = gc.getConnection() cursor = cnxn.cursor() # make required database calls out, out_array_header = FetchLabeledDataFromDatabase( max_cells, cursor, adl, DEBUG) cnxn.close() with open('Categories_base_stat.txt', 'r') as f: Categories = f.read().splitlines() Categories = sorted(Categories) category_count = len(Categories) nx, ny = out.shape if (DEBUG): # orient the user a bit print("read data is: ") print(out) print("fixed categories are: ") print(Categories) print("The size of the read raw data is %d rows by %d columns" % (nx, ny)) print("corresponding header length is: %d" % len(out_array_header)) #print(out_array_header) # specify data for the transfer-learning experiment raw_data = out[0:max_cells, :] #truncate the rows (max_cells) #read "post-processed" header from file, this includes categorical and ordinal classifications... # with open('datalake_labels','r',newline='\n') as myfile: # reader = csv.reader(myfile, delimiter=',') # header = [] # for row in reader: # header.append(row) # OR header = out_array_header ## LABEL COMBINED DATA AS CATEGORICAL/ORDINAL start_time_guess = time.time() guesses = [] print("Beginning Guessing categorical/ordinal for datalake data...") category_count = 0 ordinal_count = 0 for i in np.arange(raw_data.shape[1]): tmp = guess(raw_data[:, i], for_types='category') if tmp[0] == 'category': category_count += 1 header[i].append('categorical') if ('int' in header[i]) or ('float' in header[i]) \ or ('datetime' in header[i]): ordinal_count += 1 header[i].append('ordinal') guesses.append(tmp) if (DEBUG): #print(guesses) #print(len(guesses)) print("DEBUG::The number of categorical columns is %d" % category_count) print("DEBUG::The number of ordinal columns is %d" % ordinal_count) #print(header) elapsed_time = time.time() - start_time_guess print("Total guessing time is : %.2f sec" % elapsed_time) ## FINISHED LABELING COMBINED DATA AS CATEGORICAL/ORDINAL # transpose the data raw_data = np.char.lower(np.transpose(raw_data).astype('U')) # do other processing and encode the data if execution_config is None: raise TypeError Classifier = Simon(encoder={}) # dummy text classifier config = Classifier.load_config(execution_config, checkpoint_dir) encoder = config['encoder'] checkpoint = config['checkpoint'] encoder.categories = Categories # build classifier model Classifier = Simon(encoder=encoder) # text classifier for unit test model = Classifier.generate_transfer_model(maxlen, max_cells, category_count - 2, category_count, checkpoint, checkpoint_dir) model_compile = lambda m: m.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['binary_accuracy']) model_compile(model) # encode the data and evaluate model X, y = encoder.encode_data(raw_data, header, maxlen) if (DEBUG): print("DEBUG::y is:") print(y) print("DEBUG::The encoded labels (first row) are:") print(y[0, :]) data = Classifier.setup_test_sets(X, y) max_cells = encoder.cur_max_cells start = time.time() history = Classifier.train_model(batch_size, checkpoint_dir, model, nb_epoch, data) end = time.time() print("Time for training is %f sec" % (end - start)) config = { 'encoder': encoder, 'checkpoint': Classifier.get_best_checkpoint(checkpoint_dir) } Classifier.save_config(config, checkpoint_dir) Classifier.plot_loss(history) #comment out on docker images... Classifier.evaluate_model(max_cells, model, data, encoder, p_threshold)