def prepare_data(input_file, colname, label_colname=None): if label_colname is None: X = read_data(input_file, colname) else: X, Y = read_data(input_file, colname, label_colname) #c = Cleaner() print("starting to clean data..") #X = [c.full_clean(text) for text in X] X = [text for text in X] if label_colname is None: return X return X, Y
def test(): filename = 'r03.1_R_H_20.dow' if not len(argv) > 1 else argv[1] data = read_data(filename) start = time.time() lagrange_relaxation(data) stop = time.time() print 'Lagrange relaxation done. Time: {} seconds'.format(stop - start)
def button_load_ref(event): # handler for BUTTON_LREF clicked global CURR_REF_DATA, CURR_REF_PLOT is_tia = messagebox.askquestion('TIA?', 'Do you want to load a TIA measurement?') fname = filedialog.askopenfilename() print(fname) if type(fname) is str: CURR_REF_DATA, extra_info, _, soffset = read_data(fname) print(is_tia) if is_tia == 'yes': CURR_REF_DATA = np.array(CURR_REF_DATA) / 2. CURR_REF_DATA = np.histogram(CURR_REF_DATA, bins=NBIN, range=(-5e6, 3.5e7)) if CURR_REF_PLOT is not None: CURR_REF_PLOT.remove() del CURR_REF_PLOT print(CURR_REF_DATA[0]) rD = CURR_REF_DATA[1][1] - CURR_REF_DATA[1][0] CURR_REF_PLOT = ax.errorbar(CURR_REF_DATA[1][:-1] + rD / 2, CURR_REF_DATA[0], np.sqrt(CURR_REF_DATA[0]), ls='none', c='k', label='Refference', zorder=1000000) plt.legend()
def test(): time_start = time.time() global DEBUG DEBUG = False data = read_data('c38_R_H_5.dow') heuristic_main(data, return_primal=True) time_finish = time.time() print 'Total time: {} s'.format(time_finish - time_start)
def test(): time_start = time.time() global DEBUG DEBUG = False filename = 'r01.1_R_H_10.dow' if len(argv) <= 1 else argv[1] data = read_data(filename) objective, open_arcs = heuristic_main(data) objective, model = make_local_branching_model( data, 2, open_arcs, objective) print 'objective: {}'.format(objective) time_finish = time.time() print 'Total time: {} s'.format(time_finish - time_start)
def main(): filename = 'r03.1_R_H_20.dow' if len(argv) <= 1 \ else argv[1] data = read_data(filename) start = time() data.graph = make_graph(data) subproblems = populate_dual_subproblem(data) master = populate_master(data) master = add_in_out_cuts(master, subproblems, data) master_callback = callback_data(subproblems, data) master.optimize(master_callback) stop = time() print('Total time: {} seconds'.format(round(stop - start, 0)))
def test(): root_path = '../../../MPMCFP_DataGen/' data_path_c = 'c_Instances_Dec_Fixed_Cost/' c_trial, r_trial = 'c33_R_H_10.dow', 'r01.1_R_H_10.dow' filename = root_path + data_path_c filename += c_trial data = read_data(filename) models = [] for t in xrange(data.periods - 1): model = Model(data, t) delta = np.ones(shape=data.commodities) model.solve(data) print 'period: {} model status: {}'.format(t, model.status) models.append(model)
def main(): filename = 'r03.1_R_H_20.dow' if len(argv) <= 1 \ else argv[1] data = read_data(filename) start = time() data.graph = make_graph(data) # objective, open_arcs = heuristic(data, 2, 'u')[:2] # print 'Heuristic objective value: {}'.format(objective) subproblems = populate_dual_subproblem(data) master = populate_master(data) master_callback = callback_data(subproblems, data) master.optimize(master_callback) stop = time() print 'Total time: {} seconds'.format(round(stop - start, 0))
def test(): time_start = time.time() global DEBUG DEBUG = False root_path = './' r_trial = 'r03.1_R_H_20.dow' filename = root_path filename += r_trial if len(argv) <= 1 else argv[1] filename = argv[1] if platform.system() == 'Linux' else filename data = read_data(filename) # Uncomment for uncapacitated problems # data.capacity = np.array([10e+9] * len(data.capacity), dtype=float) objective, open_arcs = heuristic_main(data) print 'objective: {}'.format(objective) time_finish = time.time() print 'Total time: {} s'.format(time_finish - time_start)
def test(): time_start = time.time() global DEBUG DEBUG = False root_path = '../../../DataDeterministicFC/' data_path_c = 'c_Instances_Dec_Fixed_Cost/' data_path_r = 'r_Instances_Dec_Fixed_Cost/' c_trial, r_trial = 'c33_R_H_10.dow', 'r01.1_R_H_10.dow' filename = root_path + data_path_c filename += c_trial if len(argv) <= 1 else argv[1] filename = argv[1] if platform.system() == 'Linux' else filename data = read_data(filename) data.capacity = np.array([10e+9] * len(data.capacity), dtype=float) objective, open_arcs, flow_cost = heuristic_main(data) objective, open_arcs = make_local_branching_model(data, 10, open_arcs) print 'objective: {}'.format(objective) time_finish = time.time() print 'Total time: {} s'.format(time_finish - time_start)
def create_TD(self, src_directory=SRC_DIRECTORY, src_extension=SRC_EXTENSION, txt_directory=TXT_DIRECTORY, indices_file=INDICES_FILE, concat_file=CONCAT_FILE, batch_size=BATCH_SIZE, from_XML=True): if from_XML: src_to_txts(src_directory, src_extension, txt_directory) #src directory contains the XMLs or whatever concat_txts(txt_directory=txt_directory, target_name=concat_file) vocab = read_data(open(concat_file, "r")) #dict takes in words and spits out indices, r_dict the opposite c_indices, count, dict, r_dict = collect_data(vocab, VOC_SIZE) #create files where each line is the index of the word self.input_idx, self.output_idx = write_indices_files( concat_file, dict, indices_file) self.input_scrubbed = scrub(self.input_idx) self.output_scrubbed = scrub(self.output_idx) #use the above files to create our training data files self.input_sorted, self.output_sorted = sort(self.input_scrubbed, self.output_scrubbed, BATCH_SIZE) self.input_padded = pad(self.input_sorted, batch_size) self.output_padded = pad(self.output_sorted, batch_size) self.input_reversed = reverse(self.input_padded) self.input_reversed, self.output_padded = remove_bad_batches( self.input_reversed, self.output_padded, batch_size) self.batchlist, self.input_td, self.input_valid, self.tdcount, self.validcount = make_input_val( VAL_PCT, self.input_reversed, BATCH_SIZE) self.output_td, self.output_valid = make_output_val( self.output_padded, BATCH_SIZE, self.batchlist)
def main(): config = helpers.read_config() elogger = logger.get_logger() # initialize arrays for short-term and long-term traffic features speed_array = 'speeds' time_array = 'times' short_ttf = [[ collections.defaultdict(lambda: { speed_array: [], time_array: [] }) for _ in range(256) ] for _ in range(256)] long_ttf = [[ collections.defaultdict(lambda: { speed_array: [], time_array: [] }) for _ in range(256) ] for _ in range(256)] for data_file in config['data']: elogger.info( 'Generating G and T paths and extracting traffic features on {} ...' .format(data_file)) data = helpers.read_data(data_file) define_travel_grid_path(data, config['coords'], short_ttf, long_ttf, args.grid_size) elogger.info( 'Saving extended with G and T paths data in {}{}.\n'.format( args.data_destination_folder, data_file)) helpers.save_processed_data(data, args.data_destination_folder, data_file) elogger.info('Aggregate historical traffic features ...') helpers.aggregate_historical_data(short_ttf, long_ttf) elogger.info('Saving extracted traffic features in {}'.format( args.ttf_destination_folder)) helpers.save_extracted_traffic_features(short_ttf, long_ttf, args.ttf_destination_folder)
def test(): data = read_data('small7.dow') graph = make_graph(data) make_adjacency_matrix(graph, data)
from helpers import read_data, get_settings, package_translation import api settings = get_settings() article_map = read_data('article_map') locales = ['de', 'es', 'fr', 'ja', 'pt-br'] for article in article_map: url = '{}/articles/{}/translations/missing.json'.format( settings['src_root'], article) missing_locales = api.get_resource_list(url, list_name='locales', paginate=False) for locale in locales: if locale in missing_locales: # if translation missing in src, nothing to move continue print('Moving {} translation for article {}'.format(locale, article)) # get translation in src hc url = '{}/articles/{}/translations/{}.json'.format( settings['src_root'], article, locale) translation = api.get_resource(url) # create translation in dest hc url = '{}/articles/{}/translations.json'.format( settings['dst_root'], article_map[article]) payload = package_translation(translation) api.post_resource(url, payload) print('\nFinished moving translations.\n')
def main(data, num_generations, num_trees, fold, seed, model_file, blackbox_model): ########### kf = StratifiedKFold(shuffle=True, n_splits=10, random_state=seed) X, y = read_data("data/"+data+".csv") # Split the data based on the fold of this run train_index, test_index = list(kf.split(X, y))[fold] X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # H2o requires specially formatted data h2_train = h2o.H2OFrame(python_obj=np.hstack((X_train, y_train))) h2_test = h2o.H2OFrame(python_obj=np.hstack((X_test, y_test))) # ================= # Train the complex model # ================= blackbox_options = { "RF": H2ORandomForestEstimator(ntrees=100), "GB": H2OGradientBoostingEstimator(ntrees=100), "DL": H2ODeepLearningEstimator(epochs=1000), } # Choose the model based on the given parameter blackbox = blackbox_options[blackbox_model] blackbox.train(x=h2_train.columns[:-1], y=h2_train.columns[-1], training_frame=h2_train) # We use the predictions from the model as the new "labels" for training surrogate. blackbox_train_predictions = blackbox.predict(h2_train)["predict"].as_data_frame().values blackbox_train_score = scorer(blackbox_train_predictions, y_train) blackbox_test_predictions = blackbox.predict(h2_test)["predict"].as_data_frame().values blackbox_test_score = scorer(blackbox_test_predictions, y_test) print("The " + blackbox.__class__.__name__ + " achieved", "%.2f" % blackbox_train_score, "on the train set and", "%.2f" % blackbox_test_score, "on the test set") # ================= # Train the surrogates # ================= dt_training_recreating_pct, dt_testing_recreating_pct, dt_complexity = \ decision_tree(X_train, blackbox_train_predictions, X_test, blackbox_test_predictions) print("DT was able to recreate %.2f%%" % dt_training_recreating_pct, "of them on the train, and %.2f%%" % dt_testing_recreating_pct, "on the test set") sdt_training_recreating_pct, sdt_testing_recreating_pct, sdt_complexity = \ simplified_decision_tree(X_train, blackbox_train_predictions, X_test, blackbox_test_predictions) print("SDT was able to recreate %.2f%%" % sdt_training_recreating_pct, "of them on the train, and %.2f%%" % sdt_testing_recreating_pct, "on the test set") ds_training_recreating_pct, ds_testing_recreating_pct, ds_complexity = \ decision_stump(X_train, blackbox_train_predictions, X_test, blackbox_test_predictions) print("DS was able to recreate %.2f%%" % ds_training_recreating_pct, "of them on the train, and %.2f%%" % ds_testing_recreating_pct, "on the test set") lr_training_recreating_pct, lr_testing_recreating_pct, lr_complexity = \ logistic_regression(X_train, blackbox_train_predictions, X_test, blackbox_test_predictions) print("LR was able to recreate %.2f%%" % lr_training_recreating_pct, "of them on the train, and %.2f%%" % lr_testing_recreating_pct, "on the test set") ''' brl_training_recreating_pct, brl_testing_recreating_pct, brl_complexity = \ bayesian_rule_list(X_train, blackbox_train_predictions, X_test, blackbox_test_predictions) print("BRL was able to recreate %.2f%%" % brl_training_recreating_pct, "of them on the train, and %.2f%%" % brl_testing_recreating_pct, "on the test set") ''' gp_training_recreating_pct, gp_testing_recreating_pct, gp_complexity = \ genetic_program(X_train, blackbox_train_predictions, X_test, blackbox_test_predictions, num_generations, num_trees, model_file) print("GP was able to recreate %.2f%%" % gp_training_recreating_pct, "of them on the train, and %.2f%%" % gp_testing_recreating_pct, "on the test set") return [blackbox_train_score, blackbox_test_score, dt_training_recreating_pct, dt_testing_recreating_pct, dt_complexity, sdt_training_recreating_pct, sdt_testing_recreating_pct, sdt_complexity, ds_training_recreating_pct, ds_testing_recreating_pct, ds_complexity, lr_training_recreating_pct, lr_testing_recreating_pct, lr_complexity, brl_training_recreating_pct, brl_testing_recreating_pct, brl_complexity, gp_training_recreating_pct, gp_testing_recreating_pct, gp_complexity]
import arrow from helpers import read_data, write_data, get_settings import api settings = get_settings() sync_dates = read_data('sync_dates') last_sync = arrow.get(sync_dates['article_votes']) article_map = read_data('article_map') for src_article in article_map: dst_article = article_map[src_article] print('\nGetting votes for article {}...'.format(src_article)) url = '{}/{}/articles/{}/votes.json'.format(settings['src_root'], settings['locale'], src_article) votes = api.get_resource_list(url) if not votes: print('- no votes found') continue for vote in votes: if last_sync < arrow.get(vote['created_at']): print('- adding vote {} to article {}'.format( vote['id'], dst_article)) if vote['value'] == -1: url = '{}/articles/{}/down.json'.format( settings['dst_root'], dst_article) else: url = '{}/articles/{}/up.json'.format(settings['dst_root'], dst_article) payload = {
import arrow from helpers import read_data, write_data, get_settings, package_article, verify_author, write_js_redirects import api settings = get_settings() sync_dates = read_data('sync_dates') last_sync = arrow.get(sync_dates['articles']) section_map = read_data('section_map') article_map = read_data('article_map') exceptions = read_data('exceptions') for section in section_map: # # test-only section (ref docs) -> comment out for sync # if section != "206223768": # continue dst_section = section_map[section] print('\nGetting articles in section {}...'.format(section)) url = '{}/{}/sections/{}/articles.json'.format(settings['src_root'], settings['locale'], section) articles = api.get_resource_list(url) for src_article in articles: if str(src_article['id']) in exceptions: print('{} is an exception. Skipping...'.format(src_article['id'])) continue if last_sync < arrow.get(src_article['created_at']): print('- adding article {} to destination section {}'.format( src_article['id'], dst_section)) src_article['author_id'] = verify_author(src_article['author_id'], settings['team_user']) url = '{}/{}/sections/{}/articles.json'.format(
import arrow from helpers import read_data, write_data, get_settings import api settings = get_settings() sync_dates = read_data('sync_dates') last_sync = arrow.get(sync_dates['comment_votes']) article_map = read_data('article_map') comment_map = read_data('comment_map') comment_article_map = read_data('comment_article_map') for src_comment in comment_map: src_article = comment_article_map[src_comment] dst_article = article_map[src_article] dst_comment = comment_map[src_comment] print('Getting votes for comment {}...'.format(src_comment)) url = '{}/{}/articles/{}/comments/{}/votes.json'.format( settings['src_root'], settings['locale'], src_article, src_comment) votes = api.get_resource_list(url) if not votes: print('- no votes found') continue for vote in votes: if last_sync < arrow.get(vote['created_at']): print('- adding vote {} to comment {}'.format( vote['id'], dst_comment)) if vote['value'] == -1: url = '{}/articles/{}/comments/{}/down.json'.format( settings['dst_root'], dst_article, dst_comment) else:
def test(): data = read_data('c33_R_H_5.dow') heur_solution = heuristic_main(data=data, return_primal=True, track_time=True)[2] master_model = make_master(data=data, heur_solution=heur_solution)
# In[]: from helpers import read_data # In[]: Aquifer_Auser = read_data('Aquifer_Auser.csv') Aquifer_Doganella = read_data('Aquifer_Doganella.csv') Aquifer_Luco = read_data('Aquifer_Luco.csv') Aquifer_Petrignano = read_data('Aquifer_Petrignano.csv') Lake_Bilancino = read_data('Lake_Bilancino.csv') River_Arno = read_data('River_Arno.csv') Water_Spring_Amiata = read_data('Water_Spring_Amiata.csv') Water_Spring_Lupa = read_data('Water_Spring_Lupa.csv') Water_Spring_Madonna_di_Canneto = read_data( 'Water_Spring_Madonna_di_Canneto.csv') # In[]: River_Arno.head().T # In[]: print('Gabriel was here')
from helpers import read_data, get_settings import api settings = get_settings() section_map = read_data('section_map') for section in section_map: dst_section = section_map[section] print('\nGetting subscriptions for section {}...'.format(section)) url = '{}/{}/sections/{}/subscriptions.json'.format(settings['src_root'], settings['locale'], section) subscriptions = api.get_resource_list(url) if not subscriptions: print('- no subscriptions found') continue for sub in subscriptions: print('- adding subscription {} to section {}'.format(sub['id'], dst_section)) url = '{}/sections/{}/subscriptions.json'.format(settings['dst_root'], dst_section) if sub['include_comments'] is True: payload = {'subscription': {'source_locale': settings['locale'], 'user_id': sub['user_id'], 'include_comments': True}} else: payload = {'subscription': {'source_locale': settings['locale'], 'user_id': sub['user_id']}} response = api.post_resource(url, payload) if response is False: print('Skipping subscription {}'.format(sub['id']))
helper_dir = r'C:\Users\oyina\src\senior_2019-2020\lab\bijsterbosch\project\oyin' sys.path.append(helper_dir) import helpers as help import importlib #%% updating help help = importlib.reload(help) #%% read in data from the two sites siteB_file = "FNETs_siteB.txt" siteH_file = "FNETs_siteH.txt" num_regions = 10 site_B_data = help.read_data(siteB_file) site_H_data = help.read_data(siteH_file) #%% create x data # initialize array in which to hold site data; i is for channel dimension site_B_connectomes = np.ones((len(site_B_data), 1, num_regions, num_regions)) site_H_connectomes = np.ones((len(site_H_data), 1, num_regions, num_regions)) # create data matrices for person in range(len(site_B_data)): site_B_connectomes[person, :, :, :] = help.list_to_connectome(site_B_data[person], num_regions) for person in range(len(site_H_data)): site_H_connectomes[person, :, :, :] = help.list_to_connectome(site_B_data[person], num_regions) #%% create y data
# parse parameters IS_TIA = '--tia' in sys.argv sys.argv = [i for i in sys.argv if i[0] != '-'] if len(sys.argv) != 3: print(USAGE) exit(1) print('sys.argv', sys.argv) fname = sys.argv[1] iv = eval(sys.argv[2]) assert type(iv) == tuple print('paramter format: %s' % repr(IV_FORMAT)) print('using %s as iv' % repr(iv)) # read measurement data cmp_data, extra_info, _, soffset = read_data(fname) cmp_data = np.array(cmp_data) if IS_TIA: cmp_data /= 2. # do hist of meas. data hist, bins = np.histogram(cmp_data, bins=NBIN, range=BIN_RANGE) cmp_Y = hist D = bins[1] - bins[0] cmp_X = bins[:-1] + D / 2. fname = ('.'.join(fname.split('.')[:-1])).split('/')[-1] # do the noise fit sel = cmp_Y > (max(cmp_Y) / 10.) noise_fit_X = cmp_X[sel]
def train( input_file="clean_train.csv", text_col="question_text", label_col="target", valid_ratio=0.2, max_sentence_length=91, sample_percent=1, class_weights=None, cell_type="gru", embedding="word2vec", embedding_path="GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin", embedding_dim=300, rnn_layers=3, hidden_size=128, one_minus_dropout=0.5, l2_reg=3.0, batch_size=32, epochs=5, learning_rate=1e-3, allow_soft_placement=True, log_device_placement=False, display_every=10, evaluate_every=100, checkpoint_every=100, num_checkpoints=5): # Load and split data print("Loading data..") X, Y = read_data(input_file, text_col, label_col, sample_percent=sample_percent) # Create a vocanulary process # Its job is to assign each unique word an integer and then our sentences replace each word it's corresponding integer. # These mappings are later used again to substitue each word with its embedding # This method also trims or adds trailing zeros to padd and fit each sentence to a specific length print("Setting up vocabulary..") vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( max_sentence_length) X = np.array(list(vocab_processor.fit_transform(X))) print("Vocabulary Size: ", len(vocab_processor.vocabulary_)) num_classes = len(Y[0]) # split in to train and validation X, Y, x_val, y_val = split_data(X, Y, valid_ratio) # initialize tensorflow config print("Initializing tensorflow session..") with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=allow_soft_placement, log_device_placement=log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): print("Initializing our RNN:") print("\nseq_length : ", X.shape[1], "\nnum_classes : ", Y.shape[1], "\nvocab_size : ", len(vocab_processor.vocabulary_), "\nembedding_size : ", embedding_dim, "\ncell_type : ", cell_type, "\nhidden_size : ", hidden_size, "\nl2 : ", l2_reg, "\nclass_weights : ", class_weights, "\nbatch_size : ", batch_size, "\nrnn_layers : ", rnn_layers) # Initiazlie our RNN rnn = RNN(seq_length=X.shape[1], num_classes=Y.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=embedding_dim, cell_type=cell_type, hidden_size=hidden_size, l2=l2_reg, class_weights=class_weights, batch_size=batch_size, rnn_layers=rnn_layers) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) train_op = tf.train.AdamOptimizer(learning_rate).minimize( rnn.loss, global_step=global_step) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", rnn.loss) acc_summary = tf.summary.scalar("accuracy", rnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Validation summaries val_summary_op = tf.summary.merge([loss_summary, acc_summary]) val_summary_dir = os.path.join(out_dir, "summaries", "val") val_summary_writer = tf.summary.FileWriter(val_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=num_checkpoints) # Write vocabulary vocab_processor.save(os.path.join(out_dir, "text_vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) # Initializing pretrained embeddings if embedding flag is up if embedding: # initial matrix with random uniform initW = np.random.uniform( -0.25, 0.25, (len(vocab_processor.vocabulary_), embedding_dim)) # In case of glove, loading embedings is pretty easy # Just read each line, first word is the word # and evey thing else on the line is a vector embedding for that vector if "glove" in embedding: with open(embedding_path, "r", encoding="utf8") as f: for line in f: first_word = line.partition(' ')[0] rest = line[line.index(' ') + 1:] # Find if word in our vocabulary idx = vocab_processor.vocabulary_.get(first_word) if idx != 0: # If yes then substitue the glove embedding for it instead of the random one initW[idx] = np.fromstring(rest, dtype='float32', sep=" ") # In case of word2vec, we are given a bin file elif "word2vec" in embedding: with open(embedding_path, "rb") as f: # First line is header containing information about number of records and size of one record header = f.readline() vocab_size, layer1_size = map(int, header.split()) # Then, number of bytes in each record = (size of a float) * size of one record binary_len = np.dtype('float32').itemsize * layer1_size # for each record for line in range(vocab_size): word = [] while True: # Keep reading a charachter ch = f.read(1).decode('latin-1') if ch == ' ': # until you find a space, then the first word is complete word = ''.join(word) break if ch != '\n': word.append(ch) # Try to find that first word in our vocabulary idx = vocab_processor.vocabulary_.get(word) if idx != 0: # if found, add substitue the corespoding embedding vector with the random vector initW[idx] = np.fromstring(f.read(binary_len), dtype='float32') else: f.read(binary_len) sess.run(rnn.W_text.assign(initW)) print("Successful to load ", embedding, "!\n") # Once we are done with the embeddings and basic tensorflow settings # We now start with actual training routine # Generate batches itr = batch_iterator(X, Y, batch_size, epochs) # For each batch for x_batch, y_batch, start, end in itr: # Train feed_dict = { rnn.input_text: x_batch, rnn.input_label: y_batch, rnn.keep_prob: one_minus_dropout } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, rnn.loss, rnn.accuracy ], feed_dict) train_summary_writer.add_summary(summaries, step) # Training log display if step % display_every == 0: time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) # Evaluation if step % evaluate_every == 0: print("\nEvaluation:") total_preds = np.zeros(y_val.shape) itr2 = batch_iterator(x_val, y_val, batch_size, 1, shuffle=False) avg_acc = 0 avg_loss = 0 steps = 0 for x_eval_batch, y_eval_batch, s, e in itr2: feed_dict_val = { rnn.input_text: x_eval_batch, rnn.input_label: y_eval_batch, rnn.keep_prob: 1.0 } summaries_val, loss, accuracy, preds = sess.run([ val_summary_op, rnn.loss, rnn.accuracy, rnn.predictions ], feed_dict_val) val_summary_writer.add_summary(summaries_val, step) k = np.array([ one_hot_encode(num_classes, label) for label in preds ]) avg_acc += accuracy avg_loss += loss steps += 1 total_preds[s:e] = k cf, f_score = confusion_matrix(y_val, total_preds, 2) avg_acc /= steps avg_loss /= steps time_str = datetime.datetime.now().isoformat() print("{}: loss {:g}, acc {:g}, fscore {:g}\n".format( time_str, avg_loss, avg_acc, f_score)) print("Confusion Matrix") print(cf) # Model checkpoint if step % checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=step) print("Saved model checkpoint to {}\n".format(path))
import arrow from helpers import read_data, write_data, get_settings, package_article, verify_author, write_js_redirects import api settings = get_settings() sync_dates = read_data('sync_dates') last_sync = arrow.get(sync_dates['articles']) section_map = read_data('section_map') article_map = read_data('article_map') user_segment_map = read_data('user_segment_map') permission_group_map = read_data('permission_group_map') exceptions = read_data('exceptions') for section in section_map: # # test-only section (ref docs) -> comment out for sync # if section != "206223768": # continue dst_section = section_map[section] print('\nGetting articles in section {}...'.format(section)) url = '{}/{}/sections/{}/articles.json'.format(settings['src_root'], settings['locale'], section) articles = api.get_resource_list(url) for src_article in articles: if settings['cross_instance']: if src_article['user_segment_id'] != None: dst_user_segment = user_segment_map[str( src_article['user_segment_id'])] dst_permission_group = permission_group_map[str( src_article['permission_group_id'])] src_article['user_segment_id'] = dst_user_segment
def inference(self, weights_file=None, load=False): vocab = read_data(open(CONCAT_FILE, "r")) # dict takes in words and spits out indices, r_dict the opposite c_indices, count, dict, r_dict = collect_data(vocab, VOC_SIZE) inference_encInput = Input(shape=(None, ), dtype="int32", name="input_seqinf") temp = self.emb_A(inference_encInput) temp = self.enc1(temp) #temp = self.enc2(temp) temp = self.enc3(temp) temp, h, c = self.enc4(temp) #inf_emb = Embedding(output_dim=2, input_dim=5, input_length=None, mask_zero=True, name='embA') #inf_enc4 = LSTM(2, return_sequences=True, return_state=True, name='enc4') #inf_dec4 = LSTM(2, return_sequences=True, return_state=True, name='dec4') #inf_dense = Dense(units=5, activation='softmax', # kernel_initializer=TruncatedNormal(mean=0., stddev = 0.05, seed = None), #name="dense1") #temp = inf_emb(inference_encInput) #temp, h, c = inf_enc4(temp) enc_model = Model(inputs=inference_encInput, outputs=[temp, h, c]) #encoder_states is our context for decoding #load the weights obtained during some training if load == True: enc_model.load_weights(weights_file, by_name=True) #print("during inference, enc weights are " + str(enc_model.layers[2].get_weights())) #by_name attaches the weights according to the names of the layers. using the instance variables #facilitates keeping the names constant. inference_decInput = Input(shape=(None, ), dtype="int32", name="input_decinf") dec_state_input_h = Input(shape=(HID_DIM, )) dec_state_input_c = Input(shape=(HID_DIM, )) dec_states_inputs = [dec_state_input_h, dec_state_input_c] temp = self.emb_A(inference_decInput) temp = self.dec1(temp, initial_state=dec_states_inputs) #temp = self.dec2(temp) temp = self.dec3(temp) #temp = inf_emb(inference_decInput) inference_decOutput, state_h, state_c = self.dec4(temp) #inference_decOutput = self.dec4(temp) dec_states = [state_h, state_c] inference_decOutput = TimeDistributed( self.emb_to_vocab)(inference_decOutput) #inference_decOutput = Softmax(name="final_output")(inference_decOutput) dec_model = Model( inputs=[inference_decInput, dec_state_input_h, dec_state_input_c], outputs=[inference_decOutput] + dec_states) #dec_model = Model(inputs=[inference_decInput]+dec_states_inputs, outputs=inference_decOutput) if load == True: dec_model.load_weights(weights_file, by_name=True) print( "Begin conversation. Try not to use contractions.\n Punctuate end of sentences. " "Capitalization does not matter. Input <quit> to quit.\n") while True: #each iteration of this while loop takes input and predicts a response. indices = [dict["<sos>"]] eos = False count = 0 text = input("") text = text.strip().split() if len(text) == 1 and text[0] == "<quit>": print("Conversation ended.\n") break for word in text: temp = "" if eos: indices.append(dict["<sos>"]) eos = False #change the word into something the model understands word = word.lower() if word[-1] in [".", "!", "?"]: eos = True word = word[:-1] for char in word: if char.isalpha(): temp += char word = temp if word in dict: indices.append(dict[word]) if eos: indices.append(dict["<eos>"]) else: indices.append(0) if eos: indices.append(dict["<eos>"]) context_as_strings = [str(idx) for idx in indices] context_string = " ".join(context_as_strings) print("the context string is: " + context_string) #indices now contains our input sequence of integers. Run it through the encoder. indices = list(reversed(indices)) indices_arr = np.array([indices]) print("indices arr is " + str(indices_arr)) #we set up this encoder to output a list containing the hidden and cell state enc_output, e_h, e_c = enc_model.predict(indices_arr) predicted_states = [e_h, e_c] print("encoder states have shape " + str(e_h.shape) + " and " + str(e_c.shape)) decoder_input = [dict["<sos>"]] output_string = [] output_token = "" token_index = 0 tokens = [] decoder_array = np.array(decoder_input) print("dec_array has shape " + str(decoder_array.shape)) while (r_dict[token_index] != "<eos>" and count < 15): input1_batch = [decoder_array] #input1_batches = [input1_batch1] #we have only 1 batch, and it has only 1 array in it input1_batch = np.array(input1_batch) print(input1_batch.shape) input2_batch = e_h #input2_batches = [input2_batch1] input2_batch = np.array(input2_batch) print(input2_batch.shape) input3_batch = e_c #input3_batches = [input3_batch1] input3_batch = np.array(input3_batch) print(input3_batch.shape) #print("predicting on "+str(np.array([[decoder_array]+predicted_states]))) output_tokens, h, c = dec_model.predict( [input1_batch, input2_batch, input3_batch]) #this should be predicting on [ [array, h, c] ] #open the list, each item is a list of the inputs for that prediction #output_tokens should be a vector of dimension VOC_SIZE that has been softmaxed, so we need the argmax #print(str(output_tokens[0,-1,:])) token_index = np.argmax(output_tokens[0, -1, :], axis=0) decoder_input.append(token_index) #token_index+=3 print("output word is " + str(token_index) + " ", end='') print("decoder input is now " + str(decoder_input)) #print(r_dict[token_index]) #print("\n") #output token is still an index at this point so we add its word to the predicted words so far output_string.append(r_dict[token_index]) decoder_array = np.array(decoder_input) print("decoder_array is " + str(decoder_array)) #update states for next iteration #predicted_states = [h, c] #print("new states for decoder are "+str(predicted_states)) #so instead of feeding the extended sequence to the decoder each time, we just save its state #and do one word at a time count += 1 output = " ".join(output_string) print("output is: " + output + "\n")
import arrow from helpers import read_data, write_data, get_settings import api settings = get_settings() sync_dates = read_data('sync_dates') last_sync = arrow.get(sync_dates['attachments']) article_map = read_data('article_map') attachment_map = read_data('attachment_map') attachment_article_map = read_data('attachment_article_map') for src_article in article_map: dst_article = article_map[src_article] print('\nGetting attachments in article {}...'.format(src_article)) url = '{}/{}/articles/{}/attachments.json'.format(settings['src_root'], settings['locale'], src_article) attachments = api.get_resource_list(url, list_name='article_attachments', paginate=False) if not attachments: print('- no attachments found') continue for src_attachment in attachments: if last_sync < arrow.get(src_attachment['created_at']): print('- adding new attachment {} to article {}'.format( src_attachment['file_name'], dst_article)) print(src_attachment) url = '{}/articles/{}/attachments.json'.format( settings['dst_root'], dst_article)