def write_negative_instances(num_ps_rows, num_qs_rows, prods_db_c, num_random_qs_per_prod, quests_db_c, outc, outdb): for rowid in (range(1, num_ps_rows + 1)): p_id = MyUtils_dbs.search_in_alltables_db( prods_db_c, "SELECT id FROM", "WHERE rowid = " + str(rowid))[0][0] neg_qs_ids = [] neg_qs_indices = np.random.choice(a=range(1, num_qs_rows), size=num_random_qs_per_prod, replace=False) for neg_qs_index in neg_qs_indices: neg_qs_ids.append( MyUtils_dbs.search_in_alltables_db( quests_db_c, "SELECT id FROM", "WHERE `index` = " + str(neg_qs_index))[0][0]) if product_has_allfeatures(prods_db_c, p_id) and allquestions_have_allfeatures( quests_db_c, str(neg_qs_ids)): insertion_sequence = [(p_id, q_id, 0) for q_id in neg_qs_ids] outc.executemany("INSERT INTO instances VALUES (?,?,?)", insertion_sequence) else: logging.info( "Product %s excluded from the instances due to not having all the features", p_id) if rowid % (num_ps_rows // 10) == 0: logging.info("Working on category: +10%%...") outdb.commit()
def sort_candidates(candidates_db_path, ranked_candidates_outdb_path, prod_reps_dbpath, quest_reps_dbpath): MyUtils.init_logging("Rank_candidates_nn.log") ### Connecting to the databases: candidates, test products, test questions candidates_nn_db = sqlite3.connect(candidates_db_path) cands_db_c = candidates_nn_db.cursor() testprods_rep_c = sqlite3.connect(prod_reps_dbpath).cursor() testquests_rep_c = sqlite3.connect(quest_reps_dbpath).cursor() f = open(ranked_candidates_outdb_path, "w"); f.close() outdb = sqlite3.connect(ranked_candidates_outdb_path) outdb_c = outdb.cursor() outdb_c.execute('''CREATE TABLE candidates( p_id varchar(63), q_id varchar(63), distance int )''') ### test_products_ids = cands_db_c.execute("SELECT DISTINCT p_id FROM candidates").fetchall() logging.info(test_products_ids[0]) #logging.debug(test_products_ids) for tpl_pid in test_products_ids: pid = tpl_pid[0] product_representation = MyUtils_dbs.search_in_alltables_db(testprods_rep_c, "SELECT * FROM ", "WHERE id = '" + str(pid) + "'")[0] product_tuple = MyUtils.prodls_tonamedtuple(product_representation, offset=1) quests_ids = list(map ( lambda results_tpl : results_tpl[0], cands_db_c.execute("SELECT q_id FROM candidates WHERE p_id = ?", tpl_pid).fetchall())) logging.debug(quests_ids) product_qs_sorted = sort_product_candidates(product_tuple, quests_ids, testprods_rep_c, testquests_rep_c) outdb.executemany("INSERT INTO candidates VALUES (?,?,?)", product_qs_sorted) outdb.commit()
def shuffle_db_table(db_path): db_conn = sqlite3.connect(db_path) c = db_conn.cursor() temp_db = db_path + "_temp" f = open(temp_db, "w") f.close() #clean outdb outdb_conn = sqlite3.connect(temp_db) outc = outdb_conn.cursor() outc.execute('''CREATE TABLE instances(p varchar(63), q varchar(63), y tinyint) ''') outdb_conn.commit() tot_num_of_rows = MyUtils_dbs.get_tot_num_rows_db(c) rand_indices = np.random.choice(range(1, tot_num_of_rows + 1), tot_num_of_rows, replace=False) for ind in rand_indices: picked_row = c.execute("SELECT * FROM instances WHERE rowid = " + str(ind)).fetchone() p = picked_row[0] q = picked_row[1] y = picked_row[2] outc.execute('''INSERT INTO instances VALUES (?,?,?);''', (str(p), str(q), str(y))) outdb_conn.commit() logging.info("Instances have been shuffled.") os.rename(src=temp_db, dst=db_path)
def obtain_category_instances(category_dirpath, categ_products_db, categ_questions_db, max_neg_cardinality): logging.info("Extracting instances for the category: %s", os.path.basename(category_dirpath)) prods_db_c = categ_products_db.cursor() quests_db_c = categ_questions_db.cursor() outdbname = MyUtils_flags.FLAG_INSTANCEIDS + ".db" f = open(os.path.join(category_dirpath, outdbname), "w") f.close() outdb = sqlite3.connect(os.path.join(category_dirpath, outdbname)) outc = outdb.cursor() outc.execute('''CREATE TABLE instances(p varchar(63), q varchar(63), y tinyint) ''') outdb.commit() ### Get the number of Ps and Qs. Generally, |Ps| << |Qs| (eg. 119,43608) num_ps_rows = MyUtils_dbs.get_tot_num_rows_db(prods_db_c) logging.info("Number of products in category: %s", num_ps_rows) num_qs_rows = MyUtils_dbs.get_tot_num_rows_db(quests_db_c) logging.info("Number of questions in category: %s", num_qs_rows) num_possible_instances = num_ps_rows * num_qs_rows logging.info("Potential total number of instances from the category: %s", num_possible_instances) cardinality = min(num_possible_instances, max_neg_cardinality) logging.info( "Considering the upper boundary, the number of negative instances to include in the category dataset is: %s ", cardinality) num_random_qs_per_prod = cardinality // num_ps_rows logging.info("Number of random negative examples per product: %s", num_random_qs_per_prod) write_positive_instances(num_ps_rows, prods_db_c, quests_db_c, outc) outdb.commit() write_negative_instances(num_ps_rows, num_qs_rows, prods_db_c, num_random_qs_per_prod, quests_db_c, outc, outdb) outdb.commit() shuffle_db_table(os.path.join(category_dirpath, outdbname)) categ_products_db.close() categ_questions_db.close() outdb.close()
def write_positive_instances(num_ps_rows, prods_db_c, quests_db_c, outc): ###Iterate over the products: for rowid in (range(1, num_ps_rows + 1)): p_id = MyUtils_dbs.search_in_alltables_db( prods_db_c, "SELECT id FROM", "WHERE rowid = " + str(rowid))[0][0] ###Get all the Qs asked for the selected Ps; they will always be part of the dataset,since there are so few Ps q_ids_results = MyUtils_dbs.search_in_alltables_db( quests_db_c, "SELECT id FROM", "WHERE id LIKE '" + str(p_id) + "%'") q_ids_ls = [tpl[0] for tpl in q_ids_results] #filter: p and qs must have all features if product_has_allfeatures(prods_db_c, p_id) and allquestions_have_allfeatures( quests_db_c, str(q_ids_ls)): insertion_sequence = [(p_id, q_id, 1) for q_id in q_ids_ls] outc.executemany("INSERT INTO instances VALUES (?,?,?)", insertion_sequence) else: logging.info( "Product %s excluded from the instances due to not having all the features", p_id)
def generator_of_batches(batch_size, dataset_type): if dataset_type == MyUtils_flags.FLAG_TRAIN: dataset_length = MyUtils_dbs.get_nn_dataset_length( MyUtils_flags.FLAG_TRAIN) db_conn = sqlite3.connect(F.NN_TRAIN_INSTANCES_DB) if dataset_type == MyUtils_flags.FLAG_VALID: dataset_length = MyUtils_dbs.get_nn_dataset_length( MyUtils_flags.FLAG_VALID) db_conn = sqlite3.connect(F.NN_VALID_INSTANCES_DB) if dataset_type == MyUtils_flags.FLAG_TEST: dataset_length = MyUtils_dbs.get_nn_dataset_length( MyUtils_flags.FLAG_TEST) db_conn = sqlite3.connect(F.NN_TEST_INSTANCES_DB) c = db_conn.cursor() num_of_batches = dataset_length // batch_size + 1 half_mark_offset = dataset_length // 2 for i in range(0, num_of_batches): start_index_pos = i * (batch_size // 2) end_index_pos = min((i + 1) * (batch_size // 2), half_mark_offset) start_index_neg = half_mark_offset + i * (batch_size // 2) end_index_neg = half_mark_offset + min( (i + 1) * (batch_size // 2), dataset_length) c.execute("SELECT p_id, q_id, x,y FROM instances WHERE rowid IN " + str(tuple(range(start_index_pos, end_index_pos)))) rows = c.fetchall() c.execute("SELECT p_id, q_id, x,y FROM instances WHERE rowid IN " + str(tuple(range(start_index_neg, end_index_neg)))) rows_neg = c.fetchall() rows.extend(rows_neg) batch = list( map( lambda elem: (str(elem[0]), str(elem[1]), json.loads(elem[2]), int(elem[3])), rows)) yield batch
def sort_product_candidates(product_tuple, quests_ids, testprods_rep_c, testquests_rep_c): distance_list = [] #list of tuples, (p_id, q_id, distance), sorted on tpl[2] for quest_id in quests_ids: question_representation = MyUtils_dbs.search_in_alltables_db(testquests_rep_c, "SELECT * FROM ", "WHERE id = '" + str(quest_id) + "'")[0] logging.debug("Question representation: %s", question_representation) question_tuple = MyUtils.quest_lstonamedtuple(question_representation, offset=1) pq_dist = CD.compute_dist_pq(product_tuple, question_tuple) distance_list.append((product_tuple.id, quest_id, pq_dist)) distance_list_sorted = sorted(distance_list, key=lambda tpl : tpl[2]) return distance_list_sorted
def get_instance_encoded_dictionary(prod_id, question_id, ps_db_c, qs_db_c, d2v_model): product_row = MyUtils_dbs.search_in_alltables_db( ps_db_c, "SELECT * FROM ", "WHERE id = '" + prod_id + "'") question_row = MyUtils_dbs.search_in_alltables_db( qs_db_c, "SELECT * FROM ", "WHERE id = '" + str(question_id) + "'") prod_tuple = MyUtils.prodls_tonamedtuple(product_row[0]) q_tuple = MyUtils.quest_lstonamedtuple(question_row[0]) instance_x = {} instance_x["p_descvec"] = MyUtils_strings.fromstring_toarray( prod_tuple.descvec) instance_x["p_titlevec"] = MyUtils_strings.fromstring_toarray( prod_tuple.titlevec) instance_x["p_kwsVectors"] = MyUtils_strings.fromlls_toarrays( prod_tuple.kwsVectors) #logging.debug("instance_x['p_kwsVectors'].shape : %s", np.array(instance_x["p_kwsVectors"]).shape) instance_x["p_mdcategories"] = MyUtils_strings.categories_to_vecs_lls( MyUtils_strings.fromlls_toarrays(prod_tuple.mdcategories), d2v_model) if len(np.array(instance_x["p_mdcategories"]).shape) >= 3: logging.debug("instance_x['p_mdcategories'].shape : %s", np.array(instance_x["p_mdcategories"]).shape) instance_x["p_mdcategories"] = instance_x["p_mdcategories"][0] instance_x["q_questionVec"] = MyUtils_strings.fromstring_toarray( q_tuple.questionVec) instance_x["q_questionType"] = q_tuple.questionType instance_x["q_kwsVectors"] = MyUtils_strings.fromlls_toarrays( q_tuple.kwsVectors) instance_y = 1 if q_tuple.id[0:10] in prod_id else 0 instance = namedtuple('instance', 'x y') inst = instance(x=instance_x, y=instance_y) return inst
def train_NN(learning_rate = 0.01, max_epochs=1000, batch_size=32, dropout_rate=0, hiddenlayers_ls=None): MyUtils.init_logging("train_NN.log") hiddenlayers_ls_str = [str(num_elems) for num_elems in hiddenlayers_ls] tensorboard_dir_path = os.path.join(F.TENSORBOARD_ANN_DIR, 'trainingset_' + str( MyUtils_dbs.get_nn_dataset_length("train")), "bs_" + str(batch_size), "hls_" + "-".join(hiddenlayers_ls_str), "lr_" + str(learning_rate), "drop_" + str(dropout_rate) + "eps_" + str(max_epochs)) if not os.path.exists(tensorboard_dir_path): os.makedirs(tensorboard_dir_path) MyUtils_filesystem.clean_directory(tensorboard_dir_path) tf.reset_default_graph() session = tf.Session() logging.info("Creating the placeholders for input and labels...") (input_placeholder, labels_placeholder) = NN.get_model_placeholders(batch_size) placeholders = (input_placeholder, labels_placeholder) logging.info("Connecting the loss computation and forward structure...") train_loss = NN.nn_loss_computation(logits=NN.nn_inference(input=input_placeholder, layers_hidden_units_ls=hiddenlayers_ls, dropout_rate=dropout_rate), labels=labels_placeholder) lrate_tensor = tf.placeholder(shape=[], dtype=tf.float32, name="lrate_tensor") ####### Defining the optimizer if str(learning_rate).lower() == MyUtils_flags.FLAG_ADAM: starting_lrate = MyUtils_flags.FLAG_ADAM optimizer = tf.train.AdamOptimizer() else: if str(learning_rate).lower() == MyUtils_flags.FLAG_RMSPROP: starting_lrate = MyUtils_flags.FLAG_RMSPROP optimizer = tf.train.RMSPropOptimizer(0.001) else: starting_lrate = learning_rate optimizer = tf.train.GradientDescentOptimizer(lrate_tensor) if str(learning_rate).lower() == MyUtils_flags.FLAG_CLR: _best_lr, min_lr, max_lr = CLR.find_cyclical_lrate_loop(placeholders, batch_size, hiddenlayers_ls, dropout_rate) # Summaries, and gathering information: train_loss_summary = tf.summary.scalar('Cross-entropy', train_loss) predictions = tf.argmax(tf.nn.softmax(logits=NN.nn_inference(input_placeholder, hiddenlayers_ls, dropout_rate)), axis=1, name="predictions") tf_metric, tf_metric_update = tf.metrics.accuracy(labels=labels_placeholder, predictions=predictions, name="accuracy") accuracy_summary = tf.summary.scalar('Accuracy', tf_metric_update) logging.info("Defining the optimizer's minimization task on the loss function...") minimizer_task = optimizer.minimize(train_loss) #Global variables are initialized after the graph structure tf.global_variables_initializer().run(session=session) #defining the tasks that will be run inside the training loop training_tasks = [minimizer_task, train_loss, predictions, tf_metric_update] validation_tasks = [tf_metric_update, predictions] validation_writing_tasks = [accuracy_summary] train_writing_tasks = [train_loss_summary, accuracy_summary] tasks_dictionary = {MyUtils_flags.FLAG_TRAIN_TASKS: training_tasks, MyUtils_flags.FLAG_WRITING_TRAIN_TASKS: train_writing_tasks, MyUtils_flags.FLAG_VALIDATION_TASKS: validation_tasks, MyUtils_flags.FLAG_WRITING_VALIDATION_TASKS: validation_writing_tasks} #connection to the validation dataset valid_db_conn = sqlite3.connect(F.NN_VALID_INSTANCES_DB) valid_db_cursor = valid_db_conn.cursor() if str(learning_rate).lower() == MyUtils_flags.FLAG_CLR: CLR.training_loop_clr(tasks_dictionary, placeholders, batch_size, max_epochs, min_lr, max_lr, valid_db_cursor, tensorboard_dir_path) else: training_loop(tasks_dictionary, placeholders, starting_lrate, batch_size, max_epochs, valid_db_cursor, tensorboard_dir_path, session)
def define_negative_examples(doc2vec_model, dataset_typeflag): MyUtils.init_logging("NN_Dataset_Instances-define_negative_examples.log", logging.INFO) f = open(F.PRODS_WITH_NOTASKEDQUESTS_IDS, "w") f.close() prodsnegativeqs_outfile = open(F.PRODS_WITH_NOTASKEDQUESTS_IDS, "a") prodsnegativeqs_outfile.write("id_questionsNotAsked\n") ### Connect with the database to read from: candidate negative examples db_conn = sqlite3.connect(F.CANDIDATE_NEGQS_DB) c = db_conn.cursor() ### IF we are working to create the training dataset, ### then we before allowing a question Q asked for P2 to be a negative example for P1, ### we check the similarity between P1 and P2 (it must not be too high) if dataset_typeflag == MyUtils_flags.FLAG_TRAIN: ### Determining the maximum allowed similarity between products. Creates the similarity db if it does not exist if os.path.exists(F.SIMILARITY_PRODUCTS_DB) == True: p_sim_breakpoint = ES.get_products_similarity_breakpoint( fraction=0.97) else: p_sim_breakpoint = ES.explore_products_similarity(N=500, fraction=0.97) ### Connect with the databases of product and questions representations, to be able to pick the products P1 and P2 product_reps_dbconn = sqlite3.connect(F.PRODUCTS_FINAL_TRAIN_DB) product_reps_c = product_reps_dbconn.cursor() segment_size = 10**4 for input_segment in pd.read_csv(F.PRODSWITHQUESTS_IDS, sep="_", chunksize=segment_size): for id_askedqs_t in input_segment.itertuples(): prod_id = id_askedqs_t.id #logging.debug("Reading from F.PRODSWITHQUESTS_IDS, the product.id is: %s", prod_id) asked_qs = ast.literal_eval(id_askedqs_t.questionsAsked) t = (prod_id, ) c.execute('SELECT * FROM prodnegatives WHERE prod_id=?', t) row = c.fetchone() if row is None: #i.e. if the product in the file PRODSWITHQUESTS_IDS was excluded from the previous random subsampling continue candidatenegativeqs_rawstring = row[1] candidatenegativeqs_string = "[" + candidatenegativeqs_rawstring[: -1] + "]" candidatenegativeqs_ls = ast.literal_eval( candidatenegativeqs_string) candidatenegativeqs_ls1 = [ q_id for q_id in candidatenegativeqs_ls if q_id not in asked_qs ] if dataset_typeflag == MyUtils_flags.FLAG_TRAIN: p1_row = MyUtils_dbs.search_in_alltables_db( dbcursor=product_reps_c, query_pretext="SELECT * FROM", query_aftertext=" WHERE id='" + str(prod_id) + "'")[0] candidatenegativeqs_asins = list( map(lambda q_id: q_id[0:10], candidatenegativeqs_ls1)) p2_rows = MyUtils_dbs.search_in_alltables_db( dbcursor=product_reps_c, query_pretext="SELECT * FROM", query_aftertext="WHERE id IN " + str(tuple(candidatenegativeqs_asins))) qids_and_p2rows = list(zip(candidatenegativeqs_ls1, p2_rows)) for q_id, p2_row in qids_and_p2rows: #logging.debug("p1_row : %s", p1_row) if p2_row is not None and len(p2_row) > 0: #there are questions without corresponding products, in which case no similarity check is to be done p1_tuple = MyUtils.prodls_tonamedtuple(p1_row) #[1:]? p2_tuple = MyUtils.prodls_tonamedtuple(p2_row) p1_p2_sim, _simparts = PS.compute_2products_similarity_singleprocess( prod1_tuple=p1_tuple, prod2_tuple=p2_tuple, d2v_model=doc2vec_model) if p1_p2_sim > p_sim_breakpoint: candidatenegativeqs_ls1.remove(q_id) logging.info( "Removing question from the candidate negative examples, " + "because the similarity between %s and %s is > %s", prod_id, p2_tuple.id, p_sim_breakpoint) logging.info( "Choosing negative examples: P-to-p similarity checks done for product: %s", prod_id) random_indices = sorted( np.random.choice(a=range(len(candidatenegativeqs_ls1)), size=min(len(candidatenegativeqs_ls1), len(asked_qs)), replace=False, p=None)) #logging.info(candidatenegativeqs_ls1) negativeqs_ls = [ candidatenegativeqs_ls1[i] for i in random_indices ] #logging.info(negativeqs_ls) prodsnegativeqs_outfile.write(prod_id + "_" + str(negativeqs_ls) + "\n") prodsnegativeqs_outfile.close()
def get_num_training_iterations(batch_size): trainset_length = MyUtils_dbs.get_nn_dataset_length( MyUtils_flags.FLAG_TRAIN) max_iter = trainset_length // batch_size # in 1 epoch, you can not have more iterations than batches logging.info("Number of iterations per epoch: %s", max_iter) return max_iter
def run_hedge(actions_ls=None, eta=None, max_T=None, balanced_instances=True, restart_candidates=True): MyUtils.init_logging("Hedge-run_hedge.log") ### initialization: either we use the single global balanced dataset, or the imbalanced category datasets if balanced_instances: dbs_paths = [(F.ONLINE_INSTANCEIDS_GLOBAL_DB, F.PRODUCTS_FINAL_TRAIN_DB, F.QUESTIONS_FINAL_TRAIN_DB)] else: category_dirpaths = MyUtils_filesystem.get_category_dirpaths() dbs_paths = [ ] #list of tuples, with 3 elements: instancedb, products_db, qs_db for c_dir_path in category_dirpaths: for fname in os.listdir(c_dir_path): if "db" in fname: if MyUtils_flags.FLAG_INSTANCEIDS in fname: categ_instances_dbpath = os.path.join( c_dir_path, fname) elif MyUtils_flags.FLAG_PRODUCTS in fname: categ_prods_dbpath = os.path.join(c_dir_path, fname) else: categ_qs_dbpath = os.path.join(c_dir_path, fname) dbs_paths.append( (categ_instances_dbpath, categ_prods_dbpath, categ_qs_dbpath)) ### connecting with the database containing the candidates if balanced_instances: output_candidates_dbpath = F.CANDIDATES_ONLINE_BALANCED_DB else: output_candidates_dbpath = F.CANDIDATES_ONLINE_UNBALANCED_DB if restart_candidates: f = open(output_candidates_dbpath, "w") f.close() output_candidates_db = sqlite3.connect(output_candidates_dbpath) output_candidates_c = output_candidates_db.cursor() if restart_candidates: output_candidates_c.execute("""CREATE TABLE candidates ( p_id varchar(63), q_id varchar(63) )""") #For each dataset: connect to databases of instances, Ps, and Qs for (instances_dbpath, prods_dbpath, quests_dbpath) in dbs_paths: instances_db = sqlite3.connect(instances_dbpath) instances_ids_c = instances_db.cursor() prods_db = sqlite3.connect(prods_dbpath) ps_c = prods_db.cursor() quests_db = sqlite3.connect(quests_dbpath) qs_c = quests_db.cursor() chosen_dataset_name = os.path.basename( os.path.dirname(instances_dbpath)) logging.info("Online Learning: operating on dataset: %s", chosen_dataset_name) #### define the number of rounds if max_T is None: max_T = MyUtils_dbs.get_tot_num_rows_db(instances_ids_c) logging.info( "Total number of rounds (i.e. instances in the training set): %s", max_T) #### define the actions if actions_ls is None: if balanced_instances == False: actions_ls = ACD.get_actionsforcategories() else: actions_ls = AGD.get_actionsforbalanced() #### define the "learning rate" if eta is None: eta = np.sqrt((2 * np.log(len(actions_ls))) / max_T) #### output directory for Tensorboard logging results_dirpath = os.path.join( 'OnlineLearning', 'Experiments_results', str(chosen_dataset_name), 'numactions_' + str(len(actions_ls)), 'instances_' + str(max_T)) #datetime.datetime.today().strftime('%Y-%m-%d') if not os.path.exists(results_dirpath): os.makedirs(results_dirpath) MyUtils_filesystem.clean_directory(results_dirpath) #### the actual core of the algorithm hedge_loop(eta, max_T, actions_ls, instances_ids_c, ps_c, qs_c, output_candidates_db, balanced_instances, results_dirpath) else: logging.info("Online Learning results already computed for : %s", results_dirpath)
def attach_text_to_candidates(ranked_candidates_dbpath, prods_initial_dbpath, quests_initial_dbpath, prod_reps_dbpath, quest_reps_dbpath, final_outdb_path): MyUtils.init_logging("Attach_text_to_candidates.log") candidates_nn_db = sqlite3.connect(ranked_candidates_dbpath) cands_db_c = candidates_nn_db.cursor() f = open(F.RANKING_TEMP_DB, 'w'); f.close() temp_db = sqlite3.connect(F.RANKING_TEMP_DB) temp_db_c = temp_db.cursor() testprods_initial_c = sqlite3.connect(prods_initial_dbpath).cursor() testquests_initial_c = sqlite3.connect(quests_initial_dbpath).cursor() testprods_rep_c = sqlite3.connect(prod_reps_dbpath).cursor() testquests_rep_c = sqlite3.connect(quest_reps_dbpath).cursor() temp_db_c.execute('''CREATE TABLE candidates( p_id varchar(63), q_id varchar(63), distance int, p_titletext varchar(1023), p_descriptiontext varchar(8191), p_categorytext varchar (4095), q_text varchar (8191) )''') num_of_candidates = MyUtils_dbs.get_tot_num_rows_db(cands_db_c) logging.info(num_of_candidates) counter_questionsameid = 0 last_prod_id = 'x' for rowindex in range(1, num_of_candidates + 1): row = cands_db_c.execute("SELECT * FROM candidates WHERE rowid = ?", (rowindex,)).fetchone() #logging.info("info: %s", row) prod_id = row[0] quest_id = row[1] distance = row[2] if last_prod_id != prod_id: product_titleinfo,product_descinfo, product_categinfo = \ MyUtils_dbs.search_in_alltables_db(testprods_initial_c, "SELECT title, description, categories FROM", "WHERE asin = '" + str(prod_id) + "'")[0] product_representation = MyUtils_dbs.search_in_alltables_db(testprods_rep_c, "SELECT * FROM ", "WHERE id = '" + str(prod_id) + "'")[0] prod_tpl = MyUtils.prodls_tonamedtuple(product_representation, offset=1) counter_questionsameid = 0 ###get question's unixTime if len(quest_id)< 21: #format : @nan0 base_endpoint = 14 question_unixTime = str(quest_id[11:base_endpoint]) else: base_endpoint = 23 question_unixTime = str(quest_id[11:base_endpoint]) logging.debug("Question unixTime: %s", question_unixTime) if base_endpoint == 23: #if we have a valid unixTime specification possible_questions_text = MyUtils_dbs.search_in_alltables_db(testquests_initial_c, "SELECT question FROM", "WHERE asin = '" + str(quest_id[0:10]) + "'" + " AND unixTime LIKE '" + question_unixTime + "%'") else: #if we have NULL in the unixTime field possible_questions_text = MyUtils_dbs.search_in_alltables_db(testquests_initial_c, "SELECT question FROM", "WHERE asin = '" + str(quest_id[0:10]) + "'" + " AND unixTime IS NULL") base_q_id = str(quest_id[0:23]) possible_questions_reps = MyUtils_dbs.search_in_alltables_db(testquests_rep_c, "SELECT * FROM ", "WHERE id LIKE '" + str(base_q_id) + "%'") logging.debug("possible_questions_reps: %s", possible_questions_reps) logging.debug("possible_questions_text:%s", possible_questions_text) if len(possible_questions_text) > 1: possible_questions_tuples = list(map ( lambda q_ls : MyUtils.quest_lstonamedtuple(q_ls, offset=1), possible_questions_reps)) possible_questions_distances = list(map (lambda q_tpl : CD.compute_dist_pq(prod_tpl, q_tpl) , possible_questions_tuples)) qs_dist_lts = list(zip(possible_questions_tuples, possible_questions_distances)) qs_dist_lts_sorted = sorted( qs_dist_lts, key=lambda tpl : tpl[1]) #logging.info("sorted question tuples: %s", qs_dist_lts_sorted) question_textinfo = possible_questions_text[counter_questionsameid][0] counter_questionsameid= counter_questionsameid+1 else: question_textinfo = possible_questions_text[0][0] logging.debug("question_textinfo: %s", question_textinfo) temp_db_c.execute("INSERT INTO candidates VALUES (?,?,?,?,?,?,?)", (prod_id, quest_id, distance, product_titleinfo, product_descinfo, product_categinfo, question_textinfo)) logging.debug("***") temp_db.commit() os.rename(F.RANKING_TEMP_DB , final_outdb_path)
def find_cyclical_lrate_loop(placeholders, batch_size, hiddenlayers_ls, drop_rate, lrate_start=10**(-7), lrate_end=0.2): trainset_length = MyUtils_dbs.get_nn_dataset_length( MyUtils_flags.FLAG_TRAIN) max_iter = trainset_length // batch_size # in 1 epoch, you can not have more iterations than batches hiddenlayers_ls_str = [str(num_elems) for num_elems in hiddenlayers_ls] tensorboard_dir_path = os.path.join( F.TENSORBOARD_ANN_DIR, 'trainingset_' + str(MyUtils_dbs.get_nn_dataset_length("train")), "bs_" + str(batch_size), "hls_" + "-".join(hiddenlayers_ls_str), "lr_clr", "drop_" + str(drop_rate) + "_explore") if not os.path.exists(tensorboard_dir_path): os.makedirs(tensorboard_dir_path) session = tf.Session( ) # separate session for trying to find the optimal l.r. for the Cyclical Learning Rate logging.info("*** Session: Cyclical Learning Rate") (input_placeholder, labels_placeholder) = placeholders train_loss = NN.nn_loss_computation(logits=NN.nn_inference( input=input_placeholder, layers_hidden_units_ls=hiddenlayers_ls, dropout_rate=drop_rate), labels=labels_placeholder) #train_loss_summary = tf.summary.scalar('Cross-entropy', train_loss) predictions = tf.argmax(tf.nn.softmax( logits=NN.nn_inference(input_placeholder, hiddenlayers_ls, drop_rate)), axis=1) #tf_metric, tf_metric_update = tf.metrics.accuracy(labels=labels_placeholder, predictions=predictions, # name="CLR_train_accuracy") running_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="CLR_train_accuracy") #train_accuracy_summary = tf.summary.scalar('Accuracy', tf_metric_update) lrate_tensor = tf.placeholder(shape=[], dtype=tf.float32, name="lrate_tensor") optimizer = tf.train.GradientDescentOptimizer(lrate_tensor) minimizer_task = optimizer.minimize(train_loss) # Global variables are initialized after the graph structure tf.global_variables_initializer().run(session=session) running_vars_initializer = tf.variables_initializer(var_list=running_vars) session.run(running_vars_initializer) logging.info( "Number of iterations per epoch (and linear steps in the search for the learning rate): %s", max_iter) lrate_increase = (lrate_end - lrate_start) / max_iter logging.info( "Step increase of the learning rate in the exploration epochs: %s", round(lrate_increase, 7)) trial_epochs = 5 loss_matrix = np.zeros((trial_epochs, max_iter)) accuracy_matrix = np.zeros((trial_epochs, max_iter)) for i in range(1, trial_epochs + 1): start_epoch_time = time() logging.info( "Search for the base learning rate; Starting training epoch n. %s", i) batch_generator = NN.generator_of_batches(batch_size, MyUtils_flags.FLAG_TRAIN) # Train, in the current epoch for j in range(0, max_iter): session.run(running_vars_initializer ) #new batch: re-initializing the accuracy computation batch = \ batch_generator.__next__() current_iteration_feed_dict = NN.fill_feed_dict( batch, input_placeholder, labels_placeholder) learning_rate = lrate_start + lrate_increase * j current_iteration_feed_dict.update({lrate_tensor: learning_rate}) if j % (max_iter // 20) == 0: logging.info("Iteration: %s on %s .", j, max_iter) _, current_loss, b_predictions, b_labels = session.run( [minimizer_task, train_loss, predictions, labels_placeholder], feed_dict=current_iteration_feed_dict) loss_matrix[i - 1][j] = current_loss accuracy_matrix[i - 1][j] = get_batch_accuracy( b_predictions, b_labels) end_epoch_time = time() logging.info( "Searching for the base values for the cyclical learning rate. " + "Training on epoch %s executed. Time elapsed: %s", i, round(end_epoch_time - start_epoch_time, 3)) best_lr, min_lr, max_lr = pick_lr_boundaries(loss_matrix, lrate_start, lrate_increase) session.close() #write the lr to a logfile lrfile = open(os.path.join(tensorboard_dir_path, "found_lr.log"), "w") lrfile.write( "Cyclical Learning rate: applying the LR test on " + str(trial_epochs) + "epochs ;\n " + "the average learning rate granting the steepest descent of the loss function is: " + str(best_lr)) lrfile.close() return best_lr, min_lr, max_lr