def write_training_samples(prod_features_flags, quest_feature_flags, dataset_type): MyUtils.init_logging("WriteTrainingSamples.log") if "train" in dataset_type: instances_db = F.NN_TRAIN_INSTANCES_DB elif "valid" in dataset_type: instances_db = F.NN_VALID_INSTANCES_DB else: #"test" instances_db = F.NN_TEST_INSTANCES_DB f = open(instances_db, "w") f.close() #clean outdb between runs db_conn = sqlite3.connect(instances_db) c = db_conn.cursor() c.execute('''CREATE TABLE instances( p_id varchar(63), q_id varchar(63), x varchar(8191), y tinyint )''') db_conn.commit() #n: since there are no deletes, I can use the rowid as the 'index', avoiding the need for an autoincrement field #first all positive instances, then all negative instances. #The training batches are later extracted in such a way that they are random and balanced, anyway write_part_training_samples(True, prod_features_flags, quest_feature_flags, db_conn, dataset_type) write_part_training_samples(False, prod_features_flags, quest_feature_flags, db_conn, dataset_type) db_conn.close()
def filter_matches_allfeatures(): MyUtils.init_logging("OnlineLearning_DefineInstances.log") ps_db = sqlite3.connect(F.PRODUCTS_FINAL_TRAIN_DB) qs_db = sqlite3.connect(F.QUESTIONS_FINAL_TRAIN_DB) ps_db_c = ps_db.cursor() qs_db_c = qs_db.cursor() pqs_allmatches_file = open(F.ONLINE_PQMATCHES, "r") pqs_allmatches_df = pd.read_csv(pqs_allmatches_file, sep="_") filtered_matches = [] for pqs_t in pqs_allmatches_df.itertuples(): condition_p = product_has_allfeatures(ps_db_c, pqs_t.id) logging.info(pqs_t.id) condition_q = allquestions_have_allfeatures(qs_db_c, pqs_t.questionsAsked) if condition_p and condition_q: filtered_matches.append(pqs_t) pqs_allmatches_df = pd.DataFrame(filtered_matches) pqs_filteredmatches_file = open(F.ONLINE_PQMATCHES_FILTERED, "w") pqs_allmatches_df.to_csv(pqs_filteredmatches_file, sep="_") logging.info( "Number of products with matching questions, that have valid values for all the features: %s", len(filtered_matches)) pqs_allmatches_file.close() pqs_filteredmatches_file.close() del pqs_allmatches_df
def organize_category_datasets(): MyUtils.init_logging("OnlineLearning-organize_category_datasets.log") qs_csv_fpaths = get_csvs_filepaths() category_dir_paths = organize_questions(qs_csv_fpaths) for category_dir_p in category_dir_paths: attach_category_products(category_dir_p)
def create_categories_dbs(): categ_dirpaths = get_category_dirpaths() MyUtils.init_logging("OnlineLearning_create_categories_dbs.log") for categ_dir_p in categ_dirpaths: base_name = (os.path.basename(categ_dir_p)) RD.clean_representations_dbs(categ_dir_p) for filename in os.listdir(categ_dir_p): if not (utilities.MyUtils_flags.FLAG_PRODUCTS in filename) and not (utilities.MyUtils_flags.FLAG_INITIAL in filename): quests_csv_fname = filename quests_csv_path = os.path.join(categ_dir_p, quests_csv_fname) logging.info("Questions csv file: %s", quests_csv_path) elif (utilities.MyUtils_flags.FLAG_PRODUCTS in filename): prods_csv_fname = filename prods_csv_path = os.path.join(categ_dir_p, prods_csv_fname) logging.info("Products csv file: %s", prods_csv_path) quests_db_path = os.path.join( categ_dir_p, utilities.MyUtils_strings.remove_string_end( quests_csv_fname, '.csv') + '.db') RD.create_representations_db(quests_csv_path, quests_db_path) logging.info( "Category: %s .Created database for the questions. Proceeding to create the db for products...", base_name) prods_db_path = os.path.join( categ_dir_p, utilities.MyUtils_strings.remove_string_end( prods_csv_fname, '.csv') + '.db') RD.create_representations_db(prods_csv_path, prods_db_path) logging.info("Category: %s .Created database for the products", base_name)
def create_questions_representations(): MyUtils.init_logging("OnlineLearning_create_questions_representations.log") categ_dirpaths = get_category_dirpaths() (d2v_model, phrases_model) = RC.load_the_models() for categ_dir in categ_dirpaths: create_category_qs_representations(categ_dir, d2v_model, phrases_model) collect()
def sort_candidates(candidates_db_path, ranked_candidates_outdb_path, prod_reps_dbpath, quest_reps_dbpath): MyUtils.init_logging("Rank_candidates_nn.log") ### Connecting to the databases: candidates, test products, test questions candidates_nn_db = sqlite3.connect(candidates_db_path) cands_db_c = candidates_nn_db.cursor() testprods_rep_c = sqlite3.connect(prod_reps_dbpath).cursor() testquests_rep_c = sqlite3.connect(quest_reps_dbpath).cursor() f = open(ranked_candidates_outdb_path, "w"); f.close() outdb = sqlite3.connect(ranked_candidates_outdb_path) outdb_c = outdb.cursor() outdb_c.execute('''CREATE TABLE candidates( p_id varchar(63), q_id varchar(63), distance int )''') ### test_products_ids = cands_db_c.execute("SELECT DISTINCT p_id FROM candidates").fetchall() logging.info(test_products_ids[0]) #logging.debug(test_products_ids) for tpl_pid in test_products_ids: pid = tpl_pid[0] product_representation = MyUtils_dbs.search_in_alltables_db(testprods_rep_c, "SELECT * FROM ", "WHERE id = '" + str(pid) + "'")[0] product_tuple = MyUtils.prodls_tonamedtuple(product_representation, offset=1) quests_ids = list(map ( lambda results_tpl : results_tpl[0], cands_db_c.execute("SELECT q_id FROM candidates WHERE p_id = ?", tpl_pid).fetchall())) logging.debug(quests_ids) product_qs_sorted = sort_product_candidates(product_tuple, quests_ids, testprods_rep_c, testquests_rep_c) outdb.executemany("INSERT INTO candidates VALUES (?,?,?)", product_qs_sorted) outdb.commit()
def compute_encoding_questions(questions_filepath, out_db_filepath): MyUtils.init_logging("ComputeEncodingQuestions.log") questions_file = open(questions_filepath, "r") f = open(out_db_filepath, mode="w"); f.close() # clean between runs db_conn = sqlite3.connect(out_db_filepath) c = db_conn.cursor() c.execute('''CREATE TABLE qs_numenc(q_id varchar(63) NOT NULL, has_questionType tinyint, has_questionVec tinyint, has_kwsVectors tinyint, encoding_questionType varchar(15), encoding_questionVec varchar(8191), encoding_kwsVectors varchar(8191), PRIMARY KEY (q_id) )''') db_conn.commit() segment_size = 5 * 10**3 segment_id = 1 for input_segment in pd.read_csv(questions_file, sep="_", chunksize=segment_size): segment_start = time() for quest_t in input_segment.itertuples(): if len(quest_t.id) >=5 : #filter out undue headers #logging.info(quest_t) (q_flags, encodings) = get_num_encoding_quest(quest_t) c.execute('''INSERT INTO qs_numenc VALUES (?,?,?,?,?,?,?);''', (quest_t.id, int(q_flags[0]), int(q_flags[1]), int(q_flags[2]), str(encodings[0]), str(encodings[1]), str(encodings[2]))) segment_end = time() db_conn.commit() logging.info("Encoded questions' chunk n. %s ... Time elapsed = %s", segment_id, round(segment_end - segment_start,3)) segment_id = segment_id+1
def process_all_mdinfo(prods_in_df, outfilepath, phrases_model, d2v_model): MyUtils.init_logging("ExtractMetadataInfo.log") f = open(outfilepath, "w") f.close() #clean between runs sw_pattern = PPD.getStopwordsPattern(includePunctuation=True) logging.info("Started postprocessing other metadata info") segment_nrows = 5 * 10**4 logging.info("Number of elements in a segment: %s", str(segment_nrows)) with open(outfilepath, "a") as out_file: out_file.write("_id_description_price_titlevec_mdcategories\n") for input_segment in pd.read_csv(prods_in_df, chunksize=segment_nrows, sep="_"): chunk_start = time() mdinfo_lts = [] for prod_tupl in input_segment.itertuples(): prodinfo_tuple = process_prodinfo(prod_tupl, phrases_model, d2v_model, sw_pattern) mdinfo_lts.append(prodinfo_tuple) pd.DataFrame(mdinfo_lts).to_csv(out_file, mode="a", header=False, sep="_") chunk_end = time() logging.info( "Processing: other metadata info. Segment completed in time : %s seconds", str(round(chunk_end - chunk_start, 3))) logging.info("Completed: processing product metadata.")
def test(): MyUtils.init_logging("VectorizeDescriptions.log") docs_percent_touse = 1 # on the full training set, 0.3 is probably advisable. chunk_size = 10 ** 5 doc_filenames = [F.DESCDOCS] #, F.QADOCS_FILEPATH trainingset_ls = [] for doc_filename in doc_filenames: for descdocs_chunk in pd.read_csv(doc_filename, chunksize=chunk_size): len_c = len(descdocs_chunk) indices = list(sorted(numpy.random.choice(len_c, int(docs_percent_touse * len_c), replace=False))) selected_rows = descdocs_chunk.iloc[indices] docs = [] for tupl in selected_rows.itertuples(): docs.append(D2V.TaggedDocument(words=ast.literal_eval(tupl.words), tags=ast.literal_eval(tupl.tags))) trainingset_ls.extend(docs) logging.info("Reading in the documents' words. Chunk processed...") logging.info("Completed: reading in a set of documents.") d2v_model = load_model() subset = trainingset_ls[0:5] logging.debug("%s", str(subset)) for doc in subset: tag = doc.tags logging.debug("*** : %s" , str(tag)) logging.debug("XXX : %s" , str(tag[0])) logging.debug("%s",str(d2v_model.docvecs[tag[0]]))
def vectorize_keywords(in_kwsdf_filepath, phrases_model, d2v_model, out_kwvecs_filepath): MyUtils.init_logging(logfilename="MyRAKE_vectorizekws.log") logging.info("Started to vectorize the keywords") f = open(out_kwvecs_filepath, "w") f.close() # clean between runs segment_nrows = 10**4 #10**4 logging.info("Number of elements in a segment: %s", str(segment_nrows)) current_segment = 1 max_len = 10 #n of keywords to extract with open(out_kwvecs_filepath, "a") as outfile: outfile.write(",id,kwsVectors\n") for input_segment in pd.read_csv(in_kwsdf_filepath, chunksize=segment_nrows): executor = pathos.pools.ThreadPool(multiprocessing.cpu_count()) t00 = time() args = ((elem_kws, phrases_model, d2v_model, max_len) for elem_kws in input_segment.itertuples()) kws_vecs = list(executor.map(vectorize_kw_ls, args)) pd.DataFrame(kws_vecs).to_csv(outfile, mode='a', header=False) logging.info( "Keyword vectorization ; segment n.%s of the input dataframe has been processed...", current_segment) current_segment = current_segment + 1 t11 = time() logging.info("Time elapsed for a segment : %s", str(round(t11 - t00, 3))) executor.terminate() executor.restart() logging.info("Keyword vectorization : finished.")
def createQuestionDocuments(): MyUtils.init_logging("PreprocessQuestions.log") ds = open(F.QADOCS_RAW, 'w') #cleaning the file between runs ds.close() start_creatingInput = time.time() # Method: itertuples + pickle. Objective: preprocess text and create TaggedDocuments sw_pattern = PPD.getStopwordsPattern(includePunctuation=False) punct_pattern = re.compile( r'([!"#$%&()*+,./:;<=>?@\[\\\]^_`{|}-~\'])|([--])') chunk_length = 0.5 * (10**5) with open(F.QADOCS_RAW, "a") as qadocs_file: qadocs_file.write(",words,tags\n") for input_segment in pd.read_csv(RQ.QA_TRAIN_DFPATH, chunksize=chunk_length, sep="_"): chunk_0 = map( lambda tupl: createDocForRow(tupl, sw_pattern, punct_pattern), input_segment.itertuples()) chunk_1 = list(filter(lambda x: x is not None, chunk_0)) print( getsizeof(chunk_1) // (2**10) ) # debugging : get size of the chunk in Kilobytes. It also works as a progress update pd.DataFrame(chunk_1).to_csv(path_or_buf=qadocs_file, mode="a", header=False) logging.info("Chunk of documents created...") end_creatingInput = time.time() logging.info("Time spent creating the Documents: %s", str(round(end_creatingInput - start_creatingInput, 3)))
def create_onlinelearning_traininstances(): MyUtils.init_logging("create_onlinelearning_traininstances.log") file = open(F.ONLINE_INSTANCEIDS_GLOBAL_DB, "w") file.close() out_db = sqlite3.connect(F.ONLINE_INSTANCEIDS_GLOBAL_DB) out_c = out_db.cursor() out_c.execute('''CREATE TABLE positiveinstances( p varchar(63), qs_ls varchar(8191) )''') out_db.commit() #IE.register_matches() #filter_matches_allfeatures() pqs_filteredmatches_df = pd.read_csv(F.ONLINE_PQMATCHES_FILTERED, sep="_") prods_ids_ls = [] for pid_qs_t in pqs_filteredmatches_df.itertuples(): out_c.execute("INSERT INTO positiveinstances VALUES (?,?)", (pid_qs_t.id, pid_qs_t.questionsAsked)) prods_ids_ls.append(pid_qs_t.id) out_db.commit() logging.info( "Creating balanced training instances for Online learning: Positive instances determined..." ) del pqs_filteredmatches_df collect() IE.get_negative_indices(prods_ids_ls) collect() IE.assign_candidate_negative_examples(prods_ids_ls) collect() IE.define_negative_examples() unpack_question_lists() shuffle_balancedinstances_db()
def organize_qa_all(): MyUtils.init_logging("ReadQuestions.log", loglevel=logging.INFO) clean_old_files(all=False) filenames = get_filenames() core_filenames = list(map(lambda s: utilities.MyUtils_strings.remove_string_end(s, ".json.gz"), filenames)) #logging.info(str(core_filenames)) for cf in core_filenames: organize_qa_subfile(cf) clean_empty_files()
def test_my_rake(): the_stopwords_pattern = PD.getStopwordsPattern(includePunctuation=True) md_df = RM.load_md(RM.READKEYWORD_TRAINSUBSET) elem = MyUtils.pickRandomElement(md_df) while elem.description == "nan" or len( elem.description ) == 0: #a null value may be nan (for prods) or '' (for quests) elem = MyUtils.pickRandomElement(md_df) apply_my_rake(elem.description, the_stopwords_pattern)
def explore_cosine_similarity(n_products=100, n_questions=100, p_featurename="descvec", q_featurename="questionVec", fraction=0.75): MyUtils.init_logging("CosineSimilarity.log") logging.info( "Computing a cosine similarity breakpoint at fraction %s, between P:%s and Q:%s ...", fraction, p_featurename, q_featurename) prods_representations_db = sqlite3.connect(F.PRODUCTS_FINAL_TRAIN_DB) ps_c = prods_representations_db.cursor() quests_representations_db = sqlite3.connect(F.QUESTIONS_FINAL_TRAIN_DB) qs_c = quests_representations_db.cursor() ###### Get Doc2Vec vectors from the randomly selected products random_indices = get_random_indices(ps_c, n_products) random_indices_querystring = str(tuple(random_indices)) if len(random_indices) > 1 \ else "(" + str(random_indices[0]) + ")" selected_pf_strings_ts = utilities.MyUtils_dbs.search_in_alltables_db( ps_c, "SELECT " + str(p_featurename) + " FROM", "WHERE rowid IN " + random_indices_querystring) # Unpacking the tuples (each tuple is simply a container for one feature). selected_pf_strings = list(map(lambda t: t[0], selected_pf_strings_ts)) d2v_model = D2V.Doc2Vec.load( F.D2V_MODEL) #it is loaded to obtain the vectors for the mdcategories ps_vectors = get_products_vectors(p_featurename, n_products, selected_pf_strings, d2v_model) del d2v_model collect() ###### ###### Get Doc2Vec vectors from the randomly selected questions random_indices = get_random_indices(qs_c, n_questions) random_indices_querystring = str(tuple(random_indices)) if len(random_indices) > 1 \ else "(" + str(random_indices[0]) + ")" selected_qf_strings_ts = utilities.MyUtils_dbs.search_in_alltables_db( qs_c, "SELECT " + str(q_featurename) + " FROM", "WHERE rowid IN " + random_indices_querystring) selected_qf_strings = list(map(lambda t: t[0], selected_qf_strings_ts)) qs_vectors = get_questions_vectors(q_featurename, n_questions, selected_qf_strings) ###### M = compute_matrix_cosinesims(ps_vectors, qs_vectors) breakpoint = compute_breakpoint(M, fraction) show_graphic_cosinesim(M, p_featurename, q_featurename, breakpoint, fraction) update_breakpoints_db(breakpoint, p_featurename, q_featurename, n_products, n_questions, fraction) return breakpoint
def shuffle_training_halves(): MyUtils.init_logging("Shuffle_training_halves.log", logging.INFO) db_conn = sqlite3.connect(F.NN_TRAIN_INSTANCES_DB) c = db_conn.cursor() f = open(F.NN_TEMP_INSTANCES_DB, "w") f.close() #clean outdb outdb_conn = sqlite3.connect(F.NN_TEMP_INSTANCES_DB) outc = outdb_conn.cursor() outc.execute('''CREATE TABLE instances( p_id varchar(63), q_id varchar(63), x varchar(8191), y tinyint )''') outdb_conn.commit() tot_num_of_rows = utilities.MyUtils_dbs.get_nn_dataset_length( utilities.MyUtils_flags.FLAG_TRAIN) half_mark = tot_num_of_rows // 2 ids_pos = np.random.choice(range(1, half_mark + 1), half_mark, replace=False) # the effect of the previous line is to permute the indices from 0 to half_mark ids_neg = np.random.choice(range(half_mark, tot_num_of_rows), half_mark, replace=False) for id_pos in ids_pos: picked_row = c.execute("SELECT * FROM instances WHERE rowid = " + str(id_pos)).fetchone() p_id = picked_row[0] q_id = picked_row[1] x = picked_row[2] y = picked_row[3] outc.execute('''INSERT INTO instances VALUES (?,?,?,?);''', (p_id, q_id, str(x), y)) outdb_conn.commit() logging.info( "Training set: Positive instances have been shuffled. Proceeding to shuffle negative instances..." ) for id_neg in ids_neg: picked_row = c.execute("SELECT * FROM instances WHERE rowid = " + str(id_neg)).fetchone() p_id = picked_row[0] q_id = picked_row[1] x = picked_row[2] y = picked_row[3] outc.execute('''INSERT INTO instances VALUES (?,?,?,?);''', (p_id, q_id, str(x), y)) outdb_conn.commit() logging.info("Training set: Negative instances have been shuffled.") os.rename(src=F.NN_TEMP_INSTANCES_DB, dst=F.NN_TRAIN_INSTANCES_DB)
def explore(): MyUtils.init_logging("ExploreStopwordsMethods.log") descDocs_ls = list(pandas.read_csv(F.DESCDOCS_RAW))[0:10] print((descDocs_ls[0:5])) #exploration & debug model_forVocabulary = D2V.Doc2Vec() model_forVocabulary.build_vocab(sentences=descDocs_ls, update=False, progress_per=1000, keep_raw_vocab=True, trim_rule=None) #convert the vocabulary dictionary into a list for ordering vocab_list = [(k, v) for k, v in model_forVocabulary.raw_vocab.items()] vocab_list.sort( key=lambda tuple: tuple[1], reverse=True) #sorted in place, descendingly, depending on the value logging.info("Length of the whole vocabulary : " + str(len(vocab_list))) logging.info(str(vocab_list[0:400])) pandas.DataFrame( vocab_list[0:400]).to_csv("stopwords/wordFrequencies_ls.csv") singletons_vocab_list = list( filter(lambda tupl: getSingletons(tupl), vocab_list)) logging.info("Number of singletons : " + str(len(singletons_vocab_list))) logging.info(str(singletons_vocab_list[0:1000])) urls_vocab_list = list(filter(lambda tupl: getURLs(tupl), vocab_list)) logging.info("Number of URLs : " + str(len(urls_vocab_list))) logging.info(str(urls_vocab_list[0:1000])) wordDocFrequency_dict = dict.fromkeys(model_forVocabulary.raw_vocab.keys(), 0) for taggedDocument in descDocs_ls: already_encountered = [] words_ls = taggedDocument.words for word in words_ls: if word not in already_encountered: wordDocFrequency_dict[word] = wordDocFrequency_dict[word] + 1 already_encountered.append(word) # It would be log (N / df(w)). For ordering purposes, (N / df(w)) suffices, or even (1 / df(w)) # Therefore, to pick the words with lowest IDF we must pick those with a higher df(w) docFreq_list = [(k, v) for k, v in wordDocFrequency_dict.items()] docFreq_list.sort( key=lambda tuple: tuple[1], reverse=True) # sorted in place, descendingly, depending on the value logging.info("The Doc-frequencies of the words have been determined.") logging.info(str(docFreq_list[0:400])) pandas.DataFrame( docFreq_list[0:400]).to_csv("stopwords/docFrequencies_ls.csv")
def shuffle_balancedinstances_db(): MyUtils.init_logging("Shuffle_training_halves.log", logging.INFO) db_conn = sqlite3.connect(F.ONLINE_INSTANCEIDS_GLOBAL_DB) c = db_conn.cursor() f = open(F.ONLINE_TEMP_DB, "w") f.close() # clean outdb outdb_conn = sqlite3.connect(F.ONLINE_TEMP_DB) outc = outdb_conn.cursor() outc.execute('''CREATE TABLE positiveinstances(p varchar(63), qs_ls varchar(8191) ) ''') outc.execute('''CREATE TABLE negativeinstances(p varchar(63), qs_ls varchar(8191) ) ''') outdb_conn.commit() num_pos_instances = c.execute( "SELECT COUNT(*) FROM positiveinstances").fetchone()[0] num_neg_instances = c.execute( "SELECT COUNT(*) FROM negativeinstances").fetchone()[0] ids_pos = np.random.choice(a=range(1, num_pos_instances + 1), size=num_pos_instances, replace=False) ids_neg = np.random.choice(a=range(1, num_neg_instances + 1), size=num_neg_instances, replace=False) for id_pos in ids_pos: picked_row = c.execute( "SELECT * FROM positiveinstances WHERE rowid = " + str(id_pos)).fetchone() p = picked_row[0] qs = picked_row[1] outc.execute('''INSERT INTO positiveinstances VALUES (?,?);''', (str(p), qs)) outdb_conn.commit() logging.info( "Training set: Positive instances have been shuffled. Proceeding to shuffle negative instances..." ) for id_neg in ids_neg: picked_row = c.execute( "SELECT * FROM negativeinstances WHERE rowid = " + str(id_neg)).fetchone() p = picked_row[0] qs = picked_row[1] outc.execute('''INSERT INTO negativeinstances VALUES (?,?);''', (str(p), qs)) outdb_conn.commit() logging.info("Training set: Negative instances have been shuffled.") rename(src=F.ONLINE_TEMP_DB, dst=F.ONLINE_INSTANCEIDS_GLOBAL_DB)
def create_phrases_model(): MyUtils.init_logging("Encode_Common.log") logging.info("Starting preparation of phrases...") docs_percent_touse = 1 #0.5. chunk_size = 10**5 doc_filenames = [F.DESCDOCS_RAW, F.QADOCS_RAW] doc_files = [open(doc_filename, "r") for doc_filename in doc_filenames] all_docwords = [] for doc_file in doc_filenames: for docs_chunk in pd.read_csv(doc_file, chunksize=chunk_size): len_c = len(docs_chunk) words_chunk = [] indices = list( sorted( numpy.random.choice(len_c, int(docs_percent_touse * len_c), replace=False))) selected_rows = docs_chunk.iloc[indices] for tupl in selected_rows.itertuples(): word_ls = ast.literal_eval(tupl.words) words_chunk.append(word_ls) all_docwords.extend(words_chunk) logging.info("Reading in the documents' words. Chunk processed...") logging.info("Completed: reading in a set of documents' words" ) # @ time = " + str(round(time1 - start, 3))) logging.info("Number of documents to use in the Phrases model: %s", str(len(all_docwords))) del doc_filenames del doc_files collect() phrases_model = phrases.Phrases(sentences=all_docwords, min_count=20, threshold=300, delimiter=b'_', max_vocab_size=30 * 10**6) #phraser_model = phrases.Phraser(phrases_model) #time2 = time(); logging.info( "Phrases model created") #@ time = " + str(round(time2 - start, 3))) logging.info("Memory size in MBs = %s", str(mem.asizeof(phrases_model) // 2**20)) phrases_model.save(F.PHRASES_MODEL) return phrases_model
def prepare_dq_documents(): MyUtils.init_logging("Encode_Common.log") start = time() phrases_model = phrases.Phrases.load(F.PHRASES_MODEL) logging.info( "Started updating the TaggedDocuments according to the Phrases model..." ) doc_filenames = [(F.DESCDOCS_RAW, F.DESCDOCS), (F.QADOCS_RAW, F.QADOCS)] for tupl_fn in doc_filenames: input_filename = tupl_fn[0] output_filename = tupl_fn[1] f = open(output_filename, "w") f.close() # clean output file between runs with open(output_filename, "a") as newdocs_file: newdocs_file.write(",words,tags\n") with open(input_filename, "r") as rawdocs_file: chunk_n_elems = 10**5 for segment in pd.read_csv(rawdocs_file, chunksize=chunk_n_elems): new_docs_chunk = [] for tupl in segment.itertuples(): try: old_words = ast.literal_eval( tupl.words ) #evaluates the string back into a list new_words = phrases_model[old_words] new_docs_chunk.append( D2V.TaggedDocument(words=new_words, tags=tupl.tags)) except ValueError: logging.warning( "Info: literal evaluation did not apply to element: %s", str(tupl.tags)) pd.DataFrame(new_docs_chunk).to_csv(newdocs_file, mode="a", header=False) logging.info( "Documents updated with phrases: a chunk has been processed" ) logging.info( "Completed: a set of documents has been updated with Phrases") time3 = time() logging.info("New documents created, in time = %s", str(round(time3 - start, 3)))
def order_test(finalfile_path): MyUtils.init_logging("temp.log") segment_size = 10**4 segment_counter = 1 for segment in pd.read_csv(finalfile_path, chunksize=segment_size, sep='_'): #for tpl in segment.itertuples(): # logging.info(tpl) asins = segment.id ordered = utilities.MyUtils.check_series_ids_sorted(asins, len(asins)) logging.info("Is the chunks of elements n.%s ordered?: %s", segment_counter, ordered) #if not ordered: # logging.info(asins) segment_counter = segment_counter + 1 collect()
def get_keywords_keywords_distance(prod_tuple, q_tuple): try: p_keywords = np.array( MyUtils_strings.fromlls_toarrays(prod_tuple.kwsVectors)) q_keywords = np.array( MyUtils_strings.fromlls_toarrays(q_tuple.kwsVectors)) m = len(p_keywords) n = len(q_keywords) sim_matrix = np.ones(shape=(m, n)) * -1 for i in range(m): kw_vec_1 = p_keywords[i] for j in range(n): kw_vec_2 = q_keywords[j] sim_matrix[i][j] = 1 - distance.cosine(u=kw_vec_1, v=kw_vec_2) # logging.debug("\nThe sim.matrix : %s", sim_matrix) max_similarities = MyUtils.pick_maxmatches_matrix(sim_matrix) min_distances = list(map(lambda sim: 1 - sim, max_similarities)) avg_min_distance = np.average(min_distances) return avg_min_distance except NameError: return None
def explore_phrase2vec(min_freq, phrases_threshold): MyUtils.init_logging("Explore_Phrase2Vec.log") words_lls = [] doc_filenames = [F.DESCDOCS_RAW, F.QADOCS_RAW] doc_files = [open(doc_filename, "r") for doc_filename in doc_filenames] all_docwords = [] chunk_size = 10**5 for doc_file in doc_filenames: for docs_chunk in pd.read_csv(doc_file, chunksize=chunk_size): len_c = len(docs_chunk) words_chunk = [] #indices = list(sorted(numpy.random.choice(len_c, int(docs_percent_touse * len_c), replace=False))) #selected_rows = docs_chunk.iloc[indices] for tupl in docs_chunk.itertuples(): #words = tupl.words.replace("'",'"') #logging.info(words) #word_ls = json.loads(words)#ast.literal_eval(tupl.words) word_ls = eval(tupl.words, {'__builtins__': {}}) words_chunk.append(word_ls) all_docwords.extend(words_chunk) logging.info("Added chunk from file %s to documents list...", doc_file) logging.info("Number of documents: %s", len(all_docwords)) phrases_model = phrases.Phrases(sentences=all_docwords, min_count=min_freq, threshold=phrases_threshold, delimiter=b'_') #logging.info("***The Phrases model's frequency vocabulary: %s", str(phrases_model.vocab)) phrases_vocab = phrases_model.vocab del phrases_model collect() sorted_vocabulary = sorted(list(phrases_vocab.items()), key=lambda tpl: tpl[1], reverse=True) phrases_sorted_vocabulary = list( filter(lambda tpl: '_' in str(tpl[1]), sorted_vocabulary)) individual_words_sorted_vocabulary = list( filter(lambda tpl: not ('_' in str(tpl[1])), sorted_vocabulary)) logging.info("***The vocabulary of phrases, ordered by frequency : %s ", phrases_sorted_vocabulary) logging.info("***The vocabulary of words, ordered by frequency : %s ", individual_words_sorted_vocabulary) #phrases_model.save("Exploration_phrasesModel_mincount"+ str(min_freq) + "_T"+str(phrases_threshold) + ".model") for i in range(len(words_lls) // 4): print(str(phrases_model[words_lls[i]]))
def apply_NN_on_testset(): with tf.Session() as session: MyUtils.init_logging("GetCandidatesNN.log") test_db = sqlite3.connect(F.NN_TEST_INSTANCES_DB) test_db_c = test_db.cursor() saver = tf.train.import_meta_graph(F.SAVED_NN+'.meta') saver.restore(session, tf.train.latest_checkpoint(os.path.dirname(F.SAVED_NN))) MyUtils_filesystem.clean_directory(F.NN_TEST_OUTPUT_DIR) test_filewriter = tf.summary.FileWriter(os.path.join(F.NN_TEST_OUTPUT_DIR), session.graph) #with tf.variable_scope("reuse_fortest_scope", reuse=tf.AUTO_REUSE): graph = tf.get_default_graph() input_placeholder = graph.get_tensor_by_name("input_pl:0") labels_placeholder = graph.get_tensor_by_name("labels_pl:0") placeholders = (input_placeholder, labels_placeholder) test_accuracy_summary = graph.get_tensor_by_name("Accuracy:0") predictions = graph.get_tensor_by_name("predictions:0") tf_metric, tf_metric_update = tf.metrics.accuracy(labels=labels_placeholder, predictions=predictions, name="Test_accuracy") tasks = [tf_metric_update, predictions] writing_tasks = [test_accuracy_summary] accuracy_variables = list(filter(lambda var: "accuracy" in var.name, tf.local_variables() ) ) logging.debug("Accuracy variables: %s", accuracy_variables) session.run(tf.variables_initializer(accuracy_variables)) (test_accuracy_value, foundcandidates_mapls)= \ compute_output_dataset_accuracy(1, placeholders, test_db_c,tasks, writing_tasks, [test_filewriter], session) foundcandidates_ls = sorted(foundcandidates_mapls, key=lambda elem: elem[0]) logging.info("Test accuracy value: %s", test_accuracy_value) logging.info("Candidates found: %s", foundcandidates_ls) candidates_outdb = sqlite3.connect(F.CANDIDATES_NN_DB) #outdb_c = candidates_outdb.cursor() candidates_df = pd.DataFrame(foundcandidates_ls, columns=["p_id", "q_id"], dtype=str) candidates_df.to_sql(name="candidates", con=candidates_outdb, if_exists="replace", dtype={"p_id":"varchar(63)", "q_id":"varchar(63)"})
def create_docvectors_model(): MyUtils.init_logging("VectorizeDescriptions.log") start = time() docs_percent_touse = 1 # on the full training set, 0.3 is probably advisable. chunk_size = 10 ** 5 doc_filenames = [F.DESCDOCS, F.QADOCS] doc_files = [open(doc_filename, "r") for doc_filename in doc_filenames] trainingset_ls = [] for doc_file in doc_files: for descdocs_chunk in pd.read_csv(doc_file, chunksize=chunk_size): len_c = len(descdocs_chunk) indices = list(sorted(numpy.random.choice(len_c, int(docs_percent_touse * len_c), replace=False))) selected_rows = descdocs_chunk.iloc[indices] docs = [] for tupl in selected_rows.itertuples(): docs.append(D2V.TaggedDocument(words=ast.literal_eval(tupl.words), tags=ast.literal_eval(tupl.tags))) trainingset_ls.extend(docs) logging.info("Reading in the documents' words. Chunk processed...") logging.info("Completed: reading in a set of documents.") doc_file.close() del doc_files; del doc_filenames; collect() print(trainingset_ls[0:1]) logging.info("Total number of documents in the corpus: %s", len(trainingset_ls)) logging.info("Starting to build vocabulary and Doc2Vec model.") model = D2V.Doc2Vec(min_count=4, size=200, dm=1, workers=cpu_count(), # Ignore singletons; create vectors of 200 dims; use PV-DM docvecs_mapfile = os.path.join("gensim_models", "doc2vec_memorymapped_vectors")) # create the overall vocabulary, from the descriptions and the questions: model.build_vocab(documents=trainingset_ls, update=False, progress_per=10000, keep_raw_vocab=False) logging.info("D2V Vocabulary created") model.train(documents=trainingset_ls, total_examples=len(trainingset_ls), epochs=10, start_alpha=0.025, end_alpha=0.001, word_count=0, queue_factor=2, report_delay=1.0) model.save(F.D2V_MODEL) end = time() logging.info("Doc2Vec model saved. Time elapsed = %s", str(round(end - start , 3))) logging.info("Memory size in MBs = %s", str(mem.asizeof(model) // 2 ** 20)) return model
def clean_0_ids(outfile_tofilter_path, ps_or_qs_stringflag): MyUtils.init_logging("temp.log") logging.info(outfile_tofilter_path) new_outfile_path = F.ELEMENTS_TEMP f = open(new_outfile_path, 'w') f.close() new_outfile = open(new_outfile_path, 'a') if utilities.MyUtils_flags.FLAG_QUESTS in ps_or_qs_stringflag or utilities.MyUtils_flags.FLAG_QUESTIONS in ps_or_qs_stringflag: # new_outfile.write("_id_questionType_questionVec_kwsVectors\n") # logging.info("Writing: _id_questionType_questionVec_kwsVectors", ) the_header = ["id", "questionType", "questionVec", "kwsVectors"] #or the_header? if utilities.MyUtils_flags.FLAG_PRODS in ps_or_qs_stringflag or utilities.MyUtils_flags.FLAG_PRODUCTS in ps_or_qs_stringflag: # new_outfile.write("_id_price_titlevec_descvec_mdcategories_kwsVectors\n") # logging.info("Writing: _id_price_titlevec_descvec_mdcategories_kwsVectors", ) the_header = [ "id", "price", "titlevec", "descvec", "mdcategories", "kwsVectors" ] segment_size = 5 * 10**4 segment_counter = 1 for segment in pd.read_csv(outfile_tofilter_path, chunksize=segment_size, sep='_'): segment_buffer = [] for elem_tuple in segment.itertuples(): if len(elem_tuple.id) >= 9: #.id segment_buffer.append(elem_tuple) else: logging.info(elem_tuple.id) #temp_columns = ['_4','_5','_6','NODESCVEC','_8','NOKWSVECTORS'] #segment_header = True if segment_counter==1 else False segment_df = pd.DataFrame(segment_buffer)[the_header] segment_df.columns = the_header segment_df.to_csv(new_outfile_path, mode="a", header=bool(segment_counter == 1), sep="_", index=False) logging.info("Filtered '0' ids from segment n. %s...", segment_counter) segment_counter = segment_counter + 1 collect() os.rename(src=new_outfile_path, dst=outfile_tofilter_path)
def attach_category_products(category_dir_path=None): MyUtils.init_logging("OnlineLearning-attach_category_products.log", logging.INFO) segment_size = 10**4 #the files PRODUCTS_FINAL_TRAIN and PRODUCTS_FINAL_VALID are already sorted products_train_fpath = F.PRODUCTS_FINAL_TRAIN products_valid_fpath = F.PRODUCTS_FINAL_VALID for filename in os.listdir(category_dir_path): if filename.endswith(".csv") and utilities.MyUtils_flags.FLAG_INITIAL in filename: category_products_filepath = os.path.join(category_dir_path, utilities.MyUtils_flags.FLAG_PRODUCTS + utilities.MyUtils_strings.remove_string_start(filename, utilities.MyUtils_flags.FLAG_INITIAL)) logging.info("File in which to store the products belonging to the category:%s", category_products_filepath) category_qs_fpath = os.path.join(category_dir_path,filename) #logging.info("%s", category_qs_fpath) #register_products(products_train_fpath, category_qs_fpath, category_products_filepath, append=False) register_products_db(F.PRODUCTS_FINAL_TRAIN_DB, category_qs_fpath, category_products_filepath, append=False) register_products_db(F.PRODUCTS_FINAL_VALID_DB, category_qs_fpath, category_products_filepath, append=True)
def readfirst_qa(): MyUtils.init_logging("ReadQuestions.log") clean_old_files(all=True) filenames = get_filenames() core_filenames = list(map(lambda s: utilities.MyUtils_strings.remove_string_end(s, ".json.gz"), filenames)) nameslist = list(zip(filenames, core_filenames)) for (fname, core_fname) in nameslist: with gzip.open(F.QA_DIR_PATH + "/" + fname, 'rb') as qa_file: # use gzip.open and rb if the file is compressed chunk = qa_file.readlines() # returns a list of strings, one for each line qa_df = pd.DataFrame(create_dict_from_data(chunk)) qa_df = qa_df.set_index( keys="asin") # This sets the 'asin' as the index (n: but also drops the column) qa_df.to_csv(F.QA_DIR_PATH + "/" + core_fname + ".csv", sep="_") logging.info("Did read questions subset: %s", str(fname)) clean_empty_files() clean_empty_files(F.QA_DIR_PATH)
def create_representations_db(represented_elements_filepath, outdb_filepath, id_column_name="id"): MyUtils.init_logging("CreatePs&Qs_RepresentationDatabases.log") elements_file = open(represented_elements_filepath, "r") f = open(outdb_filepath, "w") f.close() #clean outdb between runs segment_size = 2 * 10**4 segment_id = 1 db_conn = sqlite3.connect(outdb_filepath) c = db_conn.cursor() for in_segment in pd.read_csv(elements_file, sep="_", chunksize=segment_size, dtype="str", quotechar='"'): #for tpl in in_segment.itertuples(): # logging.info(tpl) # raise Exception start = time() tablename = "elements" + str(segment_id) in_segment.to_sql(tablename, db_conn, chunksize=10**4, if_exists='append', dtype={id_column_name: "varchar(63)"}) collect() c.execute("CREATE INDEX indexid_" + str(segment_id) + " ON " + tablename + " (" + id_column_name + ");") logging.info( "The segment n.%s, with %s represented elements, has been copied from final_file to database, and indexed; " + "time elapsed: %s seconds", segment_id, segment_size, round(time() - start, 3)) segment_id = segment_id + 1 db_conn.commit() db_conn.close()
def unpack_question_lists(): MyUtils.init_logging("unpack_question_lists.log", logging.INFO) db_conn = sqlite3.connect(F.ONLINE_INSTANCEIDS_GLOBAL_DB) c = db_conn.cursor() f = open(F.ONLINE_TEMP_DB, "w") f.close() # clean outdb outdb_conn = sqlite3.connect(F.ONLINE_TEMP_DB) outc = outdb_conn.cursor() outc.execute('''CREATE TABLE positiveinstances(p varchar(63), q varchar(63) ) ''') outc.execute('''CREATE TABLE negativeinstances(p varchar(63), q varchar(63) ) ''') outdb_conn.commit() p_qs_lts = c.execute("SELECT * FROM positiveinstances").fetchall() logging.info( "(Positive examples): Unpacking the lists of questions from %s products", len(p_qs_lts)) for p_qs_t in p_qs_lts: p = p_qs_t[0] qs_str = p_qs_t[1] qs_ls = json.loads(qs_str.replace("'", '"')) for q in qs_ls: outc.execute('''INSERT INTO positiveinstances VALUES (?,?);''', (str(p), str(q))) outdb_conn.commit() p_qs_lts = c.execute("SELECT * FROM negativeinstances").fetchall() logging.info( "(Negative examples): Unpacking the lists of questions from %s products", len(p_qs_lts)) for p_qs_t in p_qs_lts: p = p_qs_t[0] qs_str = p_qs_t[1] qs_ls = json.loads(qs_str.replace("'", '"')) for q in qs_ls: outc.execute('''INSERT INTO negativeinstances VALUES (?,?);''', (str(p), str(q))) outdb_conn.commit() rename(src=F.ONLINE_TEMP_DB, dst=F.ONLINE_INSTANCEIDS_GLOBAL_DB)