예제 #1
0
def write_training_samples(prod_features_flags, quest_feature_flags,
                           dataset_type):
    MyUtils.init_logging("WriteTrainingSamples.log")
    if "train" in dataset_type:
        instances_db = F.NN_TRAIN_INSTANCES_DB
    elif "valid" in dataset_type:
        instances_db = F.NN_VALID_INSTANCES_DB
    else:  #"test"
        instances_db = F.NN_TEST_INSTANCES_DB

    f = open(instances_db, "w")
    f.close()  #clean outdb between runs
    db_conn = sqlite3.connect(instances_db)
    c = db_conn.cursor()
    c.execute('''CREATE TABLE instances(    p_id varchar(63),
                                            q_id varchar(63),
                                            x varchar(8191),
                                            y tinyint                      
                                            )''')
    db_conn.commit()

    #n: since there are no deletes, I can use the rowid as the 'index', avoiding the need for an autoincrement field

    #first all positive instances, then all negative instances.
    #The training batches are later extracted in such a way that they are random and balanced, anyway
    write_part_training_samples(True, prod_features_flags, quest_feature_flags,
                                db_conn, dataset_type)
    write_part_training_samples(False, prod_features_flags,
                                quest_feature_flags, db_conn, dataset_type)
    db_conn.close()
def filter_matches_allfeatures():
    MyUtils.init_logging("OnlineLearning_DefineInstances.log")
    ps_db = sqlite3.connect(F.PRODUCTS_FINAL_TRAIN_DB)
    qs_db = sqlite3.connect(F.QUESTIONS_FINAL_TRAIN_DB)
    ps_db_c = ps_db.cursor()
    qs_db_c = qs_db.cursor()

    pqs_allmatches_file = open(F.ONLINE_PQMATCHES, "r")
    pqs_allmatches_df = pd.read_csv(pqs_allmatches_file, sep="_")
    filtered_matches = []

    for pqs_t in pqs_allmatches_df.itertuples():
        condition_p = product_has_allfeatures(ps_db_c, pqs_t.id)
        logging.info(pqs_t.id)
        condition_q = allquestions_have_allfeatures(qs_db_c,
                                                    pqs_t.questionsAsked)
        if condition_p and condition_q:
            filtered_matches.append(pqs_t)
    pqs_allmatches_df = pd.DataFrame(filtered_matches)
    pqs_filteredmatches_file = open(F.ONLINE_PQMATCHES_FILTERED, "w")
    pqs_allmatches_df.to_csv(pqs_filteredmatches_file, sep="_")

    logging.info(
        "Number of products with matching questions, that have valid values for all the features: %s",
        len(filtered_matches))
    pqs_allmatches_file.close()
    pqs_filteredmatches_file.close()
    del pqs_allmatches_df
def organize_category_datasets():
    MyUtils.init_logging("OnlineLearning-organize_category_datasets.log")

    qs_csv_fpaths = get_csvs_filepaths()
    category_dir_paths = organize_questions(qs_csv_fpaths)
    for category_dir_p in category_dir_paths:
        attach_category_products(category_dir_p)
예제 #4
0
def create_categories_dbs():
    categ_dirpaths = get_category_dirpaths()
    MyUtils.init_logging("OnlineLearning_create_categories_dbs.log")

    for categ_dir_p in categ_dirpaths:
        base_name = (os.path.basename(categ_dir_p))
        RD.clean_representations_dbs(categ_dir_p)

        for filename in os.listdir(categ_dir_p):
            if not (utilities.MyUtils_flags.FLAG_PRODUCTS
                    in filename) and not (utilities.MyUtils_flags.FLAG_INITIAL
                                          in filename):
                quests_csv_fname = filename
                quests_csv_path = os.path.join(categ_dir_p, quests_csv_fname)
                logging.info("Questions csv file: %s", quests_csv_path)
            elif (utilities.MyUtils_flags.FLAG_PRODUCTS in filename):
                prods_csv_fname = filename
                prods_csv_path = os.path.join(categ_dir_p, prods_csv_fname)
                logging.info("Products csv file: %s", prods_csv_path)

        quests_db_path = os.path.join(
            categ_dir_p,
            utilities.MyUtils_strings.remove_string_end(
                quests_csv_fname, '.csv') + '.db')
        RD.create_representations_db(quests_csv_path, quests_db_path)
        logging.info(
            "Category: %s .Created database for the questions. Proceeding to create the db for products...",
            base_name)
        prods_db_path = os.path.join(
            categ_dir_p,
            utilities.MyUtils_strings.remove_string_end(
                prods_csv_fname, '.csv') + '.db')
        RD.create_representations_db(prods_csv_path, prods_db_path)
        logging.info("Category: %s .Created database for the products",
                     base_name)
예제 #5
0
def create_questions_representations():
    MyUtils.init_logging("OnlineLearning_create_questions_representations.log")
    categ_dirpaths = get_category_dirpaths()
    (d2v_model, phrases_model) = RC.load_the_models()
    for categ_dir in categ_dirpaths:
        create_category_qs_representations(categ_dir, d2v_model, phrases_model)
        collect()
예제 #6
0
def sort_candidates(candidates_db_path, ranked_candidates_outdb_path, prod_reps_dbpath, quest_reps_dbpath):
    MyUtils.init_logging("Rank_candidates_nn.log")
    ### Connecting to the databases: candidates, test products, test questions
    candidates_nn_db = sqlite3.connect(candidates_db_path)
    cands_db_c = candidates_nn_db.cursor()

    testprods_rep_c = sqlite3.connect(prod_reps_dbpath).cursor()
    testquests_rep_c = sqlite3.connect(quest_reps_dbpath).cursor()

    f = open(ranked_candidates_outdb_path, "w"); f.close()
    outdb = sqlite3.connect(ranked_candidates_outdb_path)
    outdb_c = outdb.cursor()
    outdb_c.execute('''CREATE TABLE candidates(    p_id varchar(63),
                                                   q_id varchar(63),
                                                   distance int        
                                            )''')
    ###

    test_products_ids = cands_db_c.execute("SELECT DISTINCT p_id FROM candidates").fetchall()
    logging.info(test_products_ids[0])
    #logging.debug(test_products_ids)
    for tpl_pid in test_products_ids:
        pid = tpl_pid[0]
        product_representation = MyUtils_dbs.search_in_alltables_db(testprods_rep_c, "SELECT * FROM ",
                                                                    "WHERE id = '" + str(pid) + "'")[0]
        product_tuple = MyUtils.prodls_tonamedtuple(product_representation, offset=1)
        quests_ids = list(map ( lambda results_tpl : results_tpl[0], cands_db_c.execute("SELECT q_id FROM candidates WHERE p_id = ?", tpl_pid).fetchall()))
        logging.debug(quests_ids)
        product_qs_sorted = sort_product_candidates(product_tuple, quests_ids, testprods_rep_c, testquests_rep_c)
        outdb.executemany("INSERT INTO candidates VALUES (?,?,?)", product_qs_sorted)
    outdb.commit()
def compute_encoding_questions(questions_filepath, out_db_filepath):
    MyUtils.init_logging("ComputeEncodingQuestions.log")
    questions_file = open(questions_filepath, "r")

    f = open(out_db_filepath, mode="w");
    f.close()  # clean between runs
    db_conn = sqlite3.connect(out_db_filepath)
    c = db_conn.cursor()
    c.execute('''CREATE TABLE qs_numenc(q_id varchar(63) NOT NULL,
                                        has_questionType tinyint,    
                                        has_questionVec tinyint,
                                        has_kwsVectors tinyint,
                                        encoding_questionType varchar(15),
                                        encoding_questionVec varchar(8191),
                                        encoding_kwsVectors varchar(8191),
                                        PRIMARY KEY (q_id) )''')
    db_conn.commit()

    segment_size = 5 * 10**3
    segment_id = 1
    for input_segment in pd.read_csv(questions_file, sep="_", chunksize=segment_size):
        segment_start = time()
        for quest_t in input_segment.itertuples():
            if len(quest_t.id) >=5 : #filter out undue headers
                #logging.info(quest_t)
                (q_flags, encodings) = get_num_encoding_quest(quest_t)
                c.execute('''INSERT INTO qs_numenc VALUES (?,?,?,?,?,?,?);''',
                          (quest_t.id, int(q_flags[0]), int(q_flags[1]), int(q_flags[2]),
                                 str(encodings[0]), str(encodings[1]), str(encodings[2])))
        segment_end = time()
        db_conn.commit()
        logging.info("Encoded questions' chunk n. %s ... Time elapsed = %s", segment_id, round(segment_end - segment_start,3))
        segment_id = segment_id+1
def process_all_mdinfo(prods_in_df, outfilepath, phrases_model, d2v_model):
    MyUtils.init_logging("ExtractMetadataInfo.log")

    f = open(outfilepath, "w")
    f.close()  #clean between runs
    sw_pattern = PPD.getStopwordsPattern(includePunctuation=True)
    logging.info("Started postprocessing other metadata info")

    segment_nrows = 5 * 10**4
    logging.info("Number of elements in a segment: %s", str(segment_nrows))
    with open(outfilepath, "a") as out_file:
        out_file.write("_id_description_price_titlevec_mdcategories\n")
        for input_segment in pd.read_csv(prods_in_df,
                                         chunksize=segment_nrows,
                                         sep="_"):
            chunk_start = time()
            mdinfo_lts = []
            for prod_tupl in input_segment.itertuples():
                prodinfo_tuple = process_prodinfo(prod_tupl, phrases_model,
                                                  d2v_model, sw_pattern)
                mdinfo_lts.append(prodinfo_tuple)
            pd.DataFrame(mdinfo_lts).to_csv(out_file,
                                            mode="a",
                                            header=False,
                                            sep="_")
            chunk_end = time()
            logging.info(
                "Processing: other metadata info. Segment completed in time : %s seconds",
                str(round(chunk_end - chunk_start, 3)))
    logging.info("Completed: processing product metadata.")
def test():
    MyUtils.init_logging("VectorizeDescriptions.log")
    docs_percent_touse = 1  # on the full training set, 0.3 is probably advisable.
    chunk_size = 10 ** 5

    doc_filenames = [F.DESCDOCS] #, F.QADOCS_FILEPATH
    trainingset_ls = []
    for doc_filename in doc_filenames:
        for descdocs_chunk in pd.read_csv(doc_filename, chunksize=chunk_size):
            len_c = len(descdocs_chunk)
            indices = list(sorted(numpy.random.choice(len_c, int(docs_percent_touse * len_c), replace=False)))
            selected_rows = descdocs_chunk.iloc[indices]
            docs = []
            for tupl in selected_rows.itertuples():
                docs.append(D2V.TaggedDocument(words=ast.literal_eval(tupl.words), tags=ast.literal_eval(tupl.tags)))
            trainingset_ls.extend(docs)
            logging.info("Reading in the documents' words. Chunk processed...")
        logging.info("Completed: reading in a set of documents.")

    d2v_model = load_model()

    subset = trainingset_ls[0:5]
    logging.debug("%s", str(subset))
    for doc in  subset:
        tag = doc.tags
        logging.debug("*** : %s" , str(tag))
        logging.debug("XXX : %s" , str(tag[0]))
        logging.debug("%s",str(d2v_model.docvecs[tag[0]]))
예제 #10
0
def vectorize_keywords(in_kwsdf_filepath, phrases_model, d2v_model,
                       out_kwvecs_filepath):
    MyUtils.init_logging(logfilename="MyRAKE_vectorizekws.log")
    logging.info("Started to vectorize the keywords")
    f = open(out_kwvecs_filepath, "w")
    f.close()  # clean between runs
    segment_nrows = 10**4  #10**4
    logging.info("Number of elements in a segment: %s", str(segment_nrows))

    current_segment = 1
    max_len = 10  #n of keywords to extract
    with open(out_kwvecs_filepath, "a") as outfile:
        outfile.write(",id,kwsVectors\n")
        for input_segment in pd.read_csv(in_kwsdf_filepath,
                                         chunksize=segment_nrows):
            executor = pathos.pools.ThreadPool(multiprocessing.cpu_count())
            t00 = time()
            args = ((elem_kws, phrases_model, d2v_model, max_len)
                    for elem_kws in input_segment.itertuples())
            kws_vecs = list(executor.map(vectorize_kw_ls, args))
            pd.DataFrame(kws_vecs).to_csv(outfile, mode='a', header=False)
            logging.info(
                "Keyword vectorization ; segment n.%s of the input dataframe has been processed...",
                current_segment)
            current_segment = current_segment + 1
            t11 = time()
            logging.info("Time elapsed for a segment : %s",
                         str(round(t11 - t00, 3)))
            executor.terminate()
            executor.restart()
    logging.info("Keyword vectorization : finished.")
def createQuestionDocuments():
    MyUtils.init_logging("PreprocessQuestions.log")
    ds = open(F.QADOCS_RAW, 'w')  #cleaning the file between runs
    ds.close()
    start_creatingInput = time.time()

    # Method: itertuples + pickle. Objective: preprocess text and create TaggedDocuments
    sw_pattern = PPD.getStopwordsPattern(includePunctuation=False)
    punct_pattern = re.compile(
        r'([!"#$%&()*+,./:;<=>?@\[\\\]^_`{|}-~\'])|([--])')
    chunk_length = 0.5 * (10**5)
    with open(F.QADOCS_RAW, "a") as qadocs_file:
        qadocs_file.write(",words,tags\n")
        for input_segment in pd.read_csv(RQ.QA_TRAIN_DFPATH,
                                         chunksize=chunk_length,
                                         sep="_"):
            chunk_0 = map(
                lambda tupl: createDocForRow(tupl, sw_pattern, punct_pattern),
                input_segment.itertuples())
            chunk_1 = list(filter(lambda x: x is not None, chunk_0))
            print(
                getsizeof(chunk_1) // (2**10)
            )  # debugging : get size of the chunk in Kilobytes. It also works as a progress update
            pd.DataFrame(chunk_1).to_csv(path_or_buf=qadocs_file,
                                         mode="a",
                                         header=False)
            logging.info("Chunk of documents created...")

    end_creatingInput = time.time()
    logging.info("Time spent creating the Documents: %s",
                 str(round(end_creatingInput - start_creatingInput, 3)))
def create_onlinelearning_traininstances():
    MyUtils.init_logging("create_onlinelearning_traininstances.log")

    file = open(F.ONLINE_INSTANCEIDS_GLOBAL_DB, "w")
    file.close()
    out_db = sqlite3.connect(F.ONLINE_INSTANCEIDS_GLOBAL_DB)
    out_c = out_db.cursor()
    out_c.execute('''CREATE TABLE positiveinstances(    p varchar(63),
                                                        qs_ls varchar(8191)                      
                                                    )''')
    out_db.commit()

    #IE.register_matches()
    #filter_matches_allfeatures()

    pqs_filteredmatches_df = pd.read_csv(F.ONLINE_PQMATCHES_FILTERED, sep="_")
    prods_ids_ls = []
    for pid_qs_t in pqs_filteredmatches_df.itertuples():
        out_c.execute("INSERT INTO positiveinstances VALUES (?,?)",
                      (pid_qs_t.id, pid_qs_t.questionsAsked))
        prods_ids_ls.append(pid_qs_t.id)
    out_db.commit()
    logging.info(
        "Creating balanced training instances for Online learning: Positive instances determined..."
    )
    del pqs_filteredmatches_df
    collect()
    IE.get_negative_indices(prods_ids_ls)
    collect()
    IE.assign_candidate_negative_examples(prods_ids_ls)
    collect()
    IE.define_negative_examples()
    unpack_question_lists()
    shuffle_balancedinstances_db()
예제 #13
0
def organize_qa_all():
    MyUtils.init_logging("ReadQuestions.log", loglevel=logging.INFO)
    clean_old_files(all=False)
    filenames = get_filenames()
    core_filenames = list(map(lambda s: utilities.MyUtils_strings.remove_string_end(s, ".json.gz"), filenames))
    #logging.info(str(core_filenames))
    for cf in core_filenames:
        organize_qa_subfile(cf)
    clean_empty_files()
예제 #14
0
def test_my_rake():
    the_stopwords_pattern = PD.getStopwordsPattern(includePunctuation=True)
    md_df = RM.load_md(RM.READKEYWORD_TRAINSUBSET)
    elem = MyUtils.pickRandomElement(md_df)
    while elem.description == "nan" or len(
            elem.description
    ) == 0:  #a null value may be nan (for prods) or '' (for quests)
        elem = MyUtils.pickRandomElement(md_df)
    apply_my_rake(elem.description, the_stopwords_pattern)
예제 #15
0
def explore_cosine_similarity(n_products=100,
                              n_questions=100,
                              p_featurename="descvec",
                              q_featurename="questionVec",
                              fraction=0.75):
    MyUtils.init_logging("CosineSimilarity.log")
    logging.info(
        "Computing a cosine similarity breakpoint at fraction %s, between P:%s and Q:%s ...",
        fraction, p_featurename, q_featurename)

    prods_representations_db = sqlite3.connect(F.PRODUCTS_FINAL_TRAIN_DB)
    ps_c = prods_representations_db.cursor()

    quests_representations_db = sqlite3.connect(F.QUESTIONS_FINAL_TRAIN_DB)
    qs_c = quests_representations_db.cursor()

    ###### Get Doc2Vec vectors from the randomly selected products
    random_indices = get_random_indices(ps_c, n_products)
    random_indices_querystring = str(tuple(random_indices)) if len(random_indices) > 1 \
        else "(" + str(random_indices[0]) + ")"
    selected_pf_strings_ts = utilities.MyUtils_dbs.search_in_alltables_db(
        ps_c, "SELECT " + str(p_featurename) + " FROM",
        "WHERE rowid IN " + random_indices_querystring)
    # Unpacking the tuples (each tuple is simply a container for one feature).
    selected_pf_strings = list(map(lambda t: t[0], selected_pf_strings_ts))

    d2v_model = D2V.Doc2Vec.load(
        F.D2V_MODEL)  #it is loaded to obtain the vectors for the mdcategories
    ps_vectors = get_products_vectors(p_featurename, n_products,
                                      selected_pf_strings, d2v_model)
    del d2v_model
    collect()
    ######
    ###### Get Doc2Vec vectors from the randomly selected questions
    random_indices = get_random_indices(qs_c, n_questions)
    random_indices_querystring = str(tuple(random_indices)) if len(random_indices) > 1 \
        else "(" + str(random_indices[0]) + ")"
    selected_qf_strings_ts = utilities.MyUtils_dbs.search_in_alltables_db(
        qs_c, "SELECT " + str(q_featurename) + " FROM",
        "WHERE rowid IN " + random_indices_querystring)

    selected_qf_strings = list(map(lambda t: t[0], selected_qf_strings_ts))
    qs_vectors = get_questions_vectors(q_featurename, n_questions,
                                       selected_qf_strings)
    ######

    M = compute_matrix_cosinesims(ps_vectors, qs_vectors)

    breakpoint = compute_breakpoint(M, fraction)

    show_graphic_cosinesim(M, p_featurename, q_featurename, breakpoint,
                           fraction)

    update_breakpoints_db(breakpoint, p_featurename, q_featurename, n_products,
                          n_questions, fraction)
    return breakpoint
예제 #16
0
def shuffle_training_halves():
    MyUtils.init_logging("Shuffle_training_halves.log", logging.INFO)
    db_conn = sqlite3.connect(F.NN_TRAIN_INSTANCES_DB)
    c = db_conn.cursor()

    f = open(F.NN_TEMP_INSTANCES_DB, "w")
    f.close()  #clean outdb
    outdb_conn = sqlite3.connect(F.NN_TEMP_INSTANCES_DB)
    outc = outdb_conn.cursor()
    outc.execute('''CREATE TABLE instances(   p_id varchar(63),
                                              q_id varchar(63),
                                              x varchar(8191),
                                              y tinyint
                                              )''')
    outdb_conn.commit()

    tot_num_of_rows = utilities.MyUtils_dbs.get_nn_dataset_length(
        utilities.MyUtils_flags.FLAG_TRAIN)
    half_mark = tot_num_of_rows // 2
    ids_pos = np.random.choice(range(1, half_mark + 1),
                               half_mark,
                               replace=False)
    # the effect of the previous line is to permute the indices from 0 to half_mark
    ids_neg = np.random.choice(range(half_mark, tot_num_of_rows),
                               half_mark,
                               replace=False)

    for id_pos in ids_pos:
        picked_row = c.execute("SELECT * FROM instances WHERE rowid = " +
                               str(id_pos)).fetchone()
        p_id = picked_row[0]
        q_id = picked_row[1]
        x = picked_row[2]
        y = picked_row[3]
        outc.execute('''INSERT INTO instances VALUES (?,?,?,?);''',
                     (p_id, q_id, str(x), y))
    outdb_conn.commit()
    logging.info(
        "Training set: Positive instances have been shuffled. Proceeding to shuffle negative instances..."
    )
    for id_neg in ids_neg:
        picked_row = c.execute("SELECT * FROM instances WHERE rowid = " +
                               str(id_neg)).fetchone()
        p_id = picked_row[0]
        q_id = picked_row[1]
        x = picked_row[2]
        y = picked_row[3]
        outc.execute('''INSERT INTO instances VALUES (?,?,?,?);''',
                     (p_id, q_id, str(x), y))
    outdb_conn.commit()
    logging.info("Training set: Negative instances have been shuffled.")

    os.rename(src=F.NN_TEMP_INSTANCES_DB, dst=F.NN_TRAIN_INSTANCES_DB)
def explore():
    MyUtils.init_logging("ExploreStopwordsMethods.log")
    descDocs_ls = list(pandas.read_csv(F.DESCDOCS_RAW))[0:10]

    print((descDocs_ls[0:5]))  #exploration & debug
    model_forVocabulary = D2V.Doc2Vec()
    model_forVocabulary.build_vocab(sentences=descDocs_ls,
                                    update=False,
                                    progress_per=1000,
                                    keep_raw_vocab=True,
                                    trim_rule=None)

    #convert the vocabulary dictionary into a list for ordering
    vocab_list = [(k, v) for k, v in model_forVocabulary.raw_vocab.items()]
    vocab_list.sort(
        key=lambda tuple: tuple[1],
        reverse=True)  #sorted in place, descendingly, depending on the value
    logging.info("Length of the whole vocabulary : " + str(len(vocab_list)))
    logging.info(str(vocab_list[0:400]))
    pandas.DataFrame(
        vocab_list[0:400]).to_csv("stopwords/wordFrequencies_ls.csv")

    singletons_vocab_list = list(
        filter(lambda tupl: getSingletons(tupl), vocab_list))
    logging.info("Number of singletons : " + str(len(singletons_vocab_list)))
    logging.info(str(singletons_vocab_list[0:1000]))

    urls_vocab_list = list(filter(lambda tupl: getURLs(tupl), vocab_list))
    logging.info("Number of URLs : " + str(len(urls_vocab_list)))
    logging.info(str(urls_vocab_list[0:1000]))

    wordDocFrequency_dict = dict.fromkeys(model_forVocabulary.raw_vocab.keys(),
                                          0)

    for taggedDocument in descDocs_ls:
        already_encountered = []
        words_ls = taggedDocument.words
        for word in words_ls:
            if word not in already_encountered:
                wordDocFrequency_dict[word] = wordDocFrequency_dict[word] + 1
                already_encountered.append(word)

    # It would be log (N / df(w)). For ordering purposes, (N / df(w)) suffices, or even (1 / df(w))
    # Therefore, to pick the words with lowest IDF we must pick those with a higher df(w)
    docFreq_list = [(k, v) for k, v in wordDocFrequency_dict.items()]
    docFreq_list.sort(
        key=lambda tuple: tuple[1],
        reverse=True)  # sorted in place, descendingly, depending on the value
    logging.info("The Doc-frequencies of the words have been determined.")
    logging.info(str(docFreq_list[0:400]))
    pandas.DataFrame(
        docFreq_list[0:400]).to_csv("stopwords/docFrequencies_ls.csv")
def shuffle_balancedinstances_db():
    MyUtils.init_logging("Shuffle_training_halves.log", logging.INFO)
    db_conn = sqlite3.connect(F.ONLINE_INSTANCEIDS_GLOBAL_DB)
    c = db_conn.cursor()

    f = open(F.ONLINE_TEMP_DB, "w")
    f.close()  # clean outdb
    outdb_conn = sqlite3.connect(F.ONLINE_TEMP_DB)
    outc = outdb_conn.cursor()
    outc.execute('''CREATE TABLE positiveinstances(p varchar(63),
                                            qs_ls varchar(8191) )  ''')
    outc.execute('''CREATE TABLE negativeinstances(p varchar(63),
                                                qs_ls varchar(8191) )  ''')
    outdb_conn.commit()

    num_pos_instances = c.execute(
        "SELECT COUNT(*) FROM positiveinstances").fetchone()[0]
    num_neg_instances = c.execute(
        "SELECT COUNT(*) FROM negativeinstances").fetchone()[0]
    ids_pos = np.random.choice(a=range(1, num_pos_instances + 1),
                               size=num_pos_instances,
                               replace=False)
    ids_neg = np.random.choice(a=range(1, num_neg_instances + 1),
                               size=num_neg_instances,
                               replace=False)

    for id_pos in ids_pos:
        picked_row = c.execute(
            "SELECT * FROM positiveinstances WHERE rowid = " +
            str(id_pos)).fetchone()
        p = picked_row[0]
        qs = picked_row[1]
        outc.execute('''INSERT INTO positiveinstances VALUES (?,?);''',
                     (str(p), qs))
    outdb_conn.commit()
    logging.info(
        "Training set: Positive instances have been shuffled. Proceeding to shuffle negative instances..."
    )
    for id_neg in ids_neg:
        picked_row = c.execute(
            "SELECT * FROM negativeinstances WHERE rowid = " +
            str(id_neg)).fetchone()
        p = picked_row[0]
        qs = picked_row[1]
        outc.execute('''INSERT INTO negativeinstances VALUES (?,?);''',
                     (str(p), qs))
    outdb_conn.commit()
    logging.info("Training set: Negative instances have been shuffled.")

    rename(src=F.ONLINE_TEMP_DB, dst=F.ONLINE_INSTANCEIDS_GLOBAL_DB)
예제 #19
0
def create_phrases_model():
    MyUtils.init_logging("Encode_Common.log")
    logging.info("Starting preparation of phrases...")
    docs_percent_touse = 1  #0.5.
    chunk_size = 10**5

    doc_filenames = [F.DESCDOCS_RAW, F.QADOCS_RAW]
    doc_files = [open(doc_filename, "r") for doc_filename in doc_filenames]
    all_docwords = []
    for doc_file in doc_filenames:
        for docs_chunk in pd.read_csv(doc_file, chunksize=chunk_size):
            len_c = len(docs_chunk)
            words_chunk = []
            indices = list(
                sorted(
                    numpy.random.choice(len_c,
                                        int(docs_percent_touse * len_c),
                                        replace=False)))
            selected_rows = docs_chunk.iloc[indices]
            for tupl in selected_rows.itertuples():
                word_ls = ast.literal_eval(tupl.words)
                words_chunk.append(word_ls)
            all_docwords.extend(words_chunk)
            logging.info("Reading in the documents' words. Chunk processed...")
        logging.info("Completed: reading in a set of documents' words"
                     )  # @ time = " + str(round(time1 - start, 3)))

    logging.info("Number of documents to use in the Phrases model: %s",
                 str(len(all_docwords)))
    del doc_filenames
    del doc_files
    collect()

    phrases_model = phrases.Phrases(sentences=all_docwords,
                                    min_count=20,
                                    threshold=300,
                                    delimiter=b'_',
                                    max_vocab_size=30 * 10**6)
    #phraser_model = phrases.Phraser(phrases_model)
    #time2 = time();
    logging.info(
        "Phrases model created")  #@ time = " + str(round(time2 - start, 3)))
    logging.info("Memory size in MBs = %s",
                 str(mem.asizeof(phrases_model) // 2**20))

    phrases_model.save(F.PHRASES_MODEL)

    return phrases_model
예제 #20
0
def prepare_dq_documents():
    MyUtils.init_logging("Encode_Common.log")
    start = time()
    phrases_model = phrases.Phrases.load(F.PHRASES_MODEL)
    logging.info(
        "Started updating the TaggedDocuments according to the Phrases model..."
    )
    doc_filenames = [(F.DESCDOCS_RAW, F.DESCDOCS), (F.QADOCS_RAW, F.QADOCS)]

    for tupl_fn in doc_filenames:
        input_filename = tupl_fn[0]
        output_filename = tupl_fn[1]
        f = open(output_filename, "w")
        f.close()  # clean output file between runs

        with open(output_filename, "a") as newdocs_file:
            newdocs_file.write(",words,tags\n")

            with open(input_filename, "r") as rawdocs_file:
                chunk_n_elems = 10**5
                for segment in pd.read_csv(rawdocs_file,
                                           chunksize=chunk_n_elems):
                    new_docs_chunk = []
                    for tupl in segment.itertuples():
                        try:
                            old_words = ast.literal_eval(
                                tupl.words
                            )  #evaluates the string back into a list
                            new_words = phrases_model[old_words]
                            new_docs_chunk.append(
                                D2V.TaggedDocument(words=new_words,
                                                   tags=tupl.tags))
                        except ValueError:
                            logging.warning(
                                "Info: literal evaluation did not apply to element: %s",
                                str(tupl.tags))
                    pd.DataFrame(new_docs_chunk).to_csv(newdocs_file,
                                                        mode="a",
                                                        header=False)
                    logging.info(
                        "Documents updated with phrases: a chunk has been processed"
                    )
        logging.info(
            "Completed: a set of documents has been updated with Phrases")

    time3 = time()
    logging.info("New documents created, in time = %s",
                 str(round(time3 - start, 3)))
예제 #21
0
def order_test(finalfile_path):
    MyUtils.init_logging("temp.log")
    segment_size = 10**4
    segment_counter = 1
    for segment in pd.read_csv(finalfile_path, chunksize=segment_size,
                               sep='_'):
        #for tpl in segment.itertuples():
        #    logging.info(tpl)
        asins = segment.id
        ordered = utilities.MyUtils.check_series_ids_sorted(asins, len(asins))
        logging.info("Is the chunks of elements n.%s ordered?: %s",
                     segment_counter, ordered)
        #if not ordered:
        #    logging.info(asins)
        segment_counter = segment_counter + 1
        collect()
예제 #22
0
def get_keywords_keywords_distance(prod_tuple, q_tuple):
    try:
        p_keywords = np.array(
            MyUtils_strings.fromlls_toarrays(prod_tuple.kwsVectors))
        q_keywords = np.array(
            MyUtils_strings.fromlls_toarrays(q_tuple.kwsVectors))
        m = len(p_keywords)
        n = len(q_keywords)
        sim_matrix = np.ones(shape=(m, n)) * -1
        for i in range(m):
            kw_vec_1 = p_keywords[i]
            for j in range(n):
                kw_vec_2 = q_keywords[j]
                sim_matrix[i][j] = 1 - distance.cosine(u=kw_vec_1, v=kw_vec_2)
        # logging.debug("\nThe sim.matrix : %s", sim_matrix)

        max_similarities = MyUtils.pick_maxmatches_matrix(sim_matrix)

        min_distances = list(map(lambda sim: 1 - sim, max_similarities))

        avg_min_distance = np.average(min_distances)

        return avg_min_distance
    except NameError:
        return None
예제 #23
0
def explore_phrase2vec(min_freq, phrases_threshold):
    MyUtils.init_logging("Explore_Phrase2Vec.log")
    words_lls = []
    doc_filenames = [F.DESCDOCS_RAW, F.QADOCS_RAW]
    doc_files = [open(doc_filename, "r") for doc_filename in doc_filenames]
    all_docwords = []
    chunk_size = 10**5
    for doc_file in doc_filenames:
        for docs_chunk in pd.read_csv(doc_file, chunksize=chunk_size):
            len_c = len(docs_chunk)
            words_chunk = []
            #indices = list(sorted(numpy.random.choice(len_c, int(docs_percent_touse * len_c), replace=False)))
            #selected_rows = docs_chunk.iloc[indices]
            for tupl in docs_chunk.itertuples():
                #words = tupl.words.replace("'",'"')
                #logging.info(words)
                #word_ls = json.loads(words)#ast.literal_eval(tupl.words)
                word_ls = eval(tupl.words, {'__builtins__': {}})
                words_chunk.append(word_ls)
            all_docwords.extend(words_chunk)
            logging.info("Added chunk from file %s to documents list...",
                         doc_file)

    logging.info("Number of documents: %s", len(all_docwords))
    phrases_model = phrases.Phrases(sentences=all_docwords,
                                    min_count=min_freq,
                                    threshold=phrases_threshold,
                                    delimiter=b'_')
    #logging.info("***The Phrases model's frequency vocabulary: %s", str(phrases_model.vocab))
    phrases_vocab = phrases_model.vocab
    del phrases_model
    collect()
    sorted_vocabulary = sorted(list(phrases_vocab.items()),
                               key=lambda tpl: tpl[1],
                               reverse=True)
    phrases_sorted_vocabulary = list(
        filter(lambda tpl: '_' in str(tpl[1]), sorted_vocabulary))
    individual_words_sorted_vocabulary = list(
        filter(lambda tpl: not ('_' in str(tpl[1])), sorted_vocabulary))
    logging.info("***The vocabulary of phrases, ordered by frequency : %s ",
                 phrases_sorted_vocabulary)
    logging.info("***The vocabulary of words, ordered by frequency : %s ",
                 individual_words_sorted_vocabulary)
    #phrases_model.save("Exploration_phrasesModel_mincount"+ str(min_freq) + "_T"+str(phrases_threshold) + ".model")

    for i in range(len(words_lls) // 4):
        print(str(phrases_model[words_lls[i]]))
예제 #24
0
def apply_NN_on_testset():
    with tf.Session() as session:
        MyUtils.init_logging("GetCandidatesNN.log")

        test_db = sqlite3.connect(F.NN_TEST_INSTANCES_DB)
        test_db_c = test_db.cursor()

        saver = tf.train.import_meta_graph(F.SAVED_NN+'.meta')
        saver.restore(session, tf.train.latest_checkpoint(os.path.dirname(F.SAVED_NN)))

        MyUtils_filesystem.clean_directory(F.NN_TEST_OUTPUT_DIR)
        test_filewriter = tf.summary.FileWriter(os.path.join(F.NN_TEST_OUTPUT_DIR), session.graph)


    #with tf.variable_scope("reuse_fortest_scope", reuse=tf.AUTO_REUSE):
        graph = tf.get_default_graph()
        input_placeholder = graph.get_tensor_by_name("input_pl:0")
        labels_placeholder = graph.get_tensor_by_name("labels_pl:0")
        placeholders = (input_placeholder, labels_placeholder)

        test_accuracy_summary = graph.get_tensor_by_name("Accuracy:0")
        predictions = graph.get_tensor_by_name("predictions:0")
        tf_metric, tf_metric_update = tf.metrics.accuracy(labels=labels_placeholder, predictions=predictions,
                                                          name="Test_accuracy")

        tasks = [tf_metric_update, predictions]
        writing_tasks = [test_accuracy_summary]

        accuracy_variables = list(filter(lambda var: "accuracy" in var.name, tf.local_variables() ) )
        logging.debug("Accuracy variables: %s",  accuracy_variables)
        session.run(tf.variables_initializer(accuracy_variables))

        (test_accuracy_value, foundcandidates_mapls)= \
            compute_output_dataset_accuracy(1, placeholders, test_db_c,tasks, writing_tasks, [test_filewriter], session)

        foundcandidates_ls = sorted(foundcandidates_mapls, key=lambda elem: elem[0])

        logging.info("Test accuracy value: %s", test_accuracy_value)
        logging.info("Candidates found: %s", foundcandidates_ls)

        candidates_outdb = sqlite3.connect(F.CANDIDATES_NN_DB)
        #outdb_c = candidates_outdb.cursor()

        candidates_df = pd.DataFrame(foundcandidates_ls, columns=["p_id", "q_id"], dtype=str)
        candidates_df.to_sql(name="candidates", con=candidates_outdb, if_exists="replace", dtype={"p_id":"varchar(63)",
                                                                                                  "q_id":"varchar(63)"})
def create_docvectors_model():
    MyUtils.init_logging("VectorizeDescriptions.log")
    start = time()

    docs_percent_touse = 1  # on the full training set, 0.3 is probably advisable.
    chunk_size = 10 ** 5

    doc_filenames = [F.DESCDOCS, F.QADOCS]
    doc_files = [open(doc_filename, "r") for doc_filename in doc_filenames]
    trainingset_ls = []
    for doc_file in doc_files:
        for descdocs_chunk in pd.read_csv(doc_file, chunksize=chunk_size):
            len_c = len(descdocs_chunk)
            indices = list(sorted(numpy.random.choice(len_c, int(docs_percent_touse * len_c), replace=False)))
            selected_rows = descdocs_chunk.iloc[indices]
            docs = []
            for tupl in selected_rows.itertuples():
                docs.append(D2V.TaggedDocument(words=ast.literal_eval(tupl.words), tags=ast.literal_eval(tupl.tags)))
            trainingset_ls.extend(docs)
            logging.info("Reading in the documents' words. Chunk processed...")
        logging.info("Completed: reading in a set of documents.")
        doc_file.close()
    del doc_files; del doc_filenames; collect()


    print(trainingset_ls[0:1])
    logging.info("Total number of documents in the corpus: %s", len(trainingset_ls))
    logging.info("Starting to build vocabulary and Doc2Vec model.")

    model = D2V.Doc2Vec(min_count=4, size=200, dm=1, workers=cpu_count(), # Ignore singletons; create vectors of 200 dims; use PV-DM
                        docvecs_mapfile = os.path.join("gensim_models", "doc2vec_memorymapped_vectors"))
    # create the overall vocabulary, from the descriptions and the questions:
    model.build_vocab(documents=trainingset_ls, update=False, progress_per=10000, keep_raw_vocab=False)

    logging.info("D2V Vocabulary created")

    model.train(documents=trainingset_ls, total_examples=len(trainingset_ls), epochs=10, start_alpha=0.025,
                end_alpha=0.001, word_count=0, queue_factor=2, report_delay=1.0)

    model.save(F.D2V_MODEL)

    end = time()
    logging.info("Doc2Vec model saved. Time elapsed = %s", str(round(end - start , 3)))
    logging.info("Memory size in MBs = %s", str(mem.asizeof(model) // 2 ** 20))

    return model
예제 #26
0
def clean_0_ids(outfile_tofilter_path, ps_or_qs_stringflag):

    MyUtils.init_logging("temp.log")
    logging.info(outfile_tofilter_path)
    new_outfile_path = F.ELEMENTS_TEMP
    f = open(new_outfile_path, 'w')
    f.close()
    new_outfile = open(new_outfile_path, 'a')

    if utilities.MyUtils_flags.FLAG_QUESTS in ps_or_qs_stringflag or utilities.MyUtils_flags.FLAG_QUESTIONS in ps_or_qs_stringflag:
        #     new_outfile.write("_id_questionType_questionVec_kwsVectors\n")
        #     logging.info("Writing: _id_questionType_questionVec_kwsVectors", )
        the_header = ["id", "questionType", "questionVec",
                      "kwsVectors"]  #or the_header?
    if utilities.MyUtils_flags.FLAG_PRODS in ps_or_qs_stringflag or utilities.MyUtils_flags.FLAG_PRODUCTS in ps_or_qs_stringflag:
        #     new_outfile.write("_id_price_titlevec_descvec_mdcategories_kwsVectors\n")
        #     logging.info("Writing: _id_price_titlevec_descvec_mdcategories_kwsVectors", )
        the_header = [
            "id", "price", "titlevec", "descvec", "mdcategories", "kwsVectors"
        ]

    segment_size = 5 * 10**4
    segment_counter = 1
    for segment in pd.read_csv(outfile_tofilter_path,
                               chunksize=segment_size,
                               sep='_'):
        segment_buffer = []
        for elem_tuple in segment.itertuples():
            if len(elem_tuple.id) >= 9:  #.id
                segment_buffer.append(elem_tuple)
            else:
                logging.info(elem_tuple.id)
        #temp_columns = ['_4','_5','_6','NODESCVEC','_8','NOKWSVECTORS']
        #segment_header = True if segment_counter==1 else False
        segment_df = pd.DataFrame(segment_buffer)[the_header]
        segment_df.columns = the_header
        segment_df.to_csv(new_outfile_path,
                          mode="a",
                          header=bool(segment_counter == 1),
                          sep="_",
                          index=False)
        logging.info("Filtered '0' ids from segment n. %s...", segment_counter)
        segment_counter = segment_counter + 1
        collect()

    os.rename(src=new_outfile_path, dst=outfile_tofilter_path)
def attach_category_products(category_dir_path=None):
    MyUtils.init_logging("OnlineLearning-attach_category_products.log", logging.INFO)
    segment_size = 10**4
    #the files PRODUCTS_FINAL_TRAIN and PRODUCTS_FINAL_VALID are already sorted
    products_train_fpath = F.PRODUCTS_FINAL_TRAIN
    products_valid_fpath = F.PRODUCTS_FINAL_VALID

    for filename in os.listdir(category_dir_path):
        if filename.endswith(".csv") and utilities.MyUtils_flags.FLAG_INITIAL in filename:
            category_products_filepath = os.path.join(category_dir_path, utilities.MyUtils_flags.FLAG_PRODUCTS + utilities.MyUtils_strings.remove_string_start(filename,
                                                                                                                                                               utilities.MyUtils_flags.FLAG_INITIAL))
            logging.info("File in which to store the products belonging to the category:%s", category_products_filepath)
            category_qs_fpath = os.path.join(category_dir_path,filename)
            #logging.info("%s", category_qs_fpath)
            #register_products(products_train_fpath, category_qs_fpath, category_products_filepath, append=False)
            register_products_db(F.PRODUCTS_FINAL_TRAIN_DB, category_qs_fpath, category_products_filepath, append=False)
            register_products_db(F.PRODUCTS_FINAL_VALID_DB, category_qs_fpath, category_products_filepath, append=True)
예제 #28
0
def readfirst_qa():
    MyUtils.init_logging("ReadQuestions.log")
    clean_old_files(all=True)
    filenames = get_filenames()
    core_filenames = list(map(lambda s: utilities.MyUtils_strings.remove_string_end(s, ".json.gz"), filenames))
    nameslist = list(zip(filenames, core_filenames))
    for (fname, core_fname) in nameslist:

        with gzip.open(F.QA_DIR_PATH + "/" + fname, 'rb') as qa_file:  # use gzip.open and rb if the file is compressed
                chunk = qa_file.readlines()  # returns a list of strings, one for each line
                qa_df = pd.DataFrame(create_dict_from_data(chunk))
                qa_df = qa_df.set_index(
                    keys="asin")  # This sets the 'asin' as the index (n: but also drops the column)

                qa_df.to_csv(F.QA_DIR_PATH + "/" + core_fname + ".csv", sep="_")
                logging.info("Did read questions subset: %s", str(fname))
    clean_empty_files()
    clean_empty_files(F.QA_DIR_PATH)
def create_representations_db(represented_elements_filepath,
                              outdb_filepath,
                              id_column_name="id"):
    MyUtils.init_logging("CreatePs&Qs_RepresentationDatabases.log")

    elements_file = open(represented_elements_filepath, "r")
    f = open(outdb_filepath, "w")
    f.close()  #clean outdb between runs

    segment_size = 2 * 10**4
    segment_id = 1
    db_conn = sqlite3.connect(outdb_filepath)
    c = db_conn.cursor()

    for in_segment in pd.read_csv(elements_file,
                                  sep="_",
                                  chunksize=segment_size,
                                  dtype="str",
                                  quotechar='"'):
        #for tpl in in_segment.itertuples():
        #    logging.info(tpl)
        #    raise Exception
        start = time()

        tablename = "elements" + str(segment_id)

        in_segment.to_sql(tablename,
                          db_conn,
                          chunksize=10**4,
                          if_exists='append',
                          dtype={id_column_name: "varchar(63)"})
        collect()

        c.execute("CREATE INDEX indexid_" + str(segment_id) + " ON " +
                  tablename + " (" + id_column_name + ");")
        logging.info(
            "The segment n.%s, with %s represented elements, has been copied from final_file to database, and indexed; "
            + "time elapsed: %s seconds", segment_id, segment_size,
            round(time() - start, 3))
        segment_id = segment_id + 1
        db_conn.commit()
    db_conn.close()
def unpack_question_lists():
    MyUtils.init_logging("unpack_question_lists.log", logging.INFO)
    db_conn = sqlite3.connect(F.ONLINE_INSTANCEIDS_GLOBAL_DB)
    c = db_conn.cursor()

    f = open(F.ONLINE_TEMP_DB, "w")
    f.close()  # clean outdb
    outdb_conn = sqlite3.connect(F.ONLINE_TEMP_DB)
    outc = outdb_conn.cursor()
    outc.execute('''CREATE TABLE positiveinstances(p varchar(63),
                                            q varchar(63) )  ''')
    outc.execute('''CREATE TABLE negativeinstances(p varchar(63),
                                                q varchar(63) )  ''')
    outdb_conn.commit()

    p_qs_lts = c.execute("SELECT * FROM positiveinstances").fetchall()
    logging.info(
        "(Positive examples): Unpacking the lists of questions from %s products",
        len(p_qs_lts))
    for p_qs_t in p_qs_lts:
        p = p_qs_t[0]
        qs_str = p_qs_t[1]
        qs_ls = json.loads(qs_str.replace("'", '"'))
        for q in qs_ls:
            outc.execute('''INSERT INTO positiveinstances VALUES (?,?);''',
                         (str(p), str(q)))
    outdb_conn.commit()

    p_qs_lts = c.execute("SELECT * FROM negativeinstances").fetchall()
    logging.info(
        "(Negative examples): Unpacking the lists of questions from %s products",
        len(p_qs_lts))
    for p_qs_t in p_qs_lts:
        p = p_qs_t[0]
        qs_str = p_qs_t[1]
        qs_ls = json.loads(qs_str.replace("'", '"'))
        for q in qs_ls:
            outc.execute('''INSERT INTO negativeinstances VALUES (?,?);''',
                         (str(p), str(q)))
    outdb_conn.commit()
    rename(src=F.ONLINE_TEMP_DB, dst=F.ONLINE_INSTANCEIDS_GLOBAL_DB)