示例#1
0
def write_negative_instances(num_ps_rows, num_qs_rows, prods_db_c,
                             num_random_qs_per_prod, quests_db_c, outc, outdb):

    for rowid in (range(1, num_ps_rows + 1)):
        p_id = MyUtils_dbs.search_in_alltables_db(
            prods_db_c, "SELECT id FROM", "WHERE rowid = " + str(rowid))[0][0]
        neg_qs_ids = []
        neg_qs_indices = np.random.choice(a=range(1, num_qs_rows),
                                          size=num_random_qs_per_prod,
                                          replace=False)
        for neg_qs_index in neg_qs_indices:
            neg_qs_ids.append(
                MyUtils_dbs.search_in_alltables_db(
                    quests_db_c, "SELECT id FROM",
                    "WHERE `index` = " + str(neg_qs_index))[0][0])

        if product_has_allfeatures(prods_db_c,
                                   p_id) and allquestions_have_allfeatures(
                                       quests_db_c, str(neg_qs_ids)):
            insertion_sequence = [(p_id, q_id, 0) for q_id in neg_qs_ids]
            outc.executemany("INSERT INTO instances VALUES (?,?,?)",
                             insertion_sequence)
        else:
            logging.info(
                "Product %s excluded from the instances due to not having all the features",
                p_id)

        if rowid % (num_ps_rows // 10) == 0:
            logging.info("Working on category: +10%%...")
            outdb.commit()
示例#2
0
def sort_candidates(candidates_db_path, ranked_candidates_outdb_path, prod_reps_dbpath, quest_reps_dbpath):
    MyUtils.init_logging("Rank_candidates_nn.log")
    ### Connecting to the databases: candidates, test products, test questions
    candidates_nn_db = sqlite3.connect(candidates_db_path)
    cands_db_c = candidates_nn_db.cursor()

    testprods_rep_c = sqlite3.connect(prod_reps_dbpath).cursor()
    testquests_rep_c = sqlite3.connect(quest_reps_dbpath).cursor()

    f = open(ranked_candidates_outdb_path, "w"); f.close()
    outdb = sqlite3.connect(ranked_candidates_outdb_path)
    outdb_c = outdb.cursor()
    outdb_c.execute('''CREATE TABLE candidates(    p_id varchar(63),
                                                   q_id varchar(63),
                                                   distance int        
                                            )''')
    ###

    test_products_ids = cands_db_c.execute("SELECT DISTINCT p_id FROM candidates").fetchall()
    logging.info(test_products_ids[0])
    #logging.debug(test_products_ids)
    for tpl_pid in test_products_ids:
        pid = tpl_pid[0]
        product_representation = MyUtils_dbs.search_in_alltables_db(testprods_rep_c, "SELECT * FROM ",
                                                                    "WHERE id = '" + str(pid) + "'")[0]
        product_tuple = MyUtils.prodls_tonamedtuple(product_representation, offset=1)
        quests_ids = list(map ( lambda results_tpl : results_tpl[0], cands_db_c.execute("SELECT q_id FROM candidates WHERE p_id = ?", tpl_pid).fetchall()))
        logging.debug(quests_ids)
        product_qs_sorted = sort_product_candidates(product_tuple, quests_ids, testprods_rep_c, testquests_rep_c)
        outdb.executemany("INSERT INTO candidates VALUES (?,?,?)", product_qs_sorted)
    outdb.commit()
示例#3
0
def write_positive_instances(num_ps_rows, prods_db_c, quests_db_c, outc):
    ###Iterate over the products:
    for rowid in (range(1, num_ps_rows + 1)):
        p_id = MyUtils_dbs.search_in_alltables_db(
            prods_db_c, "SELECT id FROM", "WHERE rowid = " + str(rowid))[0][0]

        ###Get all the Qs asked for the selected Ps; they will always be part of the dataset,since there are so few Ps
        q_ids_results = MyUtils_dbs.search_in_alltables_db(
            quests_db_c, "SELECT id FROM",
            "WHERE id LIKE '" + str(p_id) + "%'")
        q_ids_ls = [tpl[0] for tpl in q_ids_results]
        #filter: p and qs must have all features
        if product_has_allfeatures(prods_db_c,
                                   p_id) and allquestions_have_allfeatures(
                                       quests_db_c, str(q_ids_ls)):
            insertion_sequence = [(p_id, q_id, 1) for q_id in q_ids_ls]
            outc.executemany("INSERT INTO instances VALUES (?,?,?)",
                             insertion_sequence)
        else:
            logging.info(
                "Product %s excluded from the instances due to not having all the features",
                p_id)
示例#4
0
def sort_product_candidates(product_tuple, quests_ids, testprods_rep_c, testquests_rep_c):

    distance_list = [] #list of tuples, (p_id, q_id, distance), sorted on tpl[2]

    for quest_id in quests_ids:
        question_representation = MyUtils_dbs.search_in_alltables_db(testquests_rep_c, "SELECT * FROM ",
                                                             "WHERE id = '" + str(quest_id) + "'")[0]
        logging.debug("Question representation: %s", question_representation)
        question_tuple = MyUtils.quest_lstonamedtuple(question_representation, offset=1)
        pq_dist = CD.compute_dist_pq(product_tuple, question_tuple)
        distance_list.append((product_tuple.id, quest_id, pq_dist))

    distance_list_sorted = sorted(distance_list, key=lambda tpl : tpl[2])
    return distance_list_sorted
示例#5
0
def get_instance_encoded_dictionary(prod_id, question_id, ps_db_c, qs_db_c,
                                    d2v_model):

    product_row = MyUtils_dbs.search_in_alltables_db(
        ps_db_c, "SELECT * FROM ", "WHERE id = '" + prod_id + "'")
    question_row = MyUtils_dbs.search_in_alltables_db(
        qs_db_c, "SELECT * FROM ", "WHERE id = '" + str(question_id) + "'")
    prod_tuple = MyUtils.prodls_tonamedtuple(product_row[0])
    q_tuple = MyUtils.quest_lstonamedtuple(question_row[0])

    instance_x = {}
    instance_x["p_descvec"] = MyUtils_strings.fromstring_toarray(
        prod_tuple.descvec)
    instance_x["p_titlevec"] = MyUtils_strings.fromstring_toarray(
        prod_tuple.titlevec)
    instance_x["p_kwsVectors"] = MyUtils_strings.fromlls_toarrays(
        prod_tuple.kwsVectors)
    #logging.debug("instance_x['p_kwsVectors'].shape : %s", np.array(instance_x["p_kwsVectors"]).shape)
    instance_x["p_mdcategories"] = MyUtils_strings.categories_to_vecs_lls(
        MyUtils_strings.fromlls_toarrays(prod_tuple.mdcategories), d2v_model)
    if len(np.array(instance_x["p_mdcategories"]).shape) >= 3:
        logging.debug("instance_x['p_mdcategories'].shape : %s",
                      np.array(instance_x["p_mdcategories"]).shape)
        instance_x["p_mdcategories"] = instance_x["p_mdcategories"][0]

    instance_x["q_questionVec"] = MyUtils_strings.fromstring_toarray(
        q_tuple.questionVec)
    instance_x["q_questionType"] = q_tuple.questionType
    instance_x["q_kwsVectors"] = MyUtils_strings.fromlls_toarrays(
        q_tuple.kwsVectors)

    instance_y = 1 if q_tuple.id[0:10] in prod_id else 0
    instance = namedtuple('instance', 'x y')
    inst = instance(x=instance_x, y=instance_y)

    return inst
def define_negative_examples(doc2vec_model, dataset_typeflag):
    MyUtils.init_logging("NN_Dataset_Instances-define_negative_examples.log",
                         logging.INFO)

    f = open(F.PRODS_WITH_NOTASKEDQUESTS_IDS, "w")
    f.close()
    prodsnegativeqs_outfile = open(F.PRODS_WITH_NOTASKEDQUESTS_IDS, "a")
    prodsnegativeqs_outfile.write("id_questionsNotAsked\n")

    ### Connect with the database to read from: candidate negative examples
    db_conn = sqlite3.connect(F.CANDIDATE_NEGQS_DB)
    c = db_conn.cursor()

    ### IF we are working to create the training dataset,
    ### then we before allowing a question Q asked for P2 to be a negative example for P1,
    ### we check the similarity between P1 and P2 (it must not be too high)
    if dataset_typeflag == MyUtils_flags.FLAG_TRAIN:

        ### Determining the maximum allowed similarity between products. Creates the similarity db if it does not exist
        if os.path.exists(F.SIMILARITY_PRODUCTS_DB) == True:
            p_sim_breakpoint = ES.get_products_similarity_breakpoint(
                fraction=0.97)
        else:
            p_sim_breakpoint = ES.explore_products_similarity(N=500,
                                                              fraction=0.97)

        ### Connect with the databases of product and questions representations, to be able to pick the products P1 and P2
        product_reps_dbconn = sqlite3.connect(F.PRODUCTS_FINAL_TRAIN_DB)
        product_reps_c = product_reps_dbconn.cursor()

    segment_size = 10**4
    for input_segment in pd.read_csv(F.PRODSWITHQUESTS_IDS,
                                     sep="_",
                                     chunksize=segment_size):
        for id_askedqs_t in input_segment.itertuples():
            prod_id = id_askedqs_t.id
            #logging.debug("Reading from F.PRODSWITHQUESTS_IDS, the product.id is: %s", prod_id)
            asked_qs = ast.literal_eval(id_askedqs_t.questionsAsked)
            t = (prod_id, )
            c.execute('SELECT * FROM prodnegatives WHERE prod_id=?', t)
            row = c.fetchone()
            if row is None:  #i.e. if the product in the file PRODSWITHQUESTS_IDS was excluded from the previous random subsampling
                continue
            candidatenegativeqs_rawstring = row[1]
            candidatenegativeqs_string = "[" + candidatenegativeqs_rawstring[:
                                                                             -1] + "]"

            candidatenegativeqs_ls = ast.literal_eval(
                candidatenegativeqs_string)
            candidatenegativeqs_ls1 = [
                q_id for q_id in candidatenegativeqs_ls if q_id not in asked_qs
            ]

            if dataset_typeflag == MyUtils_flags.FLAG_TRAIN:
                p1_row = MyUtils_dbs.search_in_alltables_db(
                    dbcursor=product_reps_c,
                    query_pretext="SELECT * FROM",
                    query_aftertext=" WHERE id='" + str(prod_id) + "'")[0]
                candidatenegativeqs_asins = list(
                    map(lambda q_id: q_id[0:10], candidatenegativeqs_ls1))

                p2_rows = MyUtils_dbs.search_in_alltables_db(
                    dbcursor=product_reps_c,
                    query_pretext="SELECT * FROM",
                    query_aftertext="WHERE id IN " +
                    str(tuple(candidatenegativeqs_asins)))
                qids_and_p2rows = list(zip(candidatenegativeqs_ls1, p2_rows))

                for q_id, p2_row in qids_and_p2rows:
                    #logging.debug("p1_row : %s", p1_row)
                    if p2_row is not None and len(p2_row) > 0:
                        #there are questions without corresponding products, in which case no similarity check is to be done

                        p1_tuple = MyUtils.prodls_tonamedtuple(p1_row)  #[1:]?
                        p2_tuple = MyUtils.prodls_tonamedtuple(p2_row)
                        p1_p2_sim, _simparts = PS.compute_2products_similarity_singleprocess(
                            prod1_tuple=p1_tuple,
                            prod2_tuple=p2_tuple,
                            d2v_model=doc2vec_model)
                        if p1_p2_sim > p_sim_breakpoint:
                            candidatenegativeqs_ls1.remove(q_id)
                            logging.info(
                                "Removing question from the candidate negative examples, "
                                +
                                "because the similarity between %s and %s is > %s",
                                prod_id, p2_tuple.id, p_sim_breakpoint)
                logging.info(
                    "Choosing negative examples: P-to-p similarity checks done for product: %s",
                    prod_id)

            random_indices = sorted(
                np.random.choice(a=range(len(candidatenegativeqs_ls1)),
                                 size=min(len(candidatenegativeqs_ls1),
                                          len(asked_qs)),
                                 replace=False,
                                 p=None))
            #logging.info(candidatenegativeqs_ls1)
            negativeqs_ls = [
                candidatenegativeqs_ls1[i] for i in random_indices
            ]
            #logging.info(negativeqs_ls)
            prodsnegativeqs_outfile.write(prod_id + "_" + str(negativeqs_ls) +
                                          "\n")

    prodsnegativeqs_outfile.close()
示例#7
0
def attach_text_to_candidates(ranked_candidates_dbpath, prods_initial_dbpath, quests_initial_dbpath, prod_reps_dbpath, quest_reps_dbpath, final_outdb_path):
    MyUtils.init_logging("Attach_text_to_candidates.log")

    candidates_nn_db = sqlite3.connect(ranked_candidates_dbpath)
    cands_db_c = candidates_nn_db.cursor()
    f = open(F.RANKING_TEMP_DB, 'w'); f.close()
    temp_db = sqlite3.connect(F.RANKING_TEMP_DB)
    temp_db_c = temp_db.cursor()
    testprods_initial_c = sqlite3.connect(prods_initial_dbpath).cursor()
    testquests_initial_c = sqlite3.connect(quests_initial_dbpath).cursor()
    testprods_rep_c = sqlite3.connect(prod_reps_dbpath).cursor()
    testquests_rep_c = sqlite3.connect(quest_reps_dbpath).cursor()

    temp_db_c.execute('''CREATE TABLE candidates(  p_id varchar(63),
                                                   q_id varchar(63),
                                                   distance int,
                                                   p_titletext varchar(1023),
                                                   p_descriptiontext varchar(8191),
                                                   p_categorytext varchar (4095),
                                                   q_text varchar (8191)         
                                            )''')

    num_of_candidates = MyUtils_dbs.get_tot_num_rows_db(cands_db_c)
    logging.info(num_of_candidates)
    counter_questionsameid = 0
    last_prod_id = 'x'

    for rowindex in range(1, num_of_candidates + 1):
        row = cands_db_c.execute("SELECT * FROM candidates WHERE rowid = ?", (rowindex,)).fetchone()
        #logging.info("info: %s", row)
        prod_id = row[0]
        quest_id = row[1]
        distance = row[2]

        if last_prod_id != prod_id:
            product_titleinfo,product_descinfo, product_categinfo = \
                MyUtils_dbs.search_in_alltables_db(testprods_initial_c, "SELECT title, description, categories FROM",
                                                                  "WHERE asin = '" + str(prod_id) + "'")[0]
            product_representation = MyUtils_dbs.search_in_alltables_db(testprods_rep_c, "SELECT * FROM ",
                                                                        "WHERE id = '" + str(prod_id) + "'")[0]
            prod_tpl = MyUtils.prodls_tonamedtuple(product_representation, offset=1)

            counter_questionsameid = 0

        ###get question's unixTime
        if len(quest_id)< 21: #format : @nan0
            base_endpoint = 14
            question_unixTime = str(quest_id[11:base_endpoint])
        else:
            base_endpoint = 23
            question_unixTime = str(quest_id[11:base_endpoint])
        logging.debug("Question unixTime: %s", question_unixTime)

        if base_endpoint == 23: #if we have a valid unixTime specification

            possible_questions_text = MyUtils_dbs.search_in_alltables_db(testquests_initial_c, "SELECT question FROM",
                                                                  "WHERE asin = '" + str(quest_id[0:10]) + "'"
                                                              + " AND unixTime LIKE '" + question_unixTime + "%'")
        else: #if we have NULL in the unixTime field
            possible_questions_text = MyUtils_dbs.search_in_alltables_db(testquests_initial_c, "SELECT question FROM",
                                                                         "WHERE asin = '" + str(quest_id[0:10]) + "'"
                                                                         + " AND unixTime IS NULL")
        base_q_id = str(quest_id[0:23])
        possible_questions_reps = MyUtils_dbs.search_in_alltables_db(testquests_rep_c, "SELECT * FROM ",
                                                             "WHERE id LIKE '" + str(base_q_id) + "%'")
        logging.debug("possible_questions_reps: %s", possible_questions_reps)
        logging.debug("possible_questions_text:%s", possible_questions_text)

        if len(possible_questions_text) > 1:
            possible_questions_tuples = list(map ( lambda q_ls : MyUtils.quest_lstonamedtuple(q_ls, offset=1), possible_questions_reps))
            possible_questions_distances = list(map (lambda q_tpl : CD.compute_dist_pq(prod_tpl, q_tpl) , possible_questions_tuples))

            qs_dist_lts = list(zip(possible_questions_tuples, possible_questions_distances))
            qs_dist_lts_sorted = sorted( qs_dist_lts, key=lambda tpl : tpl[1])
            #logging.info("sorted question tuples: %s", qs_dist_lts_sorted)
            question_textinfo = possible_questions_text[counter_questionsameid][0]
            counter_questionsameid= counter_questionsameid+1
        else:
            question_textinfo = possible_questions_text[0][0]
        logging.debug("question_textinfo: %s", question_textinfo)

        temp_db_c.execute("INSERT INTO candidates VALUES (?,?,?,?,?,?,?)", (prod_id, quest_id, distance,
                                                                          product_titleinfo, product_descinfo, product_categinfo, question_textinfo))
        logging.debug("***")

    temp_db.commit()
    os.rename(F.RANKING_TEMP_DB , final_outdb_path)