Exemplo n.º 1
0
def sort_candidates(candidates_db_path, ranked_candidates_outdb_path, prod_reps_dbpath, quest_reps_dbpath):
    MyUtils.init_logging("Rank_candidates_nn.log")
    ### Connecting to the databases: candidates, test products, test questions
    candidates_nn_db = sqlite3.connect(candidates_db_path)
    cands_db_c = candidates_nn_db.cursor()

    testprods_rep_c = sqlite3.connect(prod_reps_dbpath).cursor()
    testquests_rep_c = sqlite3.connect(quest_reps_dbpath).cursor()

    f = open(ranked_candidates_outdb_path, "w"); f.close()
    outdb = sqlite3.connect(ranked_candidates_outdb_path)
    outdb_c = outdb.cursor()
    outdb_c.execute('''CREATE TABLE candidates(    p_id varchar(63),
                                                   q_id varchar(63),
                                                   distance int        
                                            )''')
    ###

    test_products_ids = cands_db_c.execute("SELECT DISTINCT p_id FROM candidates").fetchall()
    logging.info(test_products_ids[0])
    #logging.debug(test_products_ids)
    for tpl_pid in test_products_ids:
        pid = tpl_pid[0]
        product_representation = MyUtils_dbs.search_in_alltables_db(testprods_rep_c, "SELECT * FROM ",
                                                                    "WHERE id = '" + str(pid) + "'")[0]
        product_tuple = MyUtils.prodls_tonamedtuple(product_representation, offset=1)
        quests_ids = list(map ( lambda results_tpl : results_tpl[0], cands_db_c.execute("SELECT q_id FROM candidates WHERE p_id = ?", tpl_pid).fetchall()))
        logging.debug(quests_ids)
        product_qs_sorted = sort_product_candidates(product_tuple, quests_ids, testprods_rep_c, testquests_rep_c)
        outdb.executemany("INSERT INTO candidates VALUES (?,?,?)", product_qs_sorted)
    outdb.commit()
Exemplo n.º 2
0
def get_instance_encoded_dictionary(prod_id, question_id, ps_db_c, qs_db_c,
                                    d2v_model):

    product_row = MyUtils_dbs.search_in_alltables_db(
        ps_db_c, "SELECT * FROM ", "WHERE id = '" + prod_id + "'")
    question_row = MyUtils_dbs.search_in_alltables_db(
        qs_db_c, "SELECT * FROM ", "WHERE id = '" + str(question_id) + "'")
    prod_tuple = MyUtils.prodls_tonamedtuple(product_row[0])
    q_tuple = MyUtils.quest_lstonamedtuple(question_row[0])

    instance_x = {}
    instance_x["p_descvec"] = MyUtils_strings.fromstring_toarray(
        prod_tuple.descvec)
    instance_x["p_titlevec"] = MyUtils_strings.fromstring_toarray(
        prod_tuple.titlevec)
    instance_x["p_kwsVectors"] = MyUtils_strings.fromlls_toarrays(
        prod_tuple.kwsVectors)
    #logging.debug("instance_x['p_kwsVectors'].shape : %s", np.array(instance_x["p_kwsVectors"]).shape)
    instance_x["p_mdcategories"] = MyUtils_strings.categories_to_vecs_lls(
        MyUtils_strings.fromlls_toarrays(prod_tuple.mdcategories), d2v_model)
    if len(np.array(instance_x["p_mdcategories"]).shape) >= 3:
        logging.debug("instance_x['p_mdcategories'].shape : %s",
                      np.array(instance_x["p_mdcategories"]).shape)
        instance_x["p_mdcategories"] = instance_x["p_mdcategories"][0]

    instance_x["q_questionVec"] = MyUtils_strings.fromstring_toarray(
        q_tuple.questionVec)
    instance_x["q_questionType"] = q_tuple.questionType
    instance_x["q_kwsVectors"] = MyUtils_strings.fromlls_toarrays(
        q_tuple.kwsVectors)

    instance_y = 1 if q_tuple.id[0:10] in prod_id else 0
    instance = namedtuple('instance', 'x y')
    inst = instance(x=instance_x, y=instance_y)

    return inst
def register_matches(product_featureflags, quest_featureflags, dataset_type,
                     use_existing_file):
    allmatches_filepath = F.PRODSWITHQUESTS_IDS_ALL + dataset_type
    if use_existing_file:
        if os.path.exists(allmatches_filepath):
            if os.path.getsize(allmatches_filepath) > 0:
                logging.info(
                    "The P-Q matches for the requested dataset were already found. They are located in the file:%s",
                    allmatches_filepath)
                last_prod_id = "x"
                allmatches_file = open(file=allmatches_filepath,
                                       mode="r",
                                       newline='')
                reader = csv.reader(allmatches_file,
                                    delimiter='_',
                                    quotechar='"')
                reader.__next__()  #skip header
                count_ps_withmatches = 0
                while True:
                    try:
                        p_ls = reader.__next__()
                        prod_id = p_ls[0]
                        if prod_id != last_prod_id:
                            count_ps_withmatches = count_ps_withmatches + 1
                            last_prod_id = prod_id
                    except StopIteration:
                        break
                allmatches_file.close()
                return count_ps_withmatches

    if dataset_type == MyUtils_flags.FLAG_VALID:
        ps_db_filepath = F.PRODS_NUMENCODING_DB_VALID
        qs_db_filepath = F.QUESTS_NUMENCODING_DB_VALID
        prods_filepath = F.PRODUCTS_FINAL_VALID
        quests_filepath = F.QUESTIONS_FINAL_VALID
    elif dataset_type == MyUtils_flags.FLAG_TEST:
        ps_db_filepath = F.PRODS_NUMENCODING_DB_TEST
        qs_db_filepath = F.QUESTS_NUMENCODING_DB_TEST
        prods_filepath = F.PRODUCTS_FINAL_TEST
        quests_filepath = F.QUESTIONS_FINAL_TEST
    else:  #"train"
        ps_db_filepath = F.PRODS_NUMENCODING_DB_TRAIN
        qs_db_filepath = F.QUESTS_NUMENCODING_DB_TRAIN
        prods_filepath = F.PRODUCTS_FINAL_TRAIN
        quests_filepath = F.QUESTIONS_FINAL_TRAIN

    MyUtils.init_logging("RegisterMatches.log", logging.INFO)
    start = time()
    f = open(F.PRODSWITHQUESTS_IDS, "w")
    f.close()  #clean outfile between runs
    ids_outfile = open(F.PRODSWITHQUESTS_IDS, "a")
    ids_outfile.write("id_questionsAsked\n")

    #connecting with the products, to filter them, based on the features we chose to include
    ps_db_conn = sqlite3.connect(ps_db_filepath)
    ps_db_cursor = ps_db_conn.cursor()
    # connecting with the questions, to filter them, based on the features we chose to include
    qs_db_conn = sqlite3.connect(qs_db_filepath)
    qs_db_cursor = qs_db_conn.cursor()

    prods_filehandler = open(prods_filepath, "r", newline='')
    quests_filehandler = open(quests_filepath, "r", newline='')
    reader_1 = csv.reader(prods_filehandler, delimiter='_', quotechar='"')
    reader_2 = csv.reader(quests_filehandler, delimiter='_', quotechar='"')

    num_prods_withmatches = 0
    num_products_reviewed = 0
    num_questions_reviewed = 0
    last_prod_id = "x"
    questionsasked_ids_ls = []
    ### init:
    reader_1.__next__()
    reader_2.__next__()
    reader_1.__next__()
    reader_2.__next__()  #skip headers
    p_ls = reader_1.__next__()
    q_ls = reader_2.__next__()
    prod_t = MyUtils.prodls_tonamedtuple(p_ls, offset=0)
    quest_t = MyUtils.quest_lstonamedtuple(q_ls, offset=0)
    q_prod = (quest_t.id)[0:10]
    #loop:
    while True:
        try:
            match = False
            while not (match):
                while q_prod > prod_t.id or (len(q_prod) > len(prod_t.id)):
                    logging.debug("%s < %s", prod_t.id, q_prod)
                    p_ls = reader_1.__next__()  #advance product
                    num_products_reviewed = num_products_reviewed + 1
                    prod_t = MyUtils.prodls_tonamedtuple(p_ls, offset=0)

                while q_prod < prod_t.id or (len(q_prod) < len(prod_t.id)):
                    logging.debug("%s > %s", prod_t.id, q_prod)
                    q_ls = reader_2.__next__()  #advance question
                    num_questions_reviewed = num_questions_reviewed + 1
                    quest_t = MyUtils.quest_lstonamedtuple(q_ls, offset=0)
                    q_prod = (quest_t.id)[0:10]

                if q_prod == prod_t.id:
                    match = True
                    #barrier: feature filtering on products and questions; DB lookup:
                    if featurefilter_prod(prod_t.id, product_featureflags, ps_db_cursor) == True and \
                       featurefilter_quest(quest_t.id, quest_featureflags, qs_db_cursor) == True:
                        logging.info("Match: product: %s , \t question: %s",
                                     prod_t.id, quest_t.id)
                        #positive_qs_ids_file.write(str(quest_t.id) + "\n")#store the question id (positive example)
                        if len(prod_t.id) > 5:
                            if prod_t.id != last_prod_id:
                                if len(last_prod_id) > 5:
                                    ids_outfile.write(
                                        str(last_prod_id) + "_" +
                                        str(questionsasked_ids_ls) +
                                        "\n")  #write the previous p and qs
                                questionsasked_ids_ls = [
                                ]  #reset, and then append
                                questionsasked_ids_ls.append(quest_t.id)
                                last_prod_id = prod_t.id
                                num_prods_withmatches = num_prods_withmatches + 1  #n: matches = number of products that have questions
                            else:
                                logging.info("***")
                                questionsasked_ids_ls.append(
                                    quest_t.id
                                )  #same product as previously; only append
                    else:
                        pass
                    #on to the next question:
                    q_ls = reader_2.__next__()
                    quest_t = MyUtils.quest_lstonamedtuple(q_ls, offset=0)
                    q_prod = (quest_t.id)[0:10]

        except StopIteration:
            exc_info = sys.exc_info()
            logging.warning("Exception information: %s", exc_info)
            break
    logging.info("Total number products that have matching questions: %s",
                 num_prods_withmatches)
    logging.info("Products reviewed: %s", num_products_reviewed)
    logging.info("Questions reviewed: %s", num_questions_reviewed)

    copy(src=F.PRODSWITHQUESTS_IDS,
         dst=F.PRODSWITHQUESTS_IDS_ALL + dataset_type)

    end = time()
    logging.info("Time elapsed: %s", round(end - start, 4))
    ids_outfile.close()
    prods_filehandler.close()
    quests_filehandler.close()
    #positive_qs_ids_file.close()
    return num_prods_withmatches
def define_negative_examples(doc2vec_model, dataset_typeflag):
    MyUtils.init_logging("NN_Dataset_Instances-define_negative_examples.log",
                         logging.INFO)

    f = open(F.PRODS_WITH_NOTASKEDQUESTS_IDS, "w")
    f.close()
    prodsnegativeqs_outfile = open(F.PRODS_WITH_NOTASKEDQUESTS_IDS, "a")
    prodsnegativeqs_outfile.write("id_questionsNotAsked\n")

    ### Connect with the database to read from: candidate negative examples
    db_conn = sqlite3.connect(F.CANDIDATE_NEGQS_DB)
    c = db_conn.cursor()

    ### IF we are working to create the training dataset,
    ### then we before allowing a question Q asked for P2 to be a negative example for P1,
    ### we check the similarity between P1 and P2 (it must not be too high)
    if dataset_typeflag == MyUtils_flags.FLAG_TRAIN:

        ### Determining the maximum allowed similarity between products. Creates the similarity db if it does not exist
        if os.path.exists(F.SIMILARITY_PRODUCTS_DB) == True:
            p_sim_breakpoint = ES.get_products_similarity_breakpoint(
                fraction=0.97)
        else:
            p_sim_breakpoint = ES.explore_products_similarity(N=500,
                                                              fraction=0.97)

        ### Connect with the databases of product and questions representations, to be able to pick the products P1 and P2
        product_reps_dbconn = sqlite3.connect(F.PRODUCTS_FINAL_TRAIN_DB)
        product_reps_c = product_reps_dbconn.cursor()

    segment_size = 10**4
    for input_segment in pd.read_csv(F.PRODSWITHQUESTS_IDS,
                                     sep="_",
                                     chunksize=segment_size):
        for id_askedqs_t in input_segment.itertuples():
            prod_id = id_askedqs_t.id
            #logging.debug("Reading from F.PRODSWITHQUESTS_IDS, the product.id is: %s", prod_id)
            asked_qs = ast.literal_eval(id_askedqs_t.questionsAsked)
            t = (prod_id, )
            c.execute('SELECT * FROM prodnegatives WHERE prod_id=?', t)
            row = c.fetchone()
            if row is None:  #i.e. if the product in the file PRODSWITHQUESTS_IDS was excluded from the previous random subsampling
                continue
            candidatenegativeqs_rawstring = row[1]
            candidatenegativeqs_string = "[" + candidatenegativeqs_rawstring[:
                                                                             -1] + "]"

            candidatenegativeqs_ls = ast.literal_eval(
                candidatenegativeqs_string)
            candidatenegativeqs_ls1 = [
                q_id for q_id in candidatenegativeqs_ls if q_id not in asked_qs
            ]

            if dataset_typeflag == MyUtils_flags.FLAG_TRAIN:
                p1_row = MyUtils_dbs.search_in_alltables_db(
                    dbcursor=product_reps_c,
                    query_pretext="SELECT * FROM",
                    query_aftertext=" WHERE id='" + str(prod_id) + "'")[0]
                candidatenegativeqs_asins = list(
                    map(lambda q_id: q_id[0:10], candidatenegativeqs_ls1))

                p2_rows = MyUtils_dbs.search_in_alltables_db(
                    dbcursor=product_reps_c,
                    query_pretext="SELECT * FROM",
                    query_aftertext="WHERE id IN " +
                    str(tuple(candidatenegativeqs_asins)))
                qids_and_p2rows = list(zip(candidatenegativeqs_ls1, p2_rows))

                for q_id, p2_row in qids_and_p2rows:
                    #logging.debug("p1_row : %s", p1_row)
                    if p2_row is not None and len(p2_row) > 0:
                        #there are questions without corresponding products, in which case no similarity check is to be done

                        p1_tuple = MyUtils.prodls_tonamedtuple(p1_row)  #[1:]?
                        p2_tuple = MyUtils.prodls_tonamedtuple(p2_row)
                        p1_p2_sim, _simparts = PS.compute_2products_similarity_singleprocess(
                            prod1_tuple=p1_tuple,
                            prod2_tuple=p2_tuple,
                            d2v_model=doc2vec_model)
                        if p1_p2_sim > p_sim_breakpoint:
                            candidatenegativeqs_ls1.remove(q_id)
                            logging.info(
                                "Removing question from the candidate negative examples, "
                                +
                                "because the similarity between %s and %s is > %s",
                                prod_id, p2_tuple.id, p_sim_breakpoint)
                logging.info(
                    "Choosing negative examples: P-to-p similarity checks done for product: %s",
                    prod_id)

            random_indices = sorted(
                np.random.choice(a=range(len(candidatenegativeqs_ls1)),
                                 size=min(len(candidatenegativeqs_ls1),
                                          len(asked_qs)),
                                 replace=False,
                                 p=None))
            #logging.info(candidatenegativeqs_ls1)
            negativeqs_ls = [
                candidatenegativeqs_ls1[i] for i in random_indices
            ]
            #logging.info(negativeqs_ls)
            prodsnegativeqs_outfile.write(prod_id + "_" + str(negativeqs_ls) +
                                          "\n")

    prodsnegativeqs_outfile.close()
def register_matches():

    prods_filepath = F.PRODUCTS_FINAL_TRAIN
    quests_filepath = F.QUESTIONS_FINAL_TRAIN

    MyUtils.init_logging("OnlineLearning_RegisterMatches.log", logging.INFO)
    start = time()
    f = open(F.ONLINE_PQMATCHES, "w"); f.close()#clean outfile between runs
    ids_outfile = open(F.ONLINE_PQMATCHES, "a")
    ids_outfile.write("id_questionsAsked\n")

    prods_filehandler =  open(prods_filepath, "r", newline='')
    quests_filehandler = open(quests_filepath, "r", newline='')
    reader_1 = csv.reader(prods_filehandler, delimiter='_', quotechar='"')
    reader_2 = csv.reader(quests_filehandler, delimiter='_', quotechar='"')

    num_prods_withmatches = 0
    num_products_reviewed = 0
    num_questions_reviewed = 0
    last_prod_id = "x"
    questionsasked_ids_ls = []
    ### init:
    reader_1.__next__(); reader_2.__next__() ; reader_1.__next__(); reader_2.__next__()  #skip headers
    p_ls = reader_1.__next__()
    q_ls = reader_2.__next__()
    prod_t = MyUtils.prodls_tonamedtuple(p_ls, offset=0)
    quest_t = utilities.MyUtils.quest_lstonamedtuple(q_ls, offset=0)
    q_prod = (quest_t.id)[0:10]
    #loop:
    while True:
        try:
            match = False
            while not(match):
                while q_prod > prod_t.id or (len(q_prod) > len(prod_t.id)):
                    logging.debug("%s < %s", prod_t.id , q_prod)
                    p_ls = reader_1.__next__() #advance product
                    num_products_reviewed = num_products_reviewed + 1
                    prod_t = utilities.MyUtils.prodls_tonamedtuple(p_ls, offset=0)

                while q_prod < prod_t.id or (len(q_prod) < len(prod_t.id)):
                    logging.debug("%s > %s", prod_t.id, q_prod)
                    q_ls = reader_2.__next__() #advance question
                    num_questions_reviewed = num_questions_reviewed + 1
                    quest_t = utilities.MyUtils.quest_lstonamedtuple(q_ls, offset=0)
                    q_prod = (quest_t.id)[0:10]

                if q_prod == prod_t.id:
                    match = True
                    #barrier: feature filtering on products and questions; DB lookup:
                    logging.info("Match: product: %s , \t question: %s", prod_t.id, quest_t.id)
                    #positive_qs_ids_file.write(str(quest_t.id) + "\n")#store the question id (positive example)
                    if len(prod_t.id) > 5:
                        if prod_t.id != last_prod_id:
                            if len(last_prod_id) > 5:
                                ids_outfile.write(str(last_prod_id) + "_" + str(questionsasked_ids_ls) + "\n")#write the previous p and qs
                            questionsasked_ids_ls = [] #reset, and then append
                            questionsasked_ids_ls.append(quest_t.id)
                            last_prod_id = prod_t.id
                            num_prods_withmatches = num_prods_withmatches +1 #n: matches = number of products that have questions
                        else:
                            logging.info("***")
                            questionsasked_ids_ls.append(quest_t.id)#same product as previously; only append
                    #on to the next question:
                    q_ls = reader_2.__next__()
                    quest_t = utilities.MyUtils.quest_lstonamedtuple(q_ls, offset=0)
                    q_prod = (quest_t.id)[0:10]

        except StopIteration:
            logging.warning("Exception information: %s", exc_info())
            break
    logging.info("Total number products that have matching questions: %s", num_prods_withmatches)
    logging.info("Products reviewed: %s", num_products_reviewed)
    logging.info("Questions reviewed: %s", num_questions_reviewed)

    end = time()
    logging.info("Time elapsed: %s", round(end - start,4))
    ids_outfile.close()
    prods_filehandler.close()
    quests_filehandler.close()
    #positive_qs_ids_file.close()
    return num_prods_withmatches
Exemplo n.º 6
0
def attach_text_to_candidates(ranked_candidates_dbpath, prods_initial_dbpath, quests_initial_dbpath, prod_reps_dbpath, quest_reps_dbpath, final_outdb_path):
    MyUtils.init_logging("Attach_text_to_candidates.log")

    candidates_nn_db = sqlite3.connect(ranked_candidates_dbpath)
    cands_db_c = candidates_nn_db.cursor()
    f = open(F.RANKING_TEMP_DB, 'w'); f.close()
    temp_db = sqlite3.connect(F.RANKING_TEMP_DB)
    temp_db_c = temp_db.cursor()
    testprods_initial_c = sqlite3.connect(prods_initial_dbpath).cursor()
    testquests_initial_c = sqlite3.connect(quests_initial_dbpath).cursor()
    testprods_rep_c = sqlite3.connect(prod_reps_dbpath).cursor()
    testquests_rep_c = sqlite3.connect(quest_reps_dbpath).cursor()

    temp_db_c.execute('''CREATE TABLE candidates(  p_id varchar(63),
                                                   q_id varchar(63),
                                                   distance int,
                                                   p_titletext varchar(1023),
                                                   p_descriptiontext varchar(8191),
                                                   p_categorytext varchar (4095),
                                                   q_text varchar (8191)         
                                            )''')

    num_of_candidates = MyUtils_dbs.get_tot_num_rows_db(cands_db_c)
    logging.info(num_of_candidates)
    counter_questionsameid = 0
    last_prod_id = 'x'

    for rowindex in range(1, num_of_candidates + 1):
        row = cands_db_c.execute("SELECT * FROM candidates WHERE rowid = ?", (rowindex,)).fetchone()
        #logging.info("info: %s", row)
        prod_id = row[0]
        quest_id = row[1]
        distance = row[2]

        if last_prod_id != prod_id:
            product_titleinfo,product_descinfo, product_categinfo = \
                MyUtils_dbs.search_in_alltables_db(testprods_initial_c, "SELECT title, description, categories FROM",
                                                                  "WHERE asin = '" + str(prod_id) + "'")[0]
            product_representation = MyUtils_dbs.search_in_alltables_db(testprods_rep_c, "SELECT * FROM ",
                                                                        "WHERE id = '" + str(prod_id) + "'")[0]
            prod_tpl = MyUtils.prodls_tonamedtuple(product_representation, offset=1)

            counter_questionsameid = 0

        ###get question's unixTime
        if len(quest_id)< 21: #format : @nan0
            base_endpoint = 14
            question_unixTime = str(quest_id[11:base_endpoint])
        else:
            base_endpoint = 23
            question_unixTime = str(quest_id[11:base_endpoint])
        logging.debug("Question unixTime: %s", question_unixTime)

        if base_endpoint == 23: #if we have a valid unixTime specification

            possible_questions_text = MyUtils_dbs.search_in_alltables_db(testquests_initial_c, "SELECT question FROM",
                                                                  "WHERE asin = '" + str(quest_id[0:10]) + "'"
                                                              + " AND unixTime LIKE '" + question_unixTime + "%'")
        else: #if we have NULL in the unixTime field
            possible_questions_text = MyUtils_dbs.search_in_alltables_db(testquests_initial_c, "SELECT question FROM",
                                                                         "WHERE asin = '" + str(quest_id[0:10]) + "'"
                                                                         + " AND unixTime IS NULL")
        base_q_id = str(quest_id[0:23])
        possible_questions_reps = MyUtils_dbs.search_in_alltables_db(testquests_rep_c, "SELECT * FROM ",
                                                             "WHERE id LIKE '" + str(base_q_id) + "%'")
        logging.debug("possible_questions_reps: %s", possible_questions_reps)
        logging.debug("possible_questions_text:%s", possible_questions_text)

        if len(possible_questions_text) > 1:
            possible_questions_tuples = list(map ( lambda q_ls : MyUtils.quest_lstonamedtuple(q_ls, offset=1), possible_questions_reps))
            possible_questions_distances = list(map (lambda q_tpl : CD.compute_dist_pq(prod_tpl, q_tpl) , possible_questions_tuples))

            qs_dist_lts = list(zip(possible_questions_tuples, possible_questions_distances))
            qs_dist_lts_sorted = sorted( qs_dist_lts, key=lambda tpl : tpl[1])
            #logging.info("sorted question tuples: %s", qs_dist_lts_sorted)
            question_textinfo = possible_questions_text[counter_questionsameid][0]
            counter_questionsameid= counter_questionsameid+1
        else:
            question_textinfo = possible_questions_text[0][0]
        logging.debug("question_textinfo: %s", question_textinfo)

        temp_db_c.execute("INSERT INTO candidates VALUES (?,?,?,?,?,?,?)", (prod_id, quest_id, distance,
                                                                          product_titleinfo, product_descinfo, product_categinfo, question_textinfo))
        logging.debug("***")

    temp_db.commit()
    os.rename(F.RANKING_TEMP_DB , final_outdb_path)