Пример #1
0
def run_offline_paper():
    client = Milvus(host=milvus_ip, port='19530')
    cur.execute("SELECT ID ,doc_vector FROM paper")
    papers = cur.fetchall()
    for i in papers:
        try:
            id = i[0]
            vec = i[1].split(",")
            vec = [eval(j) for j in vec]
            res = client.search(collection_name='ideaman',
                                query_records=[vec],
                                top_k=51)

            status = res[0].code
            if status == 0:
                topKqueryResult = [str(j) for j in res[-1]._id_array[0]]
                paper_vecs = ",".join(topKqueryResult[1:])
                sql = 'INSERT INTO offline_paper(paper_id , recs) VALUES({} , "{}")'.format(
                    id, paper_vecs)
                cur.execute(sql)
                try:
                    conn.commit()
                except:
                    conn.rollback()
        except:
            pass
Пример #2
0
def get_user(u_id):
    """
    根据u_id,生成用户,如果该用户的所有感兴趣的tags交集中很少或者没有论文,重新生成.
    :param u_id: 用户id
    :return: name:用户名
    :return: tags:用户感兴趣的tags,格式为list
    :return: tag:用户感兴趣的tags,格式为字符串,如"1,2,3"
    :return SQL_2:用于获得该用户感兴趣tags的论文
    """
    # 创建用户
    name = "tmp_%05d" % u_id
    tags = random.sample(interest_tags, random.randint(1, 3))
    tag = ",".join(tags)
    id = u_id + 7015
    id2tags[id] = tag

    # 2,随机为每一位用户的第一个个标签点击随机(2,5)篇的论文。
    conditions = []
    for t_id in tags:
        conditions.append('FIND_IN_SET("{}",tags)'.format(t_id))
    condition = " AND ".join(conditions)

    SQL_2 = 'SELECT id FROM paper WHERE ' + condition + ' ORDER BY RAND() LIMIT {}'.format(random.randint(2, 8))
    cur.execute(SQL_2)
    if len(cur.fetchall()) <= 4:
        return get_user(u_id)
    else:
        return name, tags, tag, id, SQL_2
Пример #3
0
def get_total_recall():
    sql = "SELECT user_id , total_recall  FROM user_rec LIMIT 20;"
    cur.execute(sql)
    dic = {}
    res = cur.fetchall()
    for i in res:
        dic[i[0]] = i[1]
    for i in dic:
        print(i)
    return dic
Пример #4
0
def click(user_id, paper_id, flag=1):
    """
    将点击信息写入到mysql中
    :param user_id: 用户id
    :param paper_id:点击的论文id
    :return: null
    """
    SQL = 'INSERT INTO click_log(u_id,item_id,event_type) VALUES("{}","{}","{}")'.format(user_id, paper_id, flag)
    try:
        cur.execute(SQL)
        conn.commit()
    except:
        conn.rollback()
Пример #5
0
def gen_UIR():
    # 写入正样本
    cur.execute(
        "SELECT u_id,item_id,event_type,add_time FROM click_log where event_type = 1"
    )
    res = cur.fetchall()
    file = open("dataset/UIR.csv", "w", encoding='utf-8')
    for item in res:
        uid = item[0]
        itemid = item[1]
        rating = item[2]
        timestamp = item[3].timestamp()
        file.write(",".join([str(i)
                             for i in [uid, itemid, rating, timestamp]]) +
                   "\n")
    file.close()

    # 写入负样本
    cur.execute("""SELECT
        u_id,
        GROUP_CONCAT(DISTINCT item_id ORDER BY item_id ASC SEPARATOR ' ') as neg
    FROM
        click_log 
    WHERE
        event_type = 0 
    GROUP BY
      u_id
    ORDER BY
        LENGTH(neg) DESC""")
    res = cur.fetchall()
    file = open("./dataset/UIR_negative.csv", "w", encoding='utf-8')
    for item in res:
        uid = item[0]
        negs = item[1]
        file.write(" ".join([str(i) for i in [uid, negs]]) + "\n")
    file.close()
Пример #6
0
def main(user_number=8000):
    """
    用于生成用户点击论文数据,
        1.生产指定数量的用户,指定每一位用户的感兴趣的标签[1,3]。将deleted位置设置为1
        2.随机为每一位用户的每个标签点击随机(2,8)篇的论文。
        3.随机选择用户,85%的概率对相应感兴趣的便签进行点击,15%的概率随机点击其他的论文,执行 8w,平均每个用户执行10次
    """

    # 1 生产指定数量的用户,指定每一位用户的感兴趣的标签,将deleted位置设置为1
    for u_id in range(1, user_number + 1):
        name, tags, tag, id, SQL_2 = get_user(u_id)
        cur.execute(SQL_2)
        paper_list = list(cur.fetchall())

        SQL_1 = 'INSERT INTO `user`(username,deleted,interest_tags) VALUES("{}", 1,"{}")'.format(name, tag)
        cur.execute(SQL_1)
        conn.commit()
        for paper_id in paper_list:
            click(id, paper_id[0])
        if u_id % 100 == 0:
            print("已经完成用户数量:", u_id)

    SQL_5 = "SELECT id,interest_tags FROM `user` WHERE id >= 10;"
    cur.execute(SQL_5)
    for u_item in cur.fetchall():
        id2tags[u_item[0]] = u_item[1].split(",")
    # 3.随机选择用户,
    # 随机选择用户
    for num in range(user_number * 10):
        id = random.sample(id2tags.keys(), 1)[0]
        rand_num = random.random()
        if rand_num < 0.55:
            # 55%生成正样本
            t_id = random.sample(id2tags.get(id), 1)[0]
            SQL_3 = 'SELECT id FROM paper WHERE FIND_IN_SET("{}",tags) ORDER BY RAND() LIMIT {}'.format(t_id, 1)
            cur.execute(SQL_3)
            paper_item = cur.fetchall()[0]
            paper_id = paper_item[0]
            click(id, paper_id)
        elif 0.55 <= rand_num <= 0.9:
            # 45%生成负样本
            t_id = random.sample(id2tags.get(id), 1)[0]
            SQL_3 = 'SELECT id FROM paper WHERE NOT FIND_IN_SET("{}",tags) ORDER BY RAND() LIMIT {}'.format(t_id, 1)
            cur.execute(SQL_3)
            paper_item = cur.fetchall()[0]
            paper_id = paper_item[0]
            click(id, paper_id, 0)
        else:
            # 5 % 的概率随机点击其他的论文
            SQL_4 = 'SELECT id FROM paper ORDER BY RAND() LIMIT 1'
            cur.execute(SQL_4)
            paper_item = cur.fetchall()[0]
            paper_id = paper_item[0]
            click(id, paper_id)
        if num % 1000 == 0:
            print("已经完成点击数量:%d", num)
Пример #7
0
def getPredictData(user_id: int, paper_id: int):
    # 初始化
    tags_pad, author_pad, doc2vec_pad = 16, 100, 160
    user_interest_tags_pad = 10

    model_name = 'bert-base-uncased'
    MODEL_PATH = './bert-base-uncased/'
    # 通过词典导入分词器
    tokenizer = BertTokenizer.from_pretrained(model_name)
    # 导入配置文件
    model_config = BertConfig.from_pretrained(model_name)
    # 通过配置和路径导入模型
    bert_model = BertModel.from_pretrained(MODEL_PATH, config=model_config)

    # 获取用户数据
    cur.execute("SELECT id, interest_tags FROM `user` WHERE id = %d" %
                (user_id, ))
    res = cur.fetchone()
    userid = [str(res[0])]
    user_interest_tags = str(res[1]).split(",")
    user_interest_tags += ['0'] * (user_interest_tags_pad -
                                   len(user_interest_tags))

    # 获取论文数据
    cur.execute("""
        SELECT
            id,
            title,
            description,
            tags,
            AUTHORS,
            doc_vector 
        FROM
            paper 
        WHERE
            pwc_tasks <> '' 
            AND doc_vector IS NOT NULL 
            AND `authors` NOT LIKE '%one%' 
        AND tags NOT LIKE '%one%' 
        AND id = {}
	""".format(paper_id))
    item = cur.fetchone()

    paper_id = [str(item[0])]
    title = item[1]
    description = item[2]
    tags = item[3]
    authors = item[4]
    doc2vec = item[5]

    # title转换为vec
    encoded_input = tokenizer(title, return_tensors='pt')
    title = bert_model(**encoded_input)['pooler_output'].tolist()[0]
    title = [str(i) for i in title]

    # description转换为vec
    encoded_input = tokenizer(description, return_tensors='pt')
    description = bert_model(**encoded_input)['pooler_output'].tolist()[0]
    description = [str(i) for i in description]

    # tags转换为idx
    tags = [add2Map(tags2idx, i) for i in tags.split(",")]
    tags += ['0'] * (tags_pad - len(tags))

    # authors 转换为idx
    authors = [add2Map(authors2idx, i) for i in authors.split(",")]
    authors += ['0'] * (author_pad - len(authors))

    doc2vec = doc2vec.split(",")
    doc2vec += ['0'] * (doc2vec_pad - len(doc2vec))

    line = userid + user_interest_tags + paper_id + title + description + tags + authors + doc2vec
    line = [eval(i) for i in line]

    return line
Пример #8
0
def gen_Tensor():
    """
    第一步:下载数据
    1. 从相邻主机的Mysql中下载数据,格式:userid , user_interest_tags , paper_id ,title , description ,tags , authors ,
    doc_vector , add_time ,label
    2. 原文文本中包含逗号,都用|代替
    3. 将作者转换为独热编码,并补齐
    4. 将tags转换为独热编码并补齐
    5. 将pwc_tasks转换为独热编码并补齐
    """
    model_name = 'bert-base-uncased'
    MODEL_PATH = './bert-base-uncased/'
    # 通过词典导入分词器
    tokenizer = BertTokenizer.from_pretrained(model_name)
    # 导入配置文件
    model_config = BertConfig.from_pretrained(model_name)
    # 通过配置和路径导入模型
    bert_model = BertModel.from_pretrained(MODEL_PATH, config=model_config)
    sql = """SELECT
        log.u_id AS userid,
        `user`.interest_tags AS user_interest_tags,
        paper.id AS paperid,
        paper.title AS title,
        paper.description AS description,
        paper.tags AS tags,
        paper.`AUTHORS` AS `authors`,
        paper.doc_vector AS doc_vector,
        log.add_time AS add_time,
        log.event_type AS label 
    FROM
        ( SELECT u_id, item_id, event_type, add_time FROM click_log where u_id <= 10000) AS log
        INNER JOIN ( SELECT id, interest_tags FROM `user` ) AS `user`
        INNER JOIN (
        SELECT
            id,
            title,
            description,
            tags,
            AUTHORS,
            doc_vector 
        FROM
            paper 
        WHERE
            AND doc_vector IS NOT NULL 
            AND `authors` NOT LIKE '%one%' 
            AND tags NOT LIKE '%one%' 
        ) AS paper ON log.u_id = `user`.id 
        AND log.item_id = paper.id"""

    cur.execute(sql)
    res = cur.fetchall()

    tags2idx, authors2idx, pwc_tasks2idx = {}, {}, {}
    tags_pad, author_pad, doc2vec_pad = 16, 100, 160
    user_interest_tags_pad = 10
    f = open('dataset/tensor.csv', 'w', encoding='utf-8')

    for index, item in enumerate(res):
        userid = [str(item[0])]
        user_interest_tags = str(item[1]).split(",")
        user_interest_tags += ['0'] * (user_interest_tags_pad -
                                       len(user_interest_tags))

        paper_id = [str(item[2])]
        title = item[3]
        description = item[4]
        tags = item[5]
        authors = item[6]
        doc2vec = item[7]
        label = [str(item[9])]

        # title转换为vec
        encoded_input = tokenizer(title, return_tensors='pt')
        title = bert_model(**encoded_input)['pooler_output'].tolist()[0]
        title = [str(i) for i in title]

        # description转换为vec
        encoded_input = tokenizer(description, return_tensors='pt')
        description = bert_model(**encoded_input)['pooler_output'].tolist()[0]
        description = [str(i) for i in description]

        # tags转换为idx
        tags = [add2Map(tags2idx, i) for i in tags.split(",")]
        tags += ['0'] * (tags_pad - len(tags))

        # authors 转换为idx
        authors = [add2Map(authors2idx, i) for i in authors.split(",")]
        authors += ['0'] * (author_pad - len(authors))

        doc2vec = doc2vec.split(",")
        doc2vec += ['0'] * (doc2vec_pad - len(doc2vec))

        line = ",".join(userid + user_interest_tags + paper_id + title +
                        description + tags + authors + doc2vec + label)
        f.write(line + '\n')
    f.close()

    map2file(authors2idx, 'dataset/authors2idx')
    map2file(tags2idx, 'dataset/tags2idx')
    map2file(pwc_tasks2idx, 'dataset/pwc_task2idx')