Пример #1
0
    def __init__(self):
        self.logger = logging.getLogger(__name__)

        try:
            # print("size "+str(os.path.getsize(self.proxy_filename)))
            # print("proxy life lines: ", utils.file_len(self.proxy_filename))

            if os.path.getsize(self.proxy_filename) and utils.file_len(self.proxy_filename) > 0:
                self._proxy_list_file = list(filter(None, (line.rstrip() for line in open(self.proxy_filename))))
            else:
                self.is_use_proxy = False

            self.dataCache.names_list = []

            if os.path.getsize(self.names_filename) and utils.file_len(self.names_filename) > 0:
                self.dataCache.names_list = list(
                    filter(None, (line.rstrip() for line in open(self.names_filename, encoding="utf8"))))
            else:
                self.logger.error("Please input a names list.")

            self.logger = logging.getLogger(__name__)
        except IOError:
            print("proxy/name file I/O error")

        pass
Пример #2
0
    def filter(cls, field, value, infile=None, outfile=None):
        """Filters a GTF file based on a column in the data.
    
        :param field: Str field to examine.
        :type field: str.
        :param value: Criteria to filter column on.
        :type column: str.
        """
        # Setting output file (default to stdout)
        log('Opening GTF output file: %s', outfile or 'stdout')
        fo = sys.stdout
        if outfile != None:
            fo = open(outfile, 'wb')

        # Perform different loop depending on input file
        log('Opening GTF input file: %s', infile or 'stdin')
        fi = sys.stdin
        count = 0
        if infile != None:
            # Infile was provided
            try:
                fi = open(infile, 'rb')
            except IOError as e:
                sys.stderr.write('Cannot open file %s' % infile)
            log('Estimating time to compute...')
            flen = file_len(infile)
            log('CSV reader starting...')
            csvfile = csv.reader(fi, delimiter='\t')
            log('Beginning import')
            count = 0
            prev = None
            for row in csvfile:
                percent = int(float(count) / float(flen) * 100)
                if len(row) == 9:
                    gtf = GTF(row)
                    if prev != percent:
                        sys.stderr.write("\rPercent Complete: %d%%" % percent)
                        sys.stderr.flush()
                    if eval('gtf.' + field) == value:
                        fo.write('\t'.join(row) + '\n')
                count += 1
                prev = percent
            fi.close()
        else:
            count = 0
            interval = 100
            for line in sys.stdin:
                gtf = GTF(line.strip('\n').split('\t'))
                if eval('gtf.' + field) == value:
                    fo.write(line)
                if count % interval == 0:
                    sys.stderr.write("\rLines Read: %d" % count)
                    sys.stderr.flush()
                count += 1
            fi.close()
        fo.close()
        log('\nFinished filtering file')
Пример #3
0
    def __init__(self):
        self.last_line = ""
        self.in_house_rooms = []
        # Only rooms being parsed
        self.open_rooms = []
        self.enqueued_rooms = []

        self.room_count = 0
        self.lineLimit = 50

        self.lines_in_file = utils.file_len("maps/rooms.txt")
Пример #4
0
def set_data():
    section_id = request.values.get("section_id")
    file_ = open("current_dataset/"+section_id+".csv")
    if file_len("current_dataset/"+section_id+".csv") == 60:
        file_.readline()
        for i in range(59):
            file_.write(file_.readline())
        file_.write(request.values.get("values"))
        file_.close()
    else:
        file_ = open("current_dataset/"+section_id+".csv", "a")
        file_.write(request.values.get("values"))
        file_.close()
    return "OK"
Пример #5
0
def extract_image_urls(source, save_loc='image_urls.csv', batch_size=20000):
    articles = []
    num = 0
    _max = file_len(source)
    print(_max)
    with open(source, 'rb') as file_reader:
        for article in jl.reader(file_reader):
            articles.append(article)
            num += 1
            if num % batch_size == 0:
                save_as_csv(get_image_url(articles), path=save_loc)
                print('Processed {} of {}'.format(num, _max))
                articles = []
            if num == _max:
                print('Processed {} of {}'.format(num, _max))
                print('Finished extracting URLs')
                save_as_csv(get_image_url(articles), path=save_loc)
Пример #6
0
def image_fetcher(source,
                  save_loc=os.curdir,
                  start_index=0,
                  batch_size=50,
                  _max=None,
                  max_retries=1):
    const_start = start_index
    _max = int(const_start + _max) if _max is not None else file_len(source)
    start_of_process = time.time()
    total = 0
    while True:
        start_time = time.time()
        if start_index == _max:
            return
        elif _max is None:
            get_images(filepath=source,
                       save_loc=save_loc,
                       batch_size=batch_size,
                       start_index=start_index,
                       recursion_depth=max_retries)
            start_index += batch_size
        else:
            get_images(filepath=source,
                       save_loc=save_loc,
                       batch_size=batch_size,
                       start_index=start_index,
                       _max=_max,
                       recursion_depth=max_retries)
            start_index += batch_size
        end_time = time.time()
        total_time = end_time - start_of_process
        total += batch_size
        print('Processed {} in {:.3g} seconds'.format(batch_size,
                                                      end_time - start_time))
        print('Progress: {} in ~{:.3g}'.format(total, int(total_time) / 60))
        print(
            '---------------------------------------------------------------------'
        )
Пример #7
0
def main():
    if os.path.exists(twpc_helper.save_path):
        twpc_helper.save_path = options(twpc_helper.save_path)
        print(f"New path: {twpc_helper.save_path}")
    articles = []
    end = file_len(twpc_helper.source_path)
    with open(twpc_helper.source_path, "rb") as f:
        for i, article in enumerate(jl_reader(f), start=1):
            article["contents"] = list(filter(None, article["contents"]))
            article["category"], article["subcategory"] = get_categories(
                article)
            # Ugly code, but significantly speeds up the process if a category is set
            if twpc_helper.category is not None and not article[
                    "category"] == twpc_helper.category:
                if i % twpc_helper.batch_size == 0 or i == end:
                    if len(articles) > 0:
                        save_as_csv(articles)
                        articles = []
                    print(f"Progress: {i} / {end}.")
                continue
            article["text"] = stringify_contents(article)
            article["date"], article["time"] = unix_to_dt(article)
            article["image_url"], article[
                "image_caption"] = get_image_url_and_caption(article)
            article["author_bio"] = get_author_bio(article)
            if article["title"] is None or article["title"] == "":
                article["title"] = np.nan
            if article["author"] == "":
                article["author"], article[
                    'subtype'] = get_author_if_compilation(article)
            else:
                article["subtype"] = "standalone"
            discard_properties(article)
            articles.append(article)
            if i % twpc_helper.batch_size == 0 or i == end:
                save_as_csv(articles)
                articles = []
                print(f"Progress: {i} / {end}.")
Пример #8
0
def main():
    print "load model..."
    param = cPickle.load(open(workdir + "model_dns_ori.pkl"))  #载入model_dns_ori.pkl,导入用户项目等参数?可能是.txt文件相同信息的不同格式
    generator = GEN(ITEM_NUM, USER_NUM, EMB_DIM, lamda=0.0 / BATCH_SIZE, param=param, initdelta=INIT_DELTA,
                    learning_rate=0.001)
    discriminator = DIS(ITEM_NUM, USER_NUM, EMB_DIM, lamda=0.1 / BATCH_SIZE, param=None, initdelta=INIT_DELTA,
                        learning_rate=0.001)

    #------------------------------------------------------动态申请显存
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer()) #初始化
    #-----------------------------------------------------------

    print "gen ", simple_test(sess, generator)#这里不知道在干什么
    print "dis ", simple_test(sess, discriminator)

    dis_log = open(workdir + 'dis_log.txt', 'w') #pkl文件和txt文件的关系不明确
    gen_log = open(workdir + 'gen_log.txt', 'w')
##########################################################################################输入和初始化
    # minimax training------------------------------------------------------------------------------D部分生成负样本和正样本结合
    best = 0.  #???
    for epoch in range(15):
        if epoch >= 0:
            for d_epoch in range(100):
                if d_epoch % 5 == 0:
                    generate_for_d(sess, generator, DIS_TRAIN_FILE) #根据分布率得到采样文件,根据用户得到item
                    train_size = ut.file_len(DIS_TRAIN_FILE) #从dis-train.txt得到文件数
        ############################################################################################搭建模型前让雷一鸣搞清楚这一段的原因            
                index = 1
                while True:
                    if index > train_size:
                        break
                    if index + BATCH_SIZE <= train_size + 1: #这个判别条件的原理不知道
                        input_user, input_item, input_label = ut.get_batch_data(DIS_TRAIN_FILE, index, BATCH_SIZE)
                    else:
                        input_user, input_item, input_label = ut.get_batch_data(DIS_TRAIN_FILE, index,
                                                                                train_size - index + 1)
                    index += BATCH_SIZE
        ###########################################################################################################
                    _ = sess.run(discriminator.d_updates,  #猜测是不断修改模型的过程
                                 feed_dict={discriminator.u: input_user, discriminator.i: input_item,
                                            discriminator.label: input_label}) #给空出来的的placeholder传输值,定义好整个图之后才会用sess.run??把从G中得到的参数给了D

####################### Train G################################policy gradient生成item更新参数#########################
            for g_epoch in range(50):  # 50
                for u in user_pos_train: #u训练数据集
                    sample_lambda = 0.2
                    pos = user_pos_train[u]
                #-----------------------------------------------------------------------------很重要,需要明白一下
                    rating = sess.run(generator.all_logits, {generator.u: u})#{}中是字典型数据,猜测是在所有的猜测数据中检索字典型数据或者仅仅是把这个数据输入到模型中去,放到图中开始运行,为了取回fetch内容,在参数中加入需要输入的数据
                    exp_rating = np.exp(rating)#计算e指数
                    prob = exp_rating / np.sum(exp_rating)  # prob is generator distribution p_\theta 分布率

                    pn = (1 - sample_lambda) * prob
                    pn[pos] += sample_lambda * 1.0 / len(pos)
                    # Now, pn is the Pn in importance sampling, prob is generator distribution p_\theta
                #不明白---------------------------------但很重要####################p和pn####################3
                    sample = np.random.choice(np.arange(ITEM_NUM), 2 * len(pos), p=pn)  #这里应该是选择文档的过程给出item索引,在索引中随机以p这个概率选取2 * len(pos)个
                    #得到概率并且抽样了 根据用户的输入得到item
                    ###########################################################################
                    # Get reward and adapt it with importance sampling在D中才有reward
                    ###########################################################################
                    reward = sess.run(discriminator.reward, {discriminator.u: u, discriminator.i: sample}) #用户和抽样的项目放入D中计算反馈,sess实体运行图取回括号中的参数  sess.run(fetches,feed_dict),给placeholder创建出来的变量赋值
                    reward = reward * prob[sample] / pn[sample]
                    ###########################################################################,挑出来的是训练数据中的采样和重点sample
                    # Update G
                    ###########################################################################
                    _ = sess.run(generator.gan_updates,
                                 {generator.u: u, generator.i: sample, generator.reward: reward})

                result = simple_test(sess, generator)
                print "epoch ", epoch, "gen: ", result
                buf = '\t'.join([str(x) for x in result]) #把结果搞到一起了
                gen_log.write(str(epoch) + '\t' + buf + '\n') #输出准确率
                gen_log.flush()

                p_5 = result[1]
                if p_5 > best:
                    print 'best: ', result
                    best = p_5
                    generator.save_model(sess, "ml-100k/gan_generator.pkl")

    gen_log.close()
    dis_log.close()
Пример #9
0
def main():
    print "load model..."
    param = cPickle.load(open(workdir + "model_dns_ori.pkl"))
    generator = GEN(ITEM_NUM,
                    USER_NUM,
                    EMB_DIM,
                    lamda=0.0 / BATCH_SIZE,
                    param=None,
                    initdelta=INIT_DELTA,
                    learning_rate=0.001)
    discriminator = DIS(ITEM_NUM,
                        USER_NUM,
                        EMB_DIM,
                        lamda=0.1 / BATCH_SIZE,
                        param=None,
                        initdelta=INIT_DELTA,
                        learning_rate=0.001)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())

    print "gen ", simple_test(sess, generator)
    print "dis ", simple_test(sess, discriminator)

    dis_log = open(workdir + 'dis_log.txt', 'w')
    gen_log = open(workdir + 'gen_log.txt', 'w')

    # minimax training
    best = 0.
    for epoch in range(300):
        if epoch >= 0:
            for d_epoch in range(100):
                if d_epoch % 5 == 0:
                    generate_for_d(sess, generator, DIS_TRAIN_FILE)
                    train_size = ut.file_len(DIS_TRAIN_FILE)
                index = 1
                while True:
                    if index > train_size:
                        break
                    if index + BATCH_SIZE <= train_size + 1:
                        input_user, input_item, input_label = ut.get_batch_data(
                            DIS_TRAIN_FILE, index, BATCH_SIZE)
                    else:
                        input_user, input_item, input_label = ut.get_batch_data(
                            DIS_TRAIN_FILE, index, train_size - index + 1)
                    index += BATCH_SIZE

                    _ = sess.run(discriminator.d_updates,
                                 feed_dict={
                                     discriminator.u: input_user,
                                     discriminator.i: input_item,
                                     discriminator.label: input_label
                                 })

            # Train G
            for g_epoch in range(50):  # 50
                for u in user_pos_train:
                    sample_lambda = 0.2
                    pos = user_pos_train[u]

                    rating = sess.run(generator.all_logits, {generator.u: u})
                    exp_rating = np.exp(rating)
                    prob = exp_rating / np.sum(
                        exp_rating)  # prob is generator distribution p_\theta

                    pn = (1 - sample_lambda) * prob
                    pn[pos] += sample_lambda * 1.0 / len(pos)
                    # Now, pn is the Pn in importance sampling, prob is generator distribution p_\theta

                    sample = np.random.choice(np.arange(ITEM_NUM),
                                              2 * len(pos),
                                              p=pn)
                    ###########################################################################
                    # Get reward and adapt it with importance sampling
                    ###########################################################################
                    reward = sess.run(discriminator.reward, {
                        discriminator.u: u,
                        discriminator.i: sample
                    })
                    reward = reward * prob[sample] / pn[sample]
                    ###########################################################################
                    # Update G
                    ###########################################################################
                    _ = sess.run(
                        generator.gan_updates, {
                            generator.u: u,
                            generator.i: sample,
                            generator.reward: reward
                        })

                result = simple_test(sess, generator)
                print "epoch ", epoch, "gen: ", result
                buf = '\t'.join([str(x) for x in result])
                gen_log.write(str(epoch) + '\t' + buf + '\n')
                gen_log.flush()

                p_5 = result[4]
                if p_5 > best:
                    print 'best: ', result
                    best = p_5
                    generator.save_model(sess, "ml-100k/gan_generator.pkl")

    gen_log.close()
    dis_log.close()
Пример #10
0
def main():
    print("loading model...")
    generator = GEN(ITEM_NUM,USER_NUM,EMB_DIM,lamda = 0.0 / BATCH_SIZE,param = None,initdelta= INIT_DELTA,
                    learning_rate = 0.001)
    discriminator = DIS(ITEM_NUM,USER_NUM,EMB_DIM,lamda = 0.1/BATCH_SIZE,param=None,initdelta = INIT_DELTA,
                        learning_rate = 0.001)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        print("gen ",simple_test(sess,generator))
        print("dis ",simple_test(sess,discriminator))

        dis_log = open(workdir + 'dis_log.txt','w')
        gen_log = open(workdir + 'gen_log.txt','w')

        best = 0.
        for epoch in range(15):
            if epoch >= 0:
                for d_epoch in range(100):
                    if d_epoch % 5 == 0:
                        generate_for_d(sess,generator,DIS_TRAIN_FILE)
                        train_size = ut.file_len(DIS_TRAIN_FILE)
                    index = 1
                    while True:
                        if index > train_size:
                            break
                        if index + BATCH_SIZE <= train_size + 1:
                            input_user,input_item,input_label = ut.get_batch_data(DIS_TRAIN_FILE,index,BATCH_SIZE)
                        else:
                            input_user,input_item,input_label = ut.get_batch_data(DIS_TRAIN_FILE,index,train_size-index+1)
                        index += BATCH_SIZE

                        _ = sess.run(discriminator.d_updates,feed_dict={
                            discriminator.u:input_user,discriminator.i:input_item,discriminator.label:input_label
                        })

                for g_epoch in range(50):
                    for u in user_pos_train:
                        sample_lambda = 0.2
                        pos = user_pos_train[u]

                        rating = sess.run(generator.all_logits,{generator.u:u})
                        exp_rating = np.exp(rating)
                        prob = exp_rating / np.sum(exp_rating)

                        pn = (1-sample_lambda) * prob
                        pn[pos] += sample_lambda * 1.0 / len(pos)

                        sample = np.random.choice(np.arange(ITEM_NUM), 2 * len(pos), p=pn)

                        reward = sess.run(discriminator.reward, {discriminator.u: u, discriminator.i: sample})
                        reward = reward * prob[sample] / pn[sample]

                        _ = sess.run(generator.gan_updates,
                                     {generator.u: u, generator.i: sample, generator.reward: reward})

                    result = simple_test(sess, generator)
                    print("epoch ", epoch, "gen: ", result)
                    buf = '\t'.join([str(x) for x in result])
                    gen_log.write(str(epoch) + '\t' + buf + '\n')
                    gen_log.flush()

                    p_5 = result[1]
                    if p_5 > best:
                        print('best: ', result)
                        best = p_5


                gen_log.close()
                dis_log.close()
Пример #11
0
def main():
    p_best_val = 0.0
    ndcg_best_val = 0.0

    for epoch in range(30):
        if epoch >= 0:
            print('Training D ...')
            for d_epoch in range(100):
                if d_epoch % 30 == 0:
                    generate_for_d(DIS_TRAIN_FILE)
                    train_size = ut.file_len(DIS_TRAIN_FILE)

                index = 1
                while True:
                    if index > train_size:
                        break
                    if index + BATCH_SIZE <= train_size + 1:
                        input_pos, input_neg = ut.get_batch_data(
                            DIS_TRAIN_FILE, index, BATCH_SIZE)
                    else:
                        input_pos, input_neg = ut.get_batch_data(
                            DIS_TRAIN_FILE, index, train_size - index + 1)
                    index += BATCH_SIZE

                    pred_data = []
                    pred_data.extend(input_pos)
                    pred_data.extend(input_neg)
                    pred_data = np.asarray(pred_data)

                    pred_data_label = [1.0] * len(input_pos)
                    pred_data_label.extend([0.0] * len(input_neg))
                    pred_data_label = np.asarray(pred_data_label)

                    loss_d = discriminator(torch.tensor(pred_data), torch.tensor(pred_data_label)) \
                            + WEIGHT_DECAY * (criterion(D_w1) + criterion(D_w2)
                                           + criterion(D_b1) + criterion(D_b2))
                    optimizer_D.zero_grad()
                    loss_d.backward()
                    optimizer_D.step()
                print("\r[D Epoch %d/%d] [loss: %f]" %
                      (d_epoch, 100, loss_d.item()))

        print('Training G ...')
        for g_epoch in range(30):
            num = 0
            for query in query_pos_train.keys():
                pos_list = query_pos_train[query]
                pos_set = set(pos_list)
                all_list = query_index_url[query]

                all_list_feature = [
                    query_url_feature[query][url] for url in all_list
                ]
                all_list_feature = np.asarray(all_list_feature)
                # pdb.set_trace()
                with torch.cuda.device(device[0]):
                    all_list_score = generator.module.pred_score(
                        torch.tensor(all_list_feature).cuda())
                all_list_score = all_list_score.detach().cpu().numpy()
                # softmax for all
                exp_rating = np.exp(all_list_score - np.max(all_list_score))
                prob = exp_rating / np.sum(exp_rating)

                prob_IS = prob * (1.0 - LAMBDA)

                for i in range(len(all_list)):
                    if all_list[i] in pos_set:
                        prob_IS[i] += (LAMBDA / (1.0 * len(pos_list)))
                # pdb.set_trace()
                choose_index = np.random.choice(np.arange(len(all_list)),
                                                [5 * len(pos_list)],
                                                p=prob_IS.reshape(-1, ))
                choose_list = np.array(all_list)[choose_index]
                choose_feature = [
                    query_url_feature[query][url] for url in choose_list
                ]
                choose_IS = np.array(prob)[choose_index] / np.array(
                    prob_IS)[choose_index]

                choose_index = np.asarray(choose_index)
                choose_feature = np.asarray(choose_feature)
                choose_IS = np.asarray(choose_IS)
                with torch.cuda.device(device[0]):
                    choose_reward = discriminator.module.get_reward(
                        torch.tensor(choose_feature).cuda())
                choose_reward.detach_()

                loss_g = generator(torch.tensor(all_list_feature).cuda(), torch.tensor(choose_index), choose_reward, torch.tensor(choose_IS)) \
                        + WEIGHT_DECAY * (criterion(G_w1) + criterion(G_w2)
                                   + criterion(G_b1) + criterion(G_b2))
                # pdb.set_trace()

                optimizer_G.zero_grad()
                loss_g.backward()
                optimizer_G.step()
                num += 1
                # if num == 200:
                #     pdb.set_trace()
            print("\r[G Epoch %d/%d] [loss: %f]" %
                  (g_epoch, 30, loss_g.item()))
            # pdb.set_trace()
            p_5 = precision_at_k(device,
                                 generator,
                                 query_pos_test,
                                 query_pos_train,
                                 query_url_feature,
                                 k=5)
            ndcg_5 = ndcg_at_k(device,
                               generator,
                               query_pos_test,
                               query_pos_train,
                               query_url_feature,
                               k=5)

            if p_5 > p_best_val:
                p_best_val = p_5
                ndcg_best_val = ndcg_5
                print("Best:", "gen p@5 ", p_5, "gen ndcg@5 ", ndcg_5)
            elif p_5 == p_best_val:
                if ndcg_5 > ndcg_best_val:
                    ndcg_best_val = ndcg_5
                    print("Best:", "gen p@5 ", p_5, "gen ndcg@5 ", ndcg_5)
            #validation
            # p_5 = precision_at_k(val_loader, 5)
            # if p_5 > p_best_val:
            #     p_best_val = p_5
            #     print("Best:", "gen p@5 ", p_5)
            #     torch.save(recipe_emb.state_dict(), 'saved_models/recipe_emb_%d_%.3f.pth' % (epoch, p_5))
            #     param_num = 1
            #     for param in DG_param:
            #         torch.save(param, 'saved_models/param%d_%d_%.3f.pt' % (param_num, epoch, p_5))
            #         param_num += 1
    p_1_best = precision_at_k(device,
                              generator,
                              query_pos_test,
                              query_pos_train,
                              query_url_feature,
                              k=1)
    p_3_best = precision_at_k(device,
                              generator,
                              query_pos_test,
                              query_pos_train,
                              query_url_feature,
                              k=3)
    p_5_best = precision_at_k(device,
                              generator,
                              query_pos_test,
                              query_pos_train,
                              query_url_feature,
                              k=5)
    p_10_best = precision_at_k(device,
                               generator,
                               query_pos_test,
                               query_pos_train,
                               query_url_feature,
                               k=10)

    ndcg_1_best = ndcg_at_k(device,
                            generator,
                            query_pos_test,
                            query_pos_train,
                            query_url_feature,
                            k=1)
    ndcg_3_best = ndcg_at_k(device,
                            generator,
                            query_pos_test,
                            query_pos_train,
                            query_url_feature,
                            k=3)
    ndcg_5_best = ndcg_at_k(device,
                            generator,
                            query_pos_test,
                            query_pos_train,
                            query_url_feature,
                            k=5)
    ndcg_10_best = ndcg_at_k(device,
                             generator,
                             query_pos_test,
                             query_pos_train,
                             query_url_feature,
                             k=10)

    # map_best = MAP(sess, generator, query_pos_test, query_pos_train, query_url_feature)
    # mrr_best = MRR(sess, generator, query_pos_test, query_pos_train, query_url_feature)

    print("Best ", "p@1 ", p_1_best, "p@3 ", p_3_best, "p@5 ", p_5_best,
          "p@10 ", p_10_best)
    print("Best ", "ndcg@1 ", ndcg_1_best, "ndcg@3 ", ndcg_3_best, "ndcg@5 ",
          ndcg_5_best, "p@10 ", ndcg_10_best)
Пример #12
0
discriminator = Discriminator(
    ITEM_NUM, USER_NUM,EMB_DIM, lamda=0.0 / BATCH_SIZE,
    param=None, initdelta=INIT_DELTA)

g_optimizer = torch.optim.SGD(
    generator.parameters(), lr=0.001, momentum=0.9)

g_optimizer = torch.optim.SGD(
    discriminator.parameters(), lr=0.001, momentum=0.9)                    

for epoch in range(15):
    if epoch >= 0:
        for d_epoch in range(100):
            if d_epoch % 5 == 0:
                generate_for_d(generator, DIS_TRAIN_FILE)
                train_size = ut.file_len(DIS_TRAIN_FILE)
            index = 1
            while True:
                if index > train_size:
                    break
                if index + BATCH_SIZE <= train_size + 1:
                    users, items, labels = ut.get_batch_data(
                        DIS_TRAIN_FILE, index, BATCH_SIZE)
                else:
                    users, items, labels = ut.get_batch_data(
                        DIS_TRAIN_FILE, index, train_size - index + 1)

                loss_d = discriminator(users, items, labels)
                d_optimizer.zero_grad()
                loss_d.backward()
                d_optimizer.step()
Пример #13
0
def main():
    discriminator = DIS(FEATURE_SIZE,
                        HIDDEN_SIZE,
                        WEIGHT_DECAY,
                        D_LEARNING_RATE,
                        param=None)
    generator = GEN(FEATURE_SIZE,
                    HIDDEN_SIZE,
                    WEIGHT_DECAY,
                    G_LEARNING_RATE,
                    temperature=TEMPERATURE,
                    param=None)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.initialize_all_variables())

    print('start adversarial training')

    p_best_val = 0.0
    ndcg_best_val = 0.0

    for epoch in range(30):
        if epoch >= 0:
            # G generate negative for D, then train D
            print('Training D ...')
            for d_epoch in range(100):
                if d_epoch % 30 == 0:
                    generate_for_d(sess, generator, DIS_TRAIN_FILE)
                    train_size = ut.file_len(DIS_TRAIN_FILE)

                index = 1
                while True:
                    if index > train_size:
                        break
                    if index + BATCH_SIZE <= train_size + 1:
                        input_pos, input_neg = ut.get_batch_data(
                            DIS_TRAIN_FILE, index, BATCH_SIZE)
                    else:
                        input_pos, input_neg = ut.get_batch_data(
                            DIS_TRAIN_FILE, index, train_size - index + 1)
                    index += BATCH_SIZE

                    pred_data = []
                    pred_data.extend(input_pos)
                    pred_data.extend(input_neg)
                    pred_data = np.asarray(pred_data)

                    pred_data_label = [1.0] * len(input_pos)
                    pred_data_label.extend([0.0] * len(input_neg))
                    pred_data_label = np.asarray(pred_data_label)

                    _ = sess.run(discriminator.d_updates,
                                 feed_dict={
                                     discriminator.pred_data: pred_data,
                                     discriminator.pred_data_label:
                                     pred_data_label
                                 })
        # Train G
        print('Training G ...')
        for g_epoch in range(30):
            for query in query_pos_train.keys():
                pos_list = query_pos_train[query]
                pos_set = set(pos_list)
                all_list = query_index_url[query]

                all_list_feature = [
                    query_url_feature[query][url] for url in all_list
                ]
                all_list_feature = np.asarray(all_list_feature)
                all_list_score = sess.run(
                    generator.pred_score,
                    {generator.pred_data: all_list_feature})

                # softmax for all
                exp_rating = np.exp(all_list_score - np.max(all_list_score))
                prob = exp_rating / np.sum(exp_rating)

                prob_IS = prob * (1.0 - LAMBDA)

                for i in range(len(all_list)):
                    if all_list[i] in pos_set:
                        prob_IS[i] += (LAMBDA / (1.0 * len(pos_list)))

                choose_index = np.random.choice(np.arange(len(all_list)),
                                                [5 * len(pos_list)],
                                                p=prob_IS)
                choose_list = np.array(all_list)[choose_index]
                choose_feature = [
                    query_url_feature[query][url] for url in choose_list
                ]
                choose_IS = np.array(prob)[choose_index] / np.array(
                    prob_IS)[choose_index]

                choose_index = np.asarray(choose_index)
                choose_feature = np.asarray(choose_feature)
                choose_IS = np.asarray(choose_IS)

                choose_reward = sess.run(
                    discriminator.reward,
                    feed_dict={discriminator.pred_data: choose_feature})

                _ = sess.run(generator.g_updates,
                             feed_dict={
                                 generator.pred_data: all_list_feature,
                                 generator.sample_index: choose_index,
                                 generator.reward: choose_reward,
                                 generator.important_sampling: choose_IS
                             })

            p_5 = precision_at_k(sess,
                                 generator,
                                 query_pos_test,
                                 query_pos_train,
                                 query_url_feature,
                                 k=5)
            ndcg_5 = ndcg_at_k(sess,
                               generator,
                               query_pos_test,
                               query_pos_train,
                               query_url_feature,
                               k=5)

            if p_5 > p_best_val:
                p_best_val = p_5
                ndcg_best_val = ndcg_5
                generator.save_model(sess, GAN_MODEL_BEST_FILE)
                print("Best:", "gen p@5 ", p_5, "gen ndcg@5 ", ndcg_5)
            elif p_5 == p_best_val:
                if ndcg_5 > ndcg_best_val:
                    ndcg_best_val = ndcg_5
                    generator.save_model(sess, GAN_MODEL_BEST_FILE)
                    print("Best:", "gen p@5 ", p_5, "gen ndcg@5 ", ndcg_5)

    sess.close()
    param_best = cPickle.load(open(GAN_MODEL_BEST_FILE))
    assert param_best is not None
    generator_best = GEN(FEATURE_SIZE,
                         HIDDEN_SIZE,
                         WEIGHT_DECAY,
                         G_LEARNING_RATE,
                         temperature=TEMPERATURE,
                         param=param_best)
    sess = tf.Session(config=config)
    sess.run(tf.initialize_all_variables())

    p_1_best = precision_at_k(sess,
                              generator_best,
                              query_pos_test,
                              query_pos_train,
                              query_url_feature,
                              k=1)
    p_3_best = precision_at_k(sess,
                              generator_best,
                              query_pos_test,
                              query_pos_train,
                              query_url_feature,
                              k=3)
    p_5_best = precision_at_k(sess,
                              generator_best,
                              query_pos_test,
                              query_pos_train,
                              query_url_feature,
                              k=5)
    p_10_best = precision_at_k(sess,
                               generator_best,
                               query_pos_test,
                               query_pos_train,
                               query_url_feature,
                               k=10)

    ndcg_1_best = ndcg_at_k(sess,
                            generator_best,
                            query_pos_test,
                            query_pos_train,
                            query_url_feature,
                            k=1)
    ndcg_3_best = ndcg_at_k(sess,
                            generator_best,
                            query_pos_test,
                            query_pos_train,
                            query_url_feature,
                            k=3)
    ndcg_5_best = ndcg_at_k(sess,
                            generator_best,
                            query_pos_test,
                            query_pos_train,
                            query_url_feature,
                            k=5)
    ndcg_10_best = ndcg_at_k(sess,
                             generator_best,
                             query_pos_test,
                             query_pos_train,
                             query_url_feature,
                             k=10)

    map_best = MAP(sess, generator_best, query_pos_test, query_pos_train,
                   query_url_feature)
    mrr_best = MRR(sess, generator_best, query_pos_test, query_pos_train,
                   query_url_feature)

    print("Best ", "p@1 ", p_1_best, "p@3 ", p_3_best, "p@5 ", p_5_best,
          "p@10 ", p_10_best)
    print("Best ", "ndcg@1 ", ndcg_1_best, "ndcg@3 ", ndcg_3_best, "ndcg@5 ",
          ndcg_5_best, "p@10 ", ndcg_10_best)
    print("Best MAP ", map_best)
    print("Best MRR ", mrr_best)
Пример #14
0
def main():
    print("loading model...")
    generator = GEN(ITEM_NUM,
                    USER_NUM,
                    EMB_DIM,
                    lamda=0.0 / BATCH_SIZE,
                    param=None,
                    initdelta=INIT_DELTA,
                    learning_rate=0.001)
    discriminator = DIS(ITEM_NUM,
                        USER_NUM,
                        EMB_DIM,
                        lamda=0.1 / BATCH_SIZE,
                        param=None,
                        initdelta=INIT_DELTA,
                        learning_rate=0.001)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        print("gen ", simple_test(sess, generator))
        print("dis ", simple_test(sess, discriminator))

        dis_log = open(workdir + 'dis_log.txt', 'w')
        gen_log = open(workdir + 'gen_log.txt', 'w')

        best = 0.
        for epoch in range(15):
            if epoch >= 0:
                for d_epoch in range(100):
                    if d_epoch % 5 == 0:
                        generate_for_d(sess, generator, DIS_TRAIN_FILE)
                        train_size = ut.file_len(DIS_TRAIN_FILE)
                    index = 1
                    while True:
                        if index > train_size:
                            break
                        if index + BATCH_SIZE <= train_size + 1:
                            input_user, input_item, input_label = ut.get_batch_data(
                                DIS_TRAIN_FILE, index, BATCH_SIZE)
                        else:
                            input_user, input_item, input_label = ut.get_batch_data(
                                DIS_TRAIN_FILE, index, train_size - index + 1)
                        index += BATCH_SIZE

                        _ = sess.run(discriminator.d_updates,
                                     feed_dict={
                                         discriminator.u: input_user,
                                         discriminator.i: input_item,
                                         discriminator.label: input_label
                                     })

                for g_epoch in range(50):
                    for u in user_pos_train:
                        sample_lambda = 0.2
                        pos = user_pos_train[u]

                        rating = sess.run(generator.all_logits,
                                          {generator.u: u})
                        exp_rating = np.exp(rating)
                        prob = exp_rating / np.sum(exp_rating)

                        pn = (1 - sample_lambda) * prob
                        pn[pos] += sample_lambda * 1.0 / len(pos)

                        sample = np.random.choice(np.arange(ITEM_NUM),
                                                  2 * len(pos),
                                                  p=pn)

                        reward = sess.run(discriminator.reward, {
                            discriminator.u: u,
                            discriminator.i: sample
                        })
                        reward = reward * prob[sample] / pn[sample]

                        _ = sess.run(
                            generator.gan_updates, {
                                generator.u: u,
                                generator.i: sample,
                                generator.reward: reward
                            })

                    result = simple_test(sess, generator)
                    print("epoch ", epoch, "gen: ", result)
                    buf = '\t'.join([str(x) for x in result])
                    gen_log.write(str(epoch) + '\t' + buf + '\n')
                    gen_log.flush()

                    p_5 = result[1]
                    if p_5 > best:
                        print('best: ', result)
                        best = p_5

                gen_log.close()
                dis_log.close()
Пример #15
0
                    os.path.basename(os.path.normpath(csv)))

    return unique_events, events_to_sources


csv = pd.read_csv(args.data,
                  header=0,
                  names=['timestamp', 'source', 'event'],
                  dtype={
                      'source': str,
                      'event': str
                  },
                  parse_dates=[0],
                  chunksize=args.chunksize)

length = file_len(args.data)
print('There are {} lines of data in {}.'.format(length, args.data))

t_start = time.time()

for i, chunk in enumerate(csv):
    data = defaultdict(list)

    t0 = time.time()

    for row in chunk.itertuples(index=False):
        timestamp, source, event = row
        if pd.notnull(source):
            data[source].append((timestamp, event))

    print('Demuxing chunk: {} seconds.'.format(pretty_float(time.time() - t0)),
Пример #16
0
def main():
    #call discriminator, generator
    discriminator = DIS(FEATURE_SIZE, HIDDEN_SIZE, WEIGHT_DECAY, D_LEARNING_RATE)
    generator = GEN(FEATURE_SIZE, HIDDEN_SIZE, WEIGHT_DECAY, G_LEARNING_RATE, temperature=TEMPERATURE)
    print('start adversarial training')
    p_best_val = 0.0
    ndcg_best_val = 0.0
    for epoch in range(30):
        if epoch >= 0:
            # G generate negative for D, then train D
            print('Training D ...')
            for d_epoch in range(100):
                if d_epoch % 30 == 0:
                    generate_for_d(generator, DIS_TRAIN_FILE)
                    train_size = ut.file_len(DIS_TRAIN_FILE)
                index = 1
                while True:
                    if index > train_size:
                        break
                    if index + BATCH_SIZE <= train_size + 1:
                        input_pos, input_neg = ut.get_batch_data(DIS_TRAIN_FILE, index, BATCH_SIZE)
                    else:
                        input_pos, input_neg = ut.get_batch_data(DIS_TRAIN_FILE, index, train_size - index + 1)
                    index += BATCH_SIZE
                    pred_data = []
                    #prepare pos and neg data
                    pred_data.extend(input_pos)
                    pred_data.extend(input_neg)
                    pred_data = np.asarray(pred_data)
                    #prepara pos and neg label
                    pred_data_label = [1.0] * len(input_pos)
                    pred_data_label.extend([0.0] * len(input_neg))
                    pred_data_label = np.asarray(pred_data_label)
                    #train
                    discriminator.train(pred_data, pred_data_label)
        # Train G
        print('Training G ...')
        for g_epoch in range(10):
            start_time = time.time()
            print ('now_ G_epoch : ', str(g_epoch))
            for query in query_pos_train.keys():
                pos_list = query_pos_train[query]
                pos_set = set(pos_list)
                #all url
                all_list = query_index_url[query]
                #all feature
                all_list_feature = [query_url_feature[query][url] for url in all_list]
                all_list_feature = np.asarray(all_list_feature)
                # G generate all url prob
                prob = generator.get_prob(all_list_feature[np.newaxis, :])
                prob = prob[0]
                prob = prob.reshape([-1])
                #important sampling, change doc prob
                prob_IS = prob * (1.0 - LAMBDA)
            
                for i in range(len(all_list)):
                    if all_list[i] in pos_set:
                        prob_IS[i] += (LAMBDA / (1.0 * len(pos_list)))
                # G generate some url (5 * postive doc num)
                choose_index = np.random.choice(np.arange(len(all_list)), [5 * len(pos_list)], p=prob_IS)
                #choose url
                choose_list = np.array(all_list)[choose_index]
                #choose feature
                choose_feature = [query_url_feature[query][url] for url in choose_list]
                #prob / importan sampling prob (loss => prob * reward * prob / importan sampling prob) 
                choose_IS = np.array(prob)[choose_index] / np.array(prob_IS)[choose_index]
                choose_index = np.asarray(choose_index)
                choose_feature = np.asarray(choose_feature)
                choose_IS = np.asarray(choose_IS)
                #get reward((prob  - 0.5) * 2 )                
                choose_reward = discriminator.get_preresult(choose_feature)
                #train
                generator.train(choose_feature[np.newaxis, :], choose_reward.reshape([-1])[np.newaxis, :], choose_IS[np.newaxis, :])       
            print("train end--- %s seconds ---" % (time.time() - start_time))
            p_5 = precision_at_k(generator, query_pos_test, query_pos_train, query_url_feature, k=5)
            ndcg_5 = ndcg_at_k(generator, query_pos_test, query_pos_train, query_url_feature, k=5)            
            if p_5 > p_best_val:
                p_best_val = p_5
                ndcg_best_val = ndcg_5
                generator.save_model(GAN_MODEL_BEST_FILE)
                print("Best:", "gen p@5 ", p_5, "gen ndcg@5 ", ndcg_5)
            elif p_5 == p_best_val:
                if ndcg_5 > ndcg_best_val:
                    ndcg_best_val = ndcg_5
                    generator.save_model(GAN_MODEL_BEST_FILE)
                    print("Best:", "gen p@5 ", p_5, "gen ndcg@5 ", ndcg_5)           
Пример #17
0

start_time = time.time()
user_directory = sys.argv[1]
user_prediction_file_path = sys.argv[2]

set_ids = set()
output_directory = sys.argv[3]
output_csv = open(output_directory, "a")
with open(user_prediction_file_path) as open_file_object:
    for line in open_file_object:
        infos = line.rstrip("\n").split(',')
        user_id = infos[0]
        set_ids.add(user_id)
        user_prediction = infos[1]
        file_length = file_len(user_directory + '/' + user_id + '.csv')  # number of records for a given user
        output_csv.write(user_id + ',' + user_prediction + ',' + str(file_length) + '\n')

# Write user_id that has not any prediction (i.e, not in accio's matches)
files = [filename for filename in os.listdir(user_directory)]
for file in files:
    cur_id = file.split('.')[0]
    if not (cur_id in set_ids):
        # print(cur_id)
        output_csv.write(cur_id + ',' + ',' + str(file_len(user_directory + '/' + file)) + '\n')

output_csv.close()

# files = [filename for filename in os.listdir(user_directory)]
# output_csv = open(output_directory + '/' + 'users_data.csv', "a")
# for file in files:
Пример #18
0
def main():
    start_time = time.time()
    options = get_arguments()
    logging.info("Options")
    logging.info(options)
    logging_level = logging.DEBUG if options["verbose"] else logging.ERROR
    print options['verbose']
    print logging_level
    logging.getLogger().setLevel(logging_level)

    filepath = options['filepath']
    clear_file(output_path)
    print "Start mask generation for file " + filepath
    if not os.path.isfile(filepath):
        print("File path {} does not exist. Exiting...".format(filepath))
        sys.exit()
    # split files if requested, get line counts
    if options['split']:
        total_lines, rejected_lines = split_files(filepath,
                                                  options["max_line_length"])
    else:
        total_lines = file_len(filepath)
        rejected_lines = file_len(split_path + "/rejected_lines")
    all_masks = []
    cumulated_generated_space = 0
    treated_lines = 0
    #only open split files of correct length
    for filename in os.listdir(split_path):
        if filename == "rejected_lines":
            continue

        if int(filename.split("file_")[1]) <= options['max_line_length']:
            with open(os.path.join(split_path, filename), 'r') as fp:
                #  lines_read, generated_space, masks = learning_algorithm(fp)
                lines_read, generated_space, masks = stat_algorithm(
                    fp, options["max_mask_combinations"],
                    options["mask_rejection_ratio"])
                treated_lines += lines_read
                cumulated_generated_space += generated_space
                print_status(lines_read, len(masks), cumulated_generated_space)
                print_masks_to_file(masks, lines_read, generated_space)
                all_masks += masks
                fp.close()
                logging.info("--- %s seconds ---" % (time.time() - start_time))
    else:
        total_hits = 0
        total_generated_space = 0
        for mask in all_masks:
            total_hits += mask.hitcount
            total_generated_space += mask.generated_space
        else:
            rejection_ratio = rejected_lines / float(total_lines) * 100
            coverage_ratio = total_hits / float(total_lines) * 100
            logging.info("Total Lines : " + str(total_lines))
            logging.info("Total Rejected Lines : " + str(rejected_lines))
            logging.info("Rejection Ratio : " + str(rejection_ratio))
            logging.info("\n")
            logging.info("Total treated lines : " + str(treated_lines))
            logging.info("Total hits : " + str(total_hits))
            logging.info("Coverage Ratio: {0:.2f}%".format(coverage_ratio))
        logging.info("Generated space " + str(total_generated_space))

        print "Masks Generated : " + str(len(all_masks))
        for mask in all_masks:
            print mask.maskstring
        if total_generated_space > options['max_generated_space']:
            print "Game Over"
        else:
            print "Victory"
        print_masks_to_file(all_masks, total_lines, total_generated_space)
        logging.info("--- %s seconds ---" % (time.time() - start_time))
Пример #19
0
def main():
    discriminator = DIS(FEATURE_SIZE,
                        HIDDEN_SIZE,
                        WEIGHT_DECAY,
                        D_LEARNING_RATE,
                        param=None)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.initialize_all_variables())

    print('start random negative sampling with log ranking discriminator')
    generate_uniform(DIS_TRAIN_FILE)
    train_size = ut.file_len(DIS_TRAIN_FILE)

    p_best_val = 0.0
    ndcg_best_val = 0.0

    for epoch in range(200):
        index = 1
        while True:
            if index > train_size:
                break
            if index + BATCH_SIZE <= train_size + 1:
                input_pos, input_neg = ut.get_batch_data(
                    DIS_TRAIN_FILE, index, BATCH_SIZE)
            else:
                input_pos, input_neg = ut.get_batch_data(
                    DIS_TRAIN_FILE, index, train_size - index + 1)
            index += BATCH_SIZE

            pred_data = []
            pred_data.extend(input_pos)
            pred_data.extend(input_neg)
            pred_data = np.asarray(pred_data)

            pred_data_label = [1.0] * len(input_pos)
            pred_data_label.extend([0.0] * len(input_neg))
            pred_data_label = np.asarray(pred_data_label)

            _ = sess.run(discriminator.d_updates,
                         feed_dict={
                             discriminator.pred_data: pred_data,
                             discriminator.pred_data_label: pred_data_label
                         })

        p_5 = precision_at_k(sess,
                             discriminator,
                             query_pos_test,
                             query_pos_train,
                             query_url_feature,
                             k=5)
        ndcg_5 = ndcg_at_k(sess,
                           discriminator,
                           query_pos_test,
                           query_pos_train,
                           query_url_feature,
                           k=5)

        if p_5 > p_best_val:
            p_best_val = p_5
            discriminator.save_model(sess, MLE_MODEL_BEST_FILE)
            print("Best: ", " p@5 ", p_5, "ndcg@5 ", ndcg_5)
        elif p_5 == p_best_val:
            if ndcg_5 > ndcg_best_val:
                ndcg_best_val = ndcg_5
                discriminator.save_model(sess, MLE_MODEL_BEST_FILE)
                print("Best: ", " p@5 ", p_5, "ndcg@5 ", ndcg_5)

    sess.close()
    param_best = cPickle.load(open(MLE_MODEL_BEST_FILE))
    assert param_best is not None
    discriminator_best = DIS(FEATURE_SIZE,
                             HIDDEN_SIZE,
                             WEIGHT_DECAY,
                             D_LEARNING_RATE,
                             param=param_best)

    sess = tf.Session(config=config)
    sess.run(tf.initialize_all_variables())

    p_1_best = precision_at_k(sess,
                              discriminator_best,
                              query_pos_test,
                              query_pos_train,
                              query_url_feature,
                              k=1)
    p_3_best = precision_at_k(sess,
                              discriminator_best,
                              query_pos_test,
                              query_pos_train,
                              query_url_feature,
                              k=3)
    p_5_best = precision_at_k(sess,
                              discriminator_best,
                              query_pos_test,
                              query_pos_train,
                              query_url_feature,
                              k=5)
    p_10_best = precision_at_k(sess,
                               discriminator_best,
                               query_pos_test,
                               query_pos_train,
                               query_url_feature,
                               k=10)

    ndcg_1_best = ndcg_at_k(sess,
                            discriminator_best,
                            query_pos_test,
                            query_pos_train,
                            query_url_feature,
                            k=1)
    ndcg_3_best = ndcg_at_k(sess,
                            discriminator_best,
                            query_pos_test,
                            query_pos_train,
                            query_url_feature,
                            k=3)
    ndcg_5_best = ndcg_at_k(sess,
                            discriminator_best,
                            query_pos_test,
                            query_pos_train,
                            query_url_feature,
                            k=5)
    ndcg_10_best = ndcg_at_k(sess,
                             discriminator_best,
                             query_pos_test,
                             query_pos_train,
                             query_url_feature,
                             k=10)

    map_best = MAP(sess, discriminator_best, query_pos_test, query_pos_train,
                   query_url_feature)
    mrr_best = MRR(sess, discriminator_best, query_pos_test, query_pos_train,
                   query_url_feature)

    print("Best ", "p@1 ", p_1_best, "p@3 ", p_3_best, "p@5 ", p_5_best,
          "p@10 ", p_10_best)
    print("Best ", "ndcg@1 ", ndcg_1_best, "ndcg@3 ", ndcg_3_best, "ndcg@5 ",
          ndcg_5_best, "p@10 ", ndcg_10_best)
    print("Best MAP ", map_best)
    print("Best MRR ", mrr_best)
Пример #20
0
def main():
    print("load model...")
    #param = pickle.load(open(workdir + "model_dns_ori.pkl"))   #.pkl是python 用来保存文件的
    with open(workdir + "model_dns_ori.pkl", 'rb') as data_file:
        param = pickle.load(data_file, encoding='bytes')
    #param = cPickle.load(open(workdir + "model_dns_ori.pkl"))
    #with open(workdir + "model_dns_ori.pkl",'rb') as data_file:
    #param = pickle.load(data_file,encoding='bytes')
    print(param)
    generator = GEN(ITEM_NUM,
                    USER_NUM,
                    EMB_DIM,
                    lamda=0.0 / BATCH_SIZE,
                    param=param,
                    initdelta=INIT_DELTA,
                    learning_rate=0.001)
    discriminator = DIS(ITEM_NUM,
                        USER_NUM,
                        EMB_DIM,
                        lamda=0.1 / BATCH_SIZE,
                        param=None,
                        initdelta=INIT_DELTA,
                        learning_rate=0.001)

    config = tf.ConfigProto(
    )  # 一般用在创建session的时候。用来对session进行参数配置,配置session运行参数&&GPU设备指定
    config.gpu_options.allow_growth = True  ## 使用allow_growth option,刚一开始分配少量的GPU容量,然后按需慢慢的增加,由于不会释放
    #内存,所以会导致碎片
    sess = tf.Session(
        config=config
    )  # 要运行刚才定义的三个操作中的任何一个,我们需要为Graph创建一个Session。 Session还将分配内存来存储变量的当前值
    sess.run(tf.global_variables_initializer())

    print("gen ", simple_test(sess, generator))
    print("dis ", simple_test(sess, discriminator))

    dis_log = open(workdir + 'dis_log.txt', 'w')
    gen_log = open(workdir + 'gen_log.txt', 'w')

    # minimax training
    best = 0.
    for epoch in range(15):
        if epoch >= 0:
            for d_epoch in range(100):
                if d_epoch % 5 == 0:
                    generate_for_d(sess, generator, DIS_TRAIN_FILE)
                    train_size = ut.file_len(DIS_TRAIN_FILE)
                index = 1
                while True:
                    if index > train_size:
                        break
                    if index + BATCH_SIZE <= train_size + 1:
                        input_user, input_item, input_label = ut.get_batch_data(
                            DIS_TRAIN_FILE, index, BATCH_SIZE)
                    else:
                        input_user, input_item, input_label = ut.get_batch_data(
                            DIS_TRAIN_FILE, index, train_size - index + 1)
                    index += BATCH_SIZE

                    _ = sess.run(discriminator.d_updates,
                                 feed_dict={
                                     discriminator.u: input_user,
                                     discriminator.i: input_item,
                                     discriminator.label: input_label
                                 })

            # Train G
            for g_epoch in range(50):  # 50
                for u in user_pos_train:
                    sample_lambda = 0.2
                    pos = user_pos_train[u]

                    rating = sess.run(generator.all_logits, {generator.u: u})
                    exp_rating = np.exp(rating)
                    prob = exp_rating / np.sum(
                        exp_rating)  # prob is generator distribution p_\theta

                    pn = (1 - sample_lambda) * prob
                    pn[pos] += sample_lambda * 1.0 / len(pos)
                    # Now, pn is the Pn in importance sampling, prob is generator distribution p_\theta

                    sample = np.random.choice(np.arange(ITEM_NUM),
                                              2 * len(pos),
                                              p=pn)
                    ###########################################################################
                    # Get reward and adapt it with importance sampling
                    ###########################################################################
                    reward = sess.run(discriminator.reward, {
                        discriminator.u: u,
                        discriminator.i: sample
                    })
                    reward = reward * prob[sample] / pn[sample]
                    ###########################################################################
                    # Update G
                    ###########################################################################
                    _ = sess.run(
                        generator.gan_updates, {
                            generator.u: u,
                            generator.i: sample,
                            generator.reward: reward
                        })

                result = simple_test(sess, generator)
                print("epoch ", epoch, "gen: ", result)
                buf = '\t'.join([str(x) for x in result])
                gen_log.write(str(epoch) + '\t' + buf + '\n')
                gen_log.flush()

                p_5 = result[1]
                if p_5 > best:
                    print('best: ', result)
                    best = p_5
                    generator.save_model(sess, "ml-100k/gan_generator.pkl")

    gen_log.close()
    dis_log.close()
Пример #21
0
        unique_events.update(local_events)
        
        for event in eoi.event:
            if event in local_events:
                events_to_sources[safe_filename(event)].append(os.path.basename(os.path.normpath(csv)))
                    
    return unique_events, events_to_sources
                    
csv = pd.read_csv(args.data,
                  header=0,
                  names=['timestamp', 'source', 'event'],
                  dtype={'source': str, 'event': str},
                  parse_dates=[0],
                  chunksize=args.chunksize)

length = file_len(args.data)
print('There are {} lines of data in {}.'.format(length, args.data))

t_start = time.time()

for i, chunk in enumerate(csv):
    data = defaultdict(list)

    t0 = time.time()

    for row in chunk.itertuples(index=False):
        timestamp, source, event = row
        if pd.notnull(source):
            data[source].append((timestamp, event))

    print('Demuxing chunk: {} seconds.'.format(pretty_float(time.time()-t0)), end=' ')
Пример #22
0
def main():
    print("load initial model ...")

    param_nn = cPickle.load(open(DIS_MODEL_FILE_NN))
    assert param_nn is not None

    discriminator = DIS(FEATURE_SIZE,
                        HIDDEN_SIZE,
                        D_WEIGHT_DECAY,
                        D_LEARNING_RATE,
                        loss='log',
                        param=param_nn)
    generator = GEN(FEATURE_SIZE,
                    HIDDEN_SIZE,
                    G_WEIGHT_DECAY,
                    G_LEARNING_RATE,
                    param=param_nn)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.initialize_all_variables())

    print('start adversarial training')

    p_best_val = 0.0
    ndcg_best_val = 0.0

    for epoch in range(30):
        if epoch > 0:
            # G generate negative for D, then train D
            print('Training D ...')
            generate_for_d(sess, generator, DIS_TRAIN_FILE)
            train_size = ut.file_len(DIS_TRAIN_FILE)

            for d_epoch in range(30):
                index = 1
                while True:
                    if index > train_size:
                        break
                    if index + BATCH_SIZE <= train_size + 1:
                        input_pos, input_neg = ut.get_batch_data(
                            DIS_TRAIN_FILE, index, BATCH_SIZE)
                    else:
                        input_pos, input_neg = ut.get_batch_data(
                            DIS_TRAIN_FILE, index, train_size - index + 1)
                    index += BATCH_SIZE

                    _ = sess.run(discriminator.d_updates,
                                 feed_dict={
                                     discriminator.pos_data: input_pos,
                                     discriminator.neg_data: input_neg
                                 })

                p_5 = precision_at_k(sess,
                                     discriminator,
                                     query_pos_test,
                                     query_pos_train,
                                     query_url_feature,
                                     k=5)
                ndcg_5 = ndcg_at_k(sess,
                                   discriminator,
                                   query_pos_test,
                                   query_pos_train,
                                   query_url_feature,
                                   k=5)

                if p_5 > p_best_val:
                    p_best_val = p_5
                    ndcg_best_val = ndcg_5
                    discriminator.save_model(sess, GAN_MODEL_BEST_FILE)
                    print("Best: ", "dis p@5 ", p_5, "dis ndcg@5 ", ndcg_5)
                elif p_5 == p_best_val:
                    if ndcg_5 > ndcg_best_val:
                        ndcg_best_val = ndcg_5
                        discriminator.save_model(sess, GAN_MODEL_BEST_FILE)
                        print("Best: ", "dis p@5 ", p_5, "dis ndcg@5 ", ndcg_5)

        # Train G
        print('Training G ...')
        for g_epoch in range(50):  # 50
            for query in query_pos_train.keys():
                pos_list = query_pos_train[query]
                # candidate_list = list(set(query_url_feature[query].keys()) - set(pos_list))
                candidate_list = list(query_url_feature[query].keys())

                if len(candidate_list) <= 0:
                    continue

                candidate_list_feature = [
                    query_url_feature[query][url] for url in candidate_list
                ]
                candidate_list_feature = np.asarray(candidate_list_feature)
                candidate_list_score = sess.run(
                    generator.pred_score,
                    {generator.pred_data: candidate_list_feature})

                # softmax for all
                exp_rating = np.exp(candidate_list_score)
                prob = exp_rating / np.sum(exp_rating)

                neg_index = np.random.choice(np.arange(len(candidate_list)),
                                             size=[len(pos_list)],
                                             p=prob)
                neg_list = np.array(candidate_list)[neg_index]

                pos_list_feature = [
                    query_url_feature[query][url] for url in pos_list
                ]
                neg_list_feature = [
                    query_url_feature[query][url] for url in neg_list
                ]
                neg_index = np.asarray(neg_index)
                # every negative samples have a reward
                neg_reward = sess.run(discriminator.reward,
                                      feed_dict={
                                          discriminator.pos_data:
                                          pos_list_feature,
                                          discriminator.neg_data:
                                          neg_list_feature
                                      })

                # Method 1: softmax before gather
                _ = sess.run(generator.gan_updates,
                             feed_dict={
                                 generator.pred_data: candidate_list_feature,
                                 generator.sample_index: neg_index,
                                 generator.reward: neg_reward
                             })

    print('Best p@5: ', p_best_val, 'Best ndcg@5: ', ndcg_best_val)
Пример #23
0
def main():
    i_file_output = 0
    print "load model..."
    generator = GEN(AUTHER_NUM,
                    EMB_DIM,
                    lamda=0.0 / BATCH_SIZE,
                    param=None,
                    initdelta=INIT_DELTA,
                    learning_rate=FLAGS.init_lr_gen,
                    lr_decay_step=FLAGS.lr_decay_iter_gen)
    discriminator = DIS(AUTHER_NUM,
                        EMB_DIM,
                        lamda=0.01 / BATCH_SIZE,
                        param=None,
                        initdelta=INIT_DELTA,
                        learning_rate=FLAGS.init_lr_dis,
                        lr_decay_step=FLAGS.lr_decay_iter_dis)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())

    dis_log = open(outputdir + 'dis_log.txt', 'w')
    gen_log = open(outputdir + 'gen_log.txt', 'w')

    # minimax training
    best_gen = 0.
    best_dis = 0.
    draw_count_D = 0
    draw_count_G = 0
    for epoch in range(FLAGS.epochs):  #5000
        if epoch >= 0:
            # Train D
            generate_for_d(sess, generator, DIS_TRAIN_FILE)
            train_size = ut.file_len(DIS_TRAIN_FILE)  # generate file length
            for d_epoch in range(5):
                index = 1
                while True:
                    if index > train_size:
                        break
                    if index + BATCH_SIZE <= train_size + 1:
                        input_auther, input_coauther_real, input_coauther_fake = ut.get_batch_data(
                            DIS_TRAIN_FILE, index, BATCH_SIZE)
                    else:
                        input_auther, input_coauther_real, input_coauther_fake = ut.get_batch_data(
                            DIS_TRAIN_FILE, index, train_size - index + 1)
                    index += BATCH_SIZE

                    _ = sess.run(
                        [discriminator.d_updates, discriminator.clip_D],
                        feed_dict={
                            discriminator.auther: input_auther,
                            discriminator.co_real: input_coauther_real,
                            discriminator.co_fake: input_coauther_fake
                        })
            result = simple_test(sess, discriminator)
            buf = '\t'.join([str(x) for x in result])
            dis_log.write(str(epoch) + '\t' + buf + '\n')
            dis_log.flush()

            p_5 = result[2]
            if p_5 > best_dis:
                print 'best_dis: ', epoch, result
                best_dis = p_5
                discriminator.save_model(sess,
                                         outputdir + "gan_discriminator.pkl")
        # Train G
        for g_epoch in range(1):
            for u in auther_pos_train:
                sample_lambda = 0.2
                pos = list(set(auther_pos_train[u]))
                sample_times = 128

                rating = sess.run(generator.softmax_logits,
                                  {generator.auther: [u]})
                prob = np.reshape(rating, [-1])

                sample = np.random.choice(np.arange(AUTHER_NUM),
                                          size=sample_times,
                                          p=prob)
                ###########################################################################
                # Get reward and adapt it with importance sampling
                ###########################################################################
                reward = sess.run(
                    discriminator.reward, {
                        discriminator.auther: np.tile(u, (sample_times)),
                        discriminator.co_fake: sample
                    })
                ###########################################################################
                # Update G
                ###########################################################################
                _ = sess.run(
                    generator.gan_updates, {
                        generator.auther: np.tile(u, (sample_times)),
                        generator.co: sample,
                        generator.reward: reward
                    })
        result = simple_test(sess, generator)
        buf = '\t'.join([str(x) for x in result])
        gen_log.write(str(epoch) + '\t' + buf + '\n')
        gen_log.flush()

        p_5 = result[2]
        if p_5 > best_gen:
            print 'best_gen: ', epoch, result
            best_gen = p_5
            generator.save_model(sess, outputdir + "gan_generator.pkl")
            draw_count_G += 1
    gen_log.close()
    dis_log.close()
Пример #24
0
def main():

    best = 0.
    gen_log = open(workdir + 'gen_log.txt', 'w')
    for epoch in range(15):
        if epoch >= 0:
            for d_epoch in range(100):
                if d_epoch % 5 == 0:
                    generate_for_d(generator, DIS_TRAIN_FILE)
                    train_size = ut.file_len(DIS_TRAIN_FILE)
                index = 1
                while True:
                    if index > train_size:
                        break
                    if index + BATCH_SIZE <= train_size + 1:
                        input_user, input_item, input_label = ut.get_batch_data(
                            DIS_TRAIN_FILE, index, BATCH_SIZE)
                    else:
                        input_user, input_item, input_label = ut.get_batch_data(
                            DIS_TRAIN_FILE, index, train_size - index + 1)
                    index += BATCH_SIZE
                    # pre_logits = discriminator.module.pre_logits(input_user, input_item)
                    D_loss = discriminator(input_user, input_item, torch.tensor(input_label)) \
                            + lamda * (criterion(D_user_embeddings) + criterion(D_item_embeddings) + criterion(D_item_bias))

                    optimizer_D.zero_grad()
                    D_loss.backward()
                    optimizer_D.step()
                print("\r[D Epoch %d/%d] [loss: %f]" %
                      (d_epoch, 100, D_loss.item()))

            for g_epoch in range(50):
                for u in user_pos_train:
                    sample_lambda = 0.2
                    pos = user_pos_train[u]
                    rating = generator.module.all_logits(u)
                    rating = rating.detach_().cpu().numpy()

                    exp_rating = np.exp(rating)
                    prob = exp_rating / np.sum(
                        exp_rating)  # prob is generator distribution p_\theta

                    pn = (1 - sample_lambda) * prob
                    pn[pos] += sample_lambda * 1.0 / len(pos)
                    # Now, pn is the Pn in importance sampling, prob is generator distribution p_\theta

                    sample = np.random.choice(np.arange(ITEM_NUM),
                                              2 * len(pos),
                                              p=pn)
                    ###########################################################################
                    # Get reward and adapt it with importance sampling
                    ###########################################################################
                    reward = discriminator.module.get_reward(u, sample)
                    reward = reward.detach_().cpu().numpy(
                    ) * prob[sample] / pn[sample]
                    ###########################################################################
                    # Update G
                    ###########################################################################
                    with torch.cuda.device(device[0]):
                        G_loss = generator(u, torch.tensor(sample),
                                           torch.tensor(reward))
                    optimizer_G.zero_grad()
                    G_loss.backward()
                    optimizer_G.step()
                print("\r[G Epoch %d/%d] [loss: %f]" %
                      (g_epoch, 50, G_loss.item()))
                result = simple_test(generator)
                print("epoch ", epoch, "gen: ", result)
                buf = '\t'.join([str(x) for x in result])
                gen_log.write(str(epoch) + '\t' + buf + '\n')
                gen_log.flush()

    gen_log.close()