示例#1
0
def train():
    with tf.Session() as sess:

        init = tf.global_variables_initializer()
        sess = tf.Session()
        sess.run(init)

        test_feed_dict = {
            X: mnist.test.images.reshape(-1, 28, 28, 1),
            Y: mnist.test.labels,
            p_keep_conv: p_keep_conv_value,
            p_keep_hidden: p_keep_hidden_value
        }

        for epoch in range(training_epochs):
            total_batch = int(mnist.train.num_examples / batch_size)

            for step in range(total_batch):
                batch_xs, batch_ys = mnist.train.next_batch(batch_size)

                feed_dict = {
                    X: batch_xs.reshape(-1, 28, 28, 1),
                    Y: batch_ys,
                    p_keep_conv: p_keep_conv_value,
                    p_keep_hidden: p_keep_hidden_value
                }
                train_op = build_model()
                sess.run(train_op, feed_dict=feed_dict)

            check_prediction = tf.equal(tf.argmax(model, 1), tf.argmax(Y, 1))
            accuracy = tf.reduce_mean(tf.cast(check_prediction, tf.float32))
            accuracy_rates = sess.run(accuracy, feed_dict=test_feed_dict)

            log.info("{}".format('Epoch:', '%04d' % (epoch + 1),
                                 '/ Accuracy =', accuracy_rates))
示例#2
0
def message_Clustering():
    data = pd.read_csv("antbot/datasets/question_45/root_q_a").values.tolist()
    search_msg = [j for i in data for j in i]
    for hit_0 in tqdm(search_msg):
        root_msg = hit_0
        body = {
            "query": {
                "bool": {
                    "filter": {
                        "match_phrase": {
                            "q": "{}".format(root_msg)
                        }
                    }
                }
            }
        }

        questions_one = es.search(index="bot_entity_tmp_new",
                                  body=body,
                                  size=100)

        for hit_1 in questions_one['hits']['hits']:
            res = hit_1['_source']['q']
            body = {
                "query": {
                    "bool": {
                        "filter": {
                            "match_phrase": {
                                "q": "{}".format(res)
                            }
                        }
                    }
                }
            }
            questions_two = es.search(index="bot_entity_tmp",
                                      body=body,
                                      size=100)
            out = []
            for hit_2 in questions_two['hits']['hits']:
                res_two = hit_2['_source']
                rows = {
                    'q': str(res_two['q']).replace("\n", ""),
                    'a': str(res_two['a']).replace("\n", ""),
                    'roomId': (res_two['roomId']),
                    'tenantId': (res_two['tenantId']),
                    'lanlordId': (res_two['lanlordId']),
                    'id': str(res_two['id']).replace("\n", "")
                }
                out.append(rows)
            if len(out) < 1:
                continue
            df = pd.DataFrame(out)
            save_name = '{}.csv'.format(replace_symbol(out[0]['q']))
            save_dir = "/home/duyp/mayi_datasets/seed/entity"
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            save_path = os.path.join(save_dir, "{}".format(save_name))

            log.info("{}".format(save_path))
            df.to_csv(save_path, index=None)
示例#3
0
def get_features_corr():
    data = pd.read_csv(
        os.path.join("yancheng/datasets/results/train",
                     "total_by_day_ex_sorted.csv"))
    X = data['cnt']
    y = data['week']
    corr = np.corrcoef(X, y)
    log.info("{}".format(corr))
示例#4
0
def get_features():
    data = pd.read_csv(
        os.path.join("yancheng/datasets/results", "total_by_day.csv"))
    X = data['cnt']
    y = data['week']
    corr = np.corrcoef(X, y)
    log.info("{}".format(corr))
    data_grouped = data.groupby(by='week')
    for i, j in data_grouped:
        j.to_csv("yancheng/datasets/results/week/{}.csv".format(i), index=None)
示例#5
0
def save_question_45_to_es(index_name="question_cd_update"):
    # 批量插入
    try:
        es.indices.delete(index_name)
        log.info("{} have delete ".format(index_name))
        setting = {"number_of_shards": 6, "number_of_replicas": 0}
        mapping = {
            "timestamp": {
                "enabled": "true"
            },
            "properties": {
                "logdate": {
                    "type": "date",
                    "format": "dd/MM/yyy HH:mm:ss"
                }
            }
        }

        settings = {"settings": setting, "mapping": mapping}
        es.indices.create(index=index_name, ignore=400, body=settings)
    except:
        pass

    file_dir = "antbot/datasets/city_questions_740432.csv"
    if not os.path.isfile(file_dir):
        raise FileNotFoundError("没有数据文件")
    data = pd.read_csv(file_dir).values.tolist()

    line_number = 0
    all_data = []
    source = ''
    for m in tqdm(data):
        body = {
            '_index': '{}'.format(index_name),
            '_type': 'post',
            '_id': id,
            '_source': source
        }
        all_data.append(body)
        line_number += 1
        if line_number % 10000 == 0:
            try:
                success, _ = bulk(es,
                                  all_data,
                                  index=index_name,
                                  raise_on_error=True)
                all_data = []
                log.info(
                    "==================== success :{}/{} ====================".
                    format(line_number, len(data)))
            except Exception as e:
                log.debug("\n 存储失败! ")
示例#6
0
def cut_data():
    out = []
    data_name = os.path.join(root_path, 'datasets/cd_by_nosplit.txt')
    with open(data_name, 'r') as fr:
        lines = fr.readlines()
        for line in tqdm(lines):
            line_cut = cut(replace_symbol(line), add_stopwords=True)
            for x in line_cut:
                out.append(x)
    log.info(" Length: {} ".format(len(out)))
    fw = open(os.path.join(root_path, "datasets/cd.txt"), 'w')
    fw.writelines(" ".join(out))
    fw.close()
示例#7
0
def geturls():
    all_urls = []
    files = ['shehui', 'tiyu', 'renwu', 'ziran', 'wenhua', 'lishi', 'dili', 'keji']
    for file in files:
        rooturl = 'http://baike.baidu.com/{}'.format(file)
        log.info("{}".format(rooturl))
        html = urlhelper(rooturl)
        soup = BeautifulSoup(html, "lxml")
        for a in soup.find_all('a', href=True):
            href = a['href']
            if "view" in href:
                log.info("href: {}".format(href))
                all_urls.append(href)
                hrefhtml = urlhelper(href)
                hrefsoup = BeautifulSoup(hrefhtml, "lxml")
                for a in hrefsoup.find_all('a', href=True):
                    href = a['href']
                    if "view" in href and "http" not in href:
                        nexthref = "http://baike.baidu.com" + href
                        log.info("nexthref: {}".format(nexthref))
                        all_urls.append(nexthref)

                    elif "view" in href and "http" in href:
                        nexthref = href
                        log.info("nexthref: {}".format(nexthref))
                        all_urls.append(nexthref)
                    else:
                        continue

    df = pd.DataFrame(all_urls)
    df.to_csv("./localdatasets/baike/urls.txt", index=None)
示例#8
0
文件: model.py 项目: logonmy/alvin_py
def GP():
    data = pd.read_csv("./datasets/results/data_train.csv").values
    kernel = C(0.1, (0.001, 0.1)) * RBF(0.5, (1e-4, 10))
    reg = GaussianProcessRegressor(kernel=kernel,
                                   n_restarts_optimizer=10,
                                   alpha=0.1)
    train_x, train_y = data[:, :-1], data[:, -1]
    log.info("{}  {}".format(train_x.shape, train_y.shape))
    reg.fit(train_x, train_y)
    test_data = pd.read_csv("./datasets/test_A_20171225.txt", sep="\t")
    # TODO 新数据的不断迭代训练模型,预测新结果
    resuluts = []
    for x in tqdm(test_data.values):
        predict_x = np.array([[x[0], x[1]]])
        p = reg.predict(predict_x)[0]
        resuluts.append(int(p))
        train_x_update = list_reverse_pop(train_x, x)
        new_train_x = np.array(train_x_update)
        train_y_update = list_reverse_pop(train_y, p)
        new_train_y = np.array(train_y_update)
        log.info("{}, {}".format(new_train_x.shape, new_train_y.shape))
        reg.fit(new_train_x, new_train_y)
        log.info("predict: {}".format(resuluts))

    test_data_copy = test_data.copy()
    test_data_copy['predict'] = resuluts
    test_data_copy.to_csv("./datasets/results/predict_GP.csv", index=None)

    log.info('{}'.format(resuluts))
示例#9
0
def get_date_features():
    root = 'yancheng/datasets/results/train/'
    log.info("Total files :{}".format(len(os.listdir(root))))
    res = []
    for file in tqdm(os.listdir(root)):
        rows = OrderedDict()
        file_name = os.path.join(root, file)
        data = pd.read_csv(file_name)
        cnt = np.sum(data['cnt'].values)
        rows['date'] = data['date'].values[0]
        rows['week'] = data['day_of_week'].values[0]
        rows['cnt'] = cnt
        res.append(rows)
    df = pd.DataFrame(res).sort_values(by='date')
    df.to_csv(os.path.join("yancheng/datasets/results", "total_by_day.csv"),
              index=None)
示例#10
0
def conact_csv():
    out = []
    path = 'word2vector_code/datasets/rawdata'
    for file in tqdm(os.listdir(path)):
        data_name = os.path.join(path, file)
        with open(data_name, 'r') as fr:
            lines = fr.readlines()
            for line in lines:
                for x in line.split():
                    if x.isdigit():
                        continue
                    out.append(x)
        log.info(" NEXT ")
    log.info("{}".format(len(out)))

    fw = open("word2vector_code/datasets/train.csv", 'w')
    fw.writelines(" ".join(out))
示例#11
0
def nntest(model_dir="datasets/results/models", class_number=2):
    root = os.path.dirname(os.path.realpath(__file__))

    pos = pd.read_csv(
        "Order_predicts/datasets/results/test/action_pos_features.csv")
    neg = pd.read_csv(
        "Order_predicts/datasets/results/test/action_neg_features.csv")
    data = pd.concat([pos, neg])
    data = shuffle(data)
    ids = data['id']
    data = data.fillna(-1).replace(np.inf, 100)
    del data['16_tmode']
    del data['10_t9']
    del data['28_tmode']
    del data['27_atmedian']
    del data['29_atptp']
    del data['continent']
    del data['province']
    del data['country']
    del data['city']
    del data['age']

    x_data_holder = tf.placeholder(tf.float32, [None, 33], name='inputs_x')
    y_prediction = neural_networks(x_data_holder, 33, class_number)

    for i in ids:
        batch_x = data[data['id'].isin([i])]
        del batch_x['id']
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(os.path.join(root, model_dir))
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            y_pre = sess.run(y_prediction,
                             feed_dict={x_data_holder: batch_x.values})
            log.info("{}".format(y_pre))
            normal, spam = y_pre[0][0], y_pre[0][1]
            log.info("{}, {}".format(normal, spam))
            res = {}
            if normal > spam:
                res['pos'] = normal
            elif normal < spam:
                res['neg'] = spam
            print(res)
示例#12
0
def model(input_tensor):
    log.info("input_tensor: {}".format(input_tensor))
    with tf.device("/gpu:0"):
        weights = []
        conv_00_w = tf.get_variable(
            "conv_00_w", [3, 3, 3, 64],
            initializer=tf.random_normal_initializer(stddev=np.sqrt(2.0 / 9)))
        conv_00_b = tf.get_variable("conv_00_b", [64],
                                    initializer=tf.constant_initializer(0))
        weights.append(conv_00_w)
        weights.append(conv_00_b)
        tensor = tf.nn.relu(
            tf.nn.bias_add(
                tf.nn.conv2d(input_tensor,
                             conv_00_w,
                             strides=[1, 1, 1, 1],
                             padding='SAME'), conv_00_b))
        for i in range(18):
            conv_w = tf.get_variable("conv_%02d_w" % (i + 1), [3, 3, 64, 64],
                                     initializer=tf.random_normal_initializer(
                                         stddev=np.sqrt(2.0 / 9 / 64)))
            conv_b = tf.get_variable("conv_%02d_b" % (i + 1), [64],
                                     initializer=tf.constant_initializer(0))
            weights.append(conv_w)
            weights.append(conv_b)
            tensor = tf.nn.relu(
                tf.nn.bias_add(
                    tf.nn.conv2d(tensor,
                                 conv_w,
                                 strides=[1, 1, 1, 1],
                                 padding='SAME'), conv_b))
        conv_w = tf.get_variable("conv_20_w", [3, 3, 64, 1],
                                 initializer=tf.random_normal_initializer(
                                     stddev=np.sqrt(2.0 / 9 / 64)))
        conv_b = tf.get_variable("conv_20_b", [1],
                                 initializer=tf.constant_initializer(0))
        weights.append(conv_w)
        weights.append(conv_b)
        tensor = tf.nn.bias_add(
            tf.nn.conv2d(tensor, conv_w, strides=[1, 1, 1, 1], padding='SAME'),
            conv_b)
        tensor = tf.add(tensor, input_tensor)
        log.info("out tensor :{}".format(tensor))
        return tensor, weights
示例#13
0
文件: main.py 项目: logonmy/alvin_py
def randomforest():
    #  特征选择
    pos = pd.read_csv(
        "Order_predicts/datasets/results/train/action_pos_features.csv")
    posfillna = pos.fillna(pos.median()).replace(np.inf, 100)
    neg = pd.read_csv(
        "Order_predicts/datasets/results/train/action_neg_features.csv")
    negfillna = neg.fillna(neg.median()).replace(np.inf, 100)
    data = pd.concat([posfillna, negfillna])
    data = shuffle(data)
    data.to_csv("Order_predicts/datasets/results/train.csv", index=None)
    log.info("train data save succes ...")
    del data['id']
    Y = data['label']
    del data['label']
    X = data
    names = data.columns
    rf = RandomForestRegressor(n_estimators=10,
                               criterion="mse",
                               max_depth=None,
                               min_samples_split=2,
                               min_samples_leaf=1,
                               min_weight_fraction_leaf=0.,
                               max_features="log2",
                               max_leaf_nodes=None,
                               min_impurity_decrease=0.,
                               min_impurity_split=None,
                               bootstrap=True,
                               oob_score=False,
                               n_jobs=1,
                               random_state=1,
                               verbose=0,
                               warm_start=False)
    rf.fit(X, Y)

    res = sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_),
                     names),
                 reverse=True)

    for x in res:
        log.info("{}: {}".format(x[0], x[1]))
示例#14
0
def read_newdata():
    path = '/home/duyp/mayi_datasets/question/question_new'
    number = 0

    for file in os.listdir(path):
        filename = os.path.join(path, file)
        data = pd.read_csv(filename, lineterminator="\n").values
        out = []
        for x in tqdm(data):
            msg = x[0]
            if isinstance(msg, str):
                msgcut = cut(replace_symbol(msg), add_stopwords=True)
                for i in msgcut:
                    out.append(i)
                    number += 1
            else:
                continue
        fw = open(os.path.join(root_path, "datasets/rawdata/{}".format(file)),
                  'w')
        fw.writelines(" ".join(out))
        fw.close()
    log.info("{}".format(number))
示例#15
0
文件: model.py 项目: logonmy/alvin_py
def linear_model():
    data = pd.read_csv(
        "yancheng/datasets/results/train/total_by_day_ex_sorted.csv")
    x, y = data['week'].values, data['cnt'].values
    train_x = np.array(x[:int(len(x) * 0.8)]).reshape(782, 1)
    train_y = np.array(y[:int(len(x) * 0.8)]).reshape(782, 1)
    test_x = np.array(x[int(len(x) * 0.8):]).reshape(196, 1)
    test_y = np.array(x[int(len(x) * 0.8):]).reshape(196, )

    scaler = preprocessing.StandardScaler().fit(train_x)
    train_x = scaler.transform(train_x).reshape(782, 1)

    scaler = preprocessing.StandardScaler().fit(train_y)
    train_y = scaler.transform(train_y).reshape(782, )

    lm = LinearRegression()

    lm.fit(train_x, train_y)
    joblib.dump(lm, "yancheng/datasets/results/linear_model.m")
    score = lm.score(test_x, test_y)
    mse = mean_squared_error(test_y, lm.predict(test_x))
    # log.info("{}".format(score, mse))

    log.info("{}, {}".format(score, mse))
示例#16
0
def modeltest():
    batch_size = 4
    run_config = tf.ConfigProto()
    run_config.gpu_options.allow_growth = True
    sess = tf.Session(config=run_config)

    is_training = tf.placeholder(tf.bool, [])
    x = tf.placeholder(tf.float32, [None, image_size, image_size, 3])
    downscaled = downscale(x)
    imitation, G_vars = generator(downscaled, is_training, False)
    real_output, _ = discriminator(x, is_training, False)
    fake_output, D_vars = discriminator(imitation, is_training, True)

    saver = tf.train.Saver(max_to_keep=100)
    ckpt = tf.train.get_checkpoint_state('./checkpoints/srgan_new/')
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)
        log.info("Model load success ... {}".format(
            ckpt.model_checkpoint_path))

    x_test = np.load('yaogantest.npy')
    log.info("{}".format(x_test.shape))
    k = 0
    epoch = 100
    for x_batch_images in minibatches(inputs=x_test, batch_size=batch_size):
        raw = normalize(x_batch_images)
        mos, fake = sess.run([downscaled, imitation],
                             feed_dict={
                                 x: raw,
                                 is_training: False
                             })
        log.info("{},{},{}".format(mos.shape, fake.shape, raw.shape))
        imgs = [mos, fake, raw]
        for i in range(batch_size):
            fig = plt.figure(figsize=(290, 110))
            label = ['输入', '输出', '原始图像']
            for j, img in enumerate(imgs):
                im = np.uint8((img[i] + 1) * 127.5)
                ax = fig.add_subplot(1, len(imgs), j + 1)
                plt.imshow(im)
                plt.tick_params(labelbottom='off')
                plt.tick_params(labelleft='off')
                plt.gca().get_xaxis().set_ticks_position('none')
                plt.gca().get_yaxis().set_ticks_position('none')
                ax.set_xlabel(label[j])
            epoch_ = "{0:09d}".format(epoch)
            path = os.path.join('result', '{}_{}_{}.jpg'.format(k, i, epoch_))
            plt.show()
            plt.close()
        k += 1
示例#17
0
def sort_file_by_dict(data_dir, input_filename, output_filename, delete=True):
    """
    输出文件和输入文件保存在同一目录下
    :param data_dir: 数据根目录
    :param input_filename: 要排序文件的名字
    :param output_filename: 输出文件的名字
    :param delete: 是否删除标点符号
    :return: 0
    """
    locale.setlocale(locale.LC_ALL, locale='zh_CN.UTF-8')
    files = []
    line_number = 0
    inputs_dir = os.path.join(data_dir, input_filename)
    with open(inputs_dir, 'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if delete:
                line_new = replace_symbol(
                    line.replace("\n", '').lstrip().rstrip().strip())
                if len(line_new) > 1:
                    files.append(line_new)
                    line_number += 1
                    # TODO 或者可以隔500000保存一次,加快保存速度.
                    if line_number % 10000 == 0:
                        log.info(
                            "=============== process : {} ===============".
                            format(line_number))
            else:
                line_new = line.replace("\n", '').lstrip().rstrip().strip()
                files.append(line_new)
                line_number += 1
                if line_number % 10000 == 0:
                    pass
    log.info(" Total lines : {}".format(line_number))
    b = sorted(files, key=cmp_to_key(locale.strcoll))
    df = pd.DataFrame(b)
    df.columns = ['message']
    output_dir = os.path.join(data_dir, output_filename)
    log.info("Save file : {}".format(output_dir))
    df.to_csv(output_dir, index=None)
示例#18
0
                                                   staircase=True)
        tf.summary.scalar("learning rate", learning_rate)
        optimizer = tf.train.AdamOptimizer(learning_rate)
        opt = optimizer.minimize(loss, global_step=global_step)
        saver = tf.train.Saver(weights, max_to_keep=0)

        config = tf.ConfigProto()

        with tf.Session(config=config) as sess:
            if not os.path.exists('logs'):
                os.mkdir('logs')
            merged = tf.summary.merge_all()
            file_writer = tf.summary.FileWriter('logs', sess.graph)
            tf.initialize_all_variables().run()
            for epoch in trange(0, max_epoch):
                log.info(
                    " ................... Start Training ...................")
                batch_count = train_list_length // batch_size
                log.info("{}".format(batch_count))
                for bc in range(batch_count):
                    offset = bc * batch_size
                    for hr, lr in get_image_batch_forpng(bc, batch_size):
                        input_data, gt_data = read_data2arr(lr), read_data2arr(
                            hr)
                        log.debug("{}, {}".format(input_data.shape,
                                                  gt_data.shape))
                        feed_dict = {
                            train_input: input_data,
                            train_gt: gt_data
                        }
                        run_obj = [
                            opt, loss, train_output, learning_rate, global_step
示例#19
0
文件: main.py 项目: logonmy/alvin_py
            prob = clf_weights.predict(batch_x.values)
        except:
            prob = clf_weights.predict_proba(batch_x.values)[0][1]
            # log.info("{}, {:0.8f}".format(p[0], prob))
        df_push.loc[linenumber, 'userid'] = int(i)
        df_push.loc[linenumber, 'orderType'] = "{}".format(prob)
        linenumber += 1

    df_push.to_csv("Order_predicts/datasets/results_push.csv", index=None)


if __name__ == "__main__":
    import sys

    if len(sys.argv) < 2:
        exit(1)
    method = sys.argv[1]

    if method == 'select':
        randomforest()
    if method == 'train':
        m_names = [
            'svm', 'svr', 'lasso', 'mlpr', 'rf', 'adaboost', 'gbr', 'qda',
            'lda', 'n_n', 'gnb', 'bnb', 'dcc', 'RAN', 'SGDR'
        ]
        log.info("Total number models: {}".format(len(m_names)))

        train_models(model_name='logistic', epoch=50, batch_size=2000)
    if method == "test":
        modeltest(model_name='logistic')
示例#20
0
文件: main.py 项目: logonmy/alvin_py
def train_models(model_name, epoch=5, batch_size=100):
    log.info("current model:{}".format(model_name))
    pos = pd.read_csv(
        "Order_predicts/datasets/results/train/action_pos_features.csv")
    posfillna = pos.fillna(pos.median()).replace(np.inf, 100)
    neg = pd.read_csv(
        "Order_predicts/datasets/results/train/action_neg_features.csv")
    negfillna = neg.fillna(neg.median()).replace(np.inf, 100)
    data = pd.concat([posfillna, negfillna])
    data = shuffle(data)
    del data['id']
    y = data['label']
    del data['label']
    scaler = preprocessing.StandardScaler().fit(data)
    X = scaler.transform(data)
    pd.DataFrame(X).to_csv("Order_predicts/datasets/results/scale_x.csv",
                           index=None)
    data_scaled = preprocessing.scale(X)
    log.info("data shape: {}".format(data_scaled.shape))
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=0)
    log.info("{}, {}".format(X_train.shape, X_test.shape))

    i = 0
    for e in range(epoch):
        for train_x, train_y in minibatches(X_train,
                                            y_train,
                                            batch_size=batch_size,
                                            shuffle=False):
            if model_name == 'svc':
                clf_weights = svm.SVC(C=1.0,
                                      kernel='rbf',
                                      degree=3,
                                      gamma='auto',
                                      coef0=0.0,
                                      shrinking=True,
                                      probability=False,
                                      tol=1e-3,
                                      cache_size=200,
                                      class_weight={1: 10},
                                      verbose=False,
                                      max_iter=-1,
                                      decision_function_shape='ovr',
                                      random_state=0)
            elif model_name == 'svr':
                clf_weights = svm.SVR(kernel='rbf',
                                      degree=3,
                                      gamma='auto',
                                      coef0=0.0,
                                      tol=1e-3,
                                      C=1.0,
                                      epsilon=0.1,
                                      shrinking=True,
                                      cache_size=200,
                                      verbose=False,
                                      max_iter=-1)
            elif model_name == 'lasso':
                clf_weights = Lasso(alpha=1.0,
                                    fit_intercept=True,
                                    normalize=False,
                                    precompute=False,
                                    copy_X=True,
                                    max_iter=1000,
                                    tol=1e-4,
                                    warm_start=False,
                                    positive=False,
                                    random_state=0,
                                    selection='cyclic')
            elif model_name == 'logistic':
                clf_weights = LogisticRegression(penalty='l2',
                                                 dual=False,
                                                 tol=1e-4,
                                                 C=1.0,
                                                 fit_intercept=True,
                                                 intercept_scaling=1,
                                                 class_weight={
                                                     0: 0.1,
                                                     1: 0.9
                                                 },
                                                 random_state=0,
                                                 solver='newton-cg',
                                                 max_iter=100,
                                                 multi_class='ovr',
                                                 verbose=0,
                                                 warm_start=False,
                                                 n_jobs=1)
            elif model_name == 'mlpr':
                # learning_rate: {'constant', 'invscaling', 'adaptive'}
                clf_weights = MLPRegressor(hidden_layer_sizes=(100, ),
                                           activation="logistic",
                                           solver='adam',
                                           alpha=0.0001,
                                           batch_size='auto',
                                           learning_rate="constant",
                                           learning_rate_init=0.001,
                                           power_t=0.5,
                                           max_iter=200,
                                           shuffle=True,
                                           random_state=0,
                                           tol=1e-4,
                                           verbose=False,
                                           warm_start=False,
                                           momentum=0.9,
                                           nesterovs_momentum=True,
                                           early_stopping=False,
                                           validation_fraction=0.1,
                                           beta_1=0.9,
                                           beta_2=0.999,
                                           epsilon=1e-8)
            elif model_name == 'rf':
                clf_weights = RandomForestClassifier(
                    n_estimators=20,
                    criterion="entropy",
                    max_depth=None,
                    min_samples_split=2,
                    min_samples_leaf=1,
                    min_weight_fraction_leaf=0.,
                    max_features="auto",
                    max_leaf_nodes=None,
                    min_impurity_decrease=0.,
                    min_impurity_split=None,
                    bootstrap=True,
                    oob_score=False,
                    n_jobs=1,
                    random_state=0,
                    verbose=0,
                    warm_start=False,
                    class_weight={
                        0: 0.1,
                        1: 0.9
                    })
            elif model_name == 'adaboost':
                base_estimator = RandomForestClassifier(
                    n_estimators=20,
                    criterion="entropy",
                    max_depth=None,
                    min_samples_split=2,
                    min_samples_leaf=1,
                    min_weight_fraction_leaf=0.,
                    max_features="auto",
                    max_leaf_nodes=None,
                    min_impurity_decrease=0.,
                    min_impurity_split=None,
                    bootstrap=True,
                    oob_score=False,
                    n_jobs=1,
                    random_state=0,
                    verbose=0,
                    warm_start=False,
                    class_weight={
                        0: 0.1,
                        1: 0.9
                    })
                base_estimator1 = LogisticRegression(penalty='l2',
                                                     dual=False,
                                                     tol=1e-4,
                                                     C=1.0,
                                                     fit_intercept=True,
                                                     intercept_scaling=1,
                                                     class_weight={
                                                         0: 0.1,
                                                         1: 0.9
                                                     },
                                                     random_state=0,
                                                     solver='newton-cg',
                                                     max_iter=100,
                                                     multi_class='ovr',
                                                     verbose=0,
                                                     warm_start=False,
                                                     n_jobs=1)
                clf_weights = AdaBoostClassifier(base_estimator=base_estimator,
                                                 n_estimators=50,
                                                 learning_rate=0.6666,
                                                 algorithm='SAMME.R',
                                                 random_state=0)

            elif model_name == 'gbr':
                clf_weights = GradientBoostingRegressor(
                    loss='ls',
                    learning_rate=0.1,
                    n_estimators=100,
                    subsample=1.0,
                    criterion='friedman_mse',
                    min_samples_split=2,
                    min_samples_leaf=1,
                    min_weight_fraction_leaf=0.,
                    max_depth=3,
                    min_impurity_decrease=0.,
                    min_impurity_split=None,
                    init=None,
                    random_state=0,
                    max_features=None,
                    alpha=0.9,
                    verbose=0,
                    max_leaf_nodes=None,
                    warm_start=False,
                    presort='auto')
            elif model_name == 'qda':
                clf_weights = QuadraticDiscriminantAnalysis(
                    priors=None,
                    reg_param=0.,
                    store_covariance=False,
                    tol=1.0e-4,
                    store_covariances=None)
            elif model_name == 'lda':
                clf_weights = LinearDiscriminantAnalysis(
                    solver='svd',
                    shrinkage=None,
                    priors=None,
                    n_components=None,
                    store_covariance=False,
                    tol=1e-4)
            elif model_name == 'n_n':
                clf_weights = NearestNeighbors(n_neighbors=5,
                                               radius=1.0,
                                               algorithm='auto',
                                               leaf_size=30,
                                               metric='minkowski',
                                               p=2,
                                               metric_params=None,
                                               n_jobs=1)
            elif model_name == 'gnb':
                clf_weights = GaussianNB(priors=None)
            elif model_name == 'bnb':
                clf_weights = BernoulliNB(alpha=1.0,
                                          binarize=.0,
                                          fit_prior=True,
                                          class_prior=None)
            elif model_name == 'dcc':
                clf_weights = DecisionTreeClassifier(
                    criterion="gini",
                    splitter="best",
                    max_depth=None,
                    min_samples_split=2,
                    min_samples_leaf=1,
                    min_weight_fraction_leaf=0.,
                    max_features=None,
                    random_state=0,
                    max_leaf_nodes=None,
                    min_impurity_decrease=0.,
                    min_impurity_split=None,
                    class_weight=None,
                    presort=False)
            elif model_name == 'dcr':
                clf_weights = DecisionTreeRegressor(
                    criterion="mse",
                    splitter="best",
                    max_depth=None,
                    min_samples_split=2,
                    min_samples_leaf=1,
                    min_weight_fraction_leaf=0.,
                    max_features=None,
                    random_state=0,
                    max_leaf_nodes=None,
                    min_impurity_decrease=0.,
                    min_impurity_split=None,
                    presort=False)
            elif model_name == 'RAN':
                base_estimator = LinearRegression()
                clf_weights = RANSACRegressor(base_estimator=base_estimator,
                                              min_samples=None,
                                              residual_threshold=None,
                                              is_data_valid=None,
                                              is_model_valid=None,
                                              max_trials=100,
                                              max_skips=np.inf,
                                              stop_n_inliers=np.inf,
                                              stop_score=np.inf,
                                              stop_probability=0.99,
                                              residual_metric=None,
                                              loss='absolute_loss',
                                              random_state=0)
            elif model_name == 'adar':
                clf_weights = AdaBoostRegressor(base_estimator=None,
                                                n_estimators=50,
                                                learning_rate=1.,
                                                loss='linear',
                                                random_state=None)

            else:  # model_name == 'SGDR':
                clf_weights = SGDRegressor(loss="squared_loss",
                                           penalty="l2",
                                           alpha=0.0001,
                                           l1_ratio=0.15,
                                           fit_intercept=True,
                                           max_iter=None,
                                           tol=None,
                                           shuffle=True,
                                           verbose=0,
                                           epsilon=0.1,
                                           random_state=None,
                                           learning_rate="invscaling",
                                           eta0=0.01,
                                           power_t=0.25,
                                           warm_start=False,
                                           average=False,
                                           n_iter=None)

            # build
            clf_weights.fit(train_x, train_y)
            i += 1

            if i % 20 == 0:
                mse = mean_squared_error(y_test, clf_weights.predict(X_test))
                log.info("均方误差:{}".format(mse))
                avgscores = cross_val_score(clf_weights, train_x,
                                            train_y).mean()
                log.info("{}/{} 训练集得分平均值: {}".format(e, i, avgscores))
                model_path = os.path.join(
                    "Order_predicts/datasets/results/models",
                    '{}'.format(model_name))
                if not os.path.exists(model_path):
                    os.makedirs(model_path)
                joblib.dump(
                    clf_weights,
                    os.path.join(model_path, "{}_{}.model".format(e, i)))
                log.info(" Save ")

            if i % 50 == 0:
                scores = clf_weights.score(X_test, y_test)
                log.info("验证得分: {}".format(scores))
示例#21
0
        if x not in da2list:
            out.append(x)
    df = pd.Series(out)
    df.to_csv("word2vector_code/datasets/diff.csv", index=None)


if __name__ == "__main__":
    import sys

    if len(sys.argv) < 2:
        raise Exception("[!] You should put more args")
    method = sys.argv[1]

    if method == 'cut':
        cut_data()
        log.info(" ! Build Success ! ")

    if method == 'train':
        import logging

        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.INFO)
        max_words = 10000
        data_name = 'word2vector_code/datasets/train.csv'
        sentences = TextBatch(fname=data_name, max_sentence_length=max_words)
        model = word2vec.Word2Vec(sentences,
                                  iter=5,
                                  workers=4,
                                  size=1000,
                                  min_count=1,
                                  alpha=0.025,
示例#22
0
def train():
    epoch = 100

    is_training = tf.placeholder(tf.bool, [])
    x = tf.placeholder(tf.float32, [None, image_size, image_size, 3])
    downscaled = downscale(x)
    imitation, G_vars = generator(downscaled, is_training, False)

    real_output, _ = discriminator(x, is_training, False)
    fake_output, D_vars = discriminator(imitation, is_training, True)

    g_loss, d_loss = inference_losses(x, imitation, real_output, fake_output)
    run_config = tf.ConfigProto()
    run_config.gpu_options.allow_growth = True
    sess = tf.Session(config=run_config)

    with tf.variable_scope('srgan'):
        global_step = tf.Variable(0, name='global_step', trainable=False)
    opt = tf.train.AdamOptimizer(learning_rate=learning_rate)

    g_train_op = opt.minimize(g_loss, global_step=global_step, var_list=G_vars)
    d_train_op = opt.minimize(d_loss, global_step=global_step, var_list=D_vars)
    init = tf.global_variables_initializer()
    sess.run(init)
    saver = tf.train.Saver(max_to_keep=100)

    ckpt = tf.train.get_checkpoint_state('./checkpoints/srgan/')
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)
        log.info("ckpt:{}".format(ckpt.model_checkpoint_path))
        log.info("Model load success ... ")

    x_train = np.load('x_train.npy')
    total_batch_count = len(range(0,
                                  len(x_train) - batch_size + 1, batch_size))
    log.info("total bc: {}".format(total_batch_count))
    for i in range(epoch):
        bc = -1
        for x_batch_images in minibatches(inputs=x_train,
                                          batch_size=batch_size):
            bc += 1
            x_batch = normalize(x_batch_images)
            log.info("{}".format(x_batch.shape))
            sess.run([g_train_op, d_train_op],
                     feed_dict={
                         x: x_batch,
                         is_training: True
                     })
            g, d = sess.run([g_loss, d_loss],
                            feed_dict={
                                x: x_batch,
                                is_training: True
                            })

            if bc % 50 == 0:
                log.info("epoch:{}, bc:{}, gloss:{}, dloss:{}".format(
                    epoch, bc, g, d))

            if bc % 500 == 0:
                model_name = "srgan"
                checkpoint_dir = './checkpoints/srgan/'

                if not os.path.exists(checkpoint_dir):
                    os.makedirs(checkpoint_dir)
                savename = os.path.join(checkpoint_dir, model_name)
                saver.save(sess, savename)

                log.info("Modle Save success :{} ".format(savename))
示例#23
0
import time
from glob import glob
from six.moves import xrange
import pprint
import numpy as np
import tensorflow as tf

from pyduyp.utils.dl.ops.Convolution import conv2d as conv2d
from pyduyp.utils.dl.ops.Convolution import deconv2d as deconv2d
from pyduyp.utils.dl.ops.batchnorm import Contrib_batch_norm as batch_norm
from pyduyp.utils.image_utils import save_images, get_image
from pyduyp.utils.dl.ops.variable_ops import lrelu, conv_out_size_same, show_all_variables
from pyduyp.utils.dl.ops.Linear import linear as linear

from pyduyp.logger.log import log
log.info("================= DCGAN Runing =================")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

flags = tf.app.flags
flags.DEFINE_float("learning_rate", 0.0002,
                   "Learning rate of for adam [0.0002]")
flags.DEFINE_float("beta1", 0.5, "Momentum term of adam [0.5]")

flags.DEFINE_integer("epoch", 15, "Epoch to train [15]")
flags.DEFINE_integer("train_size", np.inf, "The size of train images [np.inf]")
flags.DEFINE_integer("batch_size", 64, "The size of batch images [64]")

flags.DEFINE_integer("sample_num", 64, "The size of batch images [64]")

flags.DEFINE_integer(
    "input_height", 108,
示例#24
0
def train_nnmodel(epoch,
                  learning_rate,
                  batch_size,
                  data_path='datasets/results',
                  data_name="train.csv",
                  class_number=2,
                  checkpoint_dir="datasets/results/models"):
    root = os.path.dirname(os.path.realpath(__file__))

    data_path = os.path.join(root, data_path, data_name)
    df_ohe = pd.read_csv(data_path)
    log.info("{}".format(df_ohe.shape))
    df_ohe = shuffle(df_ohe)
    train_y = df_ohe['label']
    train_y = pd.get_dummies(train_y)

    del df_ohe['label']
    train_x = df_ohe

    x_data_holder = tf.placeholder(tf.float32, [None, train_x.shape[1]],
                                   name='inputs_x')
    y_data_holder = tf.placeholder(tf.float32, [None, class_number],
                                   name='inputs_y')
    y_prediction = neural_networks(x_data_holder, train_x.shape[1],
                                   class_number)
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=y_data_holder,
                                                logits=y_prediction))
    train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(
        loss)

    y_pre_max = tf.argmax(y_prediction, axis=1)  # 预测值的最大值的索引
    y_train_max = tf.argmax(y_data_holder, axis=1)  # 真实值的最大值的索引
    correct_prediction = tf.equal(y_pre_max, y_train_max)  # 返回bool值
    bool2float = tf.cast(correct_prediction, tf.float32)  # bool转float32
    accuracy = tf.reduce_mean(bool2float)  # 准确率

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(max_to_keep=5)

        for e in range(epoch):
            counter = 0
            batch_count = len(train_x) // batch_size
            for batch_x, batch_y in minibatches(inputs=train_x,
                                                targets=train_y,
                                                batch_size=batch_size,
                                                shuffle=False):
                sess.run(train_step,
                         feed_dict={
                             x_data_holder: batch_x,
                             y_data_holder: batch_y
                         })
                train_loss = sess.run(loss,
                                      feed_dict={
                                          x_data_holder: batch_x,
                                          y_data_holder: batch_y
                                      })
                train_acc = sess.run(accuracy,
                                     feed_dict={
                                         x_data_holder: batch_x,
                                         y_data_holder: batch_y
                                     })
                if np.mod(counter, 10) == 1:
                    log_out = "Epoch:{} Batch Count: {}/{},  Train Accuracy: {:06f}; Loss: {:06f}"
                    log.info(
                        log_out.format(e, counter, batch_count, train_acc,
                                       train_loss))
                counter += 1
                if np.mod(counter, 10) == 1:
                    if not os.path.exists(checkpoint_dir):
                        os.makedirs(checkpoint_dir)
                    checkpoint_name = os.path.join(root, checkpoint_dir)
                    saver.save(sess,
                               save_path=os.path.join(
                                   checkpoint_name,
                                   "{}.model".format(counter)))
                    log.debug(" Model {} have save success ...".format(
                        checkpoint_name))
示例#25
0
import pandas as pd
import os
from tqdm import tqdm
from collections import OrderedDict
import numpy as np
from pyduyp.utils.utils import time2day, time2mouth, time2week
from pyduyp.utils.utils import compute_interval_of_day
from pyduyp.utils.utils import get_freq_of_day_and_month, get_week_freq, get_type_freq
from pyduyp.utils.utils import com_mode, pandas_quantile
from pyduyp.logger.log import log
from pyduyp.utils.large_datastruts import citysdict, continentdicts, countrydicts, provincedicts, agesdicts

log.info("Start runing ...")


def get_pos_action_by_id(step='train'):
    # 根据历史订单,获得精品订单的用户id,并在行为表中分离出其行为数据
    pos_root = 'Order_predicts/datasets/results/{}/action_pos/'.format(step)
    if not os.path.exists(pos_root):
        os.makedirs(pos_root)

    action = pd.read_csv("Order_predicts/datasets/{}/action_{}.csv".format(
        step, step))
    orderHistory = pd.read_csv(
        "Order_predicts/datasets/{}/orderHistory_{}.csv".format(step, step),
        usecols=['userid', 'orderTime', 'orderType'])
    data_pos = orderHistory[orderHistory['orderType'].isin(
        ['1'])]['userid'].values
    pos_ids = []
    for posid in tqdm(data_pos):
        pos_features = {}
示例#26
0
def get_action_features(step):
    pos_root = 'Order_predicts/datasets/results/{}/action_pos/'.format(step)
    neg_root = 'Order_predicts/datasets/results/{}/action_neg/'.format(step)
    if step == 'test':
        neg_root = 'Order_predicts/datasets/results/{}/action/'.format(step)
    if not os.path.exists(pos_root):
        os.makedirs(pos_root)
    if not os.path.exists(neg_root):
        os.makedirs(neg_root)
    for root in [pos_root, neg_root]:
        if 'pos' in root and step == 'test':
            continue
        base_name = root.split("/")[-2]
        history_path = "Order_predicts/datasets/results/{}/history_{}".format(
            step,
            base_name.split('_')[-1])
        actions = []
        for file in tqdm(os.listdir(root)):
            rows = OrderedDict()
            aid = file.split(".")[0]
            rows['id'] = aid
            if step == 'train':
                rows['label'] = 1 if base_name == 'action_pos' else 0
            else:
                rows['label'] = 0
            data = pd.read_csv(os.path.join(root, file))
            data_types = data['actionType'].values.tolist()
            data_copy = data.copy()
            data_copy['time2days'] = data['actionTime'].apply(time2day)
            data_copy['time2mouth'] = data['actionTime'].apply(time2mouth)
            data_copy['time2week'] = data['actionTime'].apply(time2week)
            data_copy_grouped_day = data_copy.groupby(by='time2days')
            data_copy_grouped_month = data_copy.groupby(by='time2mouth')
            time_counts, two_interval = compute_interval_of_day(data_copy)
            quantiles = pandas_quantile(time_counts)
            if len(time_counts) > 4:
                last_times = time_counts[-4:-1]
            else:
                last_times = [0, 0, 0]
            type_freq, types_sum = get_type_freq(data)  # 每个操作的总数, 总次数
            if os.path.isfile(os.path.join(history_path, file)):
                history = pd.read_csv(os.path.join(history_path, file))
                historydata_copy = history.copy()
                historydata_copy['time2days'] = data['actionTime'].apply(
                    time2day)
                historyinterval, two_historyinterval = compute_interval_of_day(
                    historydata_copy)
                if len(historyinterval) > 1:
                    historyinterval = historyinterval
                else:
                    historyinterval = [0, 0, 0, 0, 0]
            else:
                historyinterval = [0, 0, 0, 0, 0]

            rows['2_t1'] = type_freq['2_t1']  # 类型1-9点击总数
            rows['3_t2'] = type_freq['3_t2']
            rows['4_t3'] = type_freq['4_t3']
            rows['5_t4'] = type_freq['5_t4']
            rows['6_t5'] = type_freq['6_t5']
            rows['7_t6'] = type_freq['7_t6']
            rows['8_t7'] = type_freq['8_t7']
            rows['9_t8'] = type_freq['9_t8']
            rows['10_t9'] = type_freq['10_t9']
            rows['11_rate1'] = type_freq['2_t1'] / types_sum  # 打开app的比例
            rows['12_rate9'] = type_freq['10_t9'] / types_sum  # 下单的比例
            rows['13_atmean'] = np.mean(time_counts)  # 时间均值
            rows['14_atstd'] = np.std(time_counts)  # 时间标准差
            rows['15_atmedian'] = np.median(time_counts)  # 时间中位数
            rows['16_tmode'] = com_mode(time_counts)  # 时间众数
            rows['17_atptp'] = np.max(time_counts) - np.min(
                time_counts) if len(time_counts) > 0 else 0  # 时间极差
            rows['18_atvar'] = np.var(time_counts)  # 时间方差
            rows['19_xishu'] = np.mean(time_counts) / np.std(
                time_counts) if len(time_counts) > 1 else 0  # 时间变异系数
            rows['20_lastmean'] = np.mean(last_times)  # 最后三天间隔的均值
            rows['21_laststd'] = np.std(last_times)  # 最后三天间隔的标准差
            rows['22_dayrate'] = get_freq_of_day_and_month(
                data_copy_grouped_day)  # 日均
            rows['23_monthrate'] = get_freq_of_day_and_month(
                data_copy_grouped_month)  # 月均
            rows['24_weekrate'] = get_week_freq(
                data_copy['time2week'].values)  # 周均
            rows['25_atmean'] = np.mean(data_types)  # 类型均值
            rows['26_atstd'] = np.std((data_types))  # 类型标准差
            rows['27_atmedian'] = np.median(data_types)  # 类型中位数
            rows['28_tmode'] = com_mode(data_types)  # 类型众数
            rows['29_atptp'] = np.max(data_types) - np.min(data_types) if len(
                data_types) > 0 else 0  # 类型极差
            rows['30_atvar'] = np.var(data_types)  # 类型方差
            rows['31_xishu'] = np.mean(data_types) / np.std(data_types) if len(
                data_types) > 1 else 0  # 类型变异系数
            rows['32_rate2'] = type_freq['3_t2'] / types_sum  # 2的比例
            rows['33_rate3'] = type_freq['4_t3'] / types_sum  # 3的比例
            rows['34_rate4'] = type_freq['5_t4'] / types_sum  # 4的比例
            rows['35_rate5'] = type_freq['6_t5'] / types_sum  # 5的比例
            rows['36_rate6'] = type_freq['7_t6'] / types_sum  # 6的比例
            rows['37_rate7'] = type_freq['8_t7'] / types_sum  # 7的比例
            rows['38_rate8'] = type_freq['9_t8'] / types_sum  # 8的比例
            rows['39_htptp'] = np.max(historyinterval) - np.min(
                historyinterval)  # 历史订单时间极差
            rows['40_atmean'] = np.mean(two_interval)  # 时间均值
            rows['41_atstd'] = np.std(two_interval)  # 时间标准差
            rows['42_atmedian'] = np.median(two_interval)  # 时间中位数
            rows['43_tmode'] = com_mode(two_interval)  # 时间众数
            rows['44_atptp'] = np.max(two_interval) - np.min(
                two_interval) if len(two_interval) > 0 else 0  # 时间极差
            rows['45_atvar'] = np.var(two_interval)  # 时间方差
            rows['46_xishu'] = np.mean(two_interval) / np.std(
                two_interval) if len(two_interval) > 1 else 0  # 时间变异系数
            rows['47_quantile2'] = quantiles[0]
            rows['48_quantile4'] = quantiles[1]

            actions.append(rows)

        df = pd.DataFrame(actions)
        df = df.replace(np.inf, 100)
        df = df.round(7)
        df = df.round({'label': 0, 'id': 0})
        save_name = "Order_predicts/datasets/results/{}/{}_features.csv".format(
            step, base_name)
        if step == 'test':
            del df['label']
        df.to_csv(save_name, index=None)
        log.info(" !!! {}".format(save_name))