Пример #1
0
def rebuild_table(table_name):
    count = get_table_count(table_name)
    if not count:
        print("no rows in das.{0}".format(table_name))
        return

    definition = ",\n".join([
        "{0} {1}".format(d[0], d[1]) for d in get_table_definition(table_name)
    ])
    clusters = int(ceil(count / 1e7)) * 113
    time_stamp = datetime.now().strftime("%Y%m%d%H%M%S")
    select_columns = ",".join([d[0] for d in get_table_definition(table_name)])
    print(datetime.now())
    sql = """
    DROP TABLE IF EXISTS {0}_{1};
    CREATE TABLE {0}_{1} (
      {2}
    )CLUSTERED BY (id) INTO {3} BUCKETS
    STORED AS ORC;
    """.format(table_name, time_stamp, definition, clusters)
    Hive("local").execute(sql)
    print(datetime.now())
    sql = """
    INSERT INTO TABLE das.{0}_{1}
    SELECT {2}
    FROM das.{0};
    """.format(table_name, time_stamp, select_columns)
    Hive("local").execute(sql)
    print(datetime.now())
    sql = """
    DROP TABLE das.{0};
    ALTER TABLE das.{0}_{1} RENAME TO das.{0};
    """.format(table_name, time_stamp)
    Hive("local").execute(sql)
    print(datetime.now())
Пример #2
0
def transform(**kwargs):
    sql = """
        USE elengjing_tj;
        DROP TABLE IF EXISTS shop_{shop_id};
        ALTER TABLE t_shop_{shop_id} RENAME TO shop_{shop_id};
        DROP TABLE IF EXISTS shop_{shop_id}_item;
        CREATE TABLE shop_{shop_id}_item AS
        SELECT
            category_id,
            customer_item_id AS item_id,
            count(*) as cnt
        FROM elengjing_tj.shop_{shop_id}
        GROUP BY
            category_id,
            customer_item_id
        UNION ALL
        SELECT
            category_id,
            target_item_id AS item_id,
            count(*) as cnt
        FROM elengjing_tj.shop_{shop_id}
        GROUP BY
            category_id,
            target_item_id;

        DROP TABLE IF EXISTS t_shop_{shop_id}_target;
        DROP TABLE IF EXISTS t_shop_{shop_id}_customer;
        DROP TABLE IF EXISTS t_shop_{shop_id};
        DROP TABLE IF EXISTS t_shop_{shop_id}_scope;
        """.format(**kwargs)
    if kwargs.get("debug"):
        print(sql)
    else:
        Hive("idc").execute(sql)
Пример #3
0
def transform(**kwargs):
    sql = """
        USE elengjing_tj;
        DROP TABLE IF EXISTS shop_{shop_id}_case_{case};
        ALTER TABLE t_shop_{shop_id}_case_{case} RENAME TO shop_{shop_id}_case_{case};
        DROP TABLE IF EXISTS shop_{shop_id}_case_{case}_item;
        CREATE TABLE shop_{shop_id}_case_{case}_item AS (
        SELECT
            category_id,
            customer_item_id AS item_id,
            count(*) as cnt
        FROM elengjing_tj.shop_{shop_id}_case_{case}
        GROUP BY
            category_id,
            item_id
        UNION ALL
        SELECT
            category_id,
            target_item_id AS item_id,
            count(*) as cnt
        FROM elengjing_tj.shop_{shop_id}_case_{case}
        GROUP BY
            category_id,
            item_id
        );
        DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}_target;
        DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}_customer;
        DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case};
        DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}_scope;
        """.format(**kwargs)
    Hive("idc").execute(sql)
Пример #4
0
def get_shop_category(**kwargs):
    sql = """
    SELECT
        DISTINCT categoryid
    FROM elengjing.women_clothing_item
    WHERE shopid = {shop_id}
    """.format(**kwargs)
    return list(map(lambda x: x[0], Hive("idc").query(sql, to_dict=False)))
Пример #5
0
def drop(is_event, show_sql=False):
    sql = """
    USE elengjing_price;
    DROP TABLE IF EXISTS predict_{0};
    """.format("event" if is_event else "daily")
    if show_sql:
        print(sql)
    else:
        Hive("idc").execute(sql)
Пример #6
0
def create_table_item_tagged():
    sql = """
    USE elengjing;
    DROP TABLE IF EXISTS women_clothing_item_attr_t;
    CREATE TABLE women_clothing_item_attr_t AS
    SELECT itemid, CONCAT_WS(',', COLLECT_SET(CONCAT(attrname, ':' ,attrvalue))) AS data
    FROM women_clothing_item_attr
    GROUP BY itemid;
    """
    Hive(env=ENV).execute(sql)
Пример #7
0
def insert_attrvalue_to_pg():
    ret = Hive(env=ENV).query("SELECT attrvalue FROM mpintranet.attrvalue_result LIMIT 1")
    if len(ret) > 0:
        attrvalues = ret[0]["attrvalue"]
        rows = PostgreSQL().query("SELECT * FROM text_value WHERE name = 'attrname_attrvalue_columns'")
        if len(rows) > 0:
            sql = "UPDATE text_value SET value = '{0}' WHERE name = 'attrname_attrvalue_columns'".format(attrvalues)
        else:
            sql = "INSERT INTO text_value (name, value) VALUES('attrname_attrvalue_columns', '{0}')".format(attrvalues)
        PostgreSQL().execute(sql)
Пример #8
0
def get_minerals_from_hive():
    sql = """
    SELECT
        customer_attr,
        target_attr,
        customer_item_id,
        target_item_id
    FROM t_elengjing.competitive_filtering_stage_1
    LIMIT 10000;
    """
    pickle_dump("raw_data", Hive(env="idc").query(sql=sql, to_dict=False))
Пример #9
0
def create(is_event=True, show_sql=False):
    sql = """
    USE elengjing_price;
    CREATE TABLE predict_{0} (
        itemid BIGINT,
        categoryid BIGINT,
        price DECIMAL(20, 2)
    )
    STORED AS ORC;
    """.format("event" if is_event else "daily")
    if show_sql:
        print(sql)
    else:
        Hive("idc").execute(sql)
Пример #10
0
def create_table(**kwargs):
    sql = """
        USE elengjing_tj;
        DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case};
        CREATE TABLE t_shop_{shop_id}_case_{case} (
            category_id BIGINT,
            customer_shop_id BIGINT,
            customer_item_id BIGINT,
            customer_platform_id BIGINT,
            target_shop_id BIGINT,
            target_item_id BIGINT,
            target_platform_id BIGINT
        )
        CLUSTERED BY (customer_item_id) INTO 113 BUCKETS
        STORED AS ORC;
        """.format(**kwargs)
    Hive("idc").execute(sql)
Пример #11
0
def get_hive_train_data(category_id, is_event=False):
    from mplib.IO import Hive
    sql = """
    SELECT data AS data
    FROM elengjing_price.train_{category_id}_{model_type}
    ORDER BY RAND(1000000)
    LIMIT 100000
    """.format(category_id=category_id,
               model_type="event" if is_event else "daily")
    lines = [line.get("data") for line in Hive("idc").query(sql)]
    train_x = []
    train_y = []
    for line in lines:
        line = [float(num) for num in line.split(",")]
        train_x.append(line[1:])
        train_y.append(line[0])
    return numpy.nan_to_num(numpy.array(train_x)), vrm(
        numpy.nan_to_num(numpy.array(train_y)))
Пример #12
0
def get_features(**kwargs):
    sql = """
    USE t_elengjing;
    ADD FILE /home/udflib/serverudf/elengjing/{udf};
    DROP TABLE IF EXISTS competitive_item_train_stage_1;
    CREATE TABLE competitive_item_train_stage_1(
        customer_item_id STRING,
        target_item_id STRING,
        similarity STRING,
        score STRING,
        category_id STRING
    ) STORED AS ORC;
    INSERT INTO competitive_item_train_stage_1
    SELECT TRANSFORM(ct.data, tt.data, ct.itemid, tt.itemid, l.score, l.categoryid)
    USING 'python {udf} {category_id}' AS (customer_item_id, target_item_id, similarity, score, category_id)
    FROM competitive_manual_label AS l
    JOIN elengjing.women_clothing_item_attr_t AS ct
    ON ct.itemid = l.sourceitemid
    JOIN elengjing.women_clothing_item_attr_t AS tt
    ON tt.itemid = l.targetitemid;

    DROP TABLE IF EXISTS competitive_item_train_stage_2;
    CREATE TABLE competitive_item_train_stage_2 AS
    SELECT
        a.customer_item_id,
        a.target_item_id,
        a.category_id,
        cv.tag AS customer_item_dummy,
        tv.tag AS target_item_dummy,
        cn.attrvalue AS customer_item_attr,
        tn.attrvalue AS target_item_attr,
        CASE WHEN a.score < 0.5 THEN 0 ELSE 1 END AS score
    FROM competitive_item_train_stage_1 AS a
    JOIN mpintranet.attrname_export AS cn
    ON a.customer_item_id = cn.itemid
    JOIN mpintranet.attrname_export AS tn
    ON a.target_item_id = tn.itemid
    JOIN mpintranet.attrvalue_export AS cv
    ON a.customer_item_id = cv.itemid
    JOIN mpintranet.attrvalue_export AS tv
    ON a.target_item_id = tv.itemid
    WHERE a.category_id = {category_id};
    """.format(**kwargs)
    Hive(env="idc").execute(sql)
Пример #13
0
def create_table(**kwargs):
    sql = """
        USE elengjing_tj;
        DROP TABLE IF EXISTS t_shop_{shop_id};
        CREATE TABLE t_shop_{shop_id} (
            category_id BIGINT,
            customer_shop_id BIGINT,
            customer_item_id BIGINT,
            customer_platform_id BIGINT,
            target_shop_id BIGINT,
            target_item_id BIGINT,
            target_platform_id BIGINT
        )
        CLUSTERED BY (customer_item_id) INTO 791 BUCKETS
        STORED AS ORC;
        """.format(**kwargs)
    if kwargs.get("debug"):
        print(sql)
    else:
        Hive("idc").execute(sql)
Пример #14
0
def insert(category_id, is_event, show_sql=False):
    raw_sql = """
    USE elengjing_price;
    INSERT INTO predict_{0}
    SELECT
        itemid,
        {1} AS categoryid,
        CASE
            WHEN price IS NULL OR price <= 0 THEN -1
            ELSE price
        END AS price
    FROM predict_{1}_{0};
    """
    if isinstance(category_id, list):
        sql = ";\n".join([raw_sql.format("event" if is_event else "daily", c) for c in category_id])
    else:
        sql = raw_sql.format("event" if is_event else "daily", category_id)
    if show_sql:
        print(sql)
    else:
        Hive("idc").execute(sql)
Пример #15
0
def gen_attribute_dictionary(out=False):
    sql = """
    DROP TABLE IF EXISTS elengjing_price.attr_value_text;
    CREATE TABLE IF NOT EXISTS elengjing_price.attr_value_text(
        categoryid STRING,
        attr STRING
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY'\t';
    LOAD DATA LOCAL INPATH '/home/data/attr_value.csv' OVERWRITE INTO TABLE elengjing_price.attr_value_text;


    DROP TABLE IF EXISTS elengjing_price.attr_value;
    CREATE TABLE elengjing_price.attr_value
    STORED AS ORC
    AS
    SELECT *
    FROM elengjing_price.attr_value_text;
    """
    if out:
        print(sql)
    else:
        Hive("idc").execute(sql)
Пример #16
0
def execute_hive_predict_data(category_id,
                              is_event=False,
                              show_sql=False,
                              udf="udf_item_pricing13.py"):
    from mplib.IO import Hive
    sql = """
        USE elengjing_price;
        ADD FILE /home/script/normal_servers/serverudf/elengjing/{udf};
        ADD FILE /home/script/normal_servers/serverudf/elengjing/data/pricing_model/sklearn_randomforest_{category_id}_{model_type}.pickle;

        DROP TABLE IF EXISTS tmp_{category_id};
        CREATE TABLE tmp_{category_id} (
            itemid STRING,
            data STRING
        )CLUSTERED BY (itemid) INTO 1017 BUCKETS
        STORED AS ORC;

        INSERT INTO tmp_{category_id}
        SELECT
            i.itemid,
            CONCAT_WS(
                ',',
                NVL(CAST(i.avg_price AS STRING), '0'),
                NVL(CAST((UNIX_TIMESTAMP() - TO_UNIX_TIMESTAMP(i.listeddate)) / 86400.0 AS STRING), '0'),
                NVL(CAST(shop.level AS STRING), '0'),
                NVL(CAST(shop.favor AS STRING), '0'),
                NVL(CAST(s.all_spu AS STRING), '0'),
                NVL(CAST(s.all_sales_rank AS STRING), '0'),
                NVL(CAST(sc.category_spu AS STRING), '0'),
                NVL(CAST(sc.category_sales_rank AS STRING), '0'),
                NVL(CAST(s.shop_avg_price AS STRING), '0'),
                NVL(CAST(s.shop_avg_price_rank AS STRING), '0'),
                NVL(CAST(sc.category_avg_price AS STRING), '0'),
                NVL(CAST(sc.category_avg_price_rank AS STRING), '0'),
                v.data
            ) AS data
        FROM (
            SELECT
                shopid, itemid,
                MAX(listeddate) AS listeddate,
                CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END AS avg_price
            FROM transforms.women_clothing_item_new_dict
            WHERE categoryid = {category_id}
            GROUP BY
                shopid, itemid
        ) AS i
        JOIN (
            SELECT
                shopid,
                COUNT(DISTINCT itemid) AS all_spu,
                CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END AS shop_avg_price,
                ROW_NUMBER() OVER(ORDER BY SUM(salesamt) DESC) AS all_sales_rank,
                ROW_NUMBER() OVER(ORDER BY CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END DESC) AS shop_avg_price_rank
            FROM transforms.women_clothing_item_new_dict
            GROUP BY
                shopid
        ) AS s
        ON i.shopid = s.shopid
        JOIN (
            SELECT
                shopid,
                COUNT(DISTINCT itemid) AS category_spu,
                ROW_NUMBER() OVER(ORDER BY SUM(salesamt) DESC) AS category_sales_rank,
                ROW_NUMBER() OVER(ORDER BY CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END DESC) AS category_avg_price_rank,
                CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END AS category_avg_price
            FROM transforms.women_clothing_item_new_dict
            WHERE categoryid = {category_id}
            GROUP BY
                shopid
            ) AS sc
        ON i.shopid = sc.shopid
        JOIN elengjing_base.shop AS shop
        ON i.shopid = shop.shopid
        JOIN elengjing_price.attr_itemid_value_matrix_{category_id} AS v
        ON i.itemid = v.itemid;

        DROP TABLE IF EXISTS predict_{category_id}_{model_type};
        CREATE TABLE predict_{category_id}_{model_type} (
            itemid STRING,
            price DECIMAL(20,2)
        ) STORED AS ORC;

        INSERT INTO predict_{category_id}_{model_type}
        SELECT TRANSFORM(itemid, data)
        USING 'python {udf} {category_id} {model_type}' AS (itemid, price)
        FROM tmp_{category_id};

        DROP TABLE IF EXISTS tmp_{category_id};
        """.format(category_id=category_id,
                   model_type="event" if is_event else "daily",
                   udf=udf)
    if show_sql:
        print(sql)
    else:
        Hive("idc").execute(sql)
Пример #17
0
def get_data():
    sql = "SELECT COUNT(*) AS cnt FROM das.ci_diaper2014to2016_0406_allplatform_distinct AS d WHERE d.year = '2016'"
    data = Hive(env="local").query(sql)
    pickle_dump("raw", data)
Пример #18
0
def get_gold_from_hive():
    sql = "SELECT * FROM t_elengjing.competitive_item_train_stage_3"
    pickle_dump("train_data", Hive(env="idc").query(sql, to_dict=False))
Пример #19
0
def get_table_count(table_name):
    sql = "SELECT COUNT(*) AS cnt FROM das.{0}".format(table_name)
    return Hive("local").query(sql)[0].get("cnt")
Пример #20
0
from mplib.IO import Hive
from mplib import *
import traceback
import sys

if __name__ == "__main__":
    reload(sys)
    sys.setdefaultencoding("utf8")

    try:
        a = SKAssess()
        a.category_id = sys.argv[1]
        a.interval = smart_decode(sys.argv[2])
        sql = "SELECT itemid AS itemid, data AS data FROM elengjing_price.tmp_{0} LIMIT 10".format(
            a.category_id)
        data = Hive("idc").query(sql)
        data = [
            "\t".join([str(line.get("itemid")),
                       str(line.get("data"))]) for line in data
        ]
        data = list(
            map(
                lambda x: smart_decode(x).replace("\n", "").replace("\r", "").
                split("\t"), data))
        items, data = split_id_feature(data)
        a.x_predict, a.y_predict = split_x_y(data)
        print(a.y_predict.shape)
        print(a.x_predict.shape)
        a.path = "/Users/panjunjun/PycharmProjects/Lab/IO/sklearn_randomforest_{0}_daily.pickle".format(
            a.category_id)
        a.load_model()
Пример #21
0
def show_tables():
    print(Hive().query("show tables"))
Пример #22
0
def gen_attribute_matrix(category_id, keep_tmp=False, out=False):
    sql = """
    DROP TABLE IF EXISTS elengjing_price.attr_unique_itemid_{category_id};
    CREATE TABLE IF NOT EXISTS elengjing_price.attr_unique_itemid_{category_id}(
        itemid BIGINT
    )
    CLUSTERED BY (itemid) INTO 997 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE elengjing_price.attr_unique_itemid_{category_id}
    SELECT itemid
    FROM transforms.women_clothing_item_attr
    WHERE categoryid = {category_id}
    AND attrname != '品牌' GROUP BY itemid;

    DROP TABLE IF EXISTS elengjing_price.item_attr_{category_id};
    CREATE TABLE IF NOT EXISTS elengjing_price.item_attr_{category_id}(
        itemid BIGINT,
        attr STRING
    )
    CLUSTERED BY (itemid) INTO 997 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE elengjing_price.item_attr_{category_id}
    SELECT
        itemid,
        CONCAT(attrname, ":", attrvalue)
    FROM transforms.women_clothing_item_attr
    WHERE categoryid = {category_id}
    AND attrname != '品牌';

    DROP TABLE IF EXISTS elengjing_price.item_attr_1_{category_id};
    CREATE TABLE IF NOT EXISTS elengjing_price.item_attr_1_{category_id}(
        itemid BIGINT,
        attr STRING
    )
    CLUSTERED BY (itemid) INTO 997 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE elengjing_price.item_attr_1_{category_id}
    SELECT
        itemid,
        attr
    FROM elengjing_price.item_attr_{category_id}
    GROUP BY
        itemid,
        attr;

    DROP TABLE IF EXISTS elengjing_price.attr_itemid_value_{category_id};
    CREATE TABLE  IF NOT EXISTS elengjing_price.attr_itemid_value_{category_id}(
        itemid BIGINT,
        attr STRING
    )
    CLUSTERED BY (itemid) INTO 997 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE elengjing_price.attr_itemid_value_{category_id}
    SELECT
        t1.itemid,
        t2.attr
    FROM elengjing_price.attr_unique_itemid_{category_id} AS t1
    CROSS JOIN elengjing_price.attr_value AS t2
    WHERE t2.categoryid = {category_id};

    DROP TABLE IF EXISTS elengjing_price.attr_itemid_value_data_{category_id};
    CREATE TABLE  IF NOT EXISTS elengjing_price.attr_itemid_value_data_{category_id}(
        itemid BIGINT,
        attr STRING,
        data INT
    )
    CLUSTERED BY (itemid) INTO 997 BUCKETS
    STORED AS ORC;

    SET mapred.reduce.tasks = 1000;

    INSERT INTO TABLE elengjing_price.attr_itemid_value_data_{category_id}
    SELECT
        t1.*,
        IF(NVL(t2.itemid, 0) = 0, 0, 1)
    FROM elengjing_price.attr_itemid_value_{category_id} AS t1
    LEFT JOIN elengjing_price.item_attr_1_{category_id} AS t2
    ON t1.itemid = t2.itemid
    AND t1.attr = t2.attr;

    DROP TABLE IF EXISTS elengjing_price.attr_itemid_value_matrix_{category_id};
    CREATE TABLE  IF NOT EXISTS elengjing_price.attr_itemid_value_matrix_{category_id}(
        itemid BIGINT,
        attr STRING,
        data STRING
    )
    CLUSTERED BY (itemid) INTO 997 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE elengjing_price.attr_itemid_value_matrix_{category_id}
    SELECT
        itemid,
        CONCAT_WS(',', COLLECT_LIST(attr)) AS attr,
        CONCAT_WS(',', COLLECT_LIST(CAST(data AS STRING))) AS data
    FROM (
        SELECT
            itemid,
            attr,
            data,
            row_number() OVER (PARTITION BY itemid ORDER BY attr DESC) AS rank
        FROM elengjing_price.attr_itemid_value_data_{category_id}
    ) AS a
    GROUP BY itemid;
    """.format(category_id=category_id)
    if out:
        print(sql)
    else:
        Hive("idc").execute(sql)

    if not keep_tmp:
        sql = """
        USE elengjing_price;
        DROP TABLE attr_itemid_value_{category_id};
        DROP TABLE attr_itemid_value_data_{category_id};
        DROP TABLE attr_unique_itemid_{category_id};
        DROP TABLE item_attr_1_{category_id};
        DROP TABLE item_attr_{category_id};
        """.format(category_id=category_id)
        if out:
            print(sql)
        else:
            Hive("idc").execute(sql)
Пример #23
0
def insert_by_category(**kwargs):
    sql = """
        USE elengjing_tj;
        DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}_customer;
        CREATE TABLE t_shop_{shop_id}_case_{case}_customer (
            shopid BIGINT,
            categoryid BIGINT,
            itemid BIGINT,
            platformid BIGINT,
            avg_price FLOAT
        )
        CLUSTERED BY (itemid) INTO 113 BUCKETS
        STORED AS ORC;

        INSERT INTO TABLE t_shop_{shop_id}_case_{case}_customer
        SELECT
                shopid,
                categoryid,
                itemid,
                platformid,
                CASE
                    WHEN NVL(SUM(salesqty), 0) = 0
                    THEN 0
                    ELSE 1.0 * NVL(SUM(salesamt), 0) / SUM(salesqty)
                END AS avg_price
            FROM elengjing.women_clothing_item
            WHERE shopid = {shop_id}
            AND categoryid = {category_id}
            GROUP BY
                shopid,
                categoryid,
                itemid,
                platformid;

        DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}_target;
        CREATE TABLE t_shop_{shop_id}_case_{case}_target (
            shopid BIGINT,
            categoryid BIGINT,
            itemid BIGINT,
            platformid BIGINT,
            avg_price FLOAT
        )
        CLUSTERED BY (itemid) INTO 113 BUCKETS
        STORED AS ORC;

        INSERT INTO t_shop_{shop_id}_case_{case}_target
        SELECT
                shopid,
                categoryid,
                itemid,
                platformid,
                CASE
                    WHEN NVL(SUM(salesqty), 0) = 0
                    THEN 0
                    ELSE 1.0 * NVL(SUM(salesamt), 0) / SUM(salesqty)
                END AS avg_price
            FROM elengjing.women_clothing_item
            WHERE shopid != {shop_id}
            AND categoryid = {category_id}
            GROUP BY
                shopid,
                categoryid,
                itemid,
                platformid;

        INSERT INTO elengjing_tj.t_shop_{shop_id}_case_{case}
        SELECT
            c.categoryid AS category_id,
            c.shopid AS customer_shop_id,
            c.itemid AS customer_item_id,
            c.platformid AS customer_platform_id,
            t.shopid AS target_shop_id,
            t.itemid AS target_item_id,
            t.platformid AS target_platform_id
        FROM t_shop_{shop_id}_case_{case}_customer AS c
        CROSS JOIN t_shop_{shop_id}_case_{case}_target AS t
        WHERE t.avg_price BETWEEN 0.9 * c.avg_price AND 1.1 *c.avg_price;
        """.format(**kwargs)
    Hive("idc").execute(sql)
Пример #24
0
def insert_by_category(**kwargs):
    sql = """
        USE elengjing_tj;
        DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}_customer;
        CREATE TABLE t_shop_{shop_id}_case_{case}_customer (
            shopid BIGINT,
            categoryid BIGINT,
            itemid BIGINT,
            platformid BIGINT,
            avg_price FLOAT
        )
        CLUSTERED BY (itemid) INTO 113 BUCKETS
        STORED AS ORC;


        INSERT INTO TABLE t_shop_{shop_id}_case_{case}_customer
        SELECT
                shopid,
                categoryid,
                itemid,
                platformid,
                CASE
                    WHEN NVL(SUM(salesqty), 0) = 0
                    THEN 0
                    ELSE 1.0 * NVL(SUM(salesamt), 0) / SUM(salesqty)
                END AS avg_price
            FROM elengjing.women_clothing_item
            WHERE shopid = {shop_id}
            AND categoryid = {category_id}
            GROUP BY
                shopid,
                categoryid,
                itemid,
                platformid;


        DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}_target;
        CREATE TABLE t_shop_{shop_id}_case_{case}_target (
            shopid BIGINT,
            categoryid BIGINT,
            itemid BIGINT,
            platformid BIGINT,
            avg_price FLOAT
        )
        CLUSTERED BY (itemid) INTO 113 BUCKETS
        STORED AS ORC;

        INSERT INTO TABLE t_shop_{shop_id}_case_{case}_target
        SELECT
            b.shopid,
            b.categoryid,
            b.itemid,
            b.platformid,
            b.avg_price
        FROM (
            SELECT
                a.shopid,
                a.categoryid,
                a.itemid,
                a.platformid,
                a.avg_price,
                1.0 * SUM(a.qty) OVER (ORDER BY a.qty DESC, a.qty ROWS UNBOUNDED PRECEDING) / t.qty AS c_ratio
            FROM (
                SELECT
                    shopid,
                    categoryid,
                    itemid,
                    platformid,
                    CASE
                        WHEN NVL(SUM(salesqty), 0) = 0
                        THEN 0
                        ELSE 1.0 * NVL(SUM(salesamt), 0) / SUM(salesqty)
                    END AS avg_price,
                    SUM(salesqty) AS qty
                FROM elengjing.women_clothing_item
                WHERE shopid != {shop_id}
                AND categoryid = {category_id}
                GROUP BY
                    shopid,
                    categoryid,
                    itemid,
                    platformid
                ORDER BY qty DESC
            ) AS a
            CROSS JOIN (
                SELECT
                    SUM(salesqty) AS qty
                FROM elengjing.women_clothing_item
                WHERE shopid != {shop_id}
                AND categoryid = {category_id}
            ) AS t
        ) AS b
        WHERE b.c_ratio <= 0.8;


        INSERT INTO elengjing_tj.t_shop_{shop_id}_case_{case}
        SELECT
            c.categoryid AS category_id,
            c.shopid AS customer_shop_id,
            c.itemid AS customer_item_id,
            c.platformid AS customer_platform_id,
            t.shopid AS target_shop_id,
            t.itemid AS target_item_id,
            t.platformid AS target_platform_id
        FROM t_shop_{shop_id}_case_{case}_customer AS c
        CROSS JOIN t_shop_{shop_id}_case_{case}_target AS t
        WHERE t.avg_price BETWEEN 0.9 * c.avg_price AND 1.1 * c.avg_price;
        """.format(**kwargs)
    Hive("idc").execute(sql)
Пример #25
0
def gen_train_features(
        category_id,
        is_event=False,
        date="WHERE daterange BETWEEN '2016-05-01' AND '2016-05-24'",
        out=False):
    event = "event" if is_event else "daily"
    sql = """
    USE elengjing_price;
    DROP TABLE IF EXISTS train_{category_id}_{event};
    CREATE TABLE train_{category_id}_{event} (
        itemid STRING,
        data STRING
    )CLUSTERED BY (itemid) INTO 113 BUCKETS;

    INSERT INTO train_{category_id}_{event}
    SELECT
        i.itemid,
        CONCAT_WS(
            ',',
            NVL(CAST(i.avg_price AS STRING), '0'),
            NVL(CAST((UNIX_TIMESTAMP() - TO_UNIX_TIMESTAMP(i.listeddate)) / 86400.0 AS STRING), '0'),
            NVL(CAST(shop.level AS STRING), '0'),
            NVL(CAST(shop.favor AS STRING), '0'),
            NVL(CAST(s.all_spu AS STRING), '0'),
            NVL(CAST(s.all_sales_rank AS STRING), '0'),
            NVL(CAST(sc.category_spu AS STRING), '0'),
            NVL(CAST(sc.category_sales_rank AS STRING), '0'),
            NVL(CAST(s.shop_avg_price AS STRING), '0'),
            NVL(CAST(s.shop_avg_price_rank AS STRING), '0'),
            NVL(CAST(sc.category_avg_price AS STRING), '0'),
            NVL(CAST(sc.category_avg_price_rank AS STRING), '0'),
            v.data
      ) AS data
    FROM
      (SELECT
        shopid, itemid,
        MAX(listeddate) AS listeddate,
        CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END AS avg_price
      FROM transforms.women_clothing_item_new_dict
      {date_filter}
      AND categoryid = {category_id}
      GROUP BY
        shopid, itemid) AS i
    JOIN
      (SELECT
        shopid,
        COUNT(DISTINCT itemid) AS all_spu,
        CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END AS shop_avg_price,
        ROW_NUMBER() OVER(ORDER BY SUM(salesamt) DESC) AS all_sales_rank,
        ROW_NUMBER() OVER(ORDER BY CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END DESC) AS shop_avg_price_rank
      FROM transforms.women_clothing_item_new_dict
      {date_filter}
      GROUP BY
        shopid) AS s
    ON i.shopid = s.shopid
    JOIN
      (SELECT
        shopid,
        COUNT(DISTINCT itemid) AS category_spu,
        ROW_NUMBER() OVER(ORDER BY SUM(salesamt) DESC) AS category_sales_rank,
        ROW_NUMBER() OVER(ORDER BY CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END DESC) AS category_avg_price_rank,
        CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END AS category_avg_price
      FROM transforms.women_clothing_item_new_dict
      {date_filter}
      AND categoryid = {category_id}
      GROUP BY
        shopid) AS sc
    ON i.shopid = sc.shopid
    JOIN elengjing_base.shop AS shop
    ON i.shopid = shop.shopid
    JOIN elengjing_price.attr_itemid_value_matrix_{category_id} AS v
    ON i.itemid = v.itemid
    WHERE shop.level IS NOT NULL
    AND shop.favor IS NOT NULL
    AND i.avg_price BETWEEN 10 AND 10000;
    """.format(category_id=category_id, date_filter=date, event=event)
    if out:
        print(sql)
    else:
        Hive("idc").execute(sql)
Пример #26
0
def create_attrname(category_id):
    sql = """
    DROP TABLE IF EXISTS mpintranet.attrname;
    CREATE TABLE IF NOT EXISTS mpintranet.attrname(attrname STRING)
    CLUSTERED BY (attrname) INTO 113 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.attrname
    SELECT attrname
    FROM elengjing.women_clothing_item_attr
    WHERE categoryid = {0}
    GROUP BY attrname;

    DROP TABLE IF EXISTS mpintranet.itemid;
    CREATE TABLE IF NOT EXISTS mpintranet.itemid(itemid BIGINT)
    CLUSTERED BY (itemid) INTO 113 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.itemid
    SELECT itemid
    FROM elengjing.women_clothing_item_attr
    WHERE categoryid = {0}
    GROUP BY itemid;

    DROP TABLE IF EXISTS mpintranet.item_attr;
    CREATE TABLE IF NOT EXISTS mpintranet.item_attr(
        itemid BIGINT,
        attrvalue STRING
    )
    CLUSTERED BY (itemid) INTO 581 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.item_attr
    SELECT
        itemid,
        CONCAT(attrname, '_', attrvalue) AS attrvalue
    FROM elengjing.women_clothing_item_attr
    WHERE categoryid = {0}
    GROUP BY itemid, CONCAT(attrname, '_', attrvalue);

    DROP TABLE IF EXISTS mpintranet.itemid_attrname;
    CREATE TABLE IF NOT EXISTS mpintranet.itemid_attrname(
        itemid  BIGINT,
        attrname  STRING
    )
    CLUSTERED BY (itemid) INTO 581 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.itemid_attrname
    SELECT
        b.itemid,
        a.attrname
    FROM mpintranet.attrname a
    CROSS JOIN mpintranet.itemid b;

    DROP TABLE IF EXISTS mpintranet.itemid_attrname_attrvalue;
    CREATE TABLE IF NOT EXISTS mpintranet.itemid_attrname_attrvalue(
        itemid  BIGINT,
        attrname  STRING,
        attrvalue  STRING
    )
    CLUSTERED BY (itemid) INTO 783 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.itemid_attrname_attrvalue
    SELECT
        a.itemid,
        a.attrname,
        NVL(b.attrvalue, '') AS attrvalue
    FROM mpintranet.itemid_attrname a
    LEFT JOIN mpintranet.item_attr b
    ON a.itemid = b.itemid
    AND a.attrname = b.attrname;

    DROP TABLE IF EXISTS mpintranet.attrname_result;
    CREATE TABLE IF NOT EXISTS mpintranet.attrname_result(
        itemid  BIGINT,
        attrname  STRING,
        attrvalue  STRING
    )
    CLUSTERED BY (itemid) INTO 113 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.attrname_result
    SELECT
        t.itemid,
        GROUP_CONCAT(t.attrname, ",") AS attrname,
        GROUP_CONCAT(t.attrvalue, ",") AS attrvalue
    FROM
    (
        SELECT
            itemid,
            attrname,
            attrvalue
        FROM mpintranet.itemid_attrname_attrvalue
        DISTRIBUTE BY itemid
        SORT BY attrname DESC
    ) AS t
    GROUP BY t.itemid;

    DROP TABLE IF EXISTS mpintranet.attrname_export;
    CREATE TABLE IF NOT EXISTS mpintranet.attrname_export(
        itemid  BIGINT,
        attrvalue  STRING
    )
    CLUSTERED BY (itemid) INTO 113 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.attrname_export
    SELECT
        itemid,
        attrvalue
    FROM mpintranet.attrname_result;
    """
    Hive(env=ENV).execute(sql.format(category_id))
Пример #27
0
def create_attrvalue(category_id):
    sql = """
    -------- 1 产生中间表
    DROP TABLE IF EXISTS mpintranet.attrvalue;
    CREATE TABLE IF NOT EXISTS mpintranet.attrvalue(attrvalue STRING)
    CLUSTERED BY (attrvalue) INTO 113 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.attrvalue
    SELECT CONCAT(attrname, '_', attrvalue) AS attrvalue
    FROM elengjing.women_clothing_item_attr
    WHERE categoryid = {0}
    GROUP BY CONCAT(attrname, '_', attrvalue);

    DROP TABLE IF EXISTS mpintranet.itemid;
    CREATE TABLE IF NOT EXISTS mpintranet.itemid(itemid  BIGINT)
    CLUSTERED BY (itemid) INTO 113 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.itemid
    SELECT itemid
    FROM elengjing.women_clothing_item_attr
    WHERE categoryid = {0}
    GROUP BY itemid;

    DROP TABLE IF EXISTS mpintranet.item_attr;
    CREATE TABLE IF NOT EXISTS mpintranet.item_attr(
        itemid BIGINT,
        attrvalue STRING
    )
    CLUSTERED BY (itemid) INTO 581 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.item_attr
    SELECT
        itemid,
        CONCAT(attrname, '_', attrvalue) AS attrvalue
    FROM elengjing.women_clothing_item_attr
    WHERE categoryid = {0}
    GROUP BY itemid, CONCAT(attrname, '_', attrvalue);

    ------- 产生全量属性表
    DROP TABLE IF EXISTS mpintranet.itemid_attrvalue;
    CREATE TABLE IF NOT EXISTS mpintranet.itemid_attrvalue(
        itemid  BIGINT,
        attrvalue  STRING
    )
    CLUSTERED BY (itemid) INTO 581 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.itemid_attrvalue
    SELECT
        b.itemid,
        a.attrvalue
    FROM mpintranet.attrvalue AS a
    CROSS JOIN mpintranet.itemid AS b;

    ---------JOIN 全量表, 产出结果
    DROP TABLE IF EXISTS mpintranet.itemid_attrvalue_tag;
    CREATE TABLE IF NOT EXISTS mpintranet.itemid_attrvalue_tag(
        itemid BIGINT,
        attrvalue STRING,
        tag STRING
    )
    CLUSTERED BY (itemid) INTO 783 BUCKETS
    STORED AS ORC;

    SET mapred.reduce.tasks=500;
    INSERT INTO TABLE mpintranet.itemid_attrvalue_tag
    SELECT
        a.itemid,
        a.attrvalue,
        IF(b.itemid IS NULL, '0', '1') AS tag
    FROM mpintranet.itemid_attrvalue a
    LEFT JOIN mpintranet.item_attr b
    ON a.itemid = b.itemid
    AND a.attrvalue = b.attrvalue;
    ---------- JOIN 结果表
    DROP TABLE IF EXISTS mpintranet.attrvalue_result;
    CREATE TABLE IF NOT EXISTS mpintranet.attrvalue_result(
        itemid BIGINT,
        attrvalue STRING,
        tag STRING
    )
    CLUSTERED BY (itemid) INTO 113 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.attrvalue_result
    SELECT
        t.itemid,
        GROUP_CONCAT(t.attrvalue, ",") AS attrvalue,
        GROUP_CONCAT(t.tag, ",") AS tag
    FROM
    (
        SELECT
            itemid,
            attrvalue,
            tag,
            SPLIT(attrvalue, ',')[0] AS attrname
        FROM mpintranet.itemid_attrvalue_tag
        DISTRIBUTE BY itemid
        SORT BY attrname DESC
    ) AS t
    GROUP BY t.itemid;
    ------------end-------------
    DROP TABLE IF EXISTS mpintranet.attrvalue_export;
    CREATE TABLE IF NOT EXISTS mpintranet.attrvalue_export(
        itemid BIGINT,
        tag STRING
    )
    CLUSTERED BY (itemid) INTO 113 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.attrvalue_export
    SELECT
        itemid,
        tag
    FROM mpintranet.attrvalue_result;
    """
    # print(sql.format(category_id))
    Hive(env=ENV).execute(sql.format(category_id))
Пример #28
0
# coding: utf-8
# __author__: u"John"
from __future__ import unicode_literals, absolute_import, print_function, division
from mplib.IO import Hive, pickle_dump, pickle_load
from mplib.common import smart_encode
import editdistance


def get_data():
    sql = "SELECT COUNT(*) AS cnt FROM das.ci_diaper2014to2016_0406_allplatform_distinct AS d WHERE d.year = '2016'"
    data = Hive(env="local").query(sql)
    pickle_dump("raw", data)


def show_tables():
    print(Hive().query("show tables"))


def transform():
    pickle_dump("id_content",
                {line[0]: line[1]
                 for line in pickle_load("raw")})


if __name__ == "__main__":
    Hive.show_env_dict()
    print(Hive().query("show tables"))