def run_result(): with op.DBManger() as m: pipeline = [{ "$match": { "_status": 0 }, }] price_dic = {} #last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdprice20\d\d\d\d\d\d_sep"}}) #for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdprice(20\d\d\d\d\d\d)$"}}): last_sep = m.get_lasted_collection( "jingdong", filter={"name": { "$regex": r"^jdprice20201209_sep" }}) for table in m.list_tables( dbname="jingdong", filter={"name": { "$regex": r"^jdprice(20210129)$" }}): if not last_sep or table > last_sep: print("step 1: processing {}".format(table), flush=True) for item in m.read_from(db_collect=("jingdong", table), pipeline=pipeline): if int(item["id"]) in price_dic: tmp = price_dic[int(item["id"])] tmp["prices"] = (tmp["prices"][0] + 1, tmp["prices"][1] + clean_price(item)) else: price_dic[int(item["id"])] = { "prices": (1, clean_price(item)) } for skuid in price_dic: tmp = price_dic[int(skuid)] tmp["clean_price"] = round(tmp["prices"][1] / tmp["prices"][0], 2) tmp.pop("prices") result_dic = price_dic #skuids in last result last_month_skuids = {} last_result = m.get_lasted_collection( "jingdong", filter={"name": { "$regex": r"^month20\d\d\d\d$" }}) print("step 2: processing {}".format(last_result), flush=True) last_month = last_result[-6:] for skuid, comments, price, cate_id, brand_id, ziying in m.read_from( db_collect=("jingdong", last_result), out_field=("skuid", "comments", "clean_price", "cate_id", "brand_id", "ziying")): if cate_id: last_month_skuids[int(skuid)] = { "clean_price": price, "comments": comments, "cate_id": format_cat_id(cate_id), "brand_id": brand_id, "ziying": ziying } skuid_sukid_dict = {} #last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdskuid20\d\d\d\d\d\d_sep"}}) #for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdskuid(20\d\d\d\d\d\d)retry\d*$"}}): last_sep = m.get_lasted_collection( "jingdong", filter={"name": { "$regex": r"^jdskuid20201214_sep" }}) for table in m.list_tables( dbname="jingdong", filter={"name": { "$regex": r"^jdskuid(20210108)retry\d*$" }}): if not last_sep or table > last_sep: print("step 3: processing {}".format(table), flush=True) pipeline = [ { "$match": { "_status": 0 } }, { "$project": { "skuid": "$skuid", "cate_id": "$cate_id", "brand_id": "$brand_id", "ziying": "$ziying", } }, ] for skuid, cate_id, brand_id, ziying in m.read_from( db_collect=("jingdong", table), out_field=("skuid", "cate_id", "brand_id", "ziying"), pipeline=pipeline): skuid_sukid_dict[int(skuid)] = { "cate_id": cate_id, "brand_id": "0" if brand_id is None else brand_id, "ziying": ziying } #last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdcomment20\d\d\d\d\d\d_sep"}}) #for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdcomment(20\d\d\d\d\d\d)retry\d*$"}}): last_sep = m.get_lasted_collection( "jingdong", filter={"name": { "$regex": r"^jdcomment20201218_sep" }}) for table in m.list_tables( dbname="jingdong", filter={"name": { "$regex": r"^jdcomment(20210302)retry\d*$" }}): if not last_sep or table > last_sep: print("step 4: processing {}".format(table), flush=True) pipeline = [ { "$match": { #"$and": [{"_status": 0}, {"comment": {"$gt": 0}}] "$and": [{ "_status": 0 }, { "comment": { "$gt": "0" } }] } }, { "$project": { "skuid": "$skuid", "comment": "$comment", } }, ] for skuid, comments in m.read_from_yield( db_collect=("jingdong", table), out_field=("skuid", "comment"), pipeline=pipeline): if int(skuid) in skuid_sukid_dict: if int(skuid) in price_dic: price_item = result_dic[int(skuid)] price_item["clean_price"] = price_dic[int( skuid)]["clean_price"] price_item["comments"] = int(comments) price_item["type"] = 0 elif int(skuid) in last_month_skuids: last_month_price_item = last_month_skuids[int( skuid)] if int(skuid) not in result_dic: result_dic[int(skuid)] = {} price_item = result_dic[int(skuid)] price_item["clean_price"] = last_month_price_item[ "clean_price"] price_item["comments"] = int(comments) price_item["type"] = 1 else: result_dic[int(skuid)] = {} price_item = result_dic[int(skuid)] price_item["clean_price"] = 79.90 price_item["comments"] = int(comments) price_item["type"] = 2 skuid_sukid_item = skuid_sukid_dict[int(skuid)] price_item["cate_id"] = skuid_sukid_item["cate_id"] price_item["brand_id"] = skuid_sukid_item["brand_id"] price_item["ziying"] = skuid_sukid_item["ziying"] elif int(skuid) in last_month_skuids: if int(skuid) in price_dic: price_item = result_dic[int(skuid)] price_item["clean_price"] = price_dic[int( skuid)]["clean_price"] price_item["comments"] = int(comments) price_item["type"] = 3 elif int(skuid) in last_month_skuids: last_month_price_item = last_month_skuids[int( skuid)] if int(skuid) not in result_dic: result_dic[int(skuid)] = {} price_item = result_dic[int(skuid)] price_item["clean_price"] = last_month_price_item[ "clean_price"] price_item["comments"] = int(comments) price_item["type"] = 4 else: result_dic[int(skuid)] = {} price_item = result_dic[int(skuid)] price_item["clean_price"] = 79.90 price_item["comments"] = int(comments) price_item["type"] = 5 last_month_skuids_item = last_month_skuids[int(skuid)] price_item["cate_id"] = last_month_skuids_item[ "cate_id"] price_item["brand_id"] = last_month_skuids_item[ "brand_id"] price_item["ziying"] = last_month_skuids_item["ziying"] else: result_dic[int(skuid)] = {} price_item = result_dic[int(skuid)] price_item["clean_price"] = 79.90 price_item["comments"] = int(comments) price_item["cate_id"] = "0,0,0" price_item["brand_id"] = "0" price_item["ziying"] = "-1" price_item["type"] = 6 print( "step 5: processing skuid in last_month_skuids but not in result_dic", flush=True) for skuid in last_month_skuids: if int(skuid) not in result_dic: result_dic[int(skuid)] = {} price_item = result_dic[int(skuid)] price_item["clean_price"] = last_month_skuids[skuid][ "clean_price"] price_item["comments"] = last_month_skuids[skuid]["comments"] price_item["cate_id"] = "0,0,0" price_item["brand_id"] = "0" price_item["ziying"] = "-1" price_item["type"] = 7 else: price_item = result_dic[int(skuid)] if 'type' not in price_item: price_item["clean_price"] = last_month_skuids[skuid][ "clean_price"] price_item["comments"] = last_month_skuids[skuid][ "comments"] price_item["cate_id"] = last_month_skuids[skuid]["cate_id"] price_item["brand_id"] = last_month_skuids[skuid][ "brand_id"] price_item["ziying"] = last_month_skuids[skuid]["ziying"] price_item["type"] = 8 this_month = timeUtil.get_month(deltamonth=1, current_month=last_month) out_table = "month" + this_month print("step 6: processing writing result to {}".format(out_table), flush=True) buffer = [] buffer_size = 5000 print("result_dic:{}".format(len(result_dic)), flush=True) for i, k in enumerate(result_dic): result_dic[k]["skuid"] = k if "prices" in result_dic[k]: result_dic[k].pop("prices") result_dic[k]["month"] = this_month if "cate_id" in result_dic[k]: buffer.append(result_dic[k]) else: print(result_dic[k]) if i % buffer_size == 0 and buffer: m.insert_many_dict(db_collect=("jingdong", out_table), data_dict_list=buffer) buffer = [] if buffer: m.insert_many_dict(db_collect=("jingdong", out_table), data_dict_list=buffer) m.create_db_collection( db_collection=("jingdong", "jdprice{0}_sep".format(current_date)))
enumerate(m.read_from_yield(db_collect=("jingdong", last_summary)))): if item["skuid"] in this_month: this_item = this_month.pop(item["skuid"]) item["comment_{}".format(month)] = this_item["comments"] item["price"] = this_item["clean_price"] item["ziying"] = this_item["ziying"] bid = this_item["brand_id"] if bid: item["brand_id"] = bid cate_id = this_item["cate_id"] if cate_id: item["cate_id"] = cate_id else: item["comment_{}".format(month)] = item["comment_{}".format( timeUtil.get_month(-1, current_month=month))] list.append(item) if i % buffer_size == 0: m.insert_many_dict(db_collect=("jingdong", "summary_201905_{}".format(month)), data_dict_list=list) list = [] if list: m.insert_many_dict(db_collect=("jingdong", "summary_201905_{}".format(month)), data_dict_list=list) list = [] print("step3...", flush=True) for i, skuid in tqdm(enumerate(this_month)): this_item = this_month[skuid] item = {}
#!/usr/bin/env python # -*- coding: utf-8 -*- from mongo import op from multiprocess.tools import timeUtil current_date = "20200821" current_month = current_date[:-2] last_1_month, last_2_month, last_3_month = timeUtil.get_month( -1, current_month), timeUtil.get_month(-2, current_month), timeUtil.get_month( -3, current_month), comment_table = "secoComment{}".format(current_date) with op.DBManger() as m: for month in [last_1_month]: # 合并属于一个月的List m.drop_db_collect(db_collect=("secoo", "List{}".format(month))) dic = {} for listday in m.list_tables( dbname="secoo", filter={"name": { "$regex": r"List{}\d\d$".format(month) }}): print(listday, "List{}".format(month)) for item in m.read_from(db_collect=("secoo", listday), out_field=("pid", "price", "self")): dic.update({item[0]: (item[1], item[2])}) date_tuple_list = [] for k, (p, s) in dic.items(): date_tuple_list.append((k, k, p, s)) m.insert_many_tupe(db_collect=("secoo", "List{}".format(month)), data_tupe_list=date_tuple_list,
def compute_result(self): from mongo import op from multiprocess.tools import timeUtil current_date = self.current_date current_month = current_date[:-2] last_1_month, last_2_month, last_3_month = timeUtil.get_month( -1, current_month), timeUtil.get_month( -2, current_month), timeUtil.get_month(-3, current_month), comment_table = "secoComment{}".format(current_date) with op.DBManger() as m: for month in [last_1_month, last_2_month, last_3_month]: # 合并属于一个月的List m.drop_db_collect(db_collect=("secoo", "List{}".format(month))) dic = {} for listday in m.list_tables( dbname="secoo", filter={ "name": { "$regex": r"List{}\d\d$".format(month) } }): print(listday, "List{}".format(month)) for item in m.read_from(db_collect=("secoo", listday), out_field=("pid", "price", "self")): dic.update({item[0]: (item[1], item[2])}) date_tuple_list = [] for k, (p, s) in dic.items(): date_tuple_list.append((k, k, p, s)) m.insert_many_tupe(db_collect=("secoo", "List{}".format(month)), data_tupe_list=date_tuple_list, fields=("_id", "pid", "price", "self")) # 有销量 pipeline1 = [{ "$match": { "$and": [{ "_status": 0 }, { "pid": { "$ne": None } }] } }, { "$project": { "cid": "$cid", "pid_rel": "$pid_rel", "pid": "$pid", "user": "******", "device": "$device", "price": "$price", "date": "$date", "month": { "$substr": ["$date", 0, 6] }, "self": "$self", } }, { "$match": { "month": "{}".format(month) } }, { "$lookup": { "from": "CleanListNew", "localField": "pid", "foreignField": "_id", "as": "tableb" } }, { "$group": { "_id": { "month": "$month", "cid": "$cid", "pid": "$pid", "pid_rel": "$pid_rel", }, "user": { "$last": "$user", }, "device": { "$last": "$device", }, "price": { "$last": "$price", }, "tmp_price": { "$last": { "$arrayElemAt": ["$tableb.price", 0] } }, "tmp_self": { "$last": { "$arrayElemAt": ["$tableb.self", 0] } }, }, }, { "$project": { "_id": 0, "month": "$_id.month", "cid": "$_id.cid", "pid_rel": "$_id.pid_rel", "pid": "$_id.pid", "user": "******", "device": "$device", "price": { "$cond": { "if": { "$ne": ["$tmp_price", None] }, "then": "$tmp_price", "else": "$price" } }, "tmp_self": "$tmp_self", } }, { "$lookup": { "from": "CleanListNew", "localField": "pid_rel", "foreignField": "_id", "as": "tablec" } }, { "$project": { "_id": 0, "month": "$month", "cid": "$cid", "pid_rel": "$pid_rel", "pid": "$pid", "user": "******", "device": "$device", "price": "$price", "tmp_self": "$tmp_self", "tmp_self1": { "$arrayElemAt": ["$tablec.self", 0] }, } }, { "$project": { "_id": 0, "month": "$month", "cid": "$cid", "pid_rel": "$pid_rel", "pid": "$pid", "user": "******", "device": "$device", "price": "$price", "self": { "$cond": { "if": { "$ne": ["$tmp_self", None] }, "then": "$tmp_self", "else": { "if": { "$ne": ["$tmp_self1", None] }, "then": "$tmp_self1", "else": "其他" } } }, } }, { "$group": { "_id": { "month": "$month", "cid": "$cid", "pid": "$pid", "price": "$price", }, "self": { "$last": "$self" } }, }, { "$group": { "_id": { "month": "$_id.month", "pid": "$_id.pid", "price": "$_id.price", }, "sales": { "$sum": 1 }, "self": { "$last": "$self" }, }, }, { "$project": { "_id": 0, "month": "$_id.month", "pid": "$_id.pid", "sales": "$sales", "price": "$_id.price", "self": { "$cond": { "if": { "$ne": ["$self", "自营"] }, "then": "0", "else": "1" } }, } }, { "$out": "secoSales{}".format(month) }] # 无销量 pipeline2 = [{ "$match": { "$and": [{ "_status": { "$ne": 0 } }, { "_seed": { "$ne": None } }] } }, { "$project": { "pid_rel": { "$arrayElemAt": ["$_seed", 0] }, "price": { "$arrayElemAt": ["$_seed", 1] }, } }, { "$lookup": { "from": "List{}".format(month), "localField": "pid_rel", "foreignField": "_id", "as": "tableb" } }, { "$project": { "pid_rel": "$pid_rel", "price": "$price", "self": { "$arrayElemAt": ["$tableb.self", 0] }, } }, { "$match": { "self": { "$exists": True } } }, { "$group": { "_id": { "pid_rel": "$pid_rel", "price": "$price", }, "self": { "$last": "$self" }, }, }, { "$project": { "_id": 0, "month": "{}".format(month), "pid": "$_id.pid_rel", "sales": "0", "price": "$_id.price", "self": { "$cond": { "if": { "$ne": ["$self", "自营"] }, "then": "0", "else": "1" } }, } }, { "$out": "secoNosales{}".format(month) }] m.aggregate(db_collect=("secoo", comment_table), pipeline=pipeline1) m.aggregate(db_collect=("secoo", comment_table), pipeline=pipeline2) dic = {} for item in m.read_from( db_collect=("secoo", "secoNosales{}".format(month)), out_field=("pid", "price", "sales", "self")): dic.update({item[0]: (item[1], item[2], item[3])}) for item in m.read_from( db_collect=("secoo", "secoSales{}".format(month)), out_field=("pid", "price", "sales", "self")): dic.update({item[0]: (item[1], item[2], item[3])}) date_tuple_list = [] for k, (p, s, self) in dic.items(): date_tuple_list.append((k, k, p, s, self)) m.drop_db_collect(db_collect=("secoo", "secoResult{}".format(month))) m.insert_many_tupe(db_collect=("secoo", "secoResult{}".format(month)), data_tupe_list=date_tuple_list, fields=("_id", "pid", "price", "sales", "self"))