예제 #1
0
def response(flow: mitmproxy.http.HTTPFlow):
    # Only process 200 responses of HTML content.
    if not flow.response.status_code == 200:
        return

    # webdriver 变量检测
    if "passport.csdn.net/login" in flow.request.url:
        html = flow.response.text
        html = html.replace('<title>', '<script>{}</script><title>'.format(injected_javascript))
        flow.response.text = str(html)
        ctx.log.info('>>>> js代码插入成功 <<<<')

    # 商店列表抓取
    if "i.waimai.meituan.com/openh5/homepage/poilist" in flow.request.url:
        html = flow.response.text
        data = json_util.to_python(html)
        for shop in data["data"]["shopList"]:
            try:
                shop["_id"] = shop["mtWmPoiId"]
                shop["shopName"] = decryptCode(shop["shopName"])
                shop["monthSalesTip"] = decryptCode(shop["monthSalesTip"])
                shop["deliveryTimeTip"] = decryptCode(shop["deliveryTimeTip"])
                shop["minPriceTip"] = decryptCode(shop["minPriceTip"])
                shop["shippingFeeTip"] = decryptCode(shop["shippingFeeTip"])
                shop["averagePriceTip"] = decryptCode(shop["averagePriceTip"])
                shop["distance"] = decryptCode(shop["distance"])
                shop["shipping_time"] = decryptCode(shop["shipping_time"])

                if "discounts2" in shop:
                    for discounts in shop["discounts2"]:
                        discounts["info"] = decryptCode(discounts["info"])
                Cache.meituanwaimai_shop_list.insert(shop)
            finally:
                pass
    #
    # # 商店信息抓取
    # if "http://i.waimai.meituan.com/openh5/poi/food" in flow.request.url:
    #     html = flow.response.text
    #     data = JsonUtil.to_python(html)
    #
    #     # for food in data["data"]["categoryList"]:
    #     #     shop["_id"] = shop["mtWmPoiId"]
    #     # Cache.meituanwaimai.update(shop)
    #
    #
    # # 评论列表抓取
    if "i.waimai.meituan.com/openh5/poi/comments" in flow.request.url:
        html = flow.response.text
        data = json_util.to_python(html)

        mtWmPoiId = data["data"]["mtWmPoiId"]
        for comment in data["data"]["list"]:
            comment["mtWmPoiId"] = mtWmPoiId

        Cache.meituanwaimai_comment_list.insert_many(data["data"]["list"])
예제 #2
0
def test_get_python_from_file():
    rows = file_util.readRows(
        "/home/changshuai/Temp/5dcdd0a7-8842-4695-8fd8-9553b191d346.tmp")
    data = []
    for row in rows:
        datum = json_util.to_python(row)
        if datum["jobName"] == "yili_ecomm_v2_20190605131709_003_27":
            data.append(datum)
    print(data)
예제 #3
0
def updateSourceCrawlId(taskId: int, mapping: dict):
    """
    更新
    mapping = {
       "redis-link": [1, 3]
    }
    """
    conn = getRhino()
    cur = conn.cursor()

    # 获取taskConfig
    querySql = "SELECT t.* FROM t_rhino_task_config t WHERE id = {};".format(
        taskId)
    logger.info(querySql)
    cur.execute(querySql)
    result = cur.fetchone()

    # 处理 distribute
    distributeParam = json_util.to_python(result["distribute_param"])
    for key, newCrawlerIds in mapping.items():
        if key in distributeParam:
            rules = distributeParam[key]["rules"]
            for ind, rule in enumerate(distributeParam[key]["rules"]):
                if rule.startswith("{sourceCrawlerId}#in#"):
                    crawlerIds = set(
                        map(
                            int,
                            rule.replace("{sourceCrawlerId}#in#",
                                         "").split(",")))
                    crawlerIds = crawlerIds | set(newCrawlerIds)
                    crawlerIds = list(crawlerIds)
                    crawlerIds.sort()
                    rules[ind] = "{{sourceCrawlerId}}#in#{}".format(",".join(
                        map(str, crawlerIds)))

    # 处理crawler_id_list
    crawlerIdList = result["crawler_id_list"]
    crawlerIdList = set(map(int, crawlerIdList.split(",")))
    for key, newCrawlerIds in mapping.items():
        crawlerIdList = crawlerIdList | set(newCrawlerIds)

    crawlerIdList = list(crawlerIdList)
    crawlerIdList.sort()

    # 更新sql
    distributeParamUpdate = json_util.to_json(distributeParam)
    crawlerIdListUpdate = ",".join(map(str, crawlerIdList))
    updateSql = "UPDATE t_rhino_task_config SET distribute_param = '{}', crawler_id_list = '{}' WHERE id = {} ".format(
        distributeParamUpdate, crawlerIdListUpdate, taskId)
    print(updateSql)
    cur.execute(updateSql)

    cur.close()
    conn.close()
예제 #4
0
    def run(self):
        url = "https://item.taobao.com/item.htm?id=593167331763"
        # self.downloader.middleware_list.append(ProxyMiddleWare())
        res = self.downloader.get(url)

        itemId = text_util.get_first_match(res.text, "itemId\s+:\s+'(\d+)',")
        sellerId = text_util.get_first_match(res.text,
                                             "sellerId\s+:\s+'(\d+)'")
        shopName = text_util.get_first_match(res.text,
                                             "shopName\s*:\s*'(.*?)'")
        skuMap = text_util.get_first_match(res.text, "skuMap\s*:\s*({.*})")
        title = text_util.get_first_match(res.text, "title\s*:\s*'(.*)'")
        propertyMemoMap = text_util.get_first_match(
            res.text, "propertyMemoMap\s*:\s*({.*})")

        self.downloader.setting.headers.update({"Referer": url})
        self.downloader.setting.headers.update({
            "Cookie":
            "t=192584b50433a81c5feae77e9e99411f; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; OUTFOX_SEARCH_USER_ID_NCOO=1427643707.8819768; enc=V2PIbfvRYC7hvhCHq8qkNaMekFaEJPNApT08%2FgVaEAQ2OC%2BI2X4ku9sCq5dBhGRyaf7sP3uWnXEnmirxNFKDhQ%3D%3D; cna=4vCbFAVQ8hgCAbc/WcslocCr; cookie2=1931f04989f237d225904534cc89e2a7; _tb_token_=4e1edb04afa8; v=0; miid=1429757782455434771; tk_trace=oTRxOWSBNwn9dPyorMJE%2FoPdY8zZPEr%2FCrvCMS%2BG3sTRRWrQ%2BVVTl09ME1KrXE7g7f5SykjbFjU2EbuQocCrCuXu%2BxnGiDUI4y7SiU8R5wYO2UYEEivSgzo9bmwuwMAMEhtH43hBt535uXkDsXTju7V5XRRxfiOYs5k5VhVmShunGRh%2FOIXRI5LD3ngB8VZblVPU62%2FNCVT0brygusVvRPUvgT3iMfNN3l4HrDoNlJ1N88B%2FsJExCyaSkUuHnRgisCCXwa6iP2ttiJOjfsdh9kgRqJM2cYKE5mdnN7YlWI7MtgU0YitBpzvFoYM9wDlxNIrehSt32D2awKXRliVeBIw%3D; uc3=id2=UUpnjMGWeTDxMA%3D%3D&vt3=F8dBy3MLoylZjTIKqDw%3D&lg2=W5iHLLyFOGW7aA%3D%3D&nk2=suEMAecR; csg=f0359cd1; lgc=%5Cu6768%5Cu7545%5Cu5E05; dnk=%5Cu6768%5Cu7545%5Cu5E05; skt=d1c02800fe0af2e7; existShop=MTU2Njg4NjA4OA%3D%3D; uc4=id4=0%40U2gtHRBkJk9a2SFfxwUCZdl9g6Mj&nk4=0%40sOlUtvsiedjt3d5KnKNpEJI%3D; tracknick=%5Cu6768%5Cu7545%5Cu5E05; _cc_=V32FPkk%2Fhw%3D%3D; tg=0; mt=ci=19_1; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; whl=-1%260%260%261566893100933; _m_h5_tk=ed82048ac357de15b1d9f408c5a87f3b_1567332023191; _m_h5_tk_enc=1ce5e64614f05ae7b3fe320776816210; uc1=cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D&cookie21=U%2BGCWk%2F7p4mBoUyS4E9C&existShop=false&pas=0&cookie14=UoTaH0QlXL3bSQ%3D%3D&tag=8&lng=zh_CN; isg=BAUFdlAy9O_vYtC8ivmC0CRlFEiT0L5xhaOb-wdr0zw7niYQwREaJYs_qILN3tEM; l=cBNHS6EVqJJZw89-BOfNVQLf1P_OuIOf1sPP2doM4IB1951TMdIxHHwIzx_Bp3QQE95xUExySDo_2Rnp7yz3rAonhFSjOC0eQ"
        })
        self.downloader.middleware_list = []
        res = self.downloader.get(self.BASE_INFO_URL.format(itemId, sellerId))
        text_util.get_first_match(res.text, "onSibRequestSuccess\((.*)\);")
        info = json_util.to_python(
            text_util.get_first_match(res.text,
                                      "onSibRequestSuccess\((.*)\);"))

        print(
            {
                "itemId": itemId,
                "sellerId": sellerId,
                "shopName": shopName.encode('utf-8').decode("unicode-escape"),
                "title": title.encode('utf-8').decode("unicode-escape"),
                "skuMap": json_util.to_python(skuMap),
                "propertyMemoMap": propertyMemoMap,
                "soldTotalCount":
                info["data"]["soldQuantity"]["confirmGoodsCount"],
                "stock": info["data"]["dynStock"]["stock"]
            },
            end="\n")
def selectTable(appIds: List[int]):
    conn = pymysql.connect("127.0.0.1",
                           "rhino",
                           "rhino",
                           "db_datatub_rhino_v3",
                           port=5555,
                           cursorclass=pymysql.cursors.DictCursor,
                           charset='utf8')

    querySql = "SELECT t.* FROM t_rhino_task_config t " \
               "WHERE id in (" \
               "SELECT task_id_list FROM t_rhino_app_config WHERE app_id in ({})" \
               ");".format(",".join(map(str, appIds)))
    cur = conn.cursor()
    cur.execute(querySql)

    result = cur.fetchall()

    path = set()

    # 查看table
    for row in result:
        taskId = row["id"]
        print("任务Id: {}".format(taskId))

        distributeParam = json_util.to_python(row["distribute_param"])
        for name, rule in distributeParam.items():
            if isinstance(rule, list):
                for _ in rule:
                    if "1" in _["writer"]:
                        path.add("{}".format(_["writer"]["1"]))
            else:
                if rule["id"] == 1 or rule["id"] == "1":
                    path.add("{}".format(rule["path"]))

    print(path)
예제 #6
0
    post_data["source"] = "shoplist"
    post_data["skuId"] = ""
    post_data[
        "uuid"] = "16d14d36a98c8-0e53a36f17ba0e-1a201708-1fa400-16d14d36a98c8"
    post_data["platform"] = "3"
    post_data["partner"] = "4"
    post_data[
        "originUrl"] = "https://h5.waimai.meituan.com/waimai/mindex/menu?dpShopId=&mtShopId={}&utm_source=&source=shoplist&initialLat=23.129112&initialLng=113.264385&actualLat=&actualLng=".format(
            poiId)
    post_data["riskLevel"] = "71"
    post_data["optimusCode"] = "10"
    post_data["wm_latitude"] = "23129112"
    post_data["wm_longitude"] = "113264385"
    post_data["wm_actual_latitude"] = ""
    post_data["wm_actual_longitude"] = ""
    post_data[
        "openh5_uuid"] = "16d14d36a98c8-0e53a36f17ba0e-1a201708-1fa400-16d14d36a98c8"
    html = downloader.post("http://i.waimai.meituan.com/openh5/poi/food",
                           data=post_data).text

    for key, val in ttf_replace.items():
        html = html.replace(key, val)
    json = json_util.to_python(html)
    data = {
        "shopName": json["shopInfo"]["shopName"],
        "shopPic": json["shopInfo"]["shopPic"],
        "deliveryFee": json["shopInfo"]["deliveryFee"],
        "deliveryTime": json['data']['shopInfo']["deliveryTimeDecoded"],
    }
    time.sleep(3)
예제 #7
0
import jsonpath

from dio_core.network.downloader import Downloader
from dio_core.utils import json_util, url_util, time_util
from dio_core_test.utils import text_util

keyword = "女装"

for i in range(100):
    html = Downloader.get(
        "https://shopsearch.taobao.com/browse/shop_search.htm?q={}&s={}".
        format(keyword, i * 20)).text
    data = json_util.to_python(
        text_util.get_first_match(html, "g_page_config = (.*);"))
    for shop in jsonpath.jsonpath(data, "$.mods.shoplist.data.shopItems.*"):
        if "shopIcon" in shop and "title" in shop["shopIcon"] and "天猫" in shop[
                "shopIcon"]["title"]:
            print("天猫\t{}\t{}".format(url_util.patch_url(shop["shopUrl"]),
                                      shop["procnt"]))
        else:
            print("淘宝\t{}\t{}".format(url_util.patch_url(shop["shopUrl"]),
                                      shop["procnt"]))
    time_util.sleep(5)