Exemplo n.º 1
0
def list_parser(task):
    rule = task["rule"]
    content = task["recv"].getvalue()
    ret = []
    dyn_items = re.findall('({.*?sell_price.*?}),', content)
    for i in dyn_items:
        try:
            item = json.loads(i)
        except ValueError:
            continue
        link = item_base % item["id"]
        price = item["sell_price"]
        ret.append((link, price, 1))
    t = etree.HTML(content)
    nodes = t.xpath(rule["rule"])
    for node in nodes:
        link = node.xpath("div[@class = 'cat-item-pic']/a/@href")
        price = node.xpath(
            "figcaption[@class = 'cat-item-inf']/p/span[@class = 'cat-pire-nub']/text()"
        )
        if not link or not price:
            log_with_time("rule error: %s" % task["url"])
            continue
        ret.append((link[0], price[0], 1))
    result = format_price(ret)
    return result
Exemplo n.º 2
0
def list_parser(task, rule):
    t = etree.HTML(task["text"]) 
    nodes = t.xpath(rule["node"]) 
    if not nodes:
        log_with_time("node rule error: %s" % task["url"])
        return 
    dp = []
    dps = {}
    ret = [] 
    now = int(time.time()) 
    for node in nodes:
        link = node.xpath(rule["link"])
        gid = node.xpath(rule["gid"]) 
        if not link or not gid:
            log_with_time("rule error: %s" % task["url"])
            continue 
        gid = gid[0]
        dp.append((link[0], ""))
        ret.append(gid)
        dps[gid] = now 
    return {
            "dps_log": dps,
            "dp": dp,
            "price": ret,
            } 
Exemplo n.º 3
0
def stock1_parser(task, rule):
    try:
        j = demjson.decode(task['text'])
    except:
        log_with_time("bad response: %r"%task['url'])
        return []
    code = j['code']
    message = j['message']

    url = ""
    ret = {"spider":[], 'stock2':[]}

    if code == 3 and message:
        try:
            skuid = re.search("\d+", message).group()
            url = surl2(task['gid'], skuid)
        except:
            return []
    if url == "":
        #print(task['text'])
        stock = 1 if j.get('totalAmount') else 0
        ret['spider'] = format_price([(itemurl+task['gid'], task['price'], stock)])
    else:
        ret['stock2'] = [(url, task['gid'], task['price'])]

    return ret
Exemplo n.º 4
0
def pager(task):
    rule = task["rule"]
    content = task["recv"].getvalue()
    t = etree.HTML(content)
    total = t.xpath(rule["rule"])
    if "找到0个商品" in content:
        log_with_time("search result 0: %s" % task["url"])
        return
    if not total:
        log_with_time("rule error: %s" % rule["rule"])
        return
    total = int(total[0])
    page = rule["page"]
    num = total / page
    if total % page:
        num += 1
    tasks = []
    for i in range(num):
        tasks.append({
            "url":
            task["url"] + "&order=normal&sort=desc&per_page=%s" % (i * page),
            "old_url":
            task["url"],
        })
    return tasks
Exemplo n.º 5
0
def rt_parser(items): 
    pids = get_pids(items)
    if not pids:
        log_with_time("got nothing: %s" % entries)
        return
    purl = price_url % (",".join(["J_" + i for i in pids]), 
            random.randint(1000000, 10000000), int(time.time() * 1000)) 
    surl = stock_url % (async_http.quote(",".join([i for i in pids])), 
            random.randint(1000000, 10000000), int(time.time() * 1000)) 

    price_res = simple_http.get(purl) 
    stock_res = simple_http.get(surl)
    if price_res["status"] != 200 or stock_res["status"] != 200:
        log_with_time("not200: %s" % price["res"])
        return
    try:
        price_json = jsonp_json(price_res["text"]) 
        stock_json = jsonp_json(stock_res["text"].decode("gbk"))
    except: 
        traceback.print_exc()
        return
    prices = {} 
    for i in price_json: 
        prices[i["id"].split("_")[1]] = i["p"]
    stocks = {} 
    for k,v in stock_json.items(): 
        s = v["StockStateName"]
        if u"有货" in s or u"现货" in s:
            stocks[k] = 1
        else:
            stocks[k] = 0 
    ret = []
    for pid in prices:
        ret.append((str(pid), str(prices[pid]), stocks[pid])) 
    return format_price(ret)
Exemplo n.º 6
0
def cats_parser(url, res, rule):
    content = res['text']
    t = etree.HTML(content)
    ret = set()
    items = t.xpath(rule)
    for v in items:
        #pdb.set_trace()
        if '/c0-0/' in v:
            continue
        if '/ctg/s2/' in v:
            r = "(?<=/ctg/s2/).+"
            cat = re.search(r, v)
            if not cat:
                log_with_time("bad regex: %r %r" % (r, v))
                continue
            cat = cat.group().split('-')[0]
            ret.add(ctgurl % cat)
        elif 'list.yhd.com' in v:
            # http://list.yhd.com/.../
            r = "(?<=yhd\.com\/).+"
            cat = re.search(r, v)
            if not cat:
                log_with_time("bad regex: %r %r" % (r, v))
                continue
            cat = cat.group().split('-')[0]
            ret.add(lsturl % cat)
    return ret
Exemplo n.º 7
0
def list_parser(task, rule):
    #pr.enable()
    try:
        t = etree.HTML(task['text'].decode('gbk', 'replace'))
    except:
        log_with_time("bad response %s"%task['url'])
        return
    ret = [] 
    if ebookurl in task['url']:
        r = __book_list_parser1(t, task, rule)
    elif 'cp' in task['url']:
        r = __book_list_parser2(t, task, rule)
    else:
        r = __norm_list_parser1(t, task, rule)
    ret, comments, shop, dp = r 
    #pr.disable()
    #pr.print_stats()
    fret = format_price(ret)
    dps = {}
    for i in fret:
        dps[i[1]] = int(time.time())
    return {
            "result": fret, 
            "dps": dps,
            "shop": shop,
            "comment": comments,
            "dp": dp,
            }
Exemplo n.º 8
0
def dp_parser(task, rule): 
    desc_url = re.findall("desc: '(http.*?desc/[0-9]+)'", task["text"]) 
    if not desc_url:
        log_with_time("no desc: %s" % task["url"])
        return
    crc = urlcrc.get_urlcrc(3, task["url"])
    return [(desc_url[0], str(crc), "")] 
Exemplo n.º 9
0
def pager(task, rule):
    try:
        tree = etree.HTML(task["text"])
    except:
        log_with_time("bad response %s" % task['url'])
        return
    page = re.findall("/([0-9]+)", " ".join(tree.xpath(rule)))
    if not page:
        log_with_time("page rule error")
        return
    cat = re.findall("/(v?c[0-9]+-[0-9]+-[0-9]+)", task["url"])[0]
    ret = []
    for i in range(1, int(page[0]) + 1):
        if cat.startswith("vc"):
            url_a = virta_base.format(cat=cat,
                                      page=i,
                                      cb=int(time.time() * 1000))
            url_b = virtb_base.format(cat=cat,
                                      page=i,
                                      cb=int(time.time() * 1000))
        else:
            url_a = parta_base.format(cat=cat,
                                      page=i,
                                      cb=int(time.time() * 1000))
            url_b = partb_base.format(cat=cat,
                                      page=i,
                                      cb=int(time.time() * 1000))
        ret.append(url_a)
        ret.append(url_b)
    return ret
Exemplo n.º 10
0
def list_parser(task, rule):
    t = etree.HTML(task['text'])
    nodes = t.xpath(rule['nodes'])
    ret = []
    dps = {}
    for node in nodes:
        gid = node.xpath(rule['gid'])
        price = node.xpath(rule['price'])
        if not gid or not price:
            log_with_time("bad response: %r" % task['url'])
            continue
        gid = re_gid.search(gid[0]).group()
        price = re_price.search(price[0].text).group()
        ret.append({
            "url": surl1,
            "gid": gid,
            "price": price,
            "payload": {
                "id": gid,
                "type": "0",
                "count": "1"
            }
        })
        dps[gid] = time.time()
    return {"stock": ret, "dps": dps}
Exemplo n.º 11
0
def dp_parser(task, rule):
    desc_url = re.findall("desc: '(http.*?desc/[0-9]+)'", task["text"])
    if not desc_url:
        log_with_time("no desc: %s" % task["url"])
        return
    crc = urlcrc.get_urlcrc(3, task["url"])
    return [(desc_url[0], str(crc), "")]
Exemplo n.º 12
0
def list_parser(task, rule):
    t = etree.HTML(task['text'])
    nodes = t.xpath(rule['nodes'])
    prices = []
    items = []
    dps = {}
    #pdb.set_trace()
    for node in nodes:
        gid = node.attrib['itemid']
        buyinfo = node.xpath(rule['buyinfo'])
        if not gid:
            log_with_time("bad response: %r"%task['url'])
            continue
        if buyinfo:
            buyinfo = buyinfo[0]
            buycart = buyinfo.xpath(rule['buycart'])
            stock = 1
            if not buycart:
                if buyinfo.xpath(rule['sellout']) or not node.xpath(rule['comment']):
                    stock = 0
            prices.append((gid, stock))
        else:
            items.append(gid)
        dps[gid] = int(time.time())
    return {"prices": prices, "items": items, "dps": dps}
Exemplo n.º 13
0
def extract_book(url, tree, rule):
    result = []
    dps = []
    now = int(time.time())
    dps_log = {}
    nodes = tree.xpath(rule["book_node"])
    comments = {}
    lid = re.search("\d+", url.split('-')[-1]).group()
    for node in nodes:
        link_node = node.xpath(rule["book_title"])
        stock = node.xpath(rule["book_stock"])
        comment = node.xpath(rule["book_comment"])
        if not link_node or not stock:
            log_with_time("rule error: %s" % url)
            continue
        link_node = link_node[0]
        link = link_node.attrib["href"]
        gid = re_gid.search(link).group()
        comments[gid] = comment[0]
        title = link_node.text
        if u"有货" in stock[0]:
            s = 1
        else:
            s = 0
        dps_log[gid] = now
        dps.append((link, gid, title))
        result.append((link, gid, lid, s))
    return {
        "book_price": result,
        #"dp": dps,
        "dps_log": dps_log,
        "comment": comments
    }
Exemplo n.º 14
0
def cats_parser(url, res,  rule):
    content = res['text']
    t = etree.HTML(content)
    ret = set()
    items = t.xpath(rule)
    for v in items:
        #pdb.set_trace()
        if '/c0-0/' in v:
            continue
        if '/ctg/s2/' in v:
            r = "(?<=/ctg/s2/).+"
            cat = re.search(r, v)
            if not cat:
                log_with_time("bad regex: %r %r" % (r, v))
                continue
            cat = cat.group().split('-')[0]
            ret.add(ctgurl % cat)
        elif 'list.yhd.com' in v:
            # http://list.yhd.com/.../
            r = "(?<=yhd\.com\/).+"
            cat = re.search(r, v)
            if not cat:
                log_with_time("bad regex: %r %r" % (r, v))
                continue
            cat = cat.group().split('-')[0]
            ret.add(lsturl % cat)
    return ret
Exemplo n.º 15
0
def page_parser(task, rule):
    try:
        t = etree.HTML(task["text"])
    except:
        traceback.print_exc()
        return

    tot = t.xpath(rule[0])
    total = re.findall("/([0-9]+)", "".join(tot))
    if not total:
        log_with_time("rule error: %s" % rule)
        return
    total = int(total[0])
    if total == 1:
        return [{
            "url": task['url'],
        }]

    bas = t.xpath(rule[1])[0]
    base = re.findall("(^.*page=?)\d*", bas)[0]
    tasks = []
    for i in range(1, total + 1):
        tasks.append({
            "url": "http://www.likeface.com" + base + str(i),
        })
    return tasks
Exemplo n.º 16
0
def extract_book(url, tree, rule): 
    result = []
    dps = [] 
    now = int(time.time())
    dps_log = {} 
    nodes = tree.xpath(rule["book_node"])
    comments = {}
    lid = re.search("\d+", url.split('-')[-1]).group()
    for node in nodes:
        link_node = node.xpath(rule["book_title"]) 
        stock = node.xpath(rule["book_stock"]) 
        comment = node.xpath(rule["book_comment"])
        if not link_node or not stock: 
            log_with_time("rule error: %s" % url)
            continue 
        link_node = link_node[0]
        link = link_node.attrib["href"]
        gid = re_gid.search(link).group()
        comments[gid] = comment[0]
        title = link_node.text
        if u"有货" in stock[0]:
            s = 1
        else:
            s = 0 
        dps_log[gid] = now
        dps.append((link, gid, title))
        result.append((link, gid, lid, s)) 
    return {
            "book_price": result,
            #"dp": dps,
            "dps_log": dps_log,
            "comment": comments
            } 
Exemplo n.º 17
0
def list_parser(task, rule):
    t = etree.HTML(task['text'])
    nodes = t.xpath(rule['nodes'])
    ret = []
    dps = {}
    dp = []
    for node in nodes:
        gid = node.xpath(rule['gid'])
        if not gid:
            log_with_time("bad rules: %r" % task['url'])
        _gid = re.search("(?<=item/).+", gid[0])
        if not _gid:
            _gid = re.search("(?<=com/).+", gid[0])
            if not id: 
                log_with_time("bad regex: %r" % task['url'])
                continue
        gid = _gid.group()
        dp.append((dp_base % gid, "")) 
        ret.append(gid) 
    return {
            "price":  [
                {
                "url": priceurl,
                "payload": {
                    "itemid": ','.join(ret)
                }
                }],
            "dp": dp
            }
Exemplo n.º 18
0
def fix_url(url):
    if "tuan" in url:
        log_with_time("skip url: %s" % url)
        return
    x = re.findall("/([0-9\-]+)\.", url)
    if not x:
        return
    return base + ",".join(x[0].split("-"))
Exemplo n.º 19
0
def book_price(task, rule):
    try:
        j = json.loads(task['text'])
        price = j['price'][0]['proPrice']  #if j['price'] else 0
    except:
        log_with_time("bad response: %s" % task['link'])
        return
    return format_price([[str(task['qid']), str(price), task['stock']]])
Exemplo n.º 20
0
def fix_url(url):
    if "tuan" in url:
        log_with_time("skip url: %s" % url)
        return
    x = re.findall("/([0-9\-]+)\.", url)
    if not x:
        return
    return base + ",".join(x[0].split("-"))
Exemplo n.º 21
0
def book_price(task, rule):
    try:
        j = json.loads(task['text'])
        price = j['price'][0]['proPrice'] #if j['price'] else 0
    except:
        log_with_time("bad response: %s" % task['link'])
        return 
    return format_price([[str(task['qid']), str(price), task['stock']]])
Exemplo n.º 22
0
def price_parser(task, rule):
    try:
        price = re.search("(?<=price\:)\d+\.\d+(?=\,)", task['text']).group()
    except:
        log_with_time("bad response: %r"%task['url'])
        return []
    ret = [(task['gid'], price, task['stock'])]
    fret = format_price(ret)
    return fret
Exemplo n.º 23
0
def list_parser(task, rule):
    j = json.loads(task['text'])
    if not 'glist' in j:
        log_with_time("bad response %r"%task['url'])
        return []
    ret = []
    for g in j['glist']:
        ret.append((surl%g['gcode'], g['gprice']))
    return ret
Exemplo n.º 24
0
def item_parser(task, rule):
    try:
        t = etree.HTML(task['text'])
        btn = t.xpath(rule)[0]
        stock = 0 if btn.attrib.get('disabled') else 1
    except:
        log_with_time("bad response: %s"%task['url'])
        return
    return [(task['gid'], stock)]
Exemplo n.º 25
0
def list_parser(task, rule):
    j = json.loads(task['text'])
    if not 'glist' in j:
        log_with_time("bad response %r" % task['url'])
        return []
    ret = []
    for g in j['glist']:
        ret.append((surl % g['gcode'], g['gprice']))
    return ret
Exemplo n.º 26
0
def pager(task, rule):
    j = json.loads(task['text'])
    if not 'gpagecount' in j:
        log_with_time("bad response %r"%task['url'])
        return []
    code = re.search("(?<=code=)\d+(?=&)", task['url']).group()
    ret = []
    for i in range(1, j['gpagecount']+1):
        ret.append(gurl%(code,i))
    return ret
Exemplo n.º 27
0
def pager(task, rule):
    j = json.loads(task['text'])
    if not 'gpagecount' in j:
        log_with_time("bad response %r" % task['url'])
        return []
    code = re.search("(?<=code=)\d+(?=&)", task['url']).group()
    ret = []
    for i in range(1, j['gpagecount'] + 1):
        ret.append(gurl % (code, i))
    return ret
Exemplo n.º 28
0
def price_parser(task, rule):
    try:
        items = jsonp_json(task["text"])
    except ValueError as e:
        log_with_time("price_parser: jsonp_json: %s" % task["text"])
        return
    d = {}
    for item in items:
        d[item["id"].split("_")[1]] = item["p"]
    return [d]
Exemplo n.º 29
0
def price_parser(task, rule):
    try:
        items = jsonp_json(task["text"])
    except ValueError as e:
        log_with_time("price_parser: jsonp_json: %s" % task["text"])
        return
    d = {}
    for item in items:
        d[item["id"].split("_")[1]] =  item["p"]
    return [d]
Exemplo n.º 30
0
def stock_parser(task, rule):
    try:
        j = json.loads(task['text'])
        stock = 1 if j['havestock'] in ("true", "realstock") else 0
    except:
        log_with_time("bad response %s"%task['url'])
        return

    ret = [(itemurl % task['info'][0], str(task['info'][1]), stock)]
    fret = format_price(ret)
    return fret
Exemplo n.º 31
0
def cats(url, res, rule):
    content = res["text"]
    try:
        t = etree.HTML(content)
    except:
        log_with_time("bad response %s" % content.decode("utf-8", "replace"))
        return
    ret = []
    for i in t.xpath(rule):
        ret.append(yougou + i)
    return ret
Exemplo n.º 32
0
def cats(url, res, rule):
	content = res['text']
	try:
		t = etree.HTML(content)
	except:
		log_with_time("bad response %s"%content.decode('utf-8', 'replace'))
		return
	ret = []
	for i in t.xpath(rule):
		ret.append(yougou + i)
	return ret
Exemplo n.º 33
0
def pager(task, rule): 
    t = etree.HTML(task["text"]) 
    page = t.xpath(rule)
    if not page:
        log_with_time("page rule error")
        return
    ret = [task["url"]] 
    num = re.findall("\d+", " ".join(page))
    cat = re.findall("cateID=(\d+)", task["url"])[0]
    for i in range(2, int(num[0]) + 1):
        ret.append(page_base.format(cat=cat, page=i))
    return ret 
Exemplo n.º 34
0
def checkoffline(task, rule): 
    try:
        j = json.loads(task['text'])
        j = j['items']
    except:
        log_with_time("bad response %s"%task['url'])
        return
    ret = []
    for k,v in j.items():
        if not v['is_found']:
            ret.append((str(k), str(-1), -1))
    fret = format_price(ret)
    return fret
Exemplo n.º 35
0
def meizhuang_cats_parser(url, content, rule): 
    t = etree.HTML(content) 
    ret = []
    for node in t.xpath(rule[0]):
        #link
        link = node.xpath(rule[1])
        #price
        price = node.xpath(rule[2]) 
        if not link or not price:
            log_with_time("rule error: %s" % url)
        ret.append((link[0], price[0], 1))
    result = format_price(ret)
    return result
Exemplo n.º 36
0
def price_parser(task, rule):
    price = _tess.recognize(task['text'], _tess.IMAGE_PNG, 32)
    try:
        price = re.search("\d+\.\d+|\d+", price).group()
    except:
        log_with_time("bad price: %s" % task['url'])
        return
    ret = [(task['gid'], price, task['stock'])]
    fret = format_price(ret)
    dps = {}
    for i in fret:
        dps[i[1]] = int(time.time())
    return {"result":fret, "dps": dps}
Exemplo n.º 37
0
def list_parser(task, rule): 
    t = etree.HTML(task['text'])
    nodes = t.xpath(rule['nodes'])
    ret = []
    for node in nodes:
        gid = node.xpath(rule['gid'])
        price = node.xpath(rule['price'])
        if not gid or not price:
            log_with_time("bad response: %r" % task['url'])
            continue
        gid = re.findall("id-([0-9]+)", gid[0])
        ret.append((gid[0], price[0]))
    return ret
Exemplo n.º 38
0
def list_parser(task, rule):
    t = etree.HTML(task["text"])
    nodes = t.xpath(rule["node"])
    ret = []
    for node in nodes:
        qid = node.attrib["id"].split("_")
        link = node.xpath(rule["link"])
        price = node.xpath(rule["price"])
        if not link or not price or not qid:
            log_with_time("rule error: %s" % task["url"])
            continue
        ret.append(("http://www.miyabaobei.com" + link[0], qid[1], price[0]))
    return ret
Exemplo n.º 39
0
def meizhuang_cats_parser(url, content, rule):
    t = etree.HTML(content)
    ret = []
    for node in t.xpath(rule[0]):
        #link
        link = node.xpath(rule[1])
        #price
        price = node.xpath(rule[2])
        if not link or not price:
            log_with_time("rule error: %s" % url)
        ret.append((link[0], price[0], 1))
    result = format_price(ret)
    return result
Exemplo n.º 40
0
def pager(task, rule):
    c = task["text"].decode("utf-8")
    item = jsonp_json(c)
    if "pageBar" not in item:
        log_with_time("no pageBar: %s" % task["url"])
        return
    m = item["pageBar"]
    ret = []
    if not m.get("totalCount", 0):
        log_with_time("empty category: %s" % task["url"])
        return ret
    for i in range(1, m["totalPage"] + 1):
        ret.append({"url": payload(task['cat'], i)})
    return ret
Exemplo n.º 41
0
def stock_parser(task, rule):
    t = etree.HTML(task["text"])
    stock = t.xpath(rule)
    ret = []
    if not stock:
        log_with_time("bad response: %s" % task["url"])
        return ret
    if int(stock[0].text):
        stock = 1
    else:
        stock = 0
    ret.append((task['url'], task['price'], stock))
    fret = format_price(ret)
    return fret
Exemplo n.º 42
0
def stock_parser(task, rule):
    t = etree.HTML(task["text"])
    stock = t.xpath(rule)
    ret = []
    if not stock:
        log_with_time("bad response: %s" % task["url"])
        return ret
    if int(stock[0].text):
        stock = 1
    else:
        stock = 0
    ret.append((task['url'], task['price'], stock))
    fret = format_price(ret)
    return fret
Exemplo n.º 43
0
def list_parser(task): 
    t = etree.HTML(task["recv"].getvalue())
    nodes = t.xpath(task["rule"]["rule"])
    ret = []
    for node in nodes:
        link = node.xpath("div/div[@class = 'proTit']/a/@href") 
        price = node.xpath("div/div[@class = 'proPrice']/text()") 
        if not link or not price:
            log_with_time("rule error: %s" % task["old_url"])
            continue
        p = fix_price(price[0]) 
        ret.append((link[0], p, 1)) 
    result = format_price(ret)
    return result 
Exemplo n.º 44
0
def list_parser(task, rule):
    tree = etree.HTML(task["text"])
    nodes = tree.xpath(rule["node"])
    ret = []
    for node in nodes:
        gidurl = node.xpath(rule['gidurl'])
        price = node.xpath(rule['price'])
        if not gidurl or not price:
            log_with_time("list parser err: %r" % task['url'])
            continue
        gidurl = gidurl[0]
        price = price[0].text[1:]
        ret.append((gidurl, price))
        ret = list(set(ret))
    return ret
Exemplo n.º 45
0
def list_parser(task, rule):
    t = etree.HTML(task['text'])
    nodes = t.xpath(rule['nodes'])
    ret = []
    for node in nodes:
        gid = node.xpath(rule['gid'])
        price = node.xpath(rule['price'])
        if not gid or not price:
            log_with_time("bad response: %r" % task['url'])
            continue
        gid = burl + gid[0]
        price = price[0].text
        price = re_price.search(price).group()
        ret.append((gid, price))
    return ret
Exemplo n.º 46
0
def list_parser(task, rule): 
    import pdb
    t = etree.HTML(task["text"])
    nodes = t.xpath(rule["node"])
    ret = []
    for node in nodes:
        link = node.xpath(rule["link"])
        price = node.xpath(rule["price"])
        if not link or not price:
            log_with_time("rule error: %s" % task["old_url"])
            continue
        p = fix_price(price[0]) 
        ret.append((str(link[0]), str(p), 1)) 
    result = format_price(ret)
    return result 
Exemplo n.º 47
0
def list_parser(task, rule):
    tree = etree.HTML(task["text"])
    nodes = tree.xpath(rule["node"])
    ret = []
    for node in nodes:
        gidurl = node.xpath(rule['gidurl'])
        price = node.xpath(rule['price'])
        if not gidurl or not price:
            log_with_time("list parser err: %r" % task['url'])
            continue
        gidurl = gidurl[0]
        price = price[0].text[1:]
        ret.append((gidurl, price))
        ret = list(set(ret))
    return ret
Exemplo n.º 48
0
def promo_filter(item): 
    url, sku = item 
    parts =re.findall("/([A-Za-z-0-9]+)\.h", url)
    if not parts:
        log_with_time("url rule error: %s" % url)
    pid, sid = parts[0].split("-") 
    #if "A" in url:
    #    goodsNo = re.findall("([0-9]+)\.html", url)[0]
    #else:
    #    goodsNo = sku
    p = promo_url.format(time = int(time.time() * 1000), goodsNo = sku, sid = sid,  pid = pid)
    return {
            "url": p, 
            "old": url
            }