예제 #1
0
def level2(arr_oneurl, arr_onename):
    global KEEPDIR
    allfile = listfiles(KEEPDIR + "/2urls", ".md")
    emptyfile = listfiles(KEEPDIR + "/2urls", "-url.mdxx")
    for two in range(len(arr_oneurl)):
        # 已经抓过!
        prefix = str(two + 1)
        if prefix + '-url.mdxx' in emptyfile or (prefix + '-name.md' in allfile
                                                 and prefix + "-url.md"
                                                 in allfile):
            logger.info("已存在!第" + prefix + "个一级类目:" + arr_oneurl[two] +
                        "的二级类目...")
            continue
        twocontent = ratedownload(arr_oneurl[two])
        if twocontent == None or twocontent == 0:
            continue
        else:
            twocontent = twocontent.decode('utf-8', 'ignore')
        arr_twourl, arr_twoname = urlparse(twocontent, level=2)
        logger.warning("正抓取!第" + prefix + "个一级类目:" + arr_oneurl[two] +
                       "的二级类目...")
        logger.warning("还剩下" + str(len(arr_oneurl) - two + 1) + "个一级类目")
        logger.info(arr_twourl)
        savetofile("2urls/" + prefix + "-url.md", arr_twourl)
        savetofile("2urls/" + prefix + "-name.md", arr_twoname)

    logger.warning("已经抓取了二级类目下的所有url...")
예제 #2
0
def level6(catchfiles=[]):
    global KEEPDIR
    # 1-1-1-url.md 1-1-2-url.md
    # 四级所有URL文件
    # 并发文件,如果传入文件,那么并发
    if not catchfiles:
        level4file = listfiles(KEEPDIR + "/5urls", "url.md")
    else:
        logger.warning("并发文件")
        level4file = catchfiles

    # 五级下所有文件
    level5file = listfiles(KEEPDIR + "/6urls", "md")
    emptyfile = listfiles(KEEPDIR + "/6urls", "-url.mdxx")

    # 遍历四级文件
    # position为文件序列
    for position in range(len(level4file)):  # 文件名
        filename = level4file[position]
        # 文件名前缀位置
        weizhi = filename.split("-url")[0]

        urls = readfile("5urls/" + filename)
        # urlposition为链接序列
        for urlposition in range(len(urls)):
            # 已经抓过!1-1-1-1-url.md
            prefix = str(urlposition + 1)
            if weizhi + '-' + prefix + '-url.mdxx' in emptyfile or (
                    weizhi + '-' + prefix + '-name.md' in level5file
                    and weizhi + '-' + prefix + '-url.md' in level5file):
                logger.info("已存在!第" + str(position + 1) + "个四级类目:" + filename +
                            ",第" + prefix + "个五级类目:" + urls[urlposition] +
                            "的六级类目...")
                continue
            fourcontent = ratedownload(urls[urlposition])
            if fourcontent == None or fourcontent == 0:
                continue
            else:
                fourcontent = fourcontent.decode('utf-8', 'ignore')
            arr_foururl, arr_fourname = urlparse(fourcontent, level=6)
            logger.warning("正抓取!第" + str(position + 1) + "个四级类目:" + filename +
                           ",第" + prefix + "个五级类目:" + urls[urlposition] +
                           "的六级类目...")
            logger.warning("本目录还剩" + str(len(urls) - urlposition + 1) +
                           "个五级类目,排队" + str(len(level4file) - position + 1) +
                           "个四级类目")
            logger.info(arr_foururl)
            savetofile("6urls/" + weizhi + '-' + prefix + '-url.md',
                       arr_foururl)
            savetofile("6urls/" + weizhi + '-' + prefix + '-name.md',
                       arr_fourname)
    logger.warning("已经抓取了六级类目下的所有url...")
    return "ok!!!----"
예제 #3
0
def level3(catchfiles=[]):
    global KEEPDIR
    # 1-url.md 2-url.md
    # 二级所有URL文件
    # 并发文件,如果传入文件,那么并发
    if not catchfiles:
        level2file = listfiles(KEEPDIR + "/2urls", "url.md")
    else:
        logger.warning("并发文件")
        level2file = catchfiles

    # 三级下所有文件
    level3file = listfiles(KEEPDIR + "/3urls", "md")
    emptyfile = listfiles(KEEPDIR + "/3urls", "-url.mdxx")

    # 遍历二级文件
    # position为文件序列
    for position in range(len(level2file)):
        # 文件名
        filename = level2file[position]
        # 文件名前缀位置
        weizhi = filename.split("-url")[0]
        urls = readfile("2urls/" + filename)
        # urlposition为链接序列
        for urlposition in range(len(urls)):
            # 已经抓过!1-2-url.md
            prefix = str(urlposition + 1)
            if weizhi + "-" + prefix + '-url.mdxx' in emptyfile or (
                    weizhi + '-' + prefix + '-name.md' in level3file
                    and weizhi + '-' + prefix + '-url.md' in level3file):
                logger.info("已存在!第" + weizhi + "个一级类目:" + filename + ",第" +
                            prefix + "个二级类目:" + urls[urlposition] + "的三级类目...")
                continue
            threecontent = ratedownload(urls[urlposition])
            if threecontent == None or threecontent == 0:
                continue
            else:
                threecontent = threecontent.decode('utf-8', 'ignore')

            arr_threeurl, arr_threename = urlparse(threecontent, level=3)
            logger.warning("正抓取!第" + weizhi + "个一级类目:" + filename + ",第" +
                           prefix + "个二级类目:" + urls[urlposition] + "的三级类目...")
            logger.warning("本目录还剩" + str(len(urls) - urlposition + 1) +
                           "个二级类目,排队" + str(len(level2file) - position + 1) +
                           "个一级类目")
            logger.info(arr_threeurl)
            savetofile("3urls/" + weizhi + '-' + prefix + '-url.md',
                       arr_threeurl)
            savetofile("3urls/" + weizhi + '-' + prefix + '-name.md',
                       arr_threename)
    logger.warning("已经抓取了三级类目下的所有url...")
예제 #4
0
def level1():
    global KEEPDIR
    allfile = listfiles(KEEPDIR, ".md")
    if 'onename.md' in allfile and "oneurl.md" in allfile:
        arr_oneurl = readfile("oneurl.md")
        arr_onename = readfile("onename.md")
        logger.warning("一级类目已经存在,直接抓取二级类目的url...")
    else:
        # 下面是一级类目的抓取
        # 一级目录下的网址
        firsturl = "https://www.amazon.com/Best-Sellers/zgbs"
        onecontent = ratedownload(firsturl)
        if onecontent == None or onecontent == 0:
            raise
        else:
            onecontent = onecontent.decode('utf-8', 'ignore')
        arr_oneurl, arr_onename = urlparse(onecontent)
        savetofile("oneurl.md", arr_oneurl)
        savetofile("onename.md", arr_onename)
        logger.warning("已经抓取了一级类目:" + firsturl + "的所有url...")
        logger.info(arr_oneurl)
    return arr_oneurl, arr_onename
예제 #5
0
def unitlogic(url, mysqlconfig):
    global DATA_DIR
    # 抓取的类目URL
    catchurl = url[1]
    # 类目名
    catchname = url[2]

    # 页数
    try:
        page = int(url[3])
    except:
        page = 5

    # 类目ID
    id = url[0]
    # 大类名
    bigpname = url[4]
    # 数据库
    db = url[6]

    todays = tool.log.TODAYTIME
    year = todaystring(1)

    if getconfig()["ipinmysql"]:
        where = "mysql"
    else:
        where = "local"

    keepdir = createjia(DATA_DIR + "/data/items/" + year + "/" +
                        bigpname.replace(" ", "") + "/" + todays)
    detaildir = createjia(DATA_DIR + "/data/detail/" + year + "/" +
                          bigpname.replace(" ", "") + "/" + todays + "/" + id)

    listheader = {
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Connection": "keep-alive",
        "Accept-Language": "en-US;q=0.8,en;q=0.5",
        "Upgrade-Insecure-Requests": "1",
        'Host': 'www.amazon.com'
    }
    #################
    parsecontent = {}
    if fileexsit(keepdir + "/" + id + ".md"):
        with open(keepdir + "/" + id + ".md", "rb") as f:
            parsecontent = stringToObject(f.read().decode("utf-8", "ignore"))
    else:
        listcontent = ratedownload(url=catchurl,
                                   where=where,
                                   config=mysqlconfig,
                                   header=listheader,
                                   isdetail=False)
        if listcontent:
            parsecontent, isphone = phonelistparse(
                listcontent.decode("utf-8", "ignore"))
            if isphone:
                if parsecontent:
                    if phoneinsertlist(parsecontent, url):
                        with open(keepdir + "/" + id + ".md", "wb") as f:
                            f.write(
                                objectToString(parsecontent).encode("utf-8"))
                else:
                    logger.error("手机列表页解析出错:" + catchurl)
            else:
                # PC端
                if getconfig()["force"]:
                    try:
                        page = int(getconfig()["forcenum"])
                    except:
                        page = 5
                for i in range(1, min(5, page)):
                    items3 = "/ref=zg_bs_apparel_pg_" + str(
                        i + 1) + "?_encoding=UTF8&ajax=1&pg=" + str(i + 1)
                    items17 = "/ref=zg_bs_apparel_pg_" + str(
                        i + 1
                    ) + "?_encoding=UTF8&&isAboveTheFold=0&ajax=1&pg=" + str(
                        i + 1)
                    listheader = {
                        "Accept":
                        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                        # "Accept-Encoding": "gzip, deflate, br",
                        "Connection": "keep-alive",
                        "Accept-Language": "en-US;q=0.8,en;q=0.5",
                        "Upgrade-Insecure-Requests": "1",
                        'Referer': catchurl,
                        'Host': 'www.amazon.com',
                        "X-Requested-With": "XMLHttpRequest"
                    }
                    try:
                        content3 = ratedownload(url=catchurl + items3,
                                                where=where,
                                                config=mysqlconfig,
                                                header=listheader)
                        content17 = ratedownload(url=catchurl + items17,
                                                 where=where,
                                                 config=mysqlconfig,
                                                 header=listheader)
                        if content3 == 0 and content17 == 0:
                            break
                        if content3 == None and content17 == None:
                            continue
                        if content3:
                            temp3 = phonetopclistparse(content3)
                            for i in temp3:
                                parsecontent[i] = temp3[i]
                        if content17:
                            temp17 = phonetopclistparse(content17)
                            for j in temp17:
                                parsecontent[j] = temp17[j]
                    except Exception as e:
                        logger.error("手机到PC列表页出错" + str(i + 1) + ",跳过")
                        logger.error(e, exc_info=1)

                if parsecontent:
                    if phoneinsertlist(parsecontent, url):
                        with open(keepdir + "/" + id + ".md", "wb") as f:
                            f.write(
                                objectToString(parsecontent).encode("utf-8"))
                else:
                    logger.error("最后列表页解析出错:" + catchurl)
    ##################
    for asin in parsecontent:
        try:
            # smallrank-asin
            smallrank = parsecontent[asin][0]
            detailname = str(smallrank) + "-" + asin
            rankeep = detaildir + "/" + detailname
            if fileexsit(rankeep + ".md"):
                loggers.warning("Look!存在详情页:" + rankeep)
                continue
            if fileexsit(rankeep + ".emd"):
                loggers.warning("存在(页面找不到))!" + rankeep)
                continue
            detailurl = "https://www.amazon.com/dp/" + asin
            # detailurl = "https://www.amazon.com/gp/product/" + asin
            if fileexsit(rankeep + ".html"):
                with open(rankeep + ".html", "rb") as ff:
                    detailpage = ff.read()
            else:
                detailheader = {
                    "Accept":
                    "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                    "Connection": "keep-alive",
                    "Accept-Language": "en-US;q=0.8,en;q=0.5",
                    "Upgrade-Insecure-Requests": "1",
                    'Host': 'www.amazon.com'
                }
                detailpage = ratedownload(url=detailurl,
                                          where=where,
                                          config=mysqlconfig,
                                          header=detailheader,
                                          isdetail=True)
                if detailpage == None:
                    continue
                if detailpage == 0:
                    with open(rankeep + ".emd", "wt") as f:
                        f.write("1")
                    continue
                else:
                    with open(rankeep + ".html", "wb") as f:
                        f.write(detailpage)
            try:
                pinfo = phonedetailparse(detailpage.decode("utf-8", "ignore"))
            except:
                try:
                    # 不是手机端
                    pinfo = pinfoparse(detailpage.decode("utf-8", "ignore"))
                except:
                    logger.error("PC解析詳情頁出錯:" + detailurl)
                    continue
            try:
                pinfo["smallrank"] = int(smallrank)
            except:
                pinfo["smallrank"] = -1
            pinfo["title"] = parsecontent[asin][3]
            pinfo["price"] = parsecontent[asin][4]
            pinfo["asin"] = asin
            pinfo["url"] = detailurl
            pinfo["img"] = parsecontent[asin][2]
            if len(pinfo["img"]) > 240:
                pinfo["img"] = ""
            pinfo["name"] = catchname
            pinfo["bigname"] = bigpname
            pinfo["id"] = todays + "-" + detailname

            # 插入数据库
            if phoneinsertexsitlist(pinfo, url):
                with open(rankeep + ".md", "wt") as f:
                    f.write("1")
            # 可以不管这个!失败也不要紧
            phoneinsertpmysql(pinfo, db, id)
        except Exception as err:
            logger.error("发生一件商品:" + asin + ":ERROR")
            logger.error(err, exc_info=1)
            pass
    # 成功
    logger.warning(todays + "|" + bigpname + "|" + db + ":" + id +
                   " completed")
예제 #6
0
def unitlogic(url, mysqlconfig):
    global DATA_DIR
    # url: ('1-1', 'https://www.amazon.com/Best-Sellers-Appliances-Cooktops/zgbs/appliances/3741261/ref=zg_bs_nav_la_1_la/161-2441050-2846244', 'Cooktops', 2, 5, '1', '1', 'Appliances')

    # 抓取的类目URL
    catchurl = url[1]
    # 类目名
    catchname = url[2]
    # 类目ID
    id = url[0]
    # 大类名
    bigpname = url[4]
    # 页数
    page = url[3]
    # 级别
    level = url[5]

    # 数据库
    db = url[6]

    # 2016/Appl/20160606/
    todays = tool.log.TODAYTIME
    year = todaystring(1)
    db = getconfig()["dbprefix"] + db
    if not dbexist(db, id, todays):
        return

    if getconfig()["ipinmysql"]:
        where = "mysql"
    else:
        where = "local"

    keepdir = createjia(DATA_DIR + "/data/items/" + year + "/" + bigpname.replace(" ", "") + "/" + todays + "/" + id)

    detaildir = createjia(DATA_DIR + "/data/detail/" + year + "/" + bigpname.replace(" ", "") + "/" + todays + "/" + id)

    detailall = {}

    # 列表頁抓完?
    finish = listfiles(keepdir, ".jinhan")
    pagefinish = False
    if len(finish) >= 1:
        pagefinish = True

    # 重試多次仍然抓不到頁面?
    retryhappen = False
    if getconfig()["force"]:
        page = getconfig()["forcenum"]
    for i in range(min(page, 5)):
        itempath = keepdir + "/" + str(i + 1) + ".md"
        if fileexsit(itempath):
            logger.warning("已存在:" + id + "(" + str(i + 1) + ")-" + bigpname + ":" + catchname + "(" + str(
                    level) + ") --" + catchurl)
            temp = readfilelist(itempath)

            for i in temp:
                try:
                    temptemp = i.split(",")
                    insertlist(temptemp, url)
                    detailall[temptemp[0]] = temptemp[1]
                except:
                    logger.error("列表页读取失败:内容行|" + i)
            continue
        else:
            # 如果不存在文件且已經完成,證明頁數不足
            if pagefinish:
                break
            logger.warning("抓取:" + id + "(" + str(i + 1) + ")-" + bigpname + ":" + catchname + "(" + str(
                    level) + ") --" + catchurl)
            # 构造页数
            # ?_encoding=UTF8&pg=1&ajax=1   3个商品
            # ?_encoding=UTF8&pg=1&ajax=1&isAboveTheFold=0 17个商品
            # https://www.amazon.com/Best-Sellers-Clothing/zgbs/apparel/ref=zg_bs_apparel_pg_5?_encoding=UTF8&pg=5&ajax=1
            # Referer:https://www.amazon.com/gp/bestsellers/apparel/ref=pd_zg_hrsr_a_1_1
            # Referer:https://www.amazon.com/gp/bestsellers/apparel/ref=pd_zg_hrsr_a_1_1
            # X-Requested-With:XMLHttpRequest
            items3 = "/ref=zg_bs_apparel_pg_" + str(i + 1) + "?_encoding=UTF8&ajax=1&pg=" + str(i + 1)
            items17 = "/ref=zg_bs_apparel_pg_" + str(i + 1) + "?_encoding=UTF8&&isAboveTheFold=0&ajax=1&pg=" + str(
                i + 1)
            listheader = {
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                # "Accept-Encoding": "gzip, deflate, br",
                "Connection": "keep-alive",
                "Accept-Language": "en-US;q=0.8,en;q=0.5",
                "Upgrade-Insecure-Requests": "1",
                # 'Referer': 'https://www.amazon.com/',
                'Host': 'www.amazon.com'
            }
            content3 = ratedownload(url=catchurl + items3, where=where, config=mysqlconfig, header=listheader)
            content17 = ratedownload(url=catchurl + items17, where=where, config=mysqlconfig, header=listheader)
            if content3 == 0 or content17 == 0:
                break
            if content3 == None:
                retryhappen = True
                continue
            if content17 == None:
                retryhappen = True
                continue
            try:
                # {'91':['91', 'https://www.amazon.com/dp/B003Z968T0', 'WhisperKOOL® Platinum Split System 80...']}
                temp3 = rateparse(content3)
                temp17 = rateparse(content17)
                if temp3 == {} and temp17 == {}:
                    continue
                else:
                    with open(itempath, "wb") as f:
                        for i in sorted(temp3.keys()):
                            if insertlist(temp3[i], url):
                                detailall[i] = temp3[i][1]
                                f.write((",".join(temp3[i]) + "\n").encode("utf-8"))
                        for j in sorted(temp17.keys()):
                            if insertlist(temp17[j], url):
                                detailall[i] = temp17[j][1]
                                f.write((",".join(temp17[j]) + "\n").encode("utf-8"))
            except Exception as err:
                logger.error("解析列表頁錯誤:" + catchurl + ":" + str(i + 1))
                logger.error(err, exc_info=1)
                pass
    if retryhappen == False and pagefinish == False:
        with open(keepdir + "/ko.jinhan", "wt") as f:
            f.write("1")

    for rank in detailall:
        detailname = rank + "-" + detailall[rank]
        rankeep = detaildir + "/" + detailname
        if fileexsit(rankeep + ".md"):
            loggers.warning("存在!" + rankeep)
            continue
        if fileexsit(rankeep + ".emd"):
            loggers.warning("存在(页面找不到))!" + rankeep)
            continue
        detailurl = "https://www.amazon.com/dp/" + detailall[rank]

        # TODO
        # 本地文件不保存
        if fileexsit(rankeep + ".html"):
            with open(rankeep + ".html", "rb") as ff:
                detailpage = ff.read()
        else:
            detailheader = {
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                # "Accept-Encoding": "gzip, deflate, br",
                "Connection": "keep-alive",
                "Accept-Language": "en-US;q=0.8,en;q=0.5",
                "Upgrade-Insecure-Requests": "1",
                # "Cache-Control":"max-age=0",
                # 'Referer': 'https://www.amazon.com/',
                'Host': 'www.amazon.com'
            }
            detailpage = ratedownload(url=detailurl, where=where, config=mysqlconfig, header=detailheader)
            if detailpage == None:
                continue
            if detailpage == 0:
                with open(rankeep + ".emd", "wt") as f:
                    f.write("1")
                continue
            else:
                if getconfig()["localkeep"]:
                    with open(rankeep + ".html", "wb") as f:
                        f.write(detailpage)
        try:
            pinfo = pinfoparse(detailpage.decode("utf-8", "ignore"))
        except:
            logger.error("解析詳情頁出錯:" + detailurl)
            continue
        try:
            pinfo["smallrank"] = int(rank)
        except:
            pinfo["smallrank"] = -1
        pinfo["asin"] = detailall[rank]
        pinfo["url"] = detailurl
        pinfo["name"] = catchname
        pinfo["bigname"] = bigpname
        pinfo["id"] = todays + "-" + detailname

        # 插入数据库,失败也不要紧
        insertexsitlist(pinfo, url)
        if insertpmysql(pinfo, db, id):
            with open(rankeep + ".md", "wb") as f:
                f.write(objectToString(pinfo).encode("utf-8"))

    # 成功
    logger.warning(todays + "|" + bigpname + "|" + db + ":" + id + " completed")