Python get_html_src示例，Tools.get_html.get_html_src Python示例

示例#1

0

显示文件

def get_id():
    #Best-BuyCameras & CamcordersCamcordersAction-Camcorders-Action Camcorders
    # http://www.bestbuy.com/site/ipad-tablets-ereaders/tablets/pcmcat209000050008.c?id = pcmcat209000050008

    # base_url = '''http://www.bestbuy.com/site/searchpage.jsp?cp=[i]&searchType=search&browsedCategory=abcat0701000&id=pcat17071&nrp=24&qp=condition_facet%3DCondition~New&sc=Global&searchVariant=A&st=categoryid%24abcat0701000&type=page&usc=All%20Categories'''
    base_url = '''http://www.bestbuy.com/site/searchpage.jsp?cp=[i]&searchType=search&_dyncharset=UTF-8&ks=960&sc=Global&list=y&usc=All%20Categories&type=page&id=pcat17071&iht=n&seeAll=&browsedCategory=pcmcat309300050015&st=pcmcat309300050015_categoryid%24abcat0502000&qp=features_facet%3DFeatures~2-in-1%20Design^condition_facet%3DCondition~New'''
    # 只有一页的时候
    # html = get_html.get_html_src(base_url)
    # id_list = re.findall(r'data-sku-id="(.*?)"', html, re.S)
    # print len(id_list)
    # for goods_id in id_list:
    #     with open("./id.txt", "aw") as f:
    #         f.write(goods_id + "\n")
    #         print goods_id

    for i in range(1, 6):  # 页码，共2页
        url = base_url.replace("[i]", str(i))
        print (url)
        # time.sleep(5)
        html = get_html.get_html_src(url)

        id_list = re.findall(r'data-sku-id="(.*?)"', html, re.S)
        print (len(id_list))
        for goods_id in id_list:
            with open("./id.txt", "aw") as f:
                f.write(goods_id + "\n")
                print (goods_id)

示例#2

0

显示文件

def get_asin(base_url, page):
    #Electronics : Computers & Accessories : Monitors : Prime Eligible : New
    for i in range(0, page):  # 页码
        start_num = i * 25
        url = base_url.replace("[page]",
                               str(i)).replace('[start]', str(start_num))
        time.sleep(0.5)
        html = get_html.get_html_src(url)

        url_list_re = re.findall(r'<td colspan="2">(.*?)</td>', html, re.S)
        url_list = re.findall(r'<A HREF="(.*?)">', str(url_list_re), re.S)
        # print len(url_list)
        for goods_url in url_list:
            with open("./Result/items_url.txt", "aw") as f:
                f.write('http://www.frys.com/' + goods_url + "\n")
                print(goods_url)

示例#3

0

显示文件

文件： get_price_url.py 项目： Strongc/LiuFan_Spider

def get_count(price_range, url):  # 没有解析出count的url记录下来。
    try:
        html = get_html.get_html_src(url)
        if html == '' or -1 != html.find(
                'Sorry, we just need to make sure you'):
            lock.acquire()
            captcha_url_file.write(url + '\n')
            captcha_url_file.flush()
            lock.release()
            return
        if html == '404 error':
            lock.acquire()
            not_list_file.write(url + '\n')
            not_list_file.flush()
            lock.release()
            # print 'product not found'
            return
        # 没有抓下来的页面
        if html == 'time out or other errors':
            lock.acquire()
            not_crawl_file.write(price_range + '\n')
            not_crawl_file.flush()
            lock.release()
            return
        num = re.search('<h2 id="s-result-count".*?>1-60 of (.*?) result',
                        html)
        if num == None:
            num = re.search('<h2 id="s-result-count".*?>(.*?) result', html)

        if num != None:
            num = int(num.group(1).replace(',', ''))
            return num
        else:
            if 'did not match any products.' in html:
                lock.acquire()
                f_no_product.write(url + '\n')
                f_no_product.flush()
                lock.release()
            else:
                with open('get_count_fail.txt', 'aw') as f:
                    f.write(price_range + '\n')
            return
    except Exception as e:
        print(str(e))

示例#4

0

显示文件

文件： get_asins.py 项目： Strongc/LiuFan_Spider

def handle(url):
    try:
        # html = tool.gethtmlproxy(url)
        html = get_html.get_html_src(url)
        if html == '' or -1 != html.find(
                'Sorry, we just need to make sure you'):
            lock.acquire()
            f_fail.write(url + '\t空或验证码\n')
            f_fail.flush()
            lock.release()
            return
        if html == '404 error':
            lock.acquire()
            f_fail.write(url + '\t404 error\n')
            f_fail.flush()
            lock.release()
            return
        if html == 'time out or other errors':
            lock.acquire()
            f_fail.write(url + '\ttime out or other errors\n')
            f_fail.flush()
            lock.release()
            return
        tmp_asins = re.findall(r'data-asin="(.*?)"', html)
        print(tmp_asins)
        if len(tmp_asins) != 0:
            for asin in tmp_asins:
                lock.acquire()
                f_asins.write(asin + '\n')
                f_asins.flush()
                lock.release()
            lock.acquire()
            f_success.write(url + '\n')
            f_success.flush()
            lock.release()
        else:
            lock.acquire()
            f_fail.write(url + '\ttmp_asins为空\n')
            f_fail.flush()
            lock.release()
    except Exception as e:
        print(str(e))

示例#5

0

显示文件

def handle(asin):
    asin = asin.strip()
    try:
        baseurl = "https://www.amazon.ca/dp/[asin]"
        url = baseurl.replace('[asin]', asin)
        print (asin)
        html = get_html.get_html_src(url)
        robot_check = re.findall('<title dir="ltr">Robot Check</title>', html)
        if robot_check:
            lock.acquire()
            captcha_url_file.write(asin + '\n')
            captcha_url_file.flush()
            lock.release()
            print ("robot_check")
        else:
            error404 = re.findall("We're sorry. The Web address you entered is not a functioning page on our site", html)
            if error404:
                lock.acquire()
                not_list_file.write(asin + '\n')
                not_list_file.flush()
                lock.release()
                print ("not exit")
            else:
                buyboxinfo = [asin]
                price = re.findall(r'New <span class="olp-from">from</span> CDN\$ (.*?)\n', html)[0]
                buyboxinfo.append(str(price))
                lock.acquire()
                result_file.write("\t".join(buyboxinfo) + "\n")
                result_file.flush()
                success_asin_file.write(asin + '\n')
                success_asin_file.flush()
                lock.release()
                print ("success")
    except Exception as e:
        print (str(e))
        lock.acquire()
        captcha_url_file.write(asin + '\n')
        captcha_url_file.flush()
        lock.release()
        print ("error: not html")

示例#6

0

显示文件

文件： get_store.py 项目： Strongc/LiuFan_Spider

def get_info(sku):
    global false, true, null
    false = False
    true = True
    null = None

    file_id = open("./test.json", "r")
    Ids = file_id.readlines()
    # result_file = open(result_file, "aw")
    # with open('./Result/items.xls','aw') as f:
    url = 'http://api.bestbuy.com/v1/products/' + sku[:-1] + '.json?apiKey=68zbtdy4wmac9dgvnbhwke4e'

    info = get_html.get_html_src(url)
    items_info = eval(info)
    if items_info.has_key("salePrice"):
        price = items_info['salePrice']
    else:
        price = 'wrong'

    a = Ids[0]
    m = eval(a)
    b = m['storeAvailabilities']
    myconf = {
        "ISSAQUAH WA": "0",
        "TUKWILA WA": "0",
        "FEDERAL WAY WA": "0",
        "BELLEVUE WA": "0",
        "SOUTH CENTER": "0",
    }
    mydata = {}
    for li in b:

        if myconf.has_key(li['store']["name"]):
            if li['skuAvailabilities'][0].has_key("lowOnStock"):
                mydata[li['store']["name"]] = "Low"
            else:
                mydata[li['store']["name"]] = li['skuAvailabilities'][0][
                    "availabilityType"]

    realdata = myconf
    print(realdata)
    for (k, v) in mydata.items():

        realdata[k] = v
    realli = ["2016-11-24", sku[:-1]]
    realli.append(str(price))
    state = ''
    low = ''
    donot = ''
    nothave = ''
    for (k, v) in realdata.items():
        if v == "InStore":
            v = "3"
            state = "1"
        elif v == 'ShipToStore':
            v = '1'
            donot = "0"
        elif v == 'Low':
            v = "2"
            low = "2"
        else:
            nothave = '4'
        realli.append(v)
    if state == "1":
        realli.append(state)
    elif state == '' and low == "2":
        realli.append(low)
    elif state == '' and low == '' and donot == '0':
        realli.append(donot)
    else:
        realli.append("4")
    print(realli)
    csv_writer.writerow(realli)

示例#7

0

显示文件

文件： get_info.py 项目： Strongc/LiuFan_Spider

def get_info(file_name):
    global false, true, null
    false = False
    true = True
    null = None

    file_id = open("./Id_last.txt", "r")
    titles = ['itemsId', 'price', 'Original_price', 'ship', 'stock', 'brand', 'title', 'img1', 'img2', 'img3',
              'img4','img5', 'detail1', 'detail2', 'detail3', 'detail4', 'detail5','Specification']

    # 调用函数create_titles
    create_titles(file_name, titles)

    Ids = file_id.readlines()

    # result_file = open(result_file, "aw")
    # with open('./Result/items.xls','aw') as f:
    for goods_id in Ids:
        goods_id = goods_id.split("\n")[0]
        # print goods_id
        url = 'http://api.bestbuy.com/v1/products/' + str(goods_id) + '.json?apiKey=68zbtdy4wmac9dgvnbhwke4e'
        info = get_html.get_html_src(url)
        # 将字符串str当成有效的表达式来求值并返回计算结果，即去掉引号
        items_info = eval(info)
        itemsId = items_info['sku']
        # print itemsId

        goods_info = []

        goods_info.append(str(itemsId))

        #售价
        price = items_info['salePrice']
        goods_info.append(str(price))
        #一般价
        Original_price = items_info['regularPrice']
        goods_info.append(str(Original_price))
        #ship
        ship = items_info['shippingCost']
        if ship == '':
            ship = "Null"
        goods_info.append(str(ship))
        #stock
        stock_info = items_info['onlineAvailability']
        #判断是否有库存
        if stock_info:
            stock = 'in stock'
        else:
            stock = 'out of stock'
        goods_info.append(str(stock))

        #brand
        brand = ''
        if items_info.has_key('manufacturer'):
            brand = items_info['manufacturer']
        goods_info.append(str(brand))

        #title
        title = items_info['name']
        goods_info.append(str(title))

        #image
        url = 'http://www.bestbuy.com/site/products/' + str(goods_id) + '.p'
        html = get_html.get_html_src(url)
        img_list = re.findall(r'<li data-target="#carousel-main".*?src="(.*?);', html, re.S)
        while len(img_list) < 5:
            # for img_info in img_list:
            #     if img_info.find("Image coming soon") == -1:
            #         img_list = []
            #     else:
            img_list.append("")

        # print len(img_list)
        imgs = img_list[:5]  # 最多取8张图片
        # imgs = [8]
        # for i in range(8):
        #     imgs[i] = img_list[i]
        # print "-=-=-=-=-=-=-=-=-=-="
        # print imgs
        # print "-=-=-=-=-=-=-=-=-=-="
        goods_info += imgs

        # detail = ''
        detail_url = 'http://www.bestbuy.com/site/searchpage.jsp?st='+goods_id+'&_dyncharset=UTF-8&id=pcat17071&type=page&sc=Global&cp=1&nrp=&sp=&qp=&list=n&af=true&iht=y&usc=All+Categories&ks=960&keys=keys'
        detail_html = get_html.get_html_src(detail_url)
        # print detail_html

        detail_list = []

        detail_list_src = re.findall(r'<div class="short-description">(.*?)</div>', detail_html, re.S)
        if detail_list_src ==[]:
            detail_list.append('Brand Type:'+ brand)
        # print detail_list_src
        if detail_html:
            for detail_items in detail_list_src:
                # print detail_items
                detail_list = re.findall(r'<li>(.*?)</li>', detail_items, re.S)
                if detail_list ==[]:
                    detail_list = detail_list_src
                # print "--------------"
                # print detail_list
                # detail_list.append(dtmp1)


            while len(detail_list) < 5:
                detail_list.append("")
            # print len(img_list)
            detail_info= detail_list[:5]  # 最多取8张图片
            goods_info += detail_info

        # if items_info.has_key('shortDescription'):
        #     detail = items_info['shortDescription']
        # print goods_id

        # print "-=-=-=-=-=-=-=-=-=-="
        # print detail
        # print "-=-=-=-=-=-=-=-=-=-="


        # # feature
        # if items_info.has_key('longDescription'):
        #     feature_info = items_info['longDescription']
        #     if len(feature_info) > 500:
        #         feature_info = str(feature_info[:500])
        # else:
        #     feature_info = ""
        # # print len(feature_info)
        #
        # goods_info.append(str(feature_info))
        #     # print feature_info

        #spct
        spct_info = re.findall(r'data-tabs=.*?fragmentUrl&quot;:&quot;(.*?);', html, re.S)
        # 1139003

     # data-tabs=
     # "[{&quot;fragmentUrl&quot;:&quot;/site/kodak-pixpro-sp360-hd-action-camera-aqua-sport-pack-yellow/1139003.p

        if spct_info:
            spct_url = 'http://www.bestbuy.com' + spct_info[0]+";template=_specificationsTab"

        # spct_info = re.search(r'data-tabs=.*?"(.*?)"', html, re.S)
        # if spct_info:
        #     spct_all = spct_info.group(1)
        #     spct_list = spct_all.replace('&quot;', '"')
        #     spct_list = eval(spct_list)
        #     spct_url_info = spct_list[1]['fragmentUrl']
        #     spct_url = 'http://www.bestbuy.com' + str(spct_url_info)

            html1 = get_html.get_html_src(spct_url)

         #   spct_html = re.findall(r'<div class="specification-title">Key Specs</div>.*?<div class="header">(.*?)</div>.*?<div class="specification-value">(.*?)</div>.*?<i class="fistack info-icon">.*?<div class="header">(.*?)</div>.*?<div class="specification-value">(.*?)</div>.*?<div class="specification-name"><span>(.*?)</span>.*?<div class="specification-value">(.*?)</div>', html1, re.S)
            #spct_html = re.findall(r'<div class="specification-group key-specs">.*?<div class="specification-name">(.*?)</div>.*?<div class="specification-value">(.*?)</div>.*?<div class="specification-group">',html1)
            spct_html_limit = re.findall(r'<div class="specification-group key-specs">(.*?)<div class="specification-group">',html1,re.S)
            spct_html = re.findall(r'<div class="specification-name">(.*?)</div>.*?<div class="specification-value">(.*?)</div>',str(spct_html_limit),re.S)

            spct_dic = {}
            if spct_html:
                for spct_items in spct_html:
                    tmp1 = re.sub(r'<[^>]+>', '', spct_items[0], re.S)
                    # print "tmp1"
                    # print tmp1

                    retmp1 = tmp1.replace('&nbsp;','')
                    tmp2 = spct_items[1]
                    # print "tmp2"
                    # print tmp2
                    spct_dic[retmp1] = tmp2
                    # print "spct_dic"
                    # print spct_dic
            spct_dic = str(spct_dic)
            # print "---------------"
            # print spct_dic
        else:
            spct_dic = ""

        goods_info.append(spct_dic)
        # print goods_info
        # spct_len = len(spct_dic)
        # if spct_len > 2000:
        #     spct_dic = spct_dic[:2000]  # 如何字符长度大于2000，则取前面2000个字符
        # info['Specification'] = spct_dic

        # # spct_all = spct_info.group(1)
        # spct_list = spct_info.replace('&quot;', '"')
        # spct_list = eval(spct_list)
        # spct_url_info = spct_list[1]['fragmentUrl']
        # spct_url = 'http://www.bestbuy.com' + str(spct_url_info)
        # print spct_url

        #     feature_list = feature_info.split('.')
        #     feature = feature_list
        #     feature = ['detail1', 'detail2', 'detail3', 'detail4', 'detail5']
        #     feature_len = len(feature)
        #     features = detail
        #     feature_count = len(features)
        #     if feature_count < feature_len:
        #         features += ['' for _i in range(feature_len - feature_count)]
        #     feature_dict = dict(zip(feature, features))
        # return feature_dict

        # spct_info = re.findall(r'data-tabs=[\s]"(.*?)"', html, re.S)
        # pct_url = 'http://www.bestbuy.com' + str(spct_url_info)

        # print len(goods_info)

        # write info to file

        result_file.write("\t".join(goods_info) + "\n")

        print ('=============')
        print (goods_info)

        # f.flush()
        file_id.flush()
        result_file.flush()

示例#8

0

显示文件

def handle(asin):
    asin = asin.strip()
    # time.sleep(3)
    try:
        baseurl = "https://www.amazon.com/dp/[asin]"
        url = baseurl.replace('[asin]', asin)
        # time.sleep(30)
        html = get_html.get_html_src(url)
        print ('handling...')
        # 验证码页面
        if html == '' or -1 != html.find('Sorry, we just need to make sure you'):
            # time.sleep(180)
            lock.acquire()
            captcha_url_file.write(asin + '\n')
            captcha_url_file.flush()
            lock.release()
            return
        # 下架产品
        if html == '404 error':
            lock.acquire()
            not_list_file.write(asin + '\n')
            not_list_file.flush()
            lock.release()
            # print 'product not found'
            return
        # 没有抓下来的页面
        if html == 'time out or other errors':
            lock.acquire()
            not_crawl_file.write(asin + '\n')
            not_crawl_file.flush()
            lock.release()
            return

        buyboxinfo = []
        buyboxinfo.append(asin)

        stock = re.search('<span class="a-size-medium a-color-success">\s+(.*?)\.', html, re.S)
        if stock:
            stock = stock.group(1)
        else:
            stock = re.search('<span class="a-size-base a-color-state">\s+(.*?)\.', html, re.S)
            if stock:
                stock = stock.group(1)
            else:
                stock = re.search('<span class="a-color-success a-text-bold">\s+(.*?)\.', html, re.S)
                if stock:
                    stock = stock.group(1)
                else:
                    stock = re.search('<span class="a-size-medium a-color-price">\s+(.*?)\.', html, re.S)
                    if stock:
                        stock = stock.group(1)
                    else:
                        stock = ''
        buyboxinfo.append(regex_sub_info.sub(str(stock), ''))

        print (buyboxinfo)

        lock.acquire()
        result_file.write("\t".join(buyboxinfo) + "\n")
        result_file.flush()
        success_asin_file.write(asin + '\n')
        success_asin_file.flush()
        lock.release()
    except Exception as e:
        print (asin, e)

示例#9

0

显示文件

文件： buybox_book_us.py 项目： Strongc/LiuFan_Spider

def handle(asin):
    asin = asin.strip()
    try:
        baseurl = "http://www.amazon.com/gp/offer-listing/[asin]/ref=olp_f_primeEligible?ie=UTF8&f_new=true&f_primeEligible=true"
        url = baseurl.replace('[asin]', asin)

        # html = tool.gethtmlproxy(url)
        html = get_html.get_html_src(url)
        print ('handling...')
        # 验证码页面
        if html == '' or -1 != html.find('Sorry, we just need to make sure you'):
            lock.acquire()
            captcha_url_file.write(asin + '\n')
            captcha_url_file.flush()
            lock.release()
            return
        # 下架产品
        if html == '404 error':
            lock.acquire()
            not_list_file.write(asin + '\n')
            not_list_file.flush()
            lock.release()
            # print 'product not found'
            return
        # 没有抓下来的页面
        if html == 'time out or other errors':
            lock.acquire()
            not_crawl_file.write(asin + '\n')
            not_crawl_file.flush()
            lock.release()
            return

        buyboxinfo = []
        buyboxinfo.append(asin)

        author = re.search('<div id="olpProductByline" class="a-section a-spacing-mini">\s+by (.*?)\s+</div>', html,
                           re.S)
        if author:
            author = author.group(1).strip()
        else:
            author = ''
        buyboxinfo.append(regex_sub_info.sub(str(author), ''))

        price_list = re.findall(
            '<span class="a-size-large a-color-price olpOfferPrice a-text-bold">\s+\$(.*?)\s+</span>', html)
        if price_list:
            for i in range(len(price_list)):
                price = price_list[i].strip().replace(',', '')
                price_list[i] = float(price)
            min_price = min(price_list)
            fbastatus = 'FBA'
        else:
            min_price = ''
            fbastatus = 'FBM'
        buyboxinfo.append(regex_sub_info.sub(str(min_price), ''))
        buyboxinfo.append(regex_sub_info.sub(str(fbastatus), ''))
        print (buyboxinfo)

        lock.acquire()
        result_file.write("\t".join(buyboxinfo) + "\n")
        result_file.flush()
        success_asin_file.write(asin.strip() + '\n')
        success_asin_file.flush()
        lock.release()
    except Exception as e:
        print (asin, e)