Python Request示例，scrapy.Request Python示例

示例#1

0

显示文件

文件： taishi_xwj.py 项目： junjunjuner/xiwanji

    def goods(self, response):
        item = response.meta['item']
        sel = scrapy.Selector(response)
        url = response.url
        body = response.body
        ProductID = item['ProductID']
        PreferentialPrice = item['PreferentialPrice']
        price = item['price']

        if "error" in url or "2017?t" in url or "/?" in url:  #302重定向页面,写回原页面处理
            url = "https://item.jd.com/" + str(ProductID) + ".html"
            item = XiwanjiItem(ProductID=ProductID,
                               PreferentialPrice=PreferentialPrice,
                               price=price)
            yield scrapy.Request(url, callback=self.goods, meta={'item': item})
            return None

        # --------------------全球购网页---------------------------------------------
        elif "hk" in url:
            print("全球购：", url)

            #京东商品介绍部分
            detail_info = sel.xpath(".//div[@class='p-parameter']")  # 包含商品详情内容
            detail = detail_info.xpath(".//li/text()").extract()
            if detail[0] == '品牌： ':
                detail_brand = detail_info.xpath(
                    ".//li[1]/@title").extract()[0]
                detail[0] = detail[0] + detail_brand
            product_detail = '\"' + ' '.join(detail).replace('\t', '').replace(
                '\n', '').replace('  ', '') + '\"'
            detail_1 = detail_info.extract()  #缩小范围，从商品介绍部分获取想要的内容

            #商品名称
            try:
                p_Name = sel.xpath(".//div[@class='sku-name']/text()").extract(
                )[-1].strip('\"').strip('\n').strip().replace('\t', '')
                print(p_Name)
            except:
                p_Name = None

            # detail_info=sel.xpath(".//div[@class='p-parameter']/text()").extract()

            #店铺名称
            try:
                shop_name = sel.xpath(
                    ".//div[@class='shopName']/strong/span/a/text()").extract(
                    )[0]  # 店铺名称
            except:
                try:
                    shop = sel.xpath(
                        ".//div[@class='p-parameter']/ul[@class='parameter2']/li[3]/@title"
                    ).extract()[0]
                    if '店' in shop:
                        shop_name = shop
                    else:
                        shop_name = None
                except:
                    shop_name = None

            #京东规格与包装部分（将这部分的内容读为字典形式，x为字典）
            try:
                s = BeautifulSoup(body, 'lxml')
                guige = s.find('div', id_='specifications')
                x = {}
                guige2 = guige.find_all('td', class_='tdTitle')
                guige3 = guige.find_all('td', class_=None)
                for i in range(len(guige2)):
                    dt = re.findall(">(.*?)<", str(guige2[i]))
                    dd = re.findall(">(.*?)<", str(guige3[i]))
                    x.setdefault(dt[0], dd[0])
            except:
                x = None

            #商品品牌
            try:
                brand = x['品牌']
            except:
                brand = p_Name.split(" ")[0]

            if brand != p_Name:
                if ("（" and "）") in brand:
                    dd = re.findall("（.*?）", brand)[0]
                    brand = brand.replace(dd, '').replace(' ', '')
                if ("(" and ")") in brand:
                    dd = re.findall("\(.*?\)", brand)[0]
                    brand = brand.replace(dd, '').replace(' ', '')
                if brand == "Panasonic":
                    brand = "松下"
                if brand == "CHEBLO":
                    brand = "樱花"
                if brand == "MBO":
                    brand = "美博"
                if brand == "YAIR":
                    brand = "扬子"
                if brand == "PHLGCO":
                    brand = "飞歌"
                if brand == "FZM":
                    brand = "方米"
                if brand == "inyan":
                    brand = "迎燕"
                if brand == "JENSANY":
                    brand = "金三洋"

            #商品名称（型号）
            try:
                try:
                    X_name = re.findall(">货号：(.*?)<",
                                        detail_1[0])[0].strip().replace(
                                            brand, '')
                    if p_Name == None:
                        p_Name = X_name
                except:
                    try:
                        X_name = x['型号'].replace(brand, '')
                        if p_Name == None:
                            p_Name = X_name
                    except:
                        X_name = re.findall(">商品名称：(.*?)<",
                                            detail_1[0])[0].strip().replace(
                                                '\t', '').replace(brand,
                                                                  '')  # 商品名称
                        if len(X_name) == 0:
                            X_name = p_Name
                        if p_Name == None:
                            p_Name = X_name
            except:
                X_name = p_Name

            if X_name == p_Name:
                if brand and brand != p_Name:
                    if brand in X_name:
                        X_name = X_name[:0] + re.sub(brand, '', X_name)
                X_name = X_name[:0] + re.sub(r'（.*?）', '', X_name)
                X_name = X_name[:0] + re.sub(r'\(.*?\)', '', X_name)
                X_name = X_name[:0] + re.sub(r'[\u4e00-\u9fa5]+', '', X_name)
                X_name = X_name.replace('/', '').strip()

            try:
                open_method = re.findall(">开合方式：(.*?)<",
                                         detail_1[0])[0].strip()
            except:
                try:
                    open_method = x['开合方式']
                except:
                    open_method = None

            try:
                laundry = re.findall(">洗碗方式：(.*?)<", detail_1[0])[0].strip()
            except:
                try:
                    laundry = x['洗涤方式']
                except:
                    laundry = None

            try:
                capacity = re.findall(">总容积：(.*?)<", detail_1[0])[0].strip()
            except:
                try:
                    capacity = x['餐具容量（套）']
                except:
                    capacity = None

            try:
                control = re.findall(">控制方式：(.*?)<", detail_1[0])[0].strip()
            except:
                try:
                    control = x['控制方式']
                except:
                    control = None

            try:
                dry_method = x['干燥方式']
            except:
                try:
                    dry_method = re.findall(">干燥方式：(.*?)<",
                                            detail_1[0])[0].strip()
                except:
                    dry_method = None

            try:
                disinfection = x['消毒方式']
            except:
                try:
                    disinfection = re.findall(">消毒方式：(.*?)<",
                                              detail_1[0])[0].strip()
                except:
                    disinfection = None

            try:
                consump = x['耗水量（L）']
            except:
                try:
                    consump = re.findall(">耗水量：(.*?)<", detail_1[0])[0].strip()
                except:
                    consump = None

            try:
                color = x['颜色']
            except:
                try:
                    color = re.findall(">颜色：(.*?)<", detail_1[0])[0].strip()
                except:
                    color = None

            # price_web="https://p.3.cn/prices/mgets?pduid=15107253217849152442&skuIds=J_"+str(ProductID)
            comment_web = "https://sclub.jd.com/comment/productPageComments.action?productId=" + str(
                ProductID) + "&score=0&sortType=5&page=0&pageSize=10"
        # ---------------------普通网页-----------------------------------
        else:

            #商品名称（1.从名称处读；2.从表头的名称处读）
            try:
                p_Name = sel.xpath(".//div[@class='sku-name']/text()").extract(
                )[0].strip('\"').strip('\n').strip().replace('\t', '')  # 商品名称
                if len(p_Name) == 0:  # 如发生商品名称读取结果为空的情况
                    p_Name = sel.xpath(".//div[@class='item ellipsis']/@title"
                                       ).extract()[0].replace('\t', '')
            except:
                try:
                    p_Name = sel.xpath(".//div[@class='item ellipsis']/@title"
                                       ).extract()[0].replace('\t', '')
                except:
                    p_Name = None

            #京东商品介绍部分
            detail_info = sel.xpath(".//div[@class='p-parameter']")  # 包含商品详情内容
            detail = detail_info.xpath(".//li/text()").extract()
            if detail[0] == '品牌： ':
                detail_brand = detail_info.xpath(
                    ".//li[1]/@title").extract()[0]
                detail[0] = detail[0] + detail_brand
            product_detail = '\"' + ' '.join(detail).replace('\t', '').replace(
                '\n', '').replace('  ', '') + '\"'
            detail_1 = detail_info.extract()

            #京东规格与包装部分（读取为字典格式）
            try:
                s = BeautifulSoup(body, 'lxml')
                # print(s)
                guige = s.find('div', class_='Ptable')
                # print (guige)
                guige1 = guige.find_all('div', class_='Ptable-item')
                # print (guige1)
                x = {}
                for gg in guige1:
                    guige2 = gg.find_all('dt', class_=None)
                    guige3 = gg.find_all('dd', class_=None)
                    for i in range(len(guige2)):
                        dt = re.findall(">(.*?)<", str(guige2[i]))
                        dd = re.findall(">(.*?)<", str(guige3[i]))
                        x.setdefault(dt[0], dd[0])
            except:
                x = None

            #店铺名称
            try:
                try:
                    shop_name = sel.xpath(
                        ".//div[@class='name']/a/text()").extract()[0]  # 店铺名称
                except:
                    shop_name = re.findall(">店铺：(.*?)<",
                                           detail_1[0])[0].strip()
            except:
                shop_name = "京东自营"

            #不是品牌：**的形式，不用find
            try:
                brand = detail_info.xpath(
                    ".//ul[@id='parameter-brand']/li/a/text()").extract(
                    )[0].strip()  # 商品品牌
            except:
                try:
                    brand = x['品牌']
                except:
                    brand = None

            if brand:
                if ("（" and "）") in brand:
                    dd = re.findall("（.*?）", brand)[0]
                    brand = brand.replace(dd, '').replace(' ', '')
                if ("(" and ")") in brand:
                    dd = re.findall("\(.*?\)", brand)[0]
                    brand = brand.replace(dd, '').replace(' ', '')
                if brand == "Panasonic":
                    brand = "松下"
                if brand == "CHEBLO":
                    brand = "樱花"
                if brand == "MBO":
                    brand = "美博"
                if brand == "YAIR":
                    brand = "扬子"
                if brand == "PHLGCO":
                    brand = "飞歌"
                if brand == "FZM":
                    brand = "方米"
                if brand == "inyan":
                    brand = "迎燕"
                if brand == "JENSANY":
                    brand = "金三洋"

            #商品名称（型号）
            try:
                try:
                    X_name = re.findall(">货号：(.*?)<",
                                        detail_1[0])[0].strip().replace(
                                            brand, '')
                except:
                    try:
                        X_name = x['型号'].replace(brand, '')
                    except:
                        X_name = re.findall(">商品名称：(.*?)<",
                                            detail_1[0])[0].strip().replace(
                                                '\t', '').replace(brand,
                                                                  '')  # 商品名称
                        if len(X_name) == 0:
                            X_name = p_Name
                        if p_Name == None:
                            p_Name = X_name
            except:
                X_name = p_Name
            if X_name == p_Name:
                if brand and brand != p_Name:
                    if brand in X_name:
                        X_name = X_name[:0] + re.sub(brand, '', X_name)
                X_name = X_name[:0] + re.sub(r'（.*?）', '', X_name)
                X_name = X_name[:0] + re.sub(r'\(.*?\)', '', X_name)
                X_name = X_name[:0] + re.sub(r'[\u4e00-\u9fa5]+', '', X_name)
                X_name = X_name.replace('/', '').strip()

            try:
                open_method = re.findall(">开合方式：(.*?)<",
                                         detail_1[0])[0].strip()
            except:
                try:
                    open_method = x['开合方式']
                except:
                    open_method = None

            try:
                laundry = re.findall(">洗碗方式：(.*?)<", detail_1[0])[0].strip()
            except:
                try:
                    laundry = x['洗涤方式']
                except:
                    laundry = None

            try:
                capacity = re.findall(">总容积：(.*?)<", detail_1[0])[0].strip()
            except:
                try:
                    capacity = x['餐具容量（套）']
                except:
                    capacity = None

            try:
                control = re.findall(">控制方式：(.*?)<", detail_1[0])[0].strip()
            except:
                try:
                    control = x['控制方式']
                except:
                    control = None

            try:
                dry_method = x['干燥方式']
            except:
                try:
                    dry_method = re.findall(">干燥方式：(.*?)<",
                                            detail_1[0])[0].strip()
                except:
                    dry_method = None

            try:
                disinfection = x['消毒方式']
            except:
                try:
                    disinfection = re.findall(">消毒方式：(.*?)<",
                                              detail_1[0])[0].strip()
                except:
                    disinfection = None

            try:
                consump = x['耗水量（L）']
            except:
                try:
                    consump = re.findall(">耗水量：(.*?)<", detail_1[0])[0].strip()
                except:
                    consump = None

            try:
                color = x['颜色']
            except:
                try:
                    color = re.findall(">颜色：(.*?)<", detail_1[0])[0].strip()
                except:
                    color = None

            # price_web = "https://p.3.cn/prices/mgets?pduid=1508741337887922929012&skuIds=J_" + str(ProductID)
            comment_web = "https://sclub.jd.com/comment/productPageComments.action?productId=" + str(
                ProductID) + "&score=0&sortType=5&page=0&pageSize=10"
            # price_web = "https://p.3.cn/prices/mgets?pduid=1508741337887922929012&skuIds=J_" + str(ProductID)
            # price_web="https://p.3.cn/prices/mgets?ext=11000000&pin=&type=1&area=1_72_4137_0&skuIds=J_"+str(ProductID)+"&pdbp=0&pdtk=vJSo%2BcN%2B1Ot1ULpZg6kb4jfma6jcULJ1G2ulutvvlxgL3fj5JLFWweQbLYhUVX2E&pdpin=&pduid=1508741337887922929012&source=list_pc_front&_=1510210566056"

        # 商品评价   json格式

        # comment_web = "https://sclub.jd.com/comment/productPageComments.action?productId=" + str(ProductID) + "&score=0&sortType=5&page=0&pageSize=10"
        # comment_web="https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds="+str(ProductID)

        # comment_webs = requests.get(comment_web,timeout=1000).text
        # urls = json.loads(comment_webs)
        urls = requests.get(comment_web, timeout=1000).json()
        try:
            comment = urls['hotCommentTagStatistics']
            keyword_list = []
            for i in range(len(comment)):
                keyword_list.append(comment[i]['name'])
            if len(keyword_list) == 0:
                keyword = None
            else:
                keyword = ' '.join(keyword_list)  #关键词
        except:
            keyword = None

        rate = urls['productCommentSummary']
        try:
            CommentCount = rate['commentCount']  # 评论总数
        except:
            CommentCount = None
            print("评价总数", CommentCount)
        try:
            GoodRateShow = rate['goodRateShow']  # 好评率
        except:
            GoodRateShow = None
        try:
            GoodCount = rate['goodCount']  # 好评数
        except:
            GoodCount = None
        try:
            GeneralCount = rate['generalCount']  # 中评数
        except:
            GeneralCount = None
        try:
            PoorCount = rate['poorCount']  # 差评数
        except:
            PoorCount = None
        '''''' ''''
        方法一
        ''' '''''' ''
        # search_web = "https://search.jd.com/Search?keyword=" + str(p_Name) + "&enc=utf-8&wq=" + str(p_Name)
        # # print ("search页面：",search_web)
        # search_webs = requests.get(search_web, timeout=1000).text
        # soup = BeautifulSoup(search_webs, 'lxml')
        # skuid = "J_" + str(ProductID)
        # try:
        #     price_info = soup('strong', class_=skuid)
        #     PreferentialPrice = re.findall("<em>ï¿¥</em><i>(.*?)</i>", str(price_info[0]))[0]
        #     # 会有<strong class="J_10108922808" data-done="1" data-price="639.00"><em>ï¿¥</em><i></i></strong>出现
        #     #如id=10108922808  p_Name=柏翠（petrus） 38L电烤箱家用多功能 精准控温 PE7338 升级版
        #     if len(PreferentialPrice) == 0:
        #         PreferentialPrice = re.findall('data-price=\"(.*?)\"', str(price_info[0]))[0]
        #     price = PreferentialPrice
        # except:
        #     try:
        #         print("价格：",price_web)
        #         price_webs = requests.get(price_web, timeout=1000).text
        #         price_json = json.loads(price_webs)[0]
        #         PreferentialPrice = price_json['p']
        #         price = price_json['m']
        #     except:
        #         price=None
        #         PreferentialPrice=None
        # print(price,PreferentialPrice)
        if float(PreferentialPrice) > 0.00:
            item = XiwanjiItem()
            item['ProductID'] = ProductID
            item['p_Name'] = p_Name
            item['shop_name'] = shop_name
            item['price'] = price
            item['PreferentialPrice'] = PreferentialPrice
            item['CommentCount'] = CommentCount
            item['GoodRateShow'] = GoodRateShow
            item['GoodCount'] = GoodCount
            item['GeneralCount'] = GeneralCount
            item['PoorCount'] = PoorCount
            item['keyword'] = keyword
            item['type'] = product_detail
            item['brand'] = brand
            item['X_name'] = X_name
            item['open_method'] = open_method
            item['laundry'] = laundry
            item['capacity'] = capacity
            item['control'] = control
            item['dry_method'] = dry_method
            item['disinfection'] = disinfection
            item['consump'] = consump
            item['color'] = color
            item['product_url'] = url
            item['source'] = "京东"
            item['ProgramStarttime'] = self.ProgramStarttime
            yield item
        else:
            print('广告及无效页面:', url)

示例#2

0

显示文件

文件： immowelt_spider.py 项目： sgerodes/immowelt_crawler

 def start_requests(self):
     yield scrapy.Request(self.url)

示例#3

0

显示文件

文件： radio_tavisupleba_spider.py 项目： temurchichua/QartNewSurfer

 def start_requests(self):
     start_urls = ['https://www.radiotavisupleba.ge/a/31333277.html', ]
     for url in start_urls:
         yield scrapy.Request(url=url, callback=self.parse)

示例#4

0

显示文件

文件： crawler.py 项目： GuiJu/scrapyCrawler

 def parseCategory(self, response):
     for pageUrl in response.css('li.parent-cate a::attr(href)').extract():
         yield scrapy.Request(url=pageUrl, callback=self.parse)

示例#5

0

显示文件

文件： forum_get_body.py 项目： cookiepink/ItalianWeddings

 def start_requests(self):
     url_list = ['https://www.matrimonio.com/forum/prova-trucco--t750941']
     for url in url_list:
         yield scrapy.Request(url=url, callback=self.parse)

示例#6

0

显示文件

文件： heath_food_bj.py 项目： wliustc/SpiderS

 def start_requests(self):
     urls = ['http://www.bjda.gov.cn/eportal/ui?pageId=331184']
     yield scrapy.Request(urls[0], dont_filter=True)

示例#7

0

显示文件

文件： dhanak.py 项目： hafiz1133/scrapyProjects

    def start_requests(self):

        yield scrapy.Request(url='https://www.dhanak.com.pk/',
                             callback=self.link)

示例#8

0

显示文件

文件： codeforces_spider.py 项目： chenyutcmn/django_me

 def start_requests(self):
     urls = ['http://www.codeforces.com/contests']
     for url in urls:
         yield scrapy.Request(url, self.parse)

示例#9

0

显示文件

文件： pipelines.py 项目： Cooper111/Douyu

 def get_media_requests(self, item, info):
     image_link = item['imagelink']
     yield scrapy.Request(image_link)

示例#10

0

显示文件

 def parse(self, response):
     info = response.body.decode('utf-8')
     info = json.loads(info)
     if 'items' not in info.keys():
         self.err_after(response.meta)
         return None
     item_info = info['items']
     flip = info['flip']
     keyword = response.meta['keyword']
     sort = response.meta['sort']  # 上一页最后一个产品的排名
     p_time = response.meta['p_time']
     item_list = []
     page = response.meta['page']
     proxy = response.meta['proxy']
     # print('parse_before', sort,len(item_info), keyword)
     # 返回有数据，处理数据
     if len(item_info) > 0:
         for value in item_info:
             sort = sort + 1
             # 判断是否推广
             if 'ad' in value.keys():
                 mall_id = value['ad']['mall_id']
                 is_ad = 1
                 suggest_keyword = ''
             else:
                 mall_id = 0
                 is_ad = 0
                 suggest_keyword = ''
             goods_info = value
             goods_info['keyword'] = keyword
             goods_info['sort'] = sort
             goods_info['p_time'] = p_time
             goods_info['mall_id'] = mall_id
             goods_info['is_ad'] = is_ad
             goods_info['suggest_keyword'] = suggest_keyword
             item_list.append(goods_info)
         # 处理单个关键字下所有产品的排名
         item = KeywordGoodsList()
         item['goods_list'] = item_list
         item['page'] = page
         item['keyword'] = keyword
         # print('parse_middle', sort,len(item_info), keyword)
         yield item
         page += 1  # 返回数据，页码加1，未返回数据，重新抓取
         # print('parse_after', sort,len(item_info), keyword)
         if page <= self.max_page:
             url = self.build_search_url(page, self.size, keyword, flip)
             headers = self.make_headers(keyword)
             meta = {
                 'flip': flip,
                 'proxy': proxy,
                 'page': page,
                 'keyword': keyword,
                 'sort': sort,
                 'p_time': p_time
             }
             yield scrapy.Request(url,
                                  meta=meta,
                                  callback=self.parse,
                                  headers=headers,
                                  dont_filter=True,
                                  errback=self.errback_httpbin)

示例#11

0

显示文件

文件： article2.py 项目： 1067511899/tornado-learn

 def start_requests(self):
     urls = [
         'http://sz.to8to.com/zwj/']
     return [scrapy.Request(url=url, callback=self.parse) for url in urls]

示例#12

0

显示文件

文件： zqz.py 项目： jiangyg/ZWFproject

    def parse_list(self, response):
        json_text = json.loads(response.text[7:-1], encoding='utf-8')
        print(json_text)
        for data in json_text['data']:
            item = Zqz510Item()

            if 'agS' in data:
                item['agS'] = data['agS']
            else:
                item['agS'] = empty_word

            if 'agidS' in data:
                item['agidS'] = data['agidS']
            else:
                item['agidS'] = empty_word

            if 'an' in data:
                item['an'] = data['an']
            else:
                item['an'] = empty_word

            if 'anDest' in data:
                item['anDest'] = data['anDest']
            else:
                item['anDest'] = empty_word

            if 'anList' in data:
                item['anList'] = str(data['anList'])
            else:
                item['anList'] = empty_word

            if 'apS' in data:
                item['apS'] = data['apS']
            else:
                item['apS'] = empty_word

            if 'apidS' in data:
                item['apidS'] = data['apidS']
            else:
                item['apidS'] = empty_word

            if 'cid' in data:
                item['cid'] = data['cid']
            else:
                item['cid'] = empty_word

            if 'docid' in data:
                item['docid'] = data['docid']
            else:
                item['docid'] = empty_word

            if 'law' in data:
                item['law'] = data['law']
            else:
                item['law'] = empty_word

            if 'link' in data:
                item['link'] = data['link']
            else:
                item['link'] = empty_word

            if 'litem' in data:
                item['litem'] = data['litem']
            else:
                item['litem'] = empty_word

            if 'ltid' in data:
                item['ltid'] = data['ltid']
            else:
                item['ltid'] = empty_word

            if 'pd' in data:
                item['pd'] = data['pd']
            else:
                item['pd'] = empty_word

            if 'psty' in data:
                item['psty'] = data['psty']
            else:
                item['psty'] = empty_word

            if 'rid' in data:
                item['rid'] = data['rid']
            else:
                item['rid'] = empty_word

            if 'ti' in data:
                item['ti'] = data['ti']
            else:
                item['ti'] = empty_word

            if 'ty' in data:
                item['ty'] = data['ty']
            else:
                item['ty'] = empty_word

            detail_url = 'http://api.zqz510.com/tmof/detail?docid={}&callback=_jqjsp&_{}='.format(
                item['docid'], str(int(time.time() * 1000)))
            yield scrapy.Request(url=detail_url,
                                 callback=self.parse_detail,
                                 meta={'item': item},
                                 cookies=self.cookie)

示例#13

0

显示文件

文件： creditchina_spider.py 项目： studyaa/project

 def parse_detail_info(self, response):
     detail_info = response.text
     cust = response.meta['cust']
     if eval(detail_info.replace('null', 'None').replace('false', 'None').replace('true', 'None')).get('msg'):
         self.logger.warning('%s',  cust + ' occurred err!')
         self.logger.warning('%s', eval(detail_info.replace('null', 'None').replace('false', 'None').replace('true', 'None')))
     results = eval(detail_info.replace('null', 'None').replace('false', 'None').replace('true', 'None'))['data']['results']
     credit_info_detail_url = "http://www.creditchina.gov.cn/api/credit_info_detail?"
     pub_permissions_name_url = 'http://www.creditchina.gov.cn/api/pub_permissions_name?'
     pub_penalty_name_url = 'http://www.creditchina.gov.cn/api/pub_penalty_name?'
     record_param_url = 'http://www.creditchina.gov.cn/api/record_param?'
     credit_info_detail_list = []
     pub_permissions_name_list = []
     pub_penalty_name_list = []
     record_param_list_2 = []
     record_param_list_4 = []
     record_param_list_8 = []
     for result in results:
         self.logger.info('Searcing name is %s, result name is %s'  %(cust,result['name']))
         if result['name'] == cust:
             self.logger.warning('%s', cust + ' has results.')
             # summary
             credit_info_detail_url_append = {'encryStr': result['encryStr'].replace('\n', '')}
             credit_info_detail_url_append = urllib.urlencode(credit_info_detail_url_append)
             credit_info_detail_list.append(credit_info_detail_url + credit_info_detail_url_append)
             credit_info_detail_list = list(set(credit_info_detail_list))
             # pub_permissions
             pub_permissions_name_url_append = {'name': cust, 'page': 1, 'pageSize': 50}
             pub_permissions_name_url_append = urllib.urlencode(pub_permissions_name_url_append)
             pub_permissions_name_list.append(pub_permissions_name_url + pub_permissions_name_url_append)
             pub_permissions_name_list = list(set(pub_permissions_name_list))
             # pub_penalty
             pub_penalty_name_url_append = {'name': cust, 'page': 1, 'pageSize': 50}
             pub_penalty_name_url_append = urllib.urlencode(pub_penalty_name_url_append)
             pub_penalty_name_list.append(pub_penalty_name_url + pub_penalty_name_url_append)
             pub_penalty_name_list = list(set(pub_penalty_name_list))
             # creditType=2 red,creditType=4 attention,creditType=8 black
             record_param_url_append_2 = {'encryStr': result['encryStr'].replace('\n', ''), 'creditType': 2,
                                          'dataSource': 0, 'pageNum': 1, 'pageSize': 50}
             record_param_url_append_2 = urllib.urlencode(record_param_url_append_2)
             record_param_list_2.append(record_param_url + record_param_url_append_2)
             record_param_list_2 = list(set(record_param_list_2))
             record_param_url_append_4 = {'encryStr': result['encryStr'].replace('\n', ''), 'creditType': 4,
                                          'dataSource': 0, 'pageNum': 1, 'pageSize': 50}
             record_param_url_append_4 = urllib.urlencode(record_param_url_append_4)
             record_param_list_4.append(record_param_url + record_param_url_append_4)
             record_param_list_4 = list(set(record_param_list_4))
             record_param_url_append_8 = {'encryStr': result['encryStr'].replace('\n', ''), 'creditType': 8,
                                          'dataSource': 0, 'pageNum': 1, 'pageSize': 50}
             record_param_url_append_8 = urllib.urlencode(record_param_url_append_8)
             record_param_list_8.append(record_param_url + record_param_url_append_8)
             record_param_list_8 = list(set(record_param_list_8))
     if pub_permissions_name_list != []:
         for url in pub_permissions_name_list:
             time.sleep(random.uniform(1, 3))
             yield scrapy.Request(url=url, callback=self.parse_pub_permissions_name,meta={'cust': cust}, dont_filter = True)
     if record_param_list_2 != []:
         for url in record_param_list_2:
             time.sleep(random.uniform(1, 3))
             yield scrapy.Request(url=url, callback=self.parse_record_param_url_append_2,meta={'cust': cust}, dont_filter = True)
     if record_param_list_4 != []:
         for url in record_param_list_4:
             time.sleep(random.uniform(1, 3))
             yield scrapy.Request(url=url, callback=self.parse_record_param_url_append_4,meta={'cust': cust}, dont_filter = True)
     if record_param_list_8 != []:
         for url in record_param_list_8:
             time.sleep(random.uniform(1, 3))
             yield scrapy.Request(url=url, callback=self.parse_record_param_url_append_8,meta={'cust': cust}, dont_filter = True)

示例#14

0

显示文件

文件： taishi_xwj.py 项目： junjunjuner/xiwanji

    def parse(self, response):
        sel = scrapy.Selector(response)
        '''''' '''
        方法二
        ''' ''''''
        productid_list1 = sel.xpath(
            ".//div[@id='plist']/ul/li/div[contains(@class,'gl-i-wrap')]/@data-sku"
        ).extract()
        #单件，套餐……
        productid_list2 = sel.xpath(
            ".//div[@class='gl-i-tab-content']/div[@class='tab-content-item tab-cnt-i-selected j-sku-item']/@data-sku"
        ).extract()
        productid_list = productid_list1 + productid_list2
        print(productid_list)
        print(len(productid_list))
        productid_str = '%2CJ_'.join(productid_list)
        # time.sleep(random.randint(60,120))
        price_web = "https://p.3.cn/prices/mgets?ext=11000000&pin=&type=1&area=1_72_4137_0&skuIds=J_" + str(
            productid_str)
        price_webs = requests.get(price_web, timeout=1000).text
        price_jsons = json.loads(price_webs)
        if len(price_jsons) > 50:
            self.pagenum = self.pagenum + 1
            print("第" + str(self.pagenum) + "页")
        for price_json in price_jsons:
            try:
                id = price_json['id']
                ProductID = id[2:]
                PreferentialPrice = price_json['p']
                price = price_json['m']
            except:
                ProductID = None
                PreferentialPrice = None
                price = None  # 商品价格
            if ProductID:
                item = XiwanjiItem()
                with open("price.csv", "a") as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerow([ProductID, PreferentialPrice, price])
                item['ProductID'] = ProductID
                item['PreferentialPrice'] = PreferentialPrice
                item['price'] = price
                goods_web = "https://item.jd.com/" + str(ProductID) + ".html"
                request = scrapy.Request(url=goods_web,
                                         callback=self.goods,
                                         meta={'item': item},
                                         dont_filter=True)
                yield request
            else:
                print("ProductID未获取到")
                self.num = self.num + 1
            if (self.num) > 60:
                print("ProductID多次未获取到")
                exit()
        '''''' ''''
        方法一
        ''' '''''' ''
        # # url="https://item.jd.hk/18739277759.html"    #京东全球购与普通网址不同，不同的地方为“https://item.jd.com/4251335.html”
        # goods_info=sel.xpath(".//div[@id='plist']/ul/li")
        # for goods in goods_info:
        #     ProductID_info=goods.xpath(".//div[@class='gl-i-wrap j-sku-item']/@data-sku").extract()       #商品编号
        #     if len(ProductID_info)==0:
        #         ProductID_info=goods.xpath(".//div[@class='gl-i-tab-content']/div[@class='tab-content-item tab-cnt-i-selected j-sku-item']/@data-sku").extract()
        #         ProductID=ProductID_info[0]
        #     else:
        #         ProductID=ProductID_info[0]
        #     # print(ProductID)
        #     if len(ProductID)!=0:
        #         goods_web="https://item.jd.com/"+str(ProductID)+".html"         #商品链接   包含商品型号,店铺名称,类别,品牌,型号等
        #         item=JdItem(ProductID=ProductID)
        #         request=scrapy.Request(url=goods_web,callback=self.goods,meta={'item':item},dont_filter=True)
        #         yield request
        #     else:
        #         print("parse中ProductID为空  没有读到")

        # #测试用
        # productid_list1=sel.xpath(".//div[@id='plist']/ul/li/div[contains(@class,'gl-i-wrap')]/@data-sku").extract()
        # #单件，套餐……
        # productid_list2 = sel.xpath( ".//div[@class='gl-i-tab-content']/div[@class='tab-content-item tab-cnt-i-selected j-sku-item']/@data-sku").extract()
        # productid_list=productid_list1+productid_list2
        # print(productid_list)
        # print(len(productid_list))
        # for ProductID in productid_list:
        #     item = JinghuaqiItem(ProductID=ProductID,price=2.00,PreferentialPrice=2.00)
        #     # url="https://item.jd.hk/1971910764.html"
        #     url="https://item.jd.com/" + str(ProductID) + ".html"
        #     request = scrapy.Request(url=url, callback=self.goods,meta={'item':item}, dont_filter=True)
        #     yield request

        #翻页功能
        time.sleep(random.randint(60, 120))
        next_page = sel.xpath(
            ".//div[@class='p-wrap']/span[@class='p-num']/a[@class='pn-next']/@href"
        ).extract()
        if next_page:
            next = "https://list.jd.com/" + next_page[0]
            yield scrapy.Request(next, callback=self.parse)

示例#15

0

显示文件

文件： pittsburgh_c.py 项目： crocojim18/alleco

	def start_requests(self):
		urls = ['https://pittsburghpa.gov/mayor/mayor-contact',
		'https://pittsburghpa.gov/controller/controller-contact']
		urls += ["https://pittsburghpa.gov/council/d{}-contacts".format(i+1) for i in range(9)]
		for url in urls:
			yield scrapy.Request(url=url, callback=self.parse)

示例#16

0

显示文件

文件： indeed_s.py 项目： XiaoxuanMa/Indeed-Scraper-in-Scrapy-Framework

 def parse(self, response):
     for url in urls:
         yield scrapy.Request("https://www.indeed.com" + url + "/reviews?fcountry=CN",callback = self.parse_single)

示例#17

0

显示文件

    def parse(self, response):
        manager_response = response.css('.jl_intro')
        funds_response = response.css('.jl_office')

        company = response.css('.bs_gl').xpath(
            './p/label/a[@href]/text()').extract()[-1]

        num = len(manager_response)
        if isinstance(manager_response, SelectorList):
            assert num == len(funds_response)
        else:
            manager_response = [manager_response]
            funds_response = [funds_response]

        for i in range(num):
            manager = Manager()
            intro_list = manager_response[i].xpath('.//text()').extract()
            manager['name'] = intro_list[1]
            manager['appointment_date'] = intro_list[3]
            manager['introduction'] = intro_list[4]
            manager['url'] = 'http:' + manager_response[i].xpath(
                './a/@href').extract_first()
            manager['image_urls'] = manager_response[i].xpath(
                './a/img/@src').extract()
            manager['_id'] = manager['url'][-13:-5]

            try:
                funds_table_list = funds_response[i].xpath(
                    './/text()').extract()
                funds_table = numpy.array(funds_table_list[2:]).reshape(-1, 9)
                manager_name = funds_table_list[0]
            except Exception:

                def parse_line(tr):
                    return [
                        item.xpath('.//text()').extract_first()
                        for item in tr.xpath('./td')
                    ]

                funds_table = numpy.array([
                    parse_line(tr)
                    for tr in funds_response[i].xpath('./table/tbody/tr')
                ])
                manager_name = funds_response[0].xpath(
                    './div/label/a/text()').extract_first()

            manager['funds'] = funds_table[1:, 0].tolist()

            yield scrapy.Request(manager['url'],
                                 callback=self.parse_manager,
                                 meta={'manager': manager})

            for fund_list in funds_table[1:, ]:
                yield Fund(_id=manager['_id'] + '#' + fund_list[0],
                           code=fund_list[0],
                           name=fund_list[1],
                           type=fund_list[2],
                           start_date=fund_list[3],
                           end_date=fund_list[4],
                           duty_days=fund_list[5],
                           duty_return=fund_list[6],
                           average=fund_list[7],
                           rank=fund_list[8],
                           manager=manager_name,
                           company=company)

示例#18

0

显示文件

文件： indeed_s.py 项目： XiaoxuanMa/Indeed-Scraper-in-Scrapy-Framework

    def parse_single(self, response):

        reviews = response.xpath('//div[@class = "cmp-Review"]')

        company = response.xpath('//div[@class = "cmp-CompactHeaderLayout-nameContainer"]//text()').extract_first()

        for review in reviews:

            item = IndeedItem()

            item['company'] = company

            item['rating'] = review.xpath(".//div[@class = 'cmp-ReviewRating-text']/text()").extract_first()

            item['date'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[last()]").extract_first()

            item['content'] = review.xpath(".//span[@itemprop = 'reviewBody']//span[@class = 'cmp-NewLineToBr-text']/text()").extract()

            item['position'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']//meta[@itemprop='name']/@content").extract_first()

            if len(review.xpath(".//div[@class = 'cmp-Review-title']/text()")):
                item['title'] = review.xpath(".//div[@class = 'cmp-Review-title']/text()").extract_first()
            else:
                item['title'] = review.xpath(".//a[@class = 'cmp-Review-titleLink']/text()").extract_first()


            if len(review.xpath(".//a[@class = 'cmp-ReviewAuthor-link']")) == 2 :
                item['location'] = review.xpath(".//a[@class = 'cmp-ReviewAuthor-link'][2]/text()").extract_first()
                item['status'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[2]").extract_first()

            elif len(review.xpath(".//a[@class = 'cmp-ReviewAuthor-link']")) == 1:
                if review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[last()-2]").extract_first() != ' - ':
                    item['location'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[last()-2]").extract_first()
                    item['status'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[last()-4]").extract_first()
                else:
                    item['location'] = review.xpath(".//a[@class = 'cmp-ReviewAuthor-link'][1]/text()").extract_first()
                    item['status'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[3]").extract_first()
            else:
                item['location'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[5]").extract_first()
                item['status'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[3]").extract_first()


            subrating = review.xpath(".//div[@class = 'cmp-SubRating']//div[@class = 'cmp-RatingStars-starsFilled']/@style").extract()

            item['work_life_rating'] = subrating[0]

            item['benefits_rating'] = subrating[1]

            item['security_rating'] = subrating[2]

            item['management_rating'] = subrating[3]

            item['culture_rating'] = subrating[4]
            # 3px=0
            # 15px=1
            # 27px=2
            # 39px=3
            # 51px=4
            # 63px=5

            if len(review.xpath(".//div[@class = 'cmp-ReviewProsCons-prosText']//span[@class = 'cmp-NewLineToBr-text']/text()")):
                item['Pros'] = review.xpath(".//div[@class = 'cmp-ReviewProsCons-prosText']//span[@class = 'cmp-NewLineToBr-text']/text()").extract_first()
            else:
                item['Pros'] = 'NaN'

            if len(review.xpath(".//div[@class = 'cmp-ReviewProsCons-consText']//span[@class = 'cmp-NewLineToBr-text']/text()")):
                item['Cons'] = review.xpath(".//div[@class = 'cmp-ReviewProsCons-consText']//span[@class = 'cmp-NewLineToBr-text']/text()").extract_first()
            else:
                item['Cons'] = 'NaN'

            if len(review.xpath(".//span[text()='Yes']//span[@class = 'cmp-StatelessReviewFeedbackButtons-count']/text()")):
                item['helpful'] = review.xpath(".//span[text()='Yes']//span[@class = 'cmp-StatelessReviewFeedbackButtons-count']/text()").extract_first()
            else:
                item['helpful'] = 0
            
            if len(review.xpath(".//span[text()='No']//span[@class = 'cmp-StatelessReviewFeedbackButtons-count']/text()")):
                item['helpless'] = review.xpath(".//span[text()='No']//span[@class = 'cmp-StatelessReviewFeedbackButtons-count']/text()").extract_first()
            else:
                item['helpless'] = 0

            yield item

        if len(response.xpath("//a[@data-tn-element = 'next-page']/@href")):
            next_url = response.xpath("//a[@data-tn-element = 'next-page']/@href").extract_first()

            yield scrapy.Request("https://www.indeed.com" + next_url ,callback = self.parse_single)

示例#19

0

显示文件

文件： jd_spider_m.py 项目： louis-cai/spider.collections

    def start_requests(self):

        data1 = [
            {
                "category":
                "m.图书.音像.游戏",
                "url":
                "http://list.jd.com/list.html?cat=mvd.jd.com/theme/4053-7.html&go=0"
            },
            {
                "category":
                "m.图书.少儿.0-2岁",
                "url":
                "http://list.jd.com/list.html?cat=book.jd.com/children0-2.html&go=0"
            },
            {
                "category":
                "m.图书.少儿.3-6岁",
                "url":
                "http://list.jd.com/list.html?cat=book.jd.com/children3-6.html&go=0"
            },
            {
                "category":
                "m.图书.少儿.7-10岁",
                "url":
                "http://list.jd.com/list.html?cat=book.jd.com/children7-10.html&go=0"
            },
            {
                "category":
                "m.图书.少儿.11-14岁",
                "url":
                "http://list.jd.com/list.html?cat=book.jd.com/children11-14.html&go=0"
            },
            {
                "category": "m.图书.少儿.儿童文学",
                "url": "http://list.jd.com/list.html?cat=1713-3263-3394&go=0"
            },
            {
                "category": "m.图书.少儿.绘本",
                "url": "http://list.jd.com/list.html?cat=1713-3263-4761&go=0"
            },
            {
                "category": "m.图书.少儿.科普",
                "url": "http://list.jd.com/list.html?cat=1713-3263-3399&go=0"
            },
            {
                "category": "m.图书.少儿.幼儿启蒙",
                "url": "http://list.jd.com/list.html?cat=1713-3263-3395&go=0"
            },
            {
                "category": "m.图书.少儿.手工游戏",
                "url": "http://list.jd.com/list.html?cat=1713-3263-3396&go=0"
            },
            {
                "category": "m.图书.少儿.智力开发",
                "url": "http://list.jd.com/list.html?cat=1713-3263-3398&go=0"
            },
            {
                "category": "m.图书.教育.教材",
                "url": "http://list.jd.com/list.html?cat=1713-11047&go=0"
            },
            {
                "category": "m.图书.教育.中小学教辅",
                "url": "http://list.jd.com/list.html?cat=1713-3289&go=0"
            },
            {
                "category": "m.图书.教育.考试",
                "url": "http://list.jd.com/list.html?cat=1713-3290&go=0"
            },
            {
                "category": "m.图书.教育.外语学习",
                "url": "http://list.jd.com/list.html?cat=1713-3291&go=0"
            },
            {
                "category": "m.图书.教育.字典词典",
                "url": "http://list.jd.com/list.html?cat=1713-3294&go=0"
            },
            {
                "category": "m.图书.文艺.小说",
                "url": "http://list.jd.com/list.html?cat=1713-3258&go=0"
            },
            {
                "category": "m.图书.文艺.文学",
                "url": "http://list.jd.com/list.html?cat=1713-3259&go=0"
            },
            {
                "category": "m.图书.文艺.青春文学",
                "url": "http://list.jd.com/list.html?cat=1713-3260&go=0"
            },
            {
                "category": "m.图书.文艺.传记",
                "url": "http://list.jd.com/list.html?cat=1713-3261&go=0"
            },
            {
                "category": "m.图书.文艺.动漫",
                "url": "http://list.jd.com/list.html?cat=1713-3272&go=0"
            },
            {
                "category": "m.图书.文艺.艺术",
                "url": "http://list.jd.com/list.html?cat=1713-3262&go=0"
            },
            {
                "category": "m.图书.文艺.摄影",
                "url": "http://list.jd.com/list.html?cat=1713-12776&go=0"
            },
            {
                "category": "m.图书.经管励志.管理",
                "url": "http://list.jd.com/list.html?cat=1713-3266&go=0"
            },
            {
                "category": "m.图书.经管励志.金融与投资",
                "url": "http://list.jd.com/list.html?cat=1713-3265&go=0"
            },
            {
                "category": "m.图书.经管励志.经济",
                "url": "http://list.jd.com/list.html?cat=1713-3264&go=0"
            },
            {
                "category": "m.图书.经管励志.励志与成功",
                "url": "http://list.jd.com/list.html?cat=1713-3267&go=0"
            },
            {
                "category": "m.图书.人文社科.历史",
                "url": "http://list.jd.com/list.html?cat=1713-3273&go=0"
            },
            {
                "category": "m.图书.人文社科.心理学",
                "url": "http://list.jd.com/list.html?cat=1713-3279&go=0"
            },
            {
                "category": "m.图书.人文社科.政治/军事",
                "url": "http://list.jd.com/list.html?cat=1713-3276&go=0"
            },
            {
                "category": "m.图书.人文社科.社会科学",
                "url": "http://list.jd.com/list.html?cat=1713-3281&go=0"
            },
            {
                "category": "m.图书.人文社科.法律",
                "url": "http://list.jd.com/list.html?cat=1713-3277&go=0"
            },
            {
                "category": "m.图书.人文社科.文化",
                "url": "http://list.jd.com/list.html?cat=1713-3280&go=0"
            },
            {
                "category": "m.图书.生活.家教与育儿",
                "url": "http://list.jd.com/list.html?cat=1713-3270&go=0"
            },
            {
                "category": "m.图书.生活.孕产",
                "url": "http://list.jd.com/list.html?cat=1713-3270-3509&go=0"
            },
            {
                "category": "m.图书.生活.健身保健",
                "url": "http://list.jd.com/list.html?cat=1713-3269&go=0"
            },
            {
                "category": "m.图书.生活.旅游/地图",
                "url": "http://list.jd.com/list.html?cat=1713-3271&go=0"
            },
            {
                "category": "m.图书.生活.美食",
                "url": "http://list.jd.com/list.html?cat=1713-9278&go=0"
            },
            {
                "category": "m.图书.生活.时尚美妆",
                "url": "http://list.jd.com/list.html?cat=1713-9291&go=0"
            },
            {
                "category": "m.图书.生活.家居",
                "url": "http://list.jd.com/list.html?cat=1713-9301&go=0"
            },
            {
                "category": "m.图书.生活.手工DIY",
                "url": "http://list.jd.com/list.html?cat=1713-9314-9315&go=0"
            },
            {
                "category": "m.图书.生活.两性",
                "url": "http://list.jd.com/list.html?cat=1713-9309&go=0"
            },
            {
                "category": "m.图书.生活.体育",
                "url": "http://list.jd.com/list.html?cat=1713-3288&go=0"
            },
            {
                "category": "m.图书.科技.计算机与互联网",
                "url": "http://list.jd.com/list.html?cat=1713-3287&go=0"
            },
            {
                "category": "m.图书.科技.建筑",
                "url": "http://list.jd.com/list.html?cat=1713-3284&go=0"
            },
            {
                "category": "m.图书.科技.工业技术",
                "url": "http://list.jd.com/list.html?cat=1713-3282&go=0"
            },
            {
                "category": "m.图书.科技.电子/通信",
                "url": "http://list.jd.com/list.html?cat=1713-9351&go=0"
            },
            {
                "category": "m.图书.科技.医学",
                "url": "http://list.jd.com/list.html?cat=1713-3285&go=0"
            },
            {
                "category": "m.图书.科技.科学与自然",
                "url": "http://list.jd.com/list.html?cat=1713-3286&go=0"
            },
            {
                "category": "m.图书.科技.农林",
                "url": "http://list.jd.com/list.html?cat=1713-9368&go=0"
            },
            {
                "category": "m.图书.刊/原版.杂志/期刊",
                "url": "http://list.jd.com/list.html?cat=1713-4758&go=0"
            },
            {
                "category": "m.图书.刊/原版.英文原版书",
                "url": "http://list.jd.com/list.html?cat=1713-4855&go=0"
            },
            {
                "category": "m.图书.刊/原版.港台图书",
                "url": "http://list.jd.com/list.html?cat=1713-6929&go=0"
            },
            {
                "category":
                "m.图书.电子书.小说",
                "url":
                "http://list.jd.com/list.html?cat=e.jd.com/products/5272-5278.html&go=0"
            },
            {
                "category":
                "m.图书.电子书.励志与成功",
                "url":
                "http://list.jd.com/list.html?cat=e.jd.com/products/5272-5287.html&go=0"
            },
            {
                "category":
                "m.图书.电子书.经济金融",
                "url":
                "http://list.jd.com/list.html?cat=e.jd.com/products/5272-12438.html&go=0"
            },
            {
                "category":
                "m.图书.电子书.文学",
                "url":
                "http://list.jd.com/list.html?cat=e.jd.com/products/5272-5279.html&go=0"
            },
            {
                "category":
                "m.图书.电子书.社科",
                "url":
                "http://list.jd.com/list.html?cat=e.jd.com/products/5272-5301.html&go=0"
            },
            {
                "category":
                "m.图书.电子书.婚恋两性",
                "url":
                "http://list.jd.com/list.html?cat=e.jd.com/products/5272-10884.html&go=0"
            },
            {
                "category":
                "m.图书.电子书.外文原版",
                "url":
                "http://list.jd.com/list.html?cat=e.jd.com/products/5272-6828.html&go=0"
            },
            {
                "category":
                "m.图书.电子书.免费",
                "url":
                "http://list.jd.com/list.html?cat=e.jd.com/products/5272-5276.html&go=0"
            },
        ]

        for data in data1:
            category = data['category']
            url = data['url']
            request = scrapy.Request(url, self.parse)
            request.meta['category'] = category
            yield request

示例#20

0

显示文件

 def parse(self, response):
     print(response.text)
     yield scrapy.Request(self.start_urls[0], dont_filter=True)

示例#21

0

显示文件

文件： pipelines.py 项目： chrisgiga/findbook

 def get_media_requests(self, item, info):
     for image_url in item['image_urls']:
         yield scrapy.Request(
             image_url, meta={'image_name': item["image_name"] + '.jpg'})

示例#22

0

显示文件

文件： l_12.py 项目： svvay/ITEA_AC

    def start_requests(self):
        """Start parsing from first page of questions"""

        yield scrapy.Request(url=f"{self.domain}/questions?tab=newest&page=1",
                             callback=self.parse_pages)

示例#23

0

显示文件

文件： crawler.py 项目： GuiJu/scrapyCrawler

 def start_requests(self):
     urls = [
         "http://www.wandoujia.com/apps",
     ]
     for url in urls:
         yield scrapy.Request(url=url, callback=self.parseCategory)

示例#24

0

显示文件

文件： comic_spider.py 项目： zlszhonglongshen/spider

 def start_requests(self):
     yield scrapy.Request(url=self.start_urls[0],callback=self.parse1)

示例#25

0

显示文件

 def start_requests(self):
     urls = ['http://www.theguardian.com']
     for url in urls:
         yield scrapy.Request(url=url, callback = self.frontpage)

示例#26

0

显示文件

文件： simple_spider_0.py 项目： Stianbot/INFO215

 def start_requests(self):
     urls = ['https://en.wikipedia.org/wiki/Web_page',
             'https://en.wikipedia.org/wiki/Web_browser',
             'https://en.wikipedia.org/wiki/WorldWideWeb']
     for url in urls:
         yield scrapy.Request(url=url, callback=self.parse)

示例#27

0

显示文件

文件： pipelines.py 项目： chasemc67/Venture_Supply_Product_Scraper

 def get_media_requests(self, item, info):
     for image_url in item['image_urls']:
         yield scrapy.Request(image_url)

示例#28

0

显示文件

文件： renren.py 项目： yujunsen/python

 def parse_page(self, response):
     url = 'http://www.renren.com/880151247/profile'
     request = scrapy.Request(url=url, callback=self.parse_project)
     yield  request

示例#29

0

显示文件

 def start_requests(self):
     url = 'https://news.pts.org.tw/list/0'
     meta = {'iter_time': 0}
     yield scrapy.Request(url, callback=self.parse_news_list, meta=meta)

示例#30

0

显示文件

文件： blogs_5185188.py 项目： NgCafai/FinancialBlogsCrawler

 def start_requests(self):
     for page in range(1, 12):
         url = 'http://blog.eastmoney.com/5185188/bloglist_0_%d.html' % page
         self.logger.debug('parsing page: ' + url)
         yield scrapy.Request(url, callback=self.parse_outer_page)