def parseCupMatch(self, response): datas = Selector(response).xpath('//table[@class="lrace_bei"]//td/a').extract() for data in datas: try: lsName = Selector(text=data).xpath('//a/text()').extract()[0] if lsName == response.meta['lsName']: url = Selector(text=data).xpath('//a/@href').extract()[0] url = 'http://liansai.500.com{0}'.format(url) html = requests.get(url) ssName = Selector(text=html.text).xpath('//div[@class="ldrop_bd"]//li[@class="ldrop_list_on"]/a/text()').extract()[0] url = Selector(text=html.text).xpath('//div[@class="lcol_tit_r"]/a/@href').extract()[0] url = 'http://liansai.500.com{0}'.format(url) # 获取第一分类地址 html = requests.get(url) classes = Selector(text=html.text).xpath('//div[@id="match_stage"]/a[@data-id]').extract() for cs in classes: url = Selector(text=cs).xpath('//a/@href').extract()[0] istid = url.find('jifen-') stid = url[istid + 6:][:-1] url = 'http://liansai.500.com{0}'.format(url) mClass1 = Selector(text=cs).xpath('//a/text()').extract()[0] yield Request(url=url, callback=self.parseBsData, meta={'lsName': response.meta['lsName'], 'ssName': ssName, 'stid': stid, 'mClass1': mClass1}) return except Exception as e: print('parseBsData fail: {0}'.format(e))
def parse(self, response): rows = response.xpath('//*[@id="table-buildings"]/tbody/tr').extract() for row in rows: blgName = Selector( text=row).xpath('//td[4]/a/text()').extract()[0].strip() blgCity = Selector(text=row).xpath('//td[5]/a/text()').extract()[0] blgCountry = Selector( text=row).xpath('//td[5]/a/text()').extract()[1] blgFloor = Selector(text=row).xpath('//td[8]/text()').extract()[0] blgPurpose = Selector( text=row).xpath('//td[11]/text()').extract()[0].strip() blgUrl = "https://skyscrapercenter.com/" + Selector( text=row).xpath('//td[4]/a/@href').extract()[0].strip() hgtRank = Selector(text=row).xpath('//td[1]/text()').extract()[0] hgtFeet = Selector( text=row).xpath('//td[7]/text()').extract()[0].replace( ",", "") isMultiPurpose = "Y" if blgPurpose.find("/") != -1 else "N" forOffice = "Y" if blgPurpose.find("office") != -1 else "N" forResidential = "Y" if blgPurpose.find( "residential") != -1 else "N" forHotel = "Y" if blgPurpose.find("hotel") != -1 else "N" forRetail = "Y" if blgPurpose.find("retail") != -1 else "N" yrComplete = Selector( text=row).xpath('//td[9]/text()').extract()[0] item = SkyscraperItem() item['blgName'] = blgName item['blgCity'] = blgCity item['blgCountry'] = blgCountry item['blgFloor'] = blgFloor item['blgPurpose'] = blgPurpose item['blgUrl'] = blgUrl item['hgtRank'] = hgtRank item['hgtFeet'] = hgtFeet item['isMultiPurpose'] = isMultiPurpose item['forOffice'] = forOffice item['forResidential'] = forResidential item['forHotel'] = forHotel item['forRetail'] = forRetail item['yrComplete'] = yrComplete request = scrapy.Request(blgUrl, callback=self.parse_building_page) request.meta[ 'item'] = item # use 'request.meta' to pass the partial filled 'item' to parse_building_page yield request
def parse(self, response): print(response.url) links_data = Selector(response).xpath('//script[contains(., "var data = ")]/text()').extract()[0] l, r = links_data.find('{'), links_data.rfind('}') if l == -1 or r == -1: yield Request(response.url) # failed, and retry! else: article_links = [item['url'] for item in json.loads(links_data[l:(r+1)].encode('utf-8', 'ignore'))['result']] with open('root/csdn.%s.content'%(self.tag), 'ab') as fp: fp.write('\n'.join(article_links + ['']))
def parse(self, response): """TODO: Docstring for pass. :response: TODO :returns: TODO """ for item in self._parse_posts(response): if not self.should_stop(item): yield item else: return if len(Selector(response).css('#frs_list_pager .next')): #贴吧的分页有的不是完整的链接 next_page_url = Selector(response).css('#frs_list_pager .next::attr(href)').extract_first() logging.debug('next_page_url %s', next_page_url) if -1 != next_page_url.find('http://tieba.baidu.com'): yield Request(next_page_url, callback=self.parse) else: yield Request('http://tieba.baidu.com' + next_page_url, callback=self.parse)
def parse(self, response): """TODO: Docstring for pass. :response: TODO :returns: TODO """ for item in self._parse_posts(response): if not self.should_stop(item): yield item else: return if len(Selector(response).css('#frs_list_pager .next')): #贴吧的分页有的不是完整的链接 next_page_url = Selector(response).css( '#frs_list_pager .next::attr(href)').extract_first() logging.debug('next_page_url %s', next_page_url) if -1 != next_page_url.find('http://tieba.baidu.com'): yield Request(next_page_url, callback=self.parse) else: yield Request('http://tieba.baidu.com' + next_page_url, callback=self.parse)
def parse(self, response): item = BaidukeyItem() if (response.url.find('www.baidu.com') != -1): #print(response.request.headers.get('User-Agent', None), 1111111111111111) #print(response.url) for target_a in response.xpath( '//div[@id="rs"]/table/tr/th/a').extract(): keyword = Selector( text=target_a).xpath('//a/text()').extract_first() href = Selector( text=target_a).xpath('//a/@href').extract_first() if (keyword.find('干洗') != -1 or keyword.find('洗衣') != -1): if (keyword.find('洗衣粉') == -1 and keyword.find('洗衣液') == -1 and keyword.find('洗衣机') == -1): item['keywords'] = keyword item['types'] = '百度pc' item['status'] = 0 yield item fullhref = response.urljoin(href) yield scrapy.Request(url=fullhref, callback=self.parse) elif (response.url.find('m.baidu.com') != -1): #print(response.request.headers.get('User-Agent', None)) for target_a in response.xpath( '//div[@id="relativewords"]/div[@class="rw-list"]/a' ).extract(): keyword = Selector( text=target_a).xpath('//a/text()').extract_first() href = Selector( text=target_a).xpath('//a/@href').extract_first() if (keyword.find('干洗') != -1 or keyword.find('洗衣') != -1): if (keyword.find('洗衣粉') == -1 and keyword.find('洗衣液') == -1 and keyword.find('洗衣机') == -1): item['keywords'] = keyword item['types'] = '百度移动' item['status'] = 0 yield item fullhref = response.urljoin(href) yield scrapy.Request(url=fullhref, callback=self.parse) elif (response.url.find("www.so.com") != -1): for target_a in response.xpath( '//div[@id="rs"]/table/tr/th/a').extract(): keyword = Selector( text=target_a).xpath('//a/text()').extract_first() href = Selector( text=target_a).xpath('//a/@href').extract_first() if (keyword.find('干洗') != -1 or keyword.find('洗衣') != -1): if (keyword.find('洗衣粉') == -1 and keyword.find('洗衣液') == -1 and keyword.find('洗衣机') == -1): item['keywords'] = keyword item['types'] = '好搜' item['status'] = 0 yield item fullhref = response.urljoin(href) yield scrapy.Request(url=fullhref, callback=self.parse) elif (response.url.find("sogou.com") != -1): for target_a in response.xpath( '//table[@id="hint_container"]/tr/td/p/a').extract(): keyword = Selector( text=target_a).xpath('//a/text()').extract_first() href = Selector( text=target_a).xpath('//a/@href').extract_first() if (keyword.find('干洗') != -1 or keyword.find('洗衣') != -1): if (keyword.find('洗衣粉') == -1 and keyword.find('洗衣液') == -1 and keyword.find('洗衣机') == -1): item['keywords'] = keyword item['types'] = '搜狗' item['status'] = 0 yield item fullhref = response.urljoin(href) yield scrapy.Request(url=fullhref, callback=self.parse)
def parse_school(self, response): jiaoyubao_url = re.match(r'(http://\w+\.jiaoyubao\.cn)/.*', response.url).group(1) item = response.meta['item'] s = Selector(response) course_urls = s.xpath( '//div[@class="ZcTabSerP"]/div/a/@href').extract() pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=1) r = redis.Redis(connection_pool=pool) pipe = r.pipeline(transaction=True) for course in course_urls: courses_url = urljoin(jiaoyubao_url, course) pipe.lpush("jyb_course_urls", courses_url) pipe.execute() # 保存学校页面的课程链接到reids name = s.xpath( '//div[1]/div[1]/div/div[1]/div[4]/text()').extract_first() features = s.xpath('//div[@class="item2"]').extract_first() feature = '' if features: feature = remove_tags(features) msg = s.xpath('//div[@class="content_description"]') intros2 = s.xpath('//div[@class="ZcTabC"]').extract() intros1 = s.xpath( '//div[@class="ComTab"]/div[@class="ComTab_Item"][2]/div[@class="ComTab_Item_body"]' ).extract() intro = '' if msg: #src = msg.xpath('img/@href').extract_first() intros = msg.xpath('p').extract() for text in intros: if text != ' ': intro += remove_tags(text.strip()) elif intros1: for text in intros1: if text != ' ': intro += remove_tags(text.strip()) elif intros2: for text in intros2: if text != ' ': intro += remove_tags(text.strip()) else: intros3 = s.xpath('//a[@name="机构简介"]/../div').extract() for text3 in intros3: if text3 != ' ': intro += remove_tags(text3.strip()) srcs = s.xpath( '//div[@class="j j_Slide loading"]/div/ol/li/img/@src').extract() if not srcs: srcs = s.xpath( '//div[@class="j j_Slide loading"]/ol/li/img/@src').extract() if not srcs: srcs = s.xpath('//li[@class="J_ECPM"]/img/@src').extract() pic = [] if srcs: i = 0 for src in srcs: src3 = src year = time.strftime('%Y', time.localtime(time.time())) month = time.strftime('%Y.%m', time.localtime(time.time())) day = time.strftime('%Y.%m.%d', time.localtime(time.time())) t = int(time.time()) num = random.randint(1000, 9999) filename = str(t) + str(num) + '.png' filepath2 = self.path2 + year + '/' + month + '/' + day + '/' + filename fileYear = self.path1 + year fileMonth = fileYear + '/' + month fileDay = fileMonth + '/' + day filepath1 = fileDay + '/' + filename if not os.path.exists(fileYear): os.mkdir(fileYear) os.mkdir(fileMonth) os.mkdir(fileDay) else: if not os.path.exists(fileMonth): os.mkdir(fileMonth) os.mkdir(fileDay) else: if not os.path.exists(fileDay): os.mkdir(fileDay) try: urlretrieve(src3, filepath1) except: urlretrieve(urllib.parse.quote(src, safe=string.printable), filepath1) pic.append(filepath2) i += 1 if i == 5: break js = s.xpath('//div[@class="wangdian"]/span/@onclick').extract_first() if not js: js = s.xpath( '//div[@class="ZcPoint"]/div[@class="pa"]/div[@class="pa02"]/@onclick' ).extract_first() if not js: js = s.xpath( '//div[@class="tl3_dd2"]/div[@class="tl3_dr2"]/span/@onclick' ).extract_first() datas = '' maps = [] address = [] tel = '' city = '' if js: datas = js[27:-1].split(',') if datas: try: arg = int(datas[3]) except: arg = int(datas[3][1:-1]) cityid = int(datas[9][4:-1]) #经过网页跟踪发现定位数据需要传入参数并从http://api.jiaoyubao.cn/map/Ajax.aspx动态加载而来 data = { "os": 1, "arg": arg, "city": cityid, "page": 1, "pagesize": 10, "key": '' } map_url = "http://api.jiaoyubao.cn/map/Ajax.aspx" if data: r = requests.post(map_url, data=data) s = BeautifulSoup(r.text, "html.parser") point1 = s.find_all('point') city = s.points['cityname'] p = s.find('point') if not name: name = p.get('cp_name') tel1 = p.get('u400') tel2 = p.get('tel400') if tel2: tel = tel1 + '转' + tel2 else: tel = tel1 for point in point1: map = [] addr = '' campus = point.get('name') campus_address = point.get('address') lng = point.get('lng') lat = point.get('lat') addr = point.get('name') + '/' + point.get('address') value = lng, lat map.append(campus) map.append(campus_address) map.append(value) maps.append(map) address.append(addr) else: citys = s.xpath( '//div[@class="Item_ComTop1"]/div[1]/div[1]/a[2]/text()' ).extract_first() if citys: city = citys[:(len(citys) - 3)] tels = s.xpath('//div[@class="ZcTel"]/div[1]') tel1 = tels.xpath('span[2]/text()').extract_first() tel2 = tels.xpath('text()').extract_first() if tel2: tel = tel1 + '转' + tel2 else: tel = tel1 item['city'] = city item['tel'] = tel item['address'] = address item['average_price'] = None item['location'] = maps item['teacher_num'] = None item['pic'] = pic item['intro'] = ''.join(intro.split()) item['feature'] = feature item['tags'] = None item['name'] = name yield item