def search_course_4(sess, *args:list): # 接口 url = "http://www.92daikan.com/tiku.aspx" # 获取接口参数 try: res = sess.get(url, verify=False) res.raise_for_status() selector = etree.HTML(res.text) viewstate = selector.xpath('//*[@id="__VIEWSTATE"]/@value') viewstategenerator = selector.xpath( '//*[@id="__VIEWSTATEGENERATOR"]/@value') eventvalidation = selector.xpath( '//*[@id="__EVENTVALIDATION"]/@value') except requests.exceptions.RequestException as e: result = [] for each in args: answer = [] answer.append({'topic': str(e), 'correct': ''}) result.append(answer) return result # 接口参数 result = [] data = {} data['__VIEWSTATE'] = viewstate data['__VIEWSTATEGENERATOR'] = viewstategenerator data['__EVENTVALIDATION'] = eventvalidation data['ctl00$ContentPlaceHolder1$gen'] = '查询' for i in range(len(args)): data['ctl00$ContentPlaceHolder1$timu'] = args[i] # post请求 logging.info("Post to 92daikan. Question %d" % i) try: res = sess.post(url, data=data, verify=False) res.raise_for_status() except requests.exceptions.RequestException as e: answer = [] answer.append({'topic': str(e), 'correct': ''}) result.append(answer) continue # 处理结果 logging.info("Processing result") answer = [] selector = etree.HTML(res.text) temp = {} temp['topic'] = args[i] temp['correct'] = selector.xpath('//*[@id="daan"]/text()')[0] if temp['correct'] != '未找到答案': answer.append(temp) result.append(answer) logging.info("Return result: %s" % result) return result
def download_chart(self, image_dir, itemids, stime, etime): # 此url是获取图片是的,请注意饼图的URL 和此URL不一样,请仔细观察! url = "http://company.monitor.com/chart.php"; # 折线图的大小 url_par = {} url_par = {"width": 1778, "height": 300, "itemids": itemids} # 开始日期、结束日期从str转换为datetime stime = datetime.datetime.strptime(stime, "%Y-%m-%d") etime = datetime.datetime.strptime(etime, "%Y-%m-%d") # 计算period diff_sec = etime - stime period = diff_sec.days * 24 * 3600 + diff_sec.seconds url_par["period"] = period # stime转换str stime = stime.strftime('%Y%m%d%H%M%S') url_par["stime"] = stime key = url_par.keys() data = urllib.parse.urlencode(url_par).encode(encoding='UTF8') request = urllib.request.Request(url, data) url = self.urlOpener.open(request) image = url.read() html = requests.get('http://zabbix.uce.local/history.php?action=showgraph&itemids[]={}'.format(itemids)).text page = etree.HTML(html) hostname_itemname = page.xpath('//div[@class="header-title"]/h1/text()')[0].split(':') hostname = hostname_itemname[0] hostname_itemname.pop(0) itemname = '_'.join(hostname_itemname).replace('/', '_') imagename = "{}\{}_{}_{}_({}).png".format(image_dir, hostname, stime, etime.strftime('%Y%m%d%H%M%S'), itemname) f = open(imagename, 'wb') f.write(image)
def parse_html(self, url): html = self.get_html(url).decode() parse_obj = etree.HTML(html) href_list = parse_obj.xpath( '//div[@class="all"]/ul[@class="archives"]/li/p[@class="url"]/a/@href' ) print("href_list:", href_list) self.write_html(href_list)
def content(self): while True: #从响应队列中以此获取html源码 html = self.resQueue.get() parseHtml = etree.HTML(html) r_list = parseHtml.xpath('//div[@class=""j-r-list-c-desc""]/a/text()') for r in r_list: print(r+"\n") self.resQueue.task_done()
def parse(s, html, idx): result = {} tree = etree.HTML(html) try: result['lt'] = tree.xpath('//input[@name="lt"]/@value')[0] result['execution'] = tree.xpath('//input[@name="execution"]/@value')[0] result['path'] = tree.xpath('//form[@id="fm1"]/@action')[0] except IndexError, e: return None
def crawlerFeixiaohao(self): response = self.get_data('http://www.feixiaohao.com') html = etree.HTML(response.text) tbody = html.xpath('//*[@id="table"]/tbody/tr') for item in tbody: id = item.xpath('@id')[0] name = item.xpath('td[2]/a/img/@alt')[0] marketValue = item.xpath('td[3]/text()')[0] price = item.xpath('td[4]/a/text()')[0] circulation = item.xpath('td[5]/text()')[0] self.save_obj(id, name, name, price, circulation, marketValue, '', '', '', 'FXH')
def getItemDetail(self, link, save_img_path): """从宝贝的详情链接里 爬取图片 Arguments: link {String} -- [宝贝详情链接] """ newDriver = webdriver.Chrome() newDriver.get(link) time.sleep(self.sleep_time) print(newDriver.title) img_dir_path = save_img_path + newDriver.title.encode('utf-8') if True == self.mkdir(img_dir_path): print('创建宝贝目录成功') html = newDriver.page_source.encode('utf-8') selector = etree.HTML(html) # 封面图 J_ULThumb = selector.xpath("//div[@class='tb-gallery']/ul/li") index = 0 for li in J_ULThumb: # 替换图片 从50*50 至 400 * 400 if len(li.xpath("./div/a/img/@data-src")) < 1: continue small_pic = li.xpath("./div/a/img/@data-src")[0] common_pic = 'https:' + small_pic.replace('50x50', '400x400') thumb_title = str('封面图') + str(index) print(thumb_title) # self.saveImg(img_dir_path, common_pic, thumb_title.decode('utf-8')) index += 1 # 爬取里面所有图片 all_img = selector.xpath( "//div[@id='J_DivItemDesc']//descendant::img/@src") print(all_img) index = 0 for img in all_img: # imglink = '' if img.startswith('http') is True: imglink = img else: imglink = 'https:' + img self.saveImg(img_dir_path, imglink, str(index)) index += 1 newDriver.quit()
def analysis_data(self, data): """ 解析数据 :return: """ html_data = etree.HTML(data) # 获取帖子数量 div_list = html_data.xpath('//div[@id="content-left"]/div') name_list = [] # 遍历出每页中存在的帖子数 for div in div_list: nick_name = div.xpath('.//h2/text()')[0] print(nick_name.strip()) name_list.append(nick_name.strip()) return name_list
def search_course_2(sess, *args: list): if not isinstance(sess, requests.Session): args = list(args) args.insert(0, sess) args = tuple(args) sess = requests.Session() # 接口 url = "https://cx.poxiaobbs.com/index.php" # 接口参数 data = {} result = [] for i in range(len(args)): data['tm'] = args[i] # post请求 logging.info("Post to poxiao bbs php. Question %d" % i) try: res = sess.post(url, data=data, verify=False) res.raise_for_status() except requests.exceptions.RequestException as e: answer = [] answer.append({'topic': str(e), 'correct': ''}) result.append(answer) continue # 处理结果 logging.info("Processing result") answer = [] selector = etree.HTML(res.text) answer_div = selector.xpath('/html/body/div[1]/div[@class="ans"]') for each in answer_div: temp = {} answer_text = each.xpath('string(.)')\ .strip().replace(' ', '').replace('\n', '') if "答案:" in answer_text: temp['topic'] = answer_text.split("答案:")[0] temp['correct'] = answer_text.split("答案:")[1] answer.append(temp) result.append(answer) logging.info("Return result: %s" % result) return result
def login(usr, pwd, idx): s = requests.Session() r = s.get('https://passport.csdn.net/account/login', headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0', 'Host': 'passport.csdn.net', }) while True: res = parse(s, r.text, idx) if res == None: return False url = 'https://passport.csdn.net' + res['path'] form = {'username': usr, 'password':pwd, '_eventId':'submit', 'execution':res['execution'], 'lt':res['lt'],} if res.has_key('validateCode'): form['validateCode'] = res['validateCode'] s.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4', 'Content-Type': 'application/x-www-form-urlencoded', 'Host': 'passport.csdn.net', 'Origin': 'https://passport.csdn.net', 'Referer': 'https://passport.csdn.net/account/login', 'Upgrade-Insecure-Requests': 1, }) r = s.post(url, data=form) tree = etree.HTML(r.text) err_strs = tree.xpath('//span[@id="error-message"]/text()') if len(err_strs) == 0: return True err_str = err_strs[0] print err_str err = err_str.encode('utf8') validate_code_err = '验证码错误' usr_pass_err = '帐户名或登录密码不正确,请重新输入' try_later_err = '登录失败连续超过5次,请10分钟后再试' if err[:5] == validate_code_err[:5]: pass elif err[:5] == usr_pass_err[:5]: return False elif err[:5] == try_later_err[:5]: return False else: return True
def downGubenFromEastmoney(): """ 从东方财富下载总股本变动数据 url: """ pass ts_code = '600000.SH' # startDate = '2019-04-01' bs.login() # from misc import usrlGubenEastmoney # urlGubenEastmoney('600000') gubenURL = urlGubenEastmoney(ts_code) # req = getreq(gubenURL, includeHeader=True) req = getreq(gubenURL) guben = urlopen(req).read() gubenTree = etree.HTML(guben) # //*[@id="lngbbd_Table"]/tbody/tr[1]/th[3] # gubenData = gubenTree.xpath('//tr') gubenData = gubenTree.xpath('''//html//body//div//div //div//div//table//tr//td //table//tr//td//table//tr//td''') date = [gubenData[i][0].text for i in range(0, len(gubenData), 2)] date = [datetime.strptime(d, '%Y%m%d') for d in date] # print date totalshares = [ gubenData[i + 1][0].text for i in range(0, len(gubenData), 2) ] # print totalshares # t = [i[:-2] for i in totalshares] # print t try: totalshares = [float(i[:-2]) * 10000 for i in totalshares] except ValueError as e: # logging.error('ts_code:%s, %s', ts_code, e) print('ts_code:%s, %s', ts_code, e) # print totalshares gubenDf = DataFrame({ 'ts_code': ts_code, 'date': date, 'totalshares': totalshares }) return gubenDf
def get_data(text): html = etree.HTML(text) # 使用xpath格式获取数据 divs = html.xpath('//*[@id="resultList"]/div[@class="el"]') for div in divs: job_title = div.xpath('./p/span/a/@title') job_company = div.xpath('./span[1]/a/@title') job_address = div.xpath('./span[2]/text()') job_salary = div.xpath('./span[3]/text()') job_href = div.xpath('./p/span/a/@href') job_title = job_title[0] if len(job_title) > 0 else '' job_company = job_company[0] if len(job_company) > 0 else '' job_address = job_address[0] if len(job_address) > 0 else '' job_salary = job_salary[0] if len(job_salary) > 0 else '' job_href = job_href[0] if len(job_href) > 0 else '' job_info = [] # 存储数据 job_info.append(job_title) job_info.append(job_company) job_info.append(job_address) job_info.append(job_salary) # 读取职位的网页 将该网页继续解析 得到求职页面具体信息 #job_info.append(job_href) job_body, job_name = getADetails(job_href) #print(job_body) #print(job_name) job_info.append(job_body) job_info.append(job_name) write_excel(job_info, 'Simple_spider_NJ2.xls') # 写进sheet job_info = [] # 每写一次都需要清空 time.sleep(1)
def analysis_data(self): """ 解析数据 :return: """ while True: data = self.data_queue.get() html_data = etree.HTML(data) # 获取帖子数量 div_list = html_data.xpath('//div[@id="content-left"]/div') name_list = [] # 遍历出每页中存在的帖子数 for div in div_list: nick_name = div.xpath('.//h2/text()')[0] print(nick_name.strip()) name_list.append(nick_name.strip()) # return name_list # 数据入栈 self.data_queue.put(name_list) self.data_queue.task_done()
def getItem(self): """爬取当前页面的每个宝贝, 提取宝贝名字,价格,标题等信息 """ html = self.driver.page_source.encode('utf-8') selector = etree.HTML(html) itemList = selector.xpath("//div[@class='item3line1']") # 循环遍历该页所有商品 index = 0 for item3line1 in itemList: dl = item3line1.xpath("./dl") for item in dl: link = 'https:' + item.xpath("./dt/a/@href")[0] photo = 'https:' + item.xpath("./dt/a/img/@src")[0] title = item.xpath("./dd/a/text()")[0] res = {'link': link, 'photo': photo, 'title': title} # 进入宝贝详情页 开始爬取里面的图片资料 self.getItemDetail(link, '') # 获取分页信息 pagination = selector.xpath( "//div[@class='pagination']/a[contains(@class, 'J_SearchAsync') and contains(@class, 'next')]/@href" ) print(pagination) print('正在准备切换分页') if len(pagination) == 0: print('没有下一页了') else: print('加载下一页内容') self.site_url = 'https:' + pagination[0] print(self.site_url) self.getPage()
def parse0(self, html_packed): """ 网贷新闻、平台动态、网贷专栏 """ seeds = [] try: tree = etree.HTML(html_packed['html'].decode('gbk', 'ignore')) txt = 'listbox20' if html_packed['category'] == '网贷新闻' else ( 'listbox28' if html_packed['category'] == '平台动态' else 'listbox26') divs = tree.xpath( '//div[@id="%s"]//div[@class="mod-leftfixed mod-news clearfix"]' % txt) for div in divs: try: seed = { 'spider': 'Spider_bbsp2peye.crawl0', 'category': html_packed['category'], 'pubTime': time.strftime('%Y-%m-%d %H:%M', time.localtime()) } # URL url = div.xpath( './/div[@class="main"]//div[@class="hd"]/a/@href') if len(url) == 0: url = div.xpath( './/div[@class="sub"]/div[@class="inner"]/a/@href') if len(url) > 0: seed['url'] = url[0] # 描述 description = ' '.join( div.xpath( './/div[@class="main"]//div[@class="bd"]//text()' )).strip() if len(description) > 0: seed['description'] = description # 封面图片 coverImg = div.xpath( './/div[@class="sub"]/div[@class="inner"]/a/img[@class="tn"]/@src' ) if len(coverImg) > 0: seed['coverImg'] = coverImg[0] # 发表时间 pubTime = div.xpath( './/div[@class="main"]//div[@class="fd-left"]//span' ) if len(pubTime) > 0: pubTime = pubTime[-1].xpath('./text()') if len(pubTime) > 0 and pubTime[0].strip( ).startswith('20'): seed['pubTime'] = pubTime[0].strip() if seed['pubTime'] > html_packed['end'].replace( '-0', '-'): continue elif seed['pubTime'] < html_packed['start'].replace( '-0', '-'): break seeds.append(seed) except Exception as e: self.logger.error(str(e)) else: if len(divs) > 0: nextURL = tree.xpath( '//div[@class="mod-page"]/div[@class="c-page"]/a[@title="下一页"]/@href' .decode('utf8')) if len(nextURL) > 0: seeds.append({ 'url': 'http://news.p2peye.com' + nextURL[0] if nextURL[0].startswith('/') else nextURL[0], 'category': html_packed['category'], 'start': html_packed['start'], 'end': html_packed['end'], 'spider': 'Spider_bbsp2peye.crawl0', 'dont_filter': True, }) except Exception as e: self.logger.error(str(e)) return [], seeds
def parse2(self, html_packed): """ 曝光台 """ seeds = [] try: tree = etree.HTML(html_packed['html'].decode('gbk', 'ignore')) lis = tree.xpath('//ul[@role-parent="newloadmore"]/li') for li in lis: try: seed = { 'spider': 'Spider_bbsp2peye.crawl0', 'start': html_packed['start'], 'end': html_packed['end'], 'category': html_packed['category'], 'pubTime': time.strftime('%Y-%m-%d %H:%M', time.localtime()) } # URL url = li.xpath('./a[@class="newlistbox"]/@href') if len(url) > 0: seed['url'] = 'http://www.p2peye.com' + url[0] if url[ 0].startswith('/thread') else url[0] # 描述 description = ' '.join( li.xpath( './a/div[@class="synopsis"]/text()')).strip() if len(description) > 0: seed['description'] = description # 发表时间 pubTime = li.xpath( './a/div/span[@class="time"]/text()') if len(pubTime) > 0: seed['pubTime'] = pubTime[0].strip() if seed['pubTime'] > html_packed['end'].replace( '-0', '-'): continue elif seed['pubTime'] < html_packed['start'].replace( '-0', '-'): break seeds.append(seed) except Exception as e: self.logger.error(str(e)) else: if len(lis) > 0: pageNum = re.findall('forum-\d+-(\d+)\.html', html_packed['url']) if len(pageNum) > 0: nextURL = html_packed['url'].replace( '-%s.html' % pageNum[0], '-%s.html' % (int(pageNum[0]) + 1)) seeds.append({ 'url': nextURL, 'category': html_packed['category'], 'start': html_packed['start'], 'end': html_packed['end'], 'spider': 'Spider_bbsp2peye.crawl0', 'dont_filter': True, }) except Exception as e: self.logger.error(str(e)) return [], seeds
def parse3(self, html_packed): """ 曝光帖子(曝光台帖子) """ fields = [] try: tree = etree.HTML(html_packed['html'].decode('gbk', 'ignore')) articleItem = ArticleItem() # 文章ID aid = re.findall('p2peye\.com/(thread-.*?)\.html', html_packed['url']) if len(aid) == 0: aid = re.findall('p2peye\.com/(article-.*?)\.html', html_packed['url']) if len(aid) > 0: articleItem.aid = aid[0] articleItem._id = 'p2peye-%s' % aid[0] # 文章URL articleItem.url = html_packed['url'] # 文章标题 title = tree.xpath('//meta[@name="keywords"]/@content') if len(title) > 0: articleItem.title = title[0] # 文章内容 content = tree.xpath('//div[@class="typeoption"]/table[@summary]') if len(content) > 0: articleItem.content = etree.tounicode(content[0]) # 文章描述 description = tree.xpath('//meta[@name="description"]/@content') if len(description) > 0: articleItem.description = description[0] elif 'description' in html_packed.keys(): articleItem.description = html_packed['description'] # 发布时间 pubTime = tree.xpath( '//meta[@property="og:release_date"]/@content') if len(pubTime) > 0: try: articleItem.pubTime = int( time.mktime(time.strptime(pubTime[0], '%Y-%m-%d %H:%M'))) except Exception as e: pass else: pubTime = ' '.join( tree.xpath( '//div[@class="authi"]/em[contains(@id, "authorposton")]/text()' )) pubTime = re.findall('20\d.*?\d*:\d*', pubTime) if len(pubTime) > 0: try: articleItem.pubTime = int( time.mktime( time.strptime(pubTime[0], '%Y-%m-%d %H:%M'))) except Exception as e: pass # 作者昵称 author = tree.xpath( '//div[@class="pls favatar"]/div[@class="pi"]/div[@class="authi"]/a/text()' ) if len(author) == 0: author = tree.xpath( '//div[@class="pls favatar"]//div/strong/a[@class="xi2"]/text()' ) if len(author) > 0: articleItem.authorNickname = author[0].strip() # 点赞数 praise = tree.xpath( '//a[@id="recommend_add"]/i/span[@id="recommendv_add"]/text()') if len(praise) > 0: try: articleItem.praiseCount = int(praise[0]) except Exception as e: pass # 踩 refuseCount = tree.xpath( '//a[@id="recommend_subtract"]/i/span[@id="recommendv_subtract"]/text()' ) if len(refuseCount) > 0: try: articleItem.refuseCount = int(refuseCount[0]) except Exception as e: pass spans = tree.xpath('//td/div[@class="hm ptn"]/span[@class="xi1"]') if len(spans) == 2: # 阅读数 try: articleItem.readCount = int(spans[0].xpath('./text()')) except Exception as e: pass # 评论数 try: articleItem.commentCount = int(spans[1].xpath('./text()')) except Exception as e: pass # 分享数 shareCount = tree.xpath( '//a[@class="sharep"]/i/span[@id="sharenumber"]') if len(shareCount) > 0: try: articleItem.shareCount = int(shareCount[0]) except Exception as e: pass # 收藏数 collectCount = tree.xpath( '//a[@id="k_favorite"]/i/span[@id="favoritenumber"]/text()') if len(collectCount) > 0: try: articleItem.collectCount = int(collectCount[0]) except Exception as e: pass # 分类 if 'category' in html_packed.keys(): articleItem.classification = html_packed['category'] # 抓取来源 articleItem.crawlSource = '网贷天眼' # 抓取时间 articleItem.crawlTimestamp = html_packed[ 'time_crawl'] if 'time_crawl' in html_packed.keys() else int( time.time()) field = dict(articleItem.__dict__) field['pipeline_dbType'] = 'mongo' fields.append(field) except Exception as e: self.logger.error(str(e)) return fields, []
def parse1(self, html_packed): """ 文章(平台动态、网贷专栏、忘带新闻的帖子) """ fields = [] try: tree = etree.HTML(html_packed['html'].decode('gbk', 'ignore')) articleItem = ArticleItem() # 文章ID aid = re.findall('p2peye\.com/(thread-.*?)\.html', html_packed['url']) if len(aid) == 0: aid = re.findall('p2peye\.com/(article-.*?)\.html', html_packed['url']) if len(aid) > 0: articleItem.aid = aid[0] articleItem._id = 'p2peye-%s' % aid[0] # 文章链接 articleItem.url = html_packed['url'] # 文章标题 title = tree.xpath('//div[@id="ct"]//h1[@id="plat-title"]/text()') if len(title) == 0: title = tree.xpath('//meta[@name="keywords"]/@content') if len(title) > 0: articleItem.title = title[0].strip() # 文章内容 content = tree.xpath('//div[@id="ct"]//td[@id="article_content"]') if len(content) > 0: articleItem.content = etree.tounicode(content[0]) # 文章描述 description = tree.xpath('//meta[@name="description"]/@content') if len(description) > 0: articleItem.description = ' '.join(description).strip() elif 'description' in html_packed.keys(): articleItem.description = html_packed['description'] # 封面缩略图 if 'coverImg' in html_packed.keys(): articleItem.coverImg = html_packed['coverImg'] txt = ' '.join( tree.xpath( '//div[@id="ct"]//div[@class="c-a-inf"]//text()')).replace( '\t', '').replace('\r', '').replace('\n', '').replace(' ', '') # 发布时间 if 'pubTime' in html_packed.keys(): try: articleItem.pubTime = int( time.mktime( time.strptime(html_packed['pubTime'], '%Y-%m-%d %H:%M'))) except Exception as e: pass else: pubTime = re.findall('发布时间: ?(20.*?\d+:\d+)'.decode('utf8'), txt) if len(pubTime) > 0: articleItem.pubTime = int( time.mktime(time.strptime(pubTime[0], '%Y-%m-%d %H:%M'))) # 作者 authorNickname = re.findall('原作者:(.*?) '.decode('utf8'), txt) if len(authorNickname) == 0: authorNickname = re.findall('发布者:(.*?)\|'.decode('utf8'), txt) if len(authorNickname) > 0: articleItem.authorNickname = authorNickname[0].split( '|')[0].split('来自')[0].strip() # 点赞 praiseCount = tree.xpath( '//div[@id="ct"]//div[@id="click_div"]//a[@title="给力"]/span/text()' .decode('utf8')) if len(praiseCount) > 0: articleItem.praiseCount = int(praiseCount[0].strip()) # 踩 refuseCount = tree.xpath( '//div[@id="ct"]//div[@id="click_div"]//a[@title="没劲"]/span/text()' .decode('utf8')) if len(refuseCount) > 0: articleItem.refuseCount = int(refuseCount[0].strip()) # 阅读量 readCount = re.findall('浏览量: ?(\d+)'.decode('utf8'), txt) if len(readCount) > 0: articleItem.readCount = int(readCount[0]) elif 'readCount' in html_packed.keys(): articleItem.readCount = html_packed['readCount'] # 评论数 if 'commentCount' in html_packed.keys(): articleItem.commentCount = html_packed['commentCount'] # 分类 if 'category' in html_packed.keys(): articleItem.classification = html_packed['category'] # 文章来源 source = re.findall('来自: ?(.*?)[ \|]'.decode('utf8'), txt) if len(source) > 0: articleItem.source = source[0] # 抓取来源 articleItem.crawlSource = '网贷天眼' # 抓取时间 articleItem.crawlTimestamp = html_packed['time_crawl'] field = dict(articleItem.__dict__) field['pipeline_dbType'] = 'mongo' fields.append(field) except Exception as e: self.logger.error(str(e)) return fields, []
def parse4(self, html_packed): """ 平台官方动态 """ seeds = [] try: tree = etree.HTML(html_packed['html']) lis = tree.xpath( '//div[@class="mod-list"]/ul/li[@class="item clearfix"]') for li in lis: try: seed = { 'spider': 'Spider_bbsp2peye.crawl0', 'pubTime': time.strftime('%Y-%m-%d %H:%M', time.localtime()) } # URL url = li.xpath('./div[@class="mc-hd"]/a/@href') if len(url) > 0: seed['url'] = url[0] # 描述 description = ' '.join( li.xpath( './div[@class="mc-bd"]/span/text()')).strip() if len(description) > 0: seed['description'] = description commentCount = li.xpath( './div//span[@class="ft-comment"]/text()') try: seed['commentCount'] = int(commentCount[0]) except Exception as e: pass readCount = li.xpath( './div//span[@class="ft-see"]/text()') try: seed['readCount'] = int(readCount[0]) except Exception as e: pass # 发表时间 pubTime = li.xpath( './div/span[contains(@class, "time")]/text()') if len(pubTime) > 0: seed['pubTime'] = ':'.join( pubTime[0].strip().split(':')[:-1] ) # 解析出来的是2018-01-09 14:29:00,只保留到分,即2018-01-09 14:29 if seed['pubTime'] > html_packed['end']: continue elif seed['pubTime'] < html_packed['start']: break seed['category'] = '官方动态' seeds.append(seed) except Exception as e: self.logger.error(str(e)) else: if len(lis) > 0: nextURL = tree.xpath( '//div[contains(@class, "page")]/a[contains(text(), "下一页")]/@href' .decode('utf8')) if len(nextURL) > 0: nextURL = nextURL[0] if nextURL.startswith('/gfdt'): nextURL = html_packed['url'].split( '/gfdt')[0] + nextURL seeds.append({ 'url': nextURL, 'start': html_packed['start'], 'end': html_packed['end'], 'spider': 'Spider_bbsp2peye.crawl0', 'dont_filter': True, }) except Exception as e: self.logger.error(str(e)) return [], seeds
except Exception as e: print(e) driver.execute_script("window.stop()") time.sleep(3) driver.refresh() try: WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.ID, 'page'))) except: print('over') driver.execute_script("window.stop()") html = driver.page_source selector = etree.HTML(html) list1 = selector.xpath('//div[contains(@class,"item-mod__trade-order")]') try: for i in list1: order_id = i.xpath('table[1]/tbody/tr/td[1]/label/span[3]/text()')[0] # 订单号 order_time = i.xpath('table[1]/tbody/tr/td[1]/label/span[6]/text()')[0] # 下单时间 price = i.xpath('table[2]/tbody/tr/td[2]/div/p/span[2]/text()')[0] # 价格 all_price = i.xpath('table[2]/tbody/tr/td[7]/div/div[1]/p/strong/span[2]/text()')[0] # 总价 saler_title = i.xpath('table[2]/tbody/tr/td[5]/div/p[1]/a/text()')[0] # 商品名 name = i.xpath('table[2]/tbody/tr/td[5]/div/p[1]/a/text()')[0] # 买家账户名 url = i.xpath('table[2]/tbody/tr/td[6]/div/div/p[1]/a/@href')[0] # 商品详情url url = 'https:' + url driver.get(url) time.sleep(3)
#Author:Tom_Fish #-*- coding:utf-8 -*- import requests from xml import etree headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36' } res = requests.get('http://kaijiang.500.com/shtml/dlt/18001.shtml', headers=headers) html = etree.HTML(res.text) result = etree.tostring(html) print(result) print('hello world')