def testRemoteInfoDrilldownValues(self): header, body = getRequest(port=self.httpPort, path='/remote/info/drilldownvalues', arguments=dict(path='untokenized.field2', name='main'), parse=False) self.assertFalse('Traceback' in body, body) bodyLxml = HTML(body) self.assertEquals(set(['value1', 'value0', 'value9', 'value8', 'value7', 'value6', 'value5', 'value4', 'value3', 'othervalue2', 'value2']), set(bodyLxml.xpath('//ul/li/a/text()')))
import re import requests from lxml.etree import HTML response = requests.get('http://www.debian.org/releases/stable/') root = HTML(response.content) title_text = root.find('head').find('title').text release = re.search('\u201c(.*)\u201d', title_text).group(1) p_text = root.xpath('//div[@id="content"]/p[1]')[0].text version = p_text.split()[1] print('Codename: {}\nVersion: {}'.format(release, version))
def parseData(urlList): urlW=open("/usr/caizhuang/jiajiemao/url.txt" ,'a') for u in urlList: url=u.get("href").strip() print url urlW.write(url) urlW.write("\n") h = HTML(getHtml(url).decode('gbk')) try: dTxt=h.xpath('//h3') name=dTxt[0].text.strip().split()[0]+" "+dTxt[0].text.strip().split()[1]#名字 brand=dTxt[0].text.strip().split()[0]#品牌 except Exception: errorTxt.write(url) # print brand # print name try: pCpgg=h.xpath('//p[@class="pCpgg"]') td=h.xpath('//td[@class="td2"]') except Exception: errorTxt.write(url) try: if td: price=list(td[0].itertext())[1].strip() else : price=list(pCpgg[0].itertext())[1].strip()#价格 # print price except Exception: errorTxt.write(url) try: norms=list(pCpgg[-1].itertext())[1].strip()#规格 # print norms except Exception: errorTxt.write(url) try: spePs=h.xpath('//p[@class="speP"]/a') effect='' for speP in spePs: effect+=speP.text.strip()+" "#功效 # print effect except Exception: errorTxt.write(url) try: awrap=h.xpath('//div[@class="Awrap"]/ul/li/a') imgUrl=awrap[0].find("img").attrib.get("src")#图片链接地址 # print imgUrl except Exception: errorTxt.write(url) try: troCon=h.xpath('//div[@class="troCon"]') des=list(troCon[0].itertext()) description='' for d in des: if len(d.strip())>20: description+=d.strip()+""#产品描述 # print description except Exception: errorTxt.write(url) try: dTxt=h.xpath('//div[@class="dTxt"]/p/a') series=dTxt[1].text.strip() #系列 except Exception: errorTxt.write(url) # print series insertData(name,brand,price,norms,effect,imgUrl,description,series)
START_URL = 'https://free-proxy-list.net/' HEADERS = ['"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"'] PATH = 'ip_address/' options = ChromeOptions() options.add_argument(HEADERS[0]) browser = webdriver.Chrome(options = options) if __name__ == '__main__': f = open(PATH+'ip.csv', 'w') browser.get(START_URL) for i in range(15): content = browser.page_source html = HTML(content) ip_address = html.xpath('//tr//td[1]//text()')[:20] port = html.xpath('//tr//td[2]//text()')[:20] anonimity = html.xpath('//tr//td[5]//text()')[:20] https = html.xpath('//tr//td[@class="hx"]//text()')[:20] for i in range(len(port)): line = ','.join([ip_address[i], port[i], anonimity[i], https[i]]) f.write(line) f.write('\n') time.sleep(2) button = browser.find_element_by_xpath('//*[@id="proxylisttable_next"]/a') button.click() f.close()
def start(): num = 1 for i in range(0, 10): print('当前页:' + str(i)) pageToken = i * 25 start_url = 'https://movie.douban.com/top250?start={pageToken}&filter='.format( pageToken=pageToken) print(start_url) response = requests.get(start_url, headers=headers) # print(response.text) html = HTML(response.text) urls = html.xpath( '//ol[@class="grid_view"]/li//div[@class="hd"]/a/@href') for url in urls: print(url) movieId = re.search('https://movie.douban.com/subject/(\d+)/', url).group(1) response = requests.get(url, headers=headers) html = HTML(response.text) MovieNameStr = html.xpath('string(//h1/span/text())') MovieName = MovieNameStr.split(' ')[0].replace('\'', '"') EnglishName = ' '.join(MovieNameStr.split(' ')[1:]).replace( '\'', '"') pattern_all_zh = r'([\u4e00-\u9fa5])' text_cn_split = re.findall(pattern_all_zh, EnglishName, re.S) if text_cn_split: EnglishName = '' MovieName = MovieNameStr.replace('\'', '"').strip() jsonStr = re.search('<script type="application.*?">(.*?)</script>', response.text, re.S).group(1).replace('\n', '').strip() # print(response.text) print(jsonStr) json_obj = json.loads(jsonStr) OtherName = '' OtherNameStr = re.search('又名:</span>(.*?)<br/>', response.text) if OtherNameStr: OtherName = OtherNameStr.group(1).strip().replace('\'', '"') DirectorList = [] for dire in json_obj['director']: DirectorList.append(dire['name']) Director = '|'.join(DirectorList).replace('\'', '"') ActorsList = [] for dire in json_obj['actor']: ActorsList.append(dire['name']) Actors = '|'.join(ActorsList).replace('\'', '"') Year = json_obj['datePublished'] Country = re.search('制片国家/地区:</span>(.*?)<br/>', response.text).group(1).replace( '\n', '').replace('\'', '"').strip() timeLong = re.search( '片长:</span> <span property="v:runtime" content="(\d+)"', response.text).group(1).replace('\n', '').replace('\'', '"').strip() language = re.search('语言:</span>(.*?)<br/>', response.text).group(1).replace( '\n', '').replace('\'', '"').strip() Grenre = '|'.join(json_obj['genre']) Rating = json_obj['aggregateRating']['ratingValue'] RatingNum = json_obj['aggregateRating']['ratingCount'] Description = json_obj['description'] # print(movieId) # print(MovieName) # print(EnglishName) # print(Director) # print(Actors) # print(Year) # print(Country) # print(Grenre) # print(Rating) # print(RatingNum) # print(Description) sql = "insert into info(movieId,num,MovieName,EnglishName,OtherName,Director,Actors,Year,Country,Grenre,Rating,RatingNum,Description,timeLong,language) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') " % ( movieId, num, MovieName, EnglishName, OtherName, Director, Actors, Year, Country, Grenre, Rating, RatingNum, Description, timeLong, language) num += 1 print(sql) dbCli.save(sql)
def start(): date_list = [] with open('date.txt') as f: results = f.readlines() for res in results: date_list.append(res.strip()) print(date_list) item_list = [] with open('fujian_id1.txt') as f: results = f.readlines() for res in results: url = res.split(',')[0] title = res.split(',')[1].strip() obj = { 'url':url, 'title':title, } item_list.append(obj) print(len(item_list)) print(item_list[:10]) for itemObj in item_list: print(itemObj) try: for date in date_list: print('当前日期:'+date) start_url = itemObj['url'] title = itemObj['title'] body = '__VIEWSTATE=%2FwEPDwUJNDk2MTM2Mzc5ZGTd37nDAAZ8HMoQ9C6MjYnecXynQQ%3D%3D&__EVENTVALIDATION=%2FwEWAwKKm%2FSpBwKnpoOOCwKY7%2B%2FtCc29g5gXa%2BvZaoWCWvhGPER39rFI&right%24l_date={date}&right%24Button1=%CB%D1%CB%F7' data = body.format(date=date) try: response = requests.post(start_url, data=data, headers=headers,timeout=10) except: continue # print(response.text) html =HTML(response.text) tr_list = html.xpath('//form[@id="aspnetForm"]//div[@class="table3"]//tr') if len(tr_list) == 1: print('无数据') for item in tr_list[1:]: try: td_list = item.xpath('./td') if len(td_list) == 8: jiancedianName = item.xpath('string(./td[1])') jianceTime = item.xpath('string(./td[2])') jianceProject = item.xpath('string(./td[3])') jianceValue = item.xpath('string(./td[4])') biaozhunValue = item.xpath('string(./td[5])') shifoudabiao = item.xpath('string(./td[6])') chaobiaobenshu = item.xpath('string(./td[7])') shifoutingchan = item.xpath('string(./td[8])') elif len(td_list) == 6: jianceProject = item.xpath('string(./td[1])') jianceValue = item.xpath('string(./td[2])') biaozhunValue = item.xpath('string(./td[3])') shifoudabiao = item.xpath('string(./td[4])') chaobiaobenshu = item.xpath('string(./td[5])') shifoutingchan = item.xpath('string(./td[6])') # 企业名称、污染源类型(废水、废气)、监测点名称、监测时间、监测项目、监测值、标准值、是否达标、超标倍数、是否停产 print(title,jiancedianName,jianceTime,jianceProject,jianceValue,biaozhunValue,shifoudabiao,chaobiaobenshu,shifoutingchan) sql = "insert into fujian(title,jiancedianName,jianceTime,jianceProject,jianceValue,biaozhunValue,shifoudabiao,chaobiaobenshu,shifoutingchan)" \ " VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \ % (title,jiancedianName,jianceTime,jianceProject,jianceValue,biaozhunValue,shifoudabiao,chaobiaobenshu,shifoutingchan) dbclient.save(sql) except: continue except: continue
def start(): #获取第一页 pageToken = 1 start_url = 'https://forum.cyberctm.com/home.php?mod=space&uid=503430&do=thread&view=me&order=dateline&page=' + str( pageToken) if USE_PROXY: response = requests.get(start_url, headers=start_headers, proxies=proxies) else: response = requests.get(start_url, headers=start_headers) # print(response.text) html = HTML(response.text) url_list = html.xpath('//ul[@id="waterfall"]/li/div/h2/a/@href') title_list = html.xpath('//ul[@id="waterfall"]/li/div/h2/a/text()') num = 1 item_list = [] for url, title in zip(url_list, title_list): #请求详情页 print(str(num) + '. ' + title) link = 'https://forum.cyberctm.com/' + url # print(link) obj = {'numkey': str(num), 'link': link, 'title': title} item_list.append(obj) num += 1 #获取第二页 pageToken = 2 start_url = 'https://forum.cyberctm.com/home.php?mod=space&uid=503430&do=thread&view=me&order=dateline&page=' + str( pageToken) if USE_PROXY: response = requests.get(start_url, headers=start_headers, proxies=proxies) else: response = requests.get(start_url, headers=start_headers) # print(response.text) html = HTML(response.text) url_list = html.xpath('//ul[@id="waterfall"]/li/div/h2/a/@href') title_list = html.xpath('//ul[@id="waterfall"]/li/div/h2/a/text()') for url, title in zip(url_list, title_list): # 请求详情页 print(str(num) + '. ' + title) link = 'https://forum.cyberctm.com/' + url # print(link) obj = {'numkey': str(num), 'link': link, 'title': title} item_list.append(obj) num += 1 #开始评论回复 num_input_listStr = input('\n请输入要发布评论的帖子的序号:') sleepTime = input('\n请输入多少分钟后循环发布:') while True: num_input_list = num_input_listStr.split('.') with open('评论内容.txt') as f: mycomment = f.read().strip() print('当前评论内容是:' + mycomment) for num_input in num_input_list: for item in item_list: if item['numkey'] == num_input: print('\n正在评论:' + str(item['numkey'])) try: setRes = setComment(item['link'], mycomment) if setRes: pass else: setRes = setComment(item['link'], mycomment) except: print('未知错误') break sleepTimeMin = 60 * int(sleepTime) print('\n当前时间:' + str(time.strftime('%Y-%m-%d %H:%M:%S'))) print('等待下一轮:' + sleepTime + '分钟后重新启动。。。') time.sleep(sleepTimeMin)
import requests from lxml.etree import HTML import re #网站url url = 'https://www.whu.edu.cn/' #request请求 response = requests.get(url) #设置编码 response.encoding = 'utf8' #lxml解析返回的结果 html = HTML(response.text) #xpath获取对应的数据 lis = html.xpath('//a/@href') titles = html.xpath('//a/text()') for li, title in zip(lis, titles): #对不匹配的数据剔除 title = title.strip() if title != '': #re正则表达式匹配 if re.match('http://news.*?|info', li): #只获取新闻内容 if li[:4] == 'info': link = 'https://www.whu.edu.cn/' + li else: link = li print(link, title)
import re import requests from lxml.etree import HTML response = requests.get("http://www.debian.org/releases/stable/") root = HTML(response.content) title_text = root.find("head").find("title").text release = re.search("\u201c(.*)\u201d", title_text).group(1) p_text = root.xpath("//div[@id='content']/p[1]")[0].text version = p_text.split()[1] print("Codename:{}\nVersion: {}".format(release, version))
def parse_item(self, response): title = response.meta['title'] describe = response.meta['describe'] publishedDate = response.meta['publish'] pic_url = response.meta['pic'] app_name = '搜狐新闻' author = '' home_url = 'https://api.k.sohu.com/' crawlTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())) publishedDate = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(float(publishedDate) / 1000)) category = '要闻' data = json.loads(response.body) content = data['content'] selector = HTML(content) content = selector.xpath('//text()') content = ''.join(content) content = content.replace('\t', '').replace('\n', '').replace('\r', '') pic_more_url = data['photos'] pic = [] for i in range(len(pic_more_url)): pic.append(str(pic_more_url[i]['pic'])) pic_more_url = pic print "app名称", app_name print "主图片url", pic_url print "子图片url", pic_more_url print "作者", author print "详情页地址", response.url print "所属类型", category print "标题", title print "描述", describe print "内容", content print "主url", home_url print "发布时间", publishedDate print "爬取时间", crawlTime self.count += 1 url = response.url item = NewsItem() item['app_name'] = app_name item['pic_url'] = pic_url item['pic_more_url'] = pic_more_url item['author'] = author item['url'] = url item['category'] = category item['title'] = title item['describe'] = describe item['content'] = content item['home_url'] = home_url item['publishedDate'] = publishedDate item['crawlTime'] = crawlTime item['count'] = self.count timeArray = time.strptime(publishedDate, "%Y-%m-%d %H:%M:%S") timeStamp = int(time.mktime(timeArray)) if timeStamp >= self.timeStamp: numappName = self.readjson() if len(numappName) == 0: items = {'title': title} with open('souhuxinwen.json', 'a+') as fp: line = json.dumps(dict(items), ensure_ascii=False) + '\n' fp.write(line) yield item else: for i in range(len(numappName)): if numappName[i]['title'] == item['title']: return else: items = {'title': item['title']} with open('souhuxinwen.json', 'a+') as fp: line = json.dumps(dict(items), ensure_ascii=False) + '\n' fp.write(line) yield item
def xpath(self, html, *tags, exclude=None): xhtml = HTML(html) exclude = '[not(name()={})]'.format(exclude) if exclude else '' LogControl.info("//" + "//".join(tags) + exclude) if self.debug else '' for item in xhtml.xpath("//" + "//".join(tags) + exclude): yield item
def _get_total_drug_page(self, url): response = requests.get(url) sel = HTML(response.content) total_pages = int(sel.xpath('//span[@class="p-skip"]/em/b/text()')[0]) return total_pages
def parse(obj): detail_url = 'https://forum.app.autohome.com.cn/forum_v9.8.0/forum/club/topiccontent-a2-pm2-t{id}-o0-p1-s20-c1-nt0-fs0-sp0-al0-cw360-i0-ct0-mid0-abX-isar1.json' start_url = detail_url.format(id=obj['id']) print(start_url) headers = { 'User-Agent': "Android6.0 autohome9.8.5 Android", 'sample': "0", 'reqid': "863100032895926/1547540160089/346", 'apisign': "2|863100032895926|autohomebrush|1547540157|11D6CEBC22A1C9729C5C683E24F5D6AE", 'Host': "forum.app.autohome.com.cn", 'Connection': "Keep-Alive", 'Accept-Encoding': "gzip", 'cache-control': "no-cache", 'Postman-Token': "8ca13309-a73f-4acb-8414-921c52aa0665" } response = requests.get(start_url, headers=headers, verify=False) # print(response.text) responseRrStr = response.text.replace( '<span class="hs_kw0_mainpl"></span>', ',').replace('<span class="hs_kw1_mainpl"></span>', '了').replace( '<span class="hs_kw2_mainpl"></span>', '是').replace('<span class="hs_kw3_mainpl"></span>', '的').replace( '<span class="hs_kw4_mainpl"></span>', '不').replace( '<span class="hs_kw6_mainpl"></span>', '?').replace('<span class="hs_kw5_mainpl"></span>', '。') html = HTML(responseRrStr) id = obj['id'] url = start_url if obj['topictype'] == '精': jinghua = '1' else: jinghua = '0' userName = obj['userName'] title = obj['title'] publishDate = obj['publishDate'] replyCount = obj['replyCount'] clickCount = html.xpath('string(//span[@class="view"])').replace('浏览', '') content = html.xpath('string(//div[@class="tz-paragraph"])') save_res = id + '||' + url + '||' + userName + '||' + title + '||' + jinghua + '||' + clickCount + '||' + replyCount + '||' + content + '||' + publishDate save_res = save_res.replace(',', ',').replace(' ', '').replace( '\n', ' ').replace('\r', ' ').replace('||', ',').strip() + '\n' print(save_res) with open('post.csv', 'a', encoding='utf8', errors='ignore') as f: f.write(save_res) commentEtreeList = html.xpath('//ul[@class="post-flow"]/li') for eachEtree in commentEtreeList: try: commentStr = etree.tostring(eachEtree) comment_html = HTML(commentStr) comment_list = comment_html.xpath( '//div[@class="yy_reply_cont"]//text()') commentContent = ''.join(comment_list) if commentContent == '': continue with open('comment.csv', 'a') as f: commentRes = id + ',' + commentContent.replace( ',', ',').replace('\n', ' ').strip() + '\n' f.write(commentRes) except: continue
def date_xpath(self,resp,id): items = {} etre = HTML(resp) items["_id"] = id items["name"] = "".join(etre.xpath('//div[@class="zbxq_name"]/text()')) items["company"] = "".join(etre.xpath('//div[@class="zbxq_time"]/a[1]/text()')) items["userID"] = "".join(etre.xpath('//div[@class="zbxq_time"]/span//label/text()')) cont = etre.xpath('//div[@class="see"]/table') if len(cont) > 1: # 证书名称 执业印章号 注册专业 注册编号 有效期 trlt = etre.xpath('//div[@class="see"]/table[1]//tr') first = [] k = 0 v = 3 for x in range(len(trlt) + 1): if v == x: y = trlt[k:v] first.append(y) k += 3 v += 3 for _ in range(len(first)): items[f"certificate_name_{str(_)}"] = "".join(first[_][0].xpath('.//td[2]//text()')) items[f"practice_seal_{str(_)}"] = "".join(first[_][0].xpath('.//td[4]//text()')) items[f"reg_major_{str(_)}"] = "".join(first[_][1].xpath('.//td[2]//text()')) items[f"reg_number_{str(_)}"] = "".join(first[_][1].xpath('.//td[4]//text()')) items[f"validity_time_{str(_)}"] = "".join(first[_][2].xpath('.//td[2]/span/span[2]/text()')) tr_2_lt = etre.xpath('//div[@class="see"]/table[2]//tr') second = [] k = 0 v = 2 for x in range(len(tr_2_lt) + 1): if v == x: y = tr_2_lt[k:v] second.append(y) k += 2 v += 2 for _ in range(len(second)): items[f"certificate_category_{str(_)}"] = "".join(second[_][0].xpath('.//td[2]//text()')) items[f"certificate_num_{str(_)}"] = "".join(second[_][1].xpath('.//td[2]//text()')) items[f"cer_validity_time_{str(_)}"] = "".join(second[_][1].xpath('.//td[4]/span/span[2]/text()')) # 证书类型 证书编号 证书名称 执业印章号 注册专业 注册编号 有效期 else: trlt = etre.xpath('//div[@class="see"]/table[1]//tr') first = [] k = 0 v = 3 for x in range(len(trlt) + 1): if v == x: y = trlt[k:v] first.append(y) k += 3 v += 3 for _ in range(len(first)): items[f"certificate_name_{str(_)}"] = "".join(first[_][0].xpath('.//td[2]//text()')) items[f"practice_seal_{str(_)}"] = "".join(first[_][0].xpath('.//td[4]//text()')) items[f"reg_major_{str(_)}"] = "".join(first[_][1].xpath('.//td[2]//text()')) items[f"reg_number_{str(_)}"] = "".join(first[_][1].xpath('.//td[4]//text()')) items[f"validity_time_{str(_)}"] = "".join(first[_][2].xpath('.//td[2]/span/span[2]/text()')) JSK_date.save(items) self.log.info(f"数据{id}存入成功")
return None def get_num(x): return int(re.search('[0-9]+$', x).group(0)) if __name__ == '__main__': args = apr.parse_args() h = HTML(open(args.file).read().replace('<br>', '')) key_var = None for key in get_keys(h): print '[*] testing key:', key stream = '' txt = None for el in h.xpath('//*[@id or @ui or @di]'): if el.text: txt = decode_page(el.text, key) # print txt if not txt: continue if 'cryptKey' in txt: key_var = re.findall( 'var cryptKey = ([_a-z0-9]+(\[\s*[0-9]+\s*\])?),', txt, re.I)[0][0] key_var = re.sub('\s+', '', key_var) print '[+] found key_var', key_var #txt = method_3(stream,key) #print txt
def get_objects(keyword, pageToken): # kw = keyword + '%20site:news.163.com' kw = keyword SEARCH_URL = 'https://www.baidu.com/s?wd={kw}&pn={pageToken}&rn=10&oq={kw}' url = SEARCH_URL.format(kw=kw, pageToken=pageToken) print(url) try: search_response = requests.get(url, headers=headers, verify=False) except: return # print(search_response.text) html = HTML(search_response.text) # with open('aa.txt') as f: # aa = f.read() # html = HTML(aa) div_list = html.xpath('//div[@id="content_left"]/div') rank = 1 for div in div_list: if re.search('广告', etree.tostring(div, encoding='utf8').decode('utf8')): if re.search('class="EC_newppim', etree.tostring(div, encoding='utf8').decode('utf8')): eachItem_list = div.xpath('./div') for eachItem in eachItem_list: deal(eachItem, rank) rank += 1 else: deal(div, rank) rank += 1 # # 获取真实url # for site in site_list: # t = site.xpath('h3/a')[0] # link = t.get("href") # # title = site.xpath('h3/a//text()') # title = ''.join(title) # # publishDateStr = site.xpath('string(div//span[@class=" newTimeFactor_before_abs m"])').replace('-','').strip() # # save_res = title+'||'+link+'||'+publishDateStr # save_res = save_res.replace('\n','').replace('\r','').replace(',',',').replace('||',',') + '\n' # print(save_res) # with open('结果.csv','a',encoding='gbk',errors='ignore') as f: # f.write(save_res) page = html.xpath('//div[@id="page"]') if page: if u"下一页" in etree.tostring(page[0], encoding="utf-8", method="text").decode("utf-8"): pageToken = int(pageToken) + 10 else: pageToken = False return pageToken
async def main(): async with aiohttp.ClientSession() as session: mysql_cli = db.MysqlClient() item_list = [] with open('zhihu_id.txt') as f: results = f.readlines() for res in results: id = res.split(',')[0] question = res.split(',')[1] obj = { 'id': id, 'question': question, } item_list.append(obj) for obj in item_list: print(obj['id']) url = 'https://www.zhihu.com/question/' + obj['id'] print(url) response = await fetch(session, url) jsonStr = re.search( '<script id="js-initialData".*?>(.*?)</script>', response).group(1) json_obj = json.loads(jsonStr) print(json.dumps(json_obj)) for data in json_obj['initialState']['entities']['questions']: questionAuthor = json_obj['initialState']['entities'][ 'questions'][data]['author']['name'] questionAuthorId = json_obj['initialState']['entities'][ 'questions'][data]['author']['urlToken'] questionAuthor_hashId = json_obj['initialState']['entities'][ 'questions'][data]['author']['id'] save_re = questionAuthor.replace( ',', ',' ) + ',' + questionAuthorId + ',' + questionAuthor_hashId + '\n' with open('author.txt', 'a') as f: f.write(save_re) for data in json_obj['initialState']['entities']['answers']: question = obj['question'] answerId = str(json_obj['initialState']['entities']['answers'] [data]['id']) answer = json_obj['initialState']['entities']['answers'][data][ 'content'] html = HTML(answer) content_list = html.xpath('//text()') answer = ''.join(content_list) answerAuthor = json_obj['initialState']['entities']['answers'][ data]['author']['name'] answerAuthorId = json_obj['initialState']['entities'][ 'answers'][data]['author']['urlToken'] answerAuthor_hashId = json_obj['initialState']['entities'][ 'answers'][data]['author']['id'] commentCount = str(json_obj['initialState']['entities'] ['answers'][data]['commentCount']) likeCount = str(json_obj['initialState']['entities']['answers'] [data]['voteupCount']) # print(question) # print(answerId) # print(answer) # print(answerAuthor) # print(answerAuthorId) # print(answerAuthor_hashId) # print(commentCount) # print(likeCount) save_re = answerAuthor.replace( ',', ',' ) + ',' + answerAuthorId + ',' + answerAuthor_hashId + '\n' with open('author.txt', 'a') as f: f.write(save_re) sql = "insert into questionDetail(question,answerId,answer,answerAuthor,answerAuthorId,answerAuthor_hashId,commentCount,likeCount)" \ " VALUES ('%s', '%s', '%s','%s', '%s', '%s','%s', '%s')" \ % (question,answerId,answer,answerAuthor,answerAuthorId,answerAuthor_hashId,commentCount,likeCount) print(sql) mysql_cli.save(sql)
def oldhome(self, response): # if response.status== sel=scrapy.Selector(response) #从上一函数传下来 item=response.meta province=item['province'] city=item['city'] city_href=item['city_href'] county=item['county'] county_href=item['county_href'] oldhome_href=item['oldhome_href'] url=response.url item = HouseItem() #存储到此函数 item['province'] = province item['city'] = city item['city_href'] = city_href item['county'] = county item['county_href'] = county_href item['oldhome_href']=oldhome_href item['date_before']=self.date_before item['building'] = '二手房' item['ProgramStarttime']=self.ProgramStarttime # 有小区信息那一部分 detail_table=sel.xpath(".//div[@class='l-c']/div[@class='gary-detail pdd-5']/table[@class='ha_detail_table mt']") # 会出现传入的第一个链接,获取内容不全的情况 #xpath重新获取时,HTML不能用extract(),所以后续不能合并处理 if detail_table==[]: headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } urls=requests.get(url,headers=headers).text html=HTML(urls) #有该页小区信息那一部分 detail_table = html.xpath(".//div[@class='l-c']/div[@class='gary-detail pdd-5']/table[@class='ha_detail_table mt']")[0] #该区县所有小区列表 detail=detail_table.xpath(".//tr[@height='25px;']") #获取各个小区信息 for d in detail: #小区名称 item['house']=d.xpath(".//a[@class='c_blue']/text()")[0] #上月房价(平均单价) item['price']=d.xpath(".//td[4]/span/text()")[0] #环比上月信息 rate=d.xpath(".//td[5]/span/text()")[0] if '--' not in rate: if rate[0]=='-': item['rate_m_unit']='下降' item['rate_m'] = rate[1:] elif rate[0]=='+': item['rate_m_unit']='上升' item['rate_m'] = rate[1:] else: item['rate_m_unit']=None item['rate_m'] = rate else: item['rate_m_unit'] = None item['rate_m'] = rate yield item else: # 该区县所有小区列表 detail=detail_table.xpath(".//tr[@height='25px;']") # 获取各个小区信息 for d in detail: # 小区名称 item['house'] = d.xpath(".//a[@class='c_blue']/text()").extract()[0] # 上月房价(平均单价) item['price'] = d.xpath(".//td[4]/span/text()").extract()[0] # 环比上月信息 rate = d.xpath(".//td[5]/span/text()").extract()[0] if '--' not in rate: if rate[0] == '-': item['rate_m_unit'] = '下降' item['rate_m'] = rate[1:] elif rate[0] == '+': item['rate_m_unit'] = '上升' item['rate_m'] = rate[1:] else: item['rate_m_unit'] = None item['rate_m'] = rate else: item['rate_m_unit'] = None item['rate_m'] = rate yield item
def _parse_detail(self, li): item = dict() item['_id'] = li.xpath('./div[@class="div05"]/h2/a/@rjs8').pop() item['url'] = urljoin( self.base_url, li.xpath('./div[@class="div05"]/h2/a[1]/@href').pop()) item['title'] = li.xpath('./div[@class="div05"]/h2/a[1]/@title').pop() response = self._get_response(item['url']) text = self._get_text(response) detail_html = HTML(text) # 法规文号 item['fgwh'] = detail_html.xpath( '/html/body/div[8]/div/div/div[3]/ul/li[2]/p/text()') if len( detail_html.xpath( '/html/body/div[8]/div/div/div[3]/ul/li[2]/p/text()') ) > 0 else None # 发布日期 item['fbrq'] = detail_html.xpath('//p[@id="tdat"]/text()') if len( detail_html.xpath('//p[@id="tdat"]/text()')) > 0 else None # 实施日期 item['ssrq'] = detail_html.xpath( '/html/body/div[8]/div/div/div[3]/ul/li[4]/p/text()') if len( detail_html.xpath( '/html/body/div[8]/div/div/div[3]/ul/li[4]/p/text()') ) > 0 else None # 发布部门 item['fbbm'] = detail_html.xpath('//p[@id="tdpt"]/text()') if len( detail_html.xpath('//p[@id="tdpt"]/text()')) > 0 else None # 效力等级 item['xldj'] = detail_html.xpath( '/html/body/div[8]/div/div/div[3]/ul/li[6]/p/text()') if len( detail_html.xpath( '/html/body/div[8]/div/div/div[3]/ul/li[6]/p/text()') ) > 0 else None # 正文 item['maintext'] = detail_html.xpath( '//div[@id="maintext"]/text()' ) if len( detail_html.xpath('//div[@id="maintext"]/text()')) > 0 else None return item
def newhome(self,response): #从上一函数传下来 sel=scrapy.Selector(response) item=response.meta province=item['province'] city=item['city'] city_href=item['city_href'] county=item['county'] # 中间变量(不返回到yield item) cpage=item['cpage'] #当前页 county_href=item['county_href'] newhome_href = item['newhome_href'] # 中间变量(不返回到yield item) newhome_fweb=item['newhome_fweb'] #首页链接(为了后续拼翻页链接) url=response.url item = HouseItem() #存储到此函数 item['province'] = province item['city'] = city item['city_href'] = city_href item['county'] = county item['county_href'] = county_href item['newhome_href']=newhome_href item['building']='新楼盘' item['date_before']=self.date_before item['ProgramStarttime']=self.ProgramStarttime boxs=sel.xpath(".//div[@id='content']/div[@class='halistbox']") # xpath重新获取时,HTML不能用extract(),所以后续不能合并处理 if boxs==[]: #会出现获取信息不全的情况 headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } urls=requests.get(url,headers=headers).text html = HTML(urls) #有该页所有小区信息那一部分 boxs = html.xpath(".//div[@id='content']/div[@class='halistbox']")[0] #小区信息列表 box=boxs.xpath(".//div[@class='halist clearfix']") #各个小区 for b in box: #小区名称 item['house']=b.xpath(".//div[@class='title mb5 clearfix']/h4[@class='tit fl mr']/a/text()")[0] #text=['均价:', '元/㎡', '(2017-06-12)'] 或[] text=b.xpath(".//div[@class='text']/ul[@class='mb15']/li[1]/*/text()") if text: try: #房价类型 item['price_type']=text[0][:-1] except: item['price_type']=None try: #房价发布时间 item['time']=text[2][1:-1] except: item['time']=None #price_info=['25,000']或[] price_info=b.xpath(".//div[@class='text']/ul[@class='mb15']/li[1]/span/*/text()") if price_info: #房价 item['price']=price_info[0] yield item #共**页 try: pages=boxs.xpath(".//div[@class='page1 mb5 clearfix']/span[@class='page_p']/text()")[0] page=int(re.findall("共(.*?)页",pages)[0]) except: page=None else: #小区信息列表 box=boxs.xpath(".//div[@class='halist clearfix']") #各个小区 for b in box: #小区名称 item['house']=b.xpath(".//div[@class='title mb5 clearfix']/h4[@class='tit fl mr']/a/text()").extract()[0] #text=['均价:', '元/㎡', '(2017-06-12)'] 或[] text=b.xpath(".//div[@class='text']/ul[@class='mb15']/li[1]/*/text()").extract() if text: try: #房价类型(均价或起价) item['price_type']=text[0][:-1] except: item['price_type']=None try: #房价更新时间 item['time']=text[2][1:-1] except: item['time']=None # price_info=['25,000']或[] price_info=b.xpath(".//div[@class='text']/ul[@class='mb15']/li[1]/span/*/text()").extract() if price_info: item['price']=price_info[0] yield item # 共**页 try: pages=boxs.xpath(".//div[@class='page1 mb5 clearfix']/span/text()").extract()[0] page=int(re.findall("共(.*?)页",pages)[0]) except: page=None #翻页 if page: if cpage<page: #根据各区县小区首页链接拼翻页链接 newhome_href=newhome_fweb[:-1]+"-pg"+str(cpage+1)+"/" item['cpage']=cpage+1 item['newhome_fweb']=newhome_fweb item['newhome_href']=newhome_href yield scrapy.Request(url=newhome_href, callback=self.newhome, meta=item, dont_filter=True)
def get_profile(uid, deepNum, preName): # 人编码 人的标签(创客还是创客导师) 人地址(比如高密市第五中学) 个性签名 关注者数量 粉丝数量 作品数量 本月被赞总数 被访问总数 他的勋章墙数量及具体名称 代表作作品数量 作品数量(公开) 作品数量(优秀) 作品所有标签 #userId userType address url = 'http://www.i3done.com/u/{uid}' start_url = url.format(uid=uid) response = down.get_html(start_url, headers=headers) if response: # print(response.text) html = HTML(response.text) userName = html.xpath( 'string(//div[@class="zw-banner-user"]/span/text()|//strong[@class="hide_text"]/text())' ) userType = html.xpath( 'string(//a[@class="maker"]/@title|//a[@class="tutor"]/@title|//a[@class="teacher"]/@title)' ) address = html.xpath( 'string(//p[@class="school-name"]/text()|//p[@class="jsle"]/a/@title)' ) description = html.xpath( 'string(//div[@class="zw-banner-sign"]/span/text()|//p[@class="zxpent"]/text())' ).strip() followCount = html.xpath( 'string(//div[@class="zw-zone-box zw-right-data"]//a[@data-id="focus"]/text())' ).strip() fansCount = html.xpath( 'string(//div[@class="zw-zone-box zw-right-data"]//a[@data-id="fans"]/text()|//span[@id="cares"]/text())' ).strip() worksCount = html.xpath( 'string(//div[@class="zw-zone-box zw-right-data"]//a[@data-tabid="tuzhi"]/text())' ) likeCount = html.xpath( 'string(//b[@id="likes"]/text()|//font[@id="likes"])') createDate = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) clickCount = html.xpath( 'string(//div[@class="zw-right-fans"]/i/em/text()|//div[@class="zan_03"]/span/text())' ) clickCount = re.search('^(\d+)人访问', clickCount) if clickCount: clickCount = clickCount.group(1) else: clickCount = '0' honorCount = html.xpath('string(//i[@class="text_red"]/text())') level = html.xpath( 'string(//div[@class="zw-zone-des"]/span/text())').replace( '\n', '').replace('\r', '').replace('\t', '').strip() try: level = re.search('等级:L(\d+)', level).group(1) except: level = '' rank = html.xpath( 'string(//div[@class="zw-user-rank"]/span/text())').replace( '全网排名:', '') verifyTime = html.xpath( 'string(//img[@class="cp"]/@src | //div[@class="tips"]/text())') if '年' in verifyTime: try: verifyTime = re.search('(\d+年\d+月\d+日)', verifyTime).group(1) except: print('verifyTime error..') #获取荣誉 get_honor(uid) # 获取作品 publicWorkCount, excellentWorkCount, tags, category_list = get_works_first( uid) createDate = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) sql = "insert into user(userId,userName,userType,address,description,followCount,fansCount,worksCount,likeCount,clickCount,honorCount,publicWorkCount,excellentWorkCount,tags,level,rank,verifyTime,deepNum,preName,createDate) VALUES ('%s','%s','%s', '%s', '%s', '%s','%s', '%s', '%s', '%s','%s', '%s', '%s', '%s', '%s','%s', '%s', '%s', '%s', '%s')" % ( uid,userName, userType, address, description, followCount, fansCount, worksCount, likeCount,clickCount, honorCount, publicWorkCount, excellentWorkCount, tags, level,rank,verifyTime,deepNum,preName,createDate) \ + "ON DUPLICATE KEY UPDATE followCount='%s', fansCount='%s',deepNum='%s'" % (followCount,fansCount,deepNum) print(sql) mysqlCli.save(sql) return category_list
#表头 save = '序号,标题,链接,作者,时间,点赞数,评论数,转发数\n' with open('res.csv', 'w') as f: f.write(save) account = 1 #翻页 for page in range(1, 14): url = URL + str(page) response = requests.get(url, headers=headers) json_obj = json.loads(response.text) html_str = json_obj['data'] html = HTML(html_str) #lxml 的 xpath解析 titles = html.xpath( '//div[@class="UG_list_b"]//h3[@class="list_title_b"]/a/text()') hrefs = html.xpath( '//div[@class="UG_list_b"]//h3[@class="list_title_b"]/a/@href') authors = html.xpath( '//div[@class="UG_list_b"]//div[@class="subinfo_box clearfix"]/a[2]/span[1]/text()' ) times = html.xpath( '//div[@class="UG_list_b"]//div[@class="subinfo_box clearfix"]/span[1]/text()' ) likes = html.xpath( '//div[@class="UG_list_b"]//div[@class="subinfo_box clearfix"]/span[2]/em[2]/text()' ) comments = html.xpath( '//div[@class="UG_list_b"]//div[@class="subinfo_box clearfix"]/span[4]/em[2]/text()' ) zhufas = html.xpath(
import requests import re from lxml.etree import HTML url = 'https://product.suning.com/0000000000/694729819.html' response_text = requests.get(url).text html = HTML(response_text) try: p_Name = re.findall('"itemDisplayName":"(.*?)"', response_text)[0] except: p_Name = None xxx = html.xpath(".//div[@class='imgzoom-main']/a[@id='bigImg']/img/@alt")[0] print(xxx) print(p_Name) print(len(p_Name))
def lvmama_poi_detail(url): with requests.session() as sess: response = sess.get(url) html = HTML(response.text) item = LvmamaPoiDetailItem() item['raw'] = {'html': str(lzma.compress(response.content))} if 'sight' in url: item['head'] = get_text_by_xpath(html, './/span[@class="crumbs_nav"]/span//text()') item['title'] = get_text_by_xpath(html, './/div[@class="vtop-name-box"]/h2[@class="title"]/text()') item['title_en'] = get_text_by_xpath(html, './/div[@class="vtop-name-box"]/span[@class="title-eng"]/text()') item['vcomon'] = get_text_by_xpath(html, './/div[@class="vtop-name-box"]/i[@class="vcomon-icon"]/text()') # item['country'] = response.request.meta.get('country') dls = html.xpath('.//dl[@class="poi_bordernone"]') for dl in dls: dt = get_text_by_xpath(dl, './/dt//text()') dd = get_text_by_xpath(dl, './/dt//text()') if '简介' in dt: item['poi_brief'] = dd elif '景点导览' in dt: item['poi_detail'] = dd elif '交通信息' in dt: item['traffic'] = dd elif '小贴士' in dt: item['poi_tip_content'] = dd dts = html.xpath('.//div[@class="vtop-comment-box fl"]/dl/dt') dds = html.xpath('.//div[@class="vtop-comment-box fl"]/dl/dd') for dt, dd in zip(dts, dds): dt = get_text_by_xpath(dt, './/text()') dd = get_text_by_xpath(dd, './/text()') if '地 址' in dt: item['address'] = dd elif '游玩时间' in dt: item['playtime'] = dd elif '联系电话' in dt: item['phone'] = dd elif '门票' in dt: item['ticket'] = dd elif '开放时间' in dt: item['open_time'] = dd elif '网址' in dt: item['website'] = dd elif 'zone' in url: item['head'] = get_text_by_xpath(html, './/div[@class="nav clearfix"]/span[@class="crumbs_nav fl"]//text()') item['title'] = get_text_by_xpath(html, './/div[@class="nav_country clearfix"]/div[@class="countryBox fl"]/h1/text()') item['title_en'] = get_text_by_xpath(html, './/div[@class="nav_country clearfix"]/div[@class="countryBox fl"]/h1/span/text()') item['active'] = get_text_by_xpath(html, './/div[@class="nav_country clearfix"]/div[@class="countryBox fl"]/p[@class="active"]/text()') dls = html.xpath('.//div[@class="city_viewBox"]/div[@class="city_view_model"]/div/dl') for dl in dls: dt = get_text_by_xpath(dl, './/dt//text()') dd = get_text_by_xpath(dl, './/dd//text()') if '简介' in dt: item['poi_brief'] = dd elif '景点导览' in dt: item['poi_detail'] = dd elif '交通信息' in dt: item['traffic'] = dd elif '小贴士' in dt: item['poi_tip_content'] = dd divs = html.xpath('.//dl[@class="city_mapList clearfix"]/dd/div') for div in divs: dt = get_text_by_xpath(div, './/p[1]//text()') dd = get_text_by_xpath(div, './/p[2]//text()') if '地址' in dt.replace(' ',''): item['address'] = dd elif '游玩时间' in dt: item['playtime'] = dd elif '联系电话' in dt: item['phone'] = dd elif '门票' in dt: item['ticket'] = dd elif '开放时间' in dt: item['open_time'] = dd elif '网址' in dt: item['website'] = dd # item['url'] = response.request.url return item
driver.switch_to.window(handles[1]) t1 = threading.Thread(target=deletAsin) t1.start() while True: for link in links: time.sleep(0.4) driver.get(link) wait = WebDriverWait(driver, 10, 0.2) wait.until(lambda driver: driver.find_element_by_xpath( "//div[@data-index='9']")) text = driver.page_source html = HTML(text) now_asins = {} try: now_asins = set( html.xpath("//div/@data-asin")) - {'" data-index=', ''} except: pass for count in range(2): if links.index(link) == 0: countlist = countlist1 else: countlist = countlist2 while now_asins.__len__() not in countlist: # print(str(links.index(link)) + ': ' + str(now_asins.__len__())) time.sleep(0.2) text = driver.page_source html = HTML(text) try: now_asins = set(html.xpath("//div/@data-asin")) - { '" data-index=', ''
#!/usr/bin/env python # -*- coding:utf-8 -*- #导入包 import requests from lxml.etree import HTML #初始url URL = 'http://sousuo.gov.cn/column/40123/{page}.htm' #翻页 for i in range(0, 11): start_url = URL.format(page=i) print(start_url) #发送请求 response = requests.get(start_url) #lxml解析获得的结果 html = HTML(response.text) #获取url和标题 urls = html.xpath('//ul[@class="listTxt"]/li/h4/a/@href') titles = html.xpath('//ul[@class="listTxt"]/li/h4/a/text()') #对结果进行拼接 for url, title in zip(urls, titles): print(url, title) with open('结果.txt', 'a') as f: f.write(url + ',' + title + '\n')
def trade_info(self): """ 抓取商标详细信息页面 :return: """ self.s.headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3" }) data = red_cli.srandmember("BKID") url = eval(data)["url"] # url = 'https://www.tmkoo.com/detail/24ff6eaa997007f2967541ee3bb13223/11/' resp = self.get_req(url) etre = HTML(resp.text) regist_num = "".join( etre.xpath( '//td[contains(text(),"注册号")]/following-sibling::td[1]/font/text()' )) international_class = "".join( etre.xpath( '//td[contains(text(),"注册号")]/following-sibling::td[2]/font/text()' )) regist_time = "".join( etre.xpath( '//td[contains(text(),"申请日期")]/following-sibling::td[1]//text()' )) registrant_chinese_name = "".join( etre.xpath( '//td[contains(text(),"申请人名称(中文)")]/following-sibling::td[1]/div/text()' )) registrant_foreign_name = "".join( etre.xpath( '//td[contains(text(),"申请人名称(英文)")]/following-sibling::td[1]//text()' )) registrant_foreign_address = "".join( etre.xpath( '//td[contains(text(),"申请人地址(英文)")]/following-sibling::td[1]//text()' )) image_url = "".join(etre.xpath('//td[@align="center"]/img/@src')) preliminary_notice_num = "".join( etre.xpath( '//td[contains(text(),"初审公告期号")]/following-sibling::td[1]//text()' )) regist_notice_num = "".join( etre.xpath( '//td[contains(text(),"注册公告期号")]/following-sibling::td[1]//text()' )) preliminary_notice_time = "".join( etre.xpath( '//td[contains(text(),"初审公告日期")]/following-sibling::td[1]//text()' )) regist_notice_time = "".join( etre.xpath( '//td[contains(text(),"注册公告日期")]/following-sibling::td[1]//text()' )) special_period_effective_time = "".join( etre.xpath( '//td[contains(text(),"专用权期限")]/following-sibling::td[1]//text()' )) is_co_regist = "".join( etre.xpath( '//td[contains(text(),"是否共有商标")]/following-sibling::td[1]//text()' )) international_later_time = "".join( etre.xpath( '//td[contains(text(),"后期指定日期")]/following-sibling::td[1]//text()' )) international_regist_time = "".join( etre.xpath( '//td[contains(text(),"国际注册日期")]/following-sibling::td[1]//text()' )) priority_date = "".join( etre.xpath( '//td[contains(text(),"优先权日期")]/following-sibling::td[1]//text()' )) agent_name = "".join( etre.xpath( '//td[contains(text(),"代理人名称")]/following-sibling::td[1]//text()' )) color_indication = "".join( etre.xpath( '//td[contains(text(),"指定颜色")]/following-sibling::td[1]//text()' )) trademark_type = "".join( etre.xpath( '//td[contains(text(),"商标类型")]/following-sibling::td[1]//text()' )) form = "".join( etre.xpath( '//td[contains(text(),"商标状态")]/following-sibling::td[1]//text()' )) commodity_num = etre.xpath( '//a[contains(text(),"具体核准商品/服务以商标公告为准,点击查看!")]/ancestor::td//table//tr/td[@align="right"]/text()' ) commodity_chinese_name = etre.xpath( '//a[contains(text(),"具体核准商品/服务以商标公告为准,点击查看!")]/ancestor::td//table//tr/td[3]/text()' ) lt = [] for _ in range(len(commodity_num)): i = {} i["commodity_num"] = commodity_num[_] i["commodity_chinese_name"] = commodity_chinese_name[_] lt.append(i) item = { "_id": eval(data)["_id"], "regist_num": regist_num, "international_class": international_class, "regist_time": regist_time, "registrant_chinese_name": registrant_chinese_name, "image_url": image_url, "preliminary_notice_num": preliminary_notice_num, "regist_notice_num": regist_notice_num, "preliminary_notice_time": preliminary_notice_time, "regist_notice_time": regist_notice_time, "special_period_effective_time": special_period_effective_time, "international_later_time": international_later_time, "international_regist_time": international_regist_time, "priority_date": priority_date, "color_indication": color_indication, "trademark_type": trademark_type, "form": form, "is_co_regist": is_co_regist, "agent_name": agent_name, "trademark_commodity_server_info": lt, "registrant_foreign_name": registrant_foreign_name, "registrant_foreign_address": registrant_foreign_address, } item = BK.item_clear(item=item) BK_DATA_info.save(item) log.info("数据存入成功。。。。。。。。。。")
def start(): for i in range(1,400): try: print('当前页:'+str(i)) start_url = 'https://z.jd.com/bigger/search.html' body = 'status=&sort=&categoryId=&parentCategoryId=&sceneEnd=&productEnd=&keyword=&page='+str(i) response = requests.post(start_url, headers=headers,data=body,timeout=10) # print(response.text) html = HTML(response.text) urls = html.xpath('//div[@class="l-result"]//li/a/@href') print(len(urls)) for url in urls: link = 'https://z.jd.com'+url print(link) try: response = requests.get(link,headers=detail_headers,timeout=10) id = re.search('https://z.jd.com/project/details/(\d+).html',link).group(1) html = HTML(response.text) title = html.xpath('string(//h1)').replace(',',',').strip() price = html.xpath('string(//p[@class="p-num"]/text())') yu_price = html.xpath('string(//p[@id="projectMessage"]/span[2]/text())') faqiNum = html.xpath('string(//div[@class="promoters-num"]/div[@class="fl start"]/span[@class="num"])') address = html.xpath('string(//div[@class="box-content"]/ul[@class="contact-box"]/li[2]/div[@class="val"])') dangciList = html.xpath('//div[@class="details-right-fixed-box"]/div[@class="box-grade"]//div[@class="t-price"]/span | //div[@class="details-right-fixed-box"]/div[@class="box-grade "]//div[@class="t-price"]/span | //div[@class="details-right-fixed-box"]/div[@class="box-grade "]//div[@class="t-price "]/span | //div[@class="details-right-fixed-box"]/div[@class="box-grade "]//div[@class="t-price"]/span') dangciNum = str(len(dangciList)) if 'video' in response.text: has_video = '是' else: has_video = '否' imgXpath = html.xpath('//div[@class="tab-div tab-current"]//p/img') img_len = str(len(imgXpath)) # like = html.xpath('string(//span[@id="praisCount"])').replace('(','').replace(')','') # guanzhu = html.xpath('string(//span[@id="focusCount"])').replace('(','').replace(')','') contentlist = html.xpath('//div[@id="proList"]//text()') content = ''.join(contentlist) content = content.replace(' ','').replace('\t','').replace('\n',' ').replace('\r',' ').replace(',',',').strip() count_url = 'https://sq.jr.jd.com/cm/getCount?key=1000&systemId='+id count_response = requests.get(count_url,headers=count_headers,timeout=10) print(count_response.text) json_obj = json.loads(count_response.text.replace('(','').replace(')','')) like = str(json_obj['data']['praise']) guanzhu = str(json_obj['data']['focus']) save_res = id+','+link+','+title+','+price+','+yu_price+','+has_video+','+img_len+','+content+','+like+','+guanzhu+','+faqiNum+','+address+','+dangciNum+'\n' print(save_res) with open('结果.csv','a',encoding='gbk',errors='ignore') as f: f.write(save_res) except: continue except: continue
cursor.execute('SET NAMES utf8;') cursor.execute('SET CHARACTER SET utf8;') cursor.execute('SET character_set_connection=utf8;') cursor.execute(sql) db.commit() except MySQLdb.Error, e: print "Mysql Error %d: %s" % (e.args[0], e.args[1]) cursor.close() db.close() urlHtml = getHtml( "http://cosme.pclady.com.cn/products_list/br0_bs0_bi2_sm68_ef0_pb0_pe0_or0.html" ) html = HTML(urlHtml.decode('gbk')) urlList = html.xpath('//div[@class="dList"]/ul/li/i[@class="iPic"]/a') parseData(urlList) for i in range(32, 40): if i < 10: i = "0" + str(i) else: i = str(i) print i htmls = "http://cosme.pclady.com.cn/products_list/br0_bs0_bi2_sm68_ef0_pb0_pe0_or0_p" + i + ".html#productList" urlHtml = getHtml(htmls) try: html = HTML(urlHtml.decode('gbk')) urlList = html.xpath('//div[@class="dList"]/ul/li/i[@class="iPic"]/a') parseData(urlList) except Exception: errorTxt.write("\n")
def parse(self, item): selector = HTML(item.text) href = selector.xpath("//h4/a/@href") for url in href: self.article_queue.put(urljoin(BASE_URL, url))