def gooo(self): num = 1 i = 50060 # while True: responseS = "https://nba.udn.com/nba/index?gr=www" response = requests.get(responseS) time.sleep(5) # 判斷 HTTP 回傳代碼是 200 OK if response.status_code == 200: # 開啟 UTF-8 編碼的文字檔 f = codecs.open('nba.txt', 'w', encoding='utf-8') f.write(response.text) f.close() d = pyquery.PyQuery(response.text) # print(d) posts = d('div#mainbar>div#news>#news_body>dl>dt') for post in posts.items(): # print(post) txt1 = post('h3').text() # print("####################################""""start""""######################################") href = post('a').attr('href') if href is None: return fhref = "https://nba.udn.com" + href # grabPage(fhref,txt1,i) if fhref is None: return print("詳細頁面-------------------------------------------------------------------------") response = requests.get(fhref) time.sleep(3) # print(response) # 判斷 HTTP 回傳代碼是 200 OK if response.status_code == 200: # 開啟 UTF-8 編碼的文字檔 f = codecs.open('nda_det.txt', 'w', encoding='utf-8') f.write(response.text) f.close() # 將下載回來的原始碼轉成 PyQuery 的文件實體 e = pyquery.PyQuery(response.text) # 讀取店家資料 # print(e) dt = e('div#story_body_content>span>p').text() dtc = (dt.split(' ', 1 )) # print(dtc[1]) print(txt1) # print(fhref) #### with connection.cursor() as cursor: cursor.execute("select Title from intime_news") data = cursor.fetchone() if data != None: print("QQQQQQQQQQQQQQ",data,"QQQQQQQQQQQQ") if data == None: with connection.cursor() as cursor: sql = """insert into intime_news(Title,news,http) values(%s,%s,%s)""" cursor.execute(sql,(txt1,dtc[1],fhref)) elif data != None: if data[0] != txt1: with connection.cursor() as cursor: sql = """insert into intime_news(Title,news,http) values(%s,%s,%s)""" try: cursor.execute(sql,(txt1,dtc[1],fhref)) except: connection.rollback() time.sleep(1) # 自定义延时 i = i+1 else: print('搜尋結果回傳代碼並非 200') num += 1
def parse(self, response): """ 目前发现两种情况的页面 http://yongzhou.huangye88.com/xinxi/946_232725872.html http://shannan.huangye88.com/xinxi/9465_232727550.html 移动站 http://m.huangye88.com/yiqiyibiao/232631426.html 已被删除 http://m.huangye88.com/d-zunyi/13451-232631425.html 正常 http://m.huangye88.com/d-nanjing/946-232631428.html 正常2 http://m.huangye88.com/jiancai/232631442.html 带图片的 """ cate_name_3 = '' cate_name_2 = '' price_unit = '' update_time = '' price = '' send_time = '' keywords = '' com_username = '' send_money = '' thumb = '' title = '' offer_num = '' detail = '' seller = '' min_price = '' cate_name_1 = '' to_area = '' fax = '' thumb_2 = '' brand = '' thumb_1 = '' attrs_kv = [] min_amount = '' auth = '' telephone = '' ww = '' wechat = '' source_url = '' com_addr = '' qq = '' mobile = '' com_url = '' from_area = '' max_price = '' com_name = '' source_url = response.url # try: # title = response.xpath('//div[@class="pro-text"]/h1/text()').extract()[0] # except: # pass # if not title: # try: # title = response.xpath('//div[@class="topinfo"]/h1/text()').extract()[0] # except: # pass # try: # price = response.xpath('//h3[@class="big"]/text()').extract()[0].replace(u'\xa0', '').replace(u'¥', '') # except: # pass # if not price: # try: # price = response.xpath('//h3[@class="pirce"]/text()').extract()[0].replace(u'元', '') # except: # pass # if price: # min_price = price # max_price = price # attr_key = [] # attr_value = [] # try: # attr_key = response.xpath('//td[@class="attribute"]/div/text()').extract() # except: # pass # # try: # attr_value = response.xpath('//td[@class="attribute-value"]/div/text()').extract() # except: # pass # # try: # for i in range(len(attr_key)): # k = attr_key[i] # v = attr_value[i] # str = k + '|' + v # attrs_kv.append(str) # except: # pass # imgs = [] # try: # imgs = response.xpath('//div[@id="picsUrl"]/a/@big').extract() # except: # pass # try: # thumb = imgs[0] # except: # pass # try: # thumb_1 = imgs[1] # except: # pass # try: # thumb_1 = imgs[2] # except: # pass # ------------------------------------移动站--------------------------------------- # 通过这个判断商品信息是否已被删除 if response.xpath('//section[@class="mianbaoxie"]/span'): # 未被删除 try: title = response.xpath( '//div[@class="text-desc"]/div/h1/text()').extract()[0] except: pass try: price = response.xpath( '//ul[@class="no-price"]/li/span/text()').extract()[0] except: pass if not price: try: price = response.xpath( '//span[@class="price left"]/text()').extract()[0] except: pass try: for i in response.xpath('//div[@class="list-desc h"]/ul/li'): k = i.xpath('label/text()').extract()[0] v = i.xpath('span/text()').extract()[0] str = k + '|' + v attrs_kv.append(str) except: pass imgs = [] try: imgs = response.xpath( '//ul[@class="swiper-wrapper"]/li/img/@data-src').extract( ) except: pass try: thumb = imgs[0] except: pass try: thumb_1 = imgs[1] except: pass try: thumb_2 = imgs[2] except: pass try: # cate_name_1 = response.xpath('//section[@class="mianbaoxie"]/a[1]/text()').extract()[0] # cate_name_2 = response.xpath('//section[@class="mianbaoxie"]/a[2]/text()').extract()[0] # cate_name_3 = response.xpath('//section[@class="mianbaoxie"]/a[3]/text()').extract()[0] # 请求接口得到分类 rsp = requests.post('http://192.168.14.1:8000/pre_api/', data={'title': title}) rsp = json.loads(rsp.text)["data"] cate_name_1 = rsp[0] cate_name_2 = rsp[1] cate_name_3 = rsp[2] except: pass try: com_name = response.xpath( '//li[@class="last"]/span/a/text()').extract()[0] com_url = response.xpath( '//li[@class="last"]/span/a/@href').extract()[0] except: pass try: for i in response.xpath('//div[@class="list-desc"]/ul/li'): if i.xpath('a'): if u'地区' == i.xpath('a/label/text()').extract()[0]: com_addr = i.xpath('a/span/text()').extract()[0] if i.xpath('label') and u'联系' == i.xpath( 'label/text()').extract()[0]: seller = i.xpath('span/text()').extract()[0].replace( u'\xa0', '') mobile = i.xpath('span/a/text()').extract()[0] except: pass doc = pyquery.PyQuery(response.text) detail_doc = '' try: detail_doc = doc('.limit-height') for i in detail_doc('img').items(): src = i.attr('src') if not src: i.remove() upyun_pic = shuffle_image_push(response.url, src) i.attr('src', upyun_pic) except: pass detail = detail_doc.outer_html() detail = detail + u'<p>%s</p><p>联系人:%s</p><p>企业地址:%s</p>' % ( com_name, seller, com_addr) if thumb: try: thumb = shuffle_image_push(response.url, thumb) except: pass if thumb_1: try: thumb_1 = shuffle_image_push(response.url, thumb_1) except: pass if thumb_2: try: thumb_2 = shuffle_image_push(response.url, thumb_2) except: pass goods_data = { 'source_url': source_url, 'title': title, 'price': price, 'min_price': min_price, 'max_price': max_price, 'price_unit': price_unit, 'min_amount': min_amount, 'keywords': keywords, 'brand': brand, 'to_area': to_area, 'from_area': from_area, 'attrs_kv': attrs_kv, 'cate_name_1': cate_name_1, 'cate_name_2': cate_name_2, 'cate_name_3': cate_name_3, 'thumb': thumb, 'thumb_1': thumb_1, 'thumb_2': thumb_2, 'detail': detail, 'com_name': com_name, 'com_addr': com_addr, 'seller': seller, 'telephone': telephone, 'mobile': mobile, 'qq': qq, 'ww': ww, 'wechat': wechat, 'fax': fax, 'com_url': com_url, 'update_time': datetime.datetime.now().strftime('%Y-%m-%d'), 'sendtime': '', 'com_username': '', 'send_money': '', 'offer_num': '', 'auth': '' } try: yield scrapy.Request(url=com_url, meta={"goods_data": goods_data}, callback=self.parse2) except: pass
def get_article(): r = requests.get( 'https://mp.weixin.qq.com/s/pLmuGoc4bZrMNl7MSoWgiA') #request 请求提取网页正文 document = pyquery.PyQuery(r.text) # pyquery 提取正文 return document('#js_content').text() # 提取微信公众号正文伪代码
def test1(): r = requests.get("https://mp.weixin.qq.com/s/pLmuGoc4bZrMNl7MSoWgiA") document = pyquery.PyQuery(r.text) return document("#js_content").text()
def get_artical(url): art=requests.get(url) doc=pyquery.PyQuery(art.text) return doc('#js_content').text()
def request_shiyanbin_schedule(): response = requests.get(SHIYANBIN_SCHEDULE_LINK) if response.ok: return pyquery.PyQuery(response.content.decode('utf-8'))('table')
def test_repo(): resp = pyquery.PyQuery(server.repo('r1')) assert resp.find('h3:contains("provided packages")') assert resp.find('h3:contains("dependencies")')
def parse_company3(self, response): goods_data = response.meta["goods_data"] com_word = response.meta["com_word"] com_data = response.meta["com_data"] print(response.url) content_js = response.text doc = pyquery.PyQuery(content_js) aa = doc('article.intro-list ul') for i in aa('li').items(): if i('.c-left').text() == u'主营产品或服务' and not com_data["product"]: com_data["product"] = i('.c-right').text() if i('.c-left').text( ) == u'主营行业' and not com_data["main_industry"]: com_data["main_industry"] = i('.c-right').text() if i('.c-left').text() == u'企业类型': com_data["comtype"] = i('.c-right').text() if i('.c-left').text() == u'经营模式' and not com_data["busmode"]: com_data["busmode"] = i('.c-right').text() if i('.c-left').text() == u'注册地址': com_data["com_reg_addr"] = i('.c-right').text() if i('.c-left').text() == u'经营地址' and not com_data["address"]: com_data["address"] = i('.c-right').text() if i('.c-left').text() == u'公司成立时间' and not com_data["regyear"]: com_data["regyear"] = i('.c-right').text() if i('.c-left').text() == u'法定代表人/负责人' and not com_data["ceo"]: com_data["ceo"] = i('.c-right').text() if i('.c-left').text() == u'员工人数': com_data["employ"] = i('.c-right').text() if i('.c-left').text() == u'年营业额': com_data["annulsale"] = i('.c-right').text() if i('.c-left').text() == u'经营品牌': com_data["brand_name"] = i('.c-right').text() if i('.c-left').text() == u'注册资本' and not com_data["regcapital"]: com_data["regcapital"] = i('.c-right').text() if i('.c-left').text() == u'主要客户群': com_data["customer"] = i('.c-right').text() if i('.c-left').text() == u'主要市场': com_data["main_addr"] = i('.c-right').text() if i('.c-left').text() == u'是否提供OEM服务': com_data["OEM"] = i('.c-right').text() if i('.c-left').text() == u'研发部门人数': com_data["rdnum"] = i('.c-right').text() if i('.c-left').text() == u'厂房面积': com_data["com_area"] = i('.c-right').text() if i('.c-left').text() == u'质量控制': com_data["qc"] = i('.c-right').text() if i('.c-left').text() == u'管理体系认证': com_data["management_system"] = i('.c-right').text() if i('.c-left').text() == u'认证信息' and not com_data["com_auth"]: com_data["com_auth"] = i('.c-right').text() if i('.c-left').text() == u'开户银行': com_data["bank_type"] = i('.c-right').text() if 'null' in com_data["regcapital"]: com_data["regcapital"] = u'无需验资' com_data[ "source_url"] = 'http://' + com_word + '.wx.hc360.com/shop/show.html' if goods_data["detail"]: Item = HuicongGoodsFenbuItem() Item["com_data"] = com_data Item["goods_data"] = goods_data yield Item
pass if WEB_SCRAPING: try: with urllib.request.urlopen(STANDINGS_URL) as url: with open(os.path.join(DATA_PATH, 'standings.html'), 'wb') as f: f.write(url.read()) except: print('Failed web scraping ' + STANDINGS_URL) exit() users = [] try: path = os.path.join(DATA_PATH, 'standings.html') with open(path) as f: standings = pyquery.PyQuery(f.read()) except: print('Not found ' + path) exit() PM = urllib.parse.parse_qs( urllib.parse.urlparse( standings.find('.bodySubtitle a').attr('href')).query)['pm'][0] for e in standings.find('.stat tr:nth-child(n+3)'): e = pyquery.PyQuery(e).find('td:nth-child(2)') user_name = e.text() user_id = urllib.parse.parse_qs( urllib.parse.urlparse(e.find('a').attr('href')).query)['cr'][0] users.append((user_name, user_id)) if WEB_SCRAPING: for user_name, user_id in users:
def parseHTML(collect): with open('ptt.html', 'r', encoding='utf-8') as f: html = f.read() dom = pyquery.PyQuery(html) item = {} author = dom('span.article-meta-tag').filter(lambda i: pyquery.PyQuery( this).text() == '作者').siblings('span.article-meta-value').text() authorID = author.split()[0] authorName = ''.join(author.split()[1:]) authorName = authorName.replace(')', '').replace('(', '') item['authorID'] = authorID item['authorName'] = authorName #print(authorID,'\n', authorName) title = dom('span.article-meta-tag').filter(lambda i: pyquery.PyQuery( this).text() == '標題').siblings('span.article-meta-value').text() title = title.split('] ')[-1] #print(title) item['title'] = title publish_time = dom('span.article-meta-tag').filter( lambda i: pyquery.PyQuery(this).text() == '時間').siblings( 'span.article-meta-value').text() publish_time = publish_time.split('] ')[-1] #print(publish_time) item['publish_time'] = publish_time print('content---------------') # contents=dom('div#main-content:last-child') # print(contents.text()) contents = dom('div#main-content').children().items() #print(contents.items()) final_content = '' for content in contents: #print(content.children().text()) if '時間' in content.children().text(): #print('\n'.join(content.__str__().split('\n')[1:])) final_content = '\n'.join(content.__str__().split('\n')[1:]) item['time'] = final_content break print('final content', final_content) pushes = dom('div.push').items() #print(type(pushes)) for push in pushes: item2 = {} #print(push.children('span.push-userid').text()) #print(push.children('span.push-content').text()) #print(''.join(push.children('span.push-ipdatetime').text().split(' ')[1:])) item2['userID'] = push.children('span.push-userid').text() item2['userComment'] = push.children('span.push-content').text() item2['userTime'] = ''.join( push.children('span.push-ipdatetime').text().split(' ')[1:]) #print('--------------------------') item2.update(item) #print('item2',item2) collect.append(item2) with open('crawler.pickle', 'wb') as f: pickle.dump(collect, f, protocol=pickle.HIGHEST_PROTOCOL)
def parse(self, response): print(response.url) title = "" price = "" offer_num = "" send_time = "" send_money = "" com_name = "" buy_sell_num = "" com_addr = "" auth = "" com_url = "" mobile = "" telephone = "" seller = "" attrs_kv = [] detail = "" thumb_1 = "" thumb_2 = "" thumb = "" cate_name_1 = "" cate_name_2 = "" cate_name_3 = "" min_price = max_price = 0 price_unit = '' content = data = '' if response.xpath('//h1[@class="proTitle"]/text()'): try: try: title = response.xpath( '//h1[@class="proTitle"]/text()').extract()[0] except: pass try: price = response.xpath( '//div[@class="topPriceRig"]/text()').extract()[1] except: pass if not price: try: price = response.xpath( '//div[@class="topPriceRig"]/text()').extract()[0] mprice = price.replace('\r', '').replace( '\n', '').replace('\t', '').replace(' ', '').split('-') min_price = mprice[0].strip().replace(u'¥', '') max_price = mprice[1].strip().replace(u'¥', '') except: pass if not price: try: price = response.xpath( '//div[@class="topPriceRig telBra"]/text()' ).extract()[0] except: pass try: price = price.replace('\r', '').replace('\n', '').replace( '\t', '').replace(' ', '') except: pass try: if u'¥' in price: price = price.replace(u'¥', '') except: pass try: offer_num = response.xpath( '//span[@class="supply-numb"]/text()').extract()[0] except: pass try: for i in response.xpath('//div[@class="item-row-w"]'): row = i.xpath('string(.)') if u'发货期限' in row[0].extract(): send_time = i.xpath('text()').extract()[1] send_time = send_time.replace('\r', '').replace( '\n', '').replace('\t', '').replace(' ', '') except: pass try: buy_sell_num = response.xpath( '//li[@class="line-btm"]/div/a/text()').extract()[0] except: pass try: com_name = response.xpath( '//div[@class="comply-name"]/p/a/text()').extract()[0] for i in response.xpath( '//div[@class="item-mmt-txt"]/ul/li'): row = i.xpath('string(.)') if u'所在地区' in row[0].extract(): com_addr = i.xpath('div/p/text()').extract()[0] if u'认证信息' in row[0].extract(): try: auth = i.xpath('div/a/text()').extract()[0] except: auth = i.xpath('div/text()').extract()[0] com_url = response.xpath( '//p[@class="cName"]/a/@href').extract()[0] except: pass try: mobile = response.xpath( '//em[@class="c-red"]/text()').extract()[0][1:] telephone = response.xpath( '//div[@class="p tel1"]/em/text()').extract()[0] telephone = telephone[1:].split(' ')[0] if not seller: seller = response.xpath( '//div[@class="p name"]/em/text()').extract( )[0][1:] except: pass try: for i in response.xpath( '//div[@class="d-vopy parameter "]/ul/li'): key = i.xpath('span/text()').extract()[0].replace( '\r', '').replace('\n', '').replace('\t', '').replace(' ', '')[:-1] value = i.xpath('p/text()').extract()[0].replace( '\r', '').replace('\n', '').replace('\t', '').replace(' ', '') str = key + '|' + value attrs_kv.append(str) except: pass try: thumb = response.xpath( '//ul[@id="thumblist"]/li[1]/div/a/@rel').extract()[0] thumb = re.findall(r"largeimage: '(.*?)'", thumb)[0] thumb_1 = response.xpath( '//ul[@id="thumblist"]/li[2]/div/a/@rel').extract()[0] thumb_1 = re.findall(r"largeimage: '(.*?)'", thumb_1)[0] thumb_2 = response.xpath( '//ul[@id="thumblist"]/li[3]/div/a/@rel').extract()[0] thumb_2 = re.findall(r"largeimage: '(.*?)'", thumb_2)[0] except: pass try: json_data = re.findall(r'"supCatClass":(.*?),"supcatId"', response.text)[0] json_data = json.loads(json_data) cate_name_1 = json_data[0]["catName"] cate_name_2 = json_data[1]["catName"] cate_name_3 = json_data[2]["catName"] except: pass except: pass ss = response.xpath('//script/text()').extract() update_time = '' keys = [] for i in ss: text = i for j in text.split('var'): keys.append(j.strip()) for i in keys: i = i.replace('null', 'None').replace('false', 'False').replace('true', 'True') if i: try: exec i in locals() except Exception as e: pass try: com_username = company_username.decode('utf-8') except: com_username = '' try: keywords = productWord except: try: keywords = searchVal except: try: keywords = urllib.unquote(keywordencode).decode('gbk') except: keywords = '' try: keywords = keywords.decode('utf-8') except: pass try: update_time = supplyInfoJson['pubDate'].split(' ')[0] except: update_time = (datetime.datetime.now() - datetime.timedelta(30)).strftime('%Y-%m-%d') try: brand = supplyInfoJson['brandName'] except: brand = '' try: brand = brand.decode('utf-8') except: pass try: businAttList = supplyInfoJson['businAttList'] except: businAttList = [] from_area = '' if businAttList: for i in businAttList: if i['attname'] == '产地': from_area = i['attvalue'] if not brand: if i['attname'] == '品牌': brand = i['attvalue'] try: from_area = from_area.decode('utf-8') except: pass try: seller = companyContactor except: try: seller = contactor except: pass try: fax = companyJson['fax'] except: fax = '' to_area = qq = ww = wechat = '' try: detail = supplyInfoJson['introduce'] detail = detail.decode("utf-8") except: pass if u'质量保证,欢迎咨询洽谈' in detail or not detail: my_doc = pyquery.PyQuery(response.text) my_doc = my_doc("#introduce") detail = my_doc.outer_html() if detail: try: doc = pyquery.PyQuery(detail) except: pass print "start up upyun detail" try: for i in doc('a').items(): if i.attr('href') and 'hc360' in i.attr('href'): i.remove() except: pass try: for i in doc('img').items(): try: if i.attr('data-ke-src'): i.attr('data-ke-src', '') except: pass src = i.attr('src') try: if 'hc360' not in src or 'no_pic' in src or 'nopic' in src: i.remove() continue except: pass try: if thumb and 'no_pic' in thumb: thumb = src if thumb and 'nopic' in thumb: thumb = src except: pass upyun_pic = '' try: upyun_pic = image_push(response.url, src) except: pass if 'hc360' in upyun_pic: i.remove() continue i.attr('src', upyun_pic) except: pass try: for i in doc('img').items(): if i.attr('src'): src = i.attr('src') if 'hc360' in src or '//' == src: i.remove() if i.attr('data-ke-src'): i.remove_attr('data-ke-src') if i.attr('data-mce-src'): i.remove_attr('data-mce-src') if i.attr('data-cke-saved-src'): i.remove_attr('data-cke-saved-src') except: pass try: for i in doc('*').items(): if i.attr('src') and 'hc360' in i.attr('src'): i.attr('src', '') if i.attr('data-tfs-url'): i.attr('data-tfs-url', '') if i.attr('data-url'): i.attr('data-url', '') except: pass detail = doc.outer_html() try: detail = detail.replace('<div style="overflow:hidden;">', '<div>') except: pass if detail and u'正在加载' in detail: detail = '' try: min_amount = int( response.xpath('//tr[@class="item-cur-tab"]/td/text()'). extract()[0].split('-')[0].strip()) except: min_amount = 1 try: price = re.search(r'\d+\.?\d+', price).group() except: price = 0 if not min_price: min_price = price if not max_price: max_price = price if offer_num: try: res = re.search(r'(\d+)(.+)', offer_num.replace(' ', '')).groups() offer_num = res[0] if len(res) > 1: price_unit = res[1] except: pass print "start up upyun thumb" if thumb: thumb = image_push(response.url, thumb) if 'hc360' in thumb: thumb = '' if thumb_1: thumb_1 = image_push(response.url, thumb_1) if 'hc360' in thumb_1: thumb_1 = '' if thumb_2: thumb_2 = image_push(response.url, thumb_2) if 'hc360' in thumb_2: thumb_2 = '' goods_data = { 'source_url': response.url, 'title': title, 'price': price, 'offer_num': offer_num, 'send_time': send_time, 'send_money': send_money, 'com_name': com_name, 'com_addr': com_addr, 'auth': auth, 'com_url': com_url, 'mobile': mobile, 'telephone': telephone, 'seller': seller, 'attrs_kv': attrs_kv, 'detail': detail, 'thumb_1': thumb_1, 'thumb_2': thumb_2, 'thumb': thumb, 'cate_name_1': cate_name_1, 'cate_name_2': cate_name_2, 'cate_name_3': cate_name_3, 'update_time': datetime.datetime.now().strftime('%Y-%m-%d'), 'com_username': com_username, 'keywords': keywords, 'min_amount': min_amount, 'min_price': min_price, 'max_price': max_price, 'price_unit': price_unit, 'brand': brand, 'to_area': to_area, 'from_area': from_area, 'qq': qq, 'ww': ww, 'fax': fax, 'wechat': wechat, } # 获取企业url判断企业是否已被爬取 com_url = "" try: com_url = response.xpath( '//p[@class="cName"]/a/@href').extract()[0] except: pass if not com_url: try: com_url = response.xpath( '//div[@class="goods-tit goods-tit-blue"]/a/@href' ).extract()[0] except: pass # 取出企业的关键词 reg = 'http://(.*?).b2b.hc360.com' com_word = re.findall(reg, com_url)[0] print(" ") test_com_url = 'http://' + com_word + '.wx.hc360.com/shop/show.html' conn = pymysql.connect(host='192.168.14.90', port=3306, user='******', passwd='123456', db='hc360', charset='utf8') cursor = conn.cursor() cursor.execute( "select * from com_tmp where url = '{}'".format(test_com_url)) conn.commit() result = cursor.fetchone() if not result: # 企业没有爬过 try: cursor.execute( "in sert into com_tmp (url) v alues ('{}')".format( test_com_url)) conn.commit() except: pass cursor.close() conn.close() # 爬取该企业的信息,并将企业信息放入Item 的 com_data中,与goods_data 一起交给mongoPipe处理 url_1 = "http://detail.b2b.hc360.com/detail/turbine/template/moblie,vmoblie,getcontact_us.html?username="******"goods_data": goods_data, "com_word": com_word }, callback=self.parse_company) except: pass else: cursor.close() conn.close() # 企业爬过了 if goods_data["detail"]: Item = HuicongGoodsFenbuItem() Item["goods_data"] = goods_data Item["com_data"] = "" yield Item
def test_pkg_external(): resp = pyquery.PyQuery(server.pkg('six')) assert resp.find('h2:contains("external package: six")')
def test_pkg_disambituate(): resp = pyquery.PyQuery(server.pkg('pkg2')) repo_links = resp.find('a[href^="/repo/"]') assert len(repo_links) == 2 assert repo_links.eq(0).attr('href') == '/repo/r2.htm' assert repo_links.eq(1).attr('href') == '/repo/r5.htm'
def test_pkg_redirect(): resp = pyquery.PyQuery(server.pkg('pkg1')) meta = resp.find('meta[http-equiv="refresh"]') assert meta.attr('content') == '0;/repo/r1.htm'
'''这是一个通过网络请求获得网页内容,使用分词工具对中文字符串 进行分词,统计词频,得出结果,并发送到指定邮箱的程序''' import requests import pyquery from pyquery import PyQuery from mymodule import stats_word '''访问网址''' image_url = "https://mp.weixin.qq.com/s/pLmuGoc4bZrMNl7MSoWgiA" '''将网络中的内容全部赋值给response''' response = requests.get(image_url) '''提取网址中的正文内容''' document = pyquery.PyQuery(response.text) content = document('#js_content').text() statList = stats_word.stats_text_cn(content, 100) statstring = ''.join(str(i) for i in statList) import getpass sender = input("输入发件人邮箱") password = getpass.getpass("输入发件人邮箱密码(可复制粘贴):") recipients = input("输入收件人邮箱") import yagmail yag = yagmail.SMTP(user=sender, password=password, host='smtp.163.com') yag.send(recipients, '19100305 yxying1992主题:张小龙微信公开课演讲稿中文词频前100名统计', statstring)
def monitoring(self): """执行""" #是否有权限 self.local.focuson = True self.local.sayhello = True self.local.addbuddy = True self.local.sendmsg = True #开始监控 count = 0 while True: if count > 0: myqueue.get() self.showMsg(self.local.url + "\t第 %s 次执行完毕,睡眠中等待下一次执行\r" % count) time.sleep(int(self.monitoring_['apart'])) myqueue.put(self.local.url) #获取需要执行的用户 page = 1 userlist = [] try: while True: try: url = self.local.url + "/home.php?mod=space&do=friend&view=online&type=member&page=" + str( page) html = self.local.conn.get(url, headers=self.headers) html.encodeing = self.local.encodeing q = pyquery.PyQuery(html.text.encode().decode('utf8')) # formhashhref = q("a[href*='action=logout']").attr('href')#验证字符串 # if formhashhref: # self.local.formhash = formhashhref.split('=')[-1] li = q('#friend_ul li') for row in li: rowq = pyquery.PyQuery(row) liid = rowq.attr('id') li_uid = liid.split('_')[1] #得到用户uid usertext = rowq('a').eq(2).text() #取出a标签中文本 if li_uid != '1' and usertext == '收听TA': userlist.append(int(li_uid)) #查询页数 try: pageCountstrs = q( 'input[name=custompage]').next().attr('title') if pageCountstrs: pageCount = re.match(".*?([0-9]+).*?", pageCountstrs, re.S) if pageCount: if page < int(pageCount[1]): page += 1 else: break else: break except Exception as e: break except Exception as e: break for uid in userlist: isAdmin = self.isAdmin(uid) if not isAdmin: #排除管理员 # #关注 if self.local.focuson: self.focusOn(uid) #打招呼 if self.local.sayhello: self.sayHello(uid) # 发消息 if self.local.sendmsg: self.sendMsg(uid) #加好友 if self.local.addbuddy: self.addBuddy(uid) if self.local.focuson == False and self.local.sayhello == False and self.local.sendmsg == False and self.local.addbuddy == False: self.showErrorMsg( error, self.local.url + "\t由于此站点没有任何权限,将关闭此站点\r") except Exception as e: pass # print(e,11111111111111111111111111111111) count += 1
def get_article(): r = requests.get('https://mp.weixin.qq.com/s/pLmuGoc4bZrMNl7MSoWgiA') document = pyquery.PyQuery(r.text) return document('#js_content').text()
def login(self, url, loginnum=1): #初始化 print(url, '登录...') self.local.url = url self.local.formhash = '' self.local.conn = requests.Session() filejson = self.path + 'cookies/' + self.local.url.replace( 'http://', '').replace('https://', '') + '.json' if os.path.exists(filejson): try: with open(filejson, 'r', encoding='utf8') as f: listCookies = json.loads(f.read()) except Exception as e: self.showErrorMsg(cookielog, self.local.url + "\tcookie文件格式错误\r") #添加cookie(登陆操作) try: for cookie in listCookies: self.local.conn.cookies.set(cookie.get('name'), cookie.get('value')) except Exception as e: # print(e) self.showErrorMsg( cookielog, self.local.url + "\tcookie设置错误,请更换cookie文件\r") #访问一次首页 try: getlogin = self.local.conn.get(self.local.url, headers=self.headers) time.sleep(1) #防止跳转域名 urllist = getlogin.url.split('/') self.local.url = urllist[0] + '/' + urllist[1] + '/' + urllist[ 2] if getlogin.status_code != 200: self.showErrorMsg( loginerror, self.local.url + "\t此网站无法正常访问,返回错误:" + str(getlogin.status_code) + "\r") except Exception as e: if loginnum > 3: self.showErrorMsg( loginerror, self.local.url + "\t此网站无法正常访问,返回错误:" + str(e) + "\r") else: self.login(self.local.url, loginnum + 1) #编码方式 encodeing = 'utf8' try: encodeing = requests.utils.get_encodings_from_content( getlogin.text)[0] except Exception as e: pass try: getlogin.encodeing = encodeing q = pyquery.PyQuery(getlogin.text.encode().decode('utf8')) self.local.formhash = q("input[name='formhash']").val() #验证字符串 except Exception as e: pass # getlogin.encodeing = encodeing self.local.encodeing = encodeing try: if encodeing is 'utf8': self.local.message = self.monitoring_['msg'] else: self.local.message = self.monitoring_['msg'].encode( 'utf8').decode('utf-8').encode('gbk') except Exception as e: self.local.message = self.monitoring_['msg'] try: # q = pyquery.PyQuery(getlogin.text.encode().decode('utf8')) # self.local.log = open(self.path+'log/'+self.local.url.replace('http://','').replace('https://','')+'.log','a',encoding='utf8') # self.local.log.write("\r\r\r"+time.strftime('%Y-%m-%d %H:%M:%S')+'\t'+self.local.url+"\r") # usergroup = q('a[href$="home.php?mod=spacecp&ac=usergroup"]').text() # if usergroup: # self.showSuccessMsg(self.local.url+"\t登陆成功\r") # else: # self.showErrorMsg(loginerror,self.local.url+'\t登录失败\r') #执行 # self.monitoring() pass except Exception as e: print(e) # 获取管理组 try: adminGroupList = self.getAdminGroup() if not adminGroupList: shutil.copyfile( filejson, self.path + 'cookie2/' + self.local.url.replace( 'http://', '').replace('https://', '') + '.json') self.showErrorMsg( loginerror, self.local.url + '\t无法获取管理组,为了安全将关闭此站点的一切操作!\r') else: self.local.log = open( self.path + 'log/' + self.local.url.replace( 'http://', '').replace('https://', '') + '.log', 'a', encoding='utf8') self.local.log.write("\r\r\r" + time.strftime('%Y-%m-%d %H:%M:%S') + '\t' + self.local.url + "\r") self.showSuccessMsg(self.local.url + '\t管理组:' + str(adminGroupList) + '\r') time.sleep(1) except Exception as e: print(e) myqueue.put(self.local.url) #开始监控 self.monitoring()
def parse_page(data: bytes): text = data.decode(encoding) doc = pyquery.PyQuery(text) title = doc('strong font').text() content = doc('p').text() return (title, content)
def test_index(): resp = pyquery.PyQuery(server.index()) assert resp.find('#repos') assert resp.find('#packages') assert resp.find('#external')