def GetDouBanMovie(): a = 1 for i in range(0, 250, 25): url = "https://blog.csdn.net/nav/blockchain" r = requests.get(url) f = open('blog.txt', mode='a+') for blog in PyQuery(r.content)(".list_con"): title = PyQuery(blog).find(".csdn-tracking-statistics").find( 'a').html() title.replace('\n', '') num = PyQuery(blog).find(".num").html() s = "%s: 博客:%s 阅读量:%s \n" % (a, title, num) f.write(s) a += 1
async def search(self, search_term): # First try finding crates with the search term as brand name webpage = await self.get_search({'zoeken': 'true', 'merk': search_term.replace(" ", "-"), 'kratten': 'krat-alle', 'sorteer': 'prijs-oplopend'}) url = PyQuery(webpage)('a.merkenUrl').attr('href') if url is None: # Try finding crates with the search term as search term webpage = await self.get_search({'zoeken': 'true', 'zoek': search_term.replace(" ", "+"), 'kratten': 'krat-alle', 'sorteer': 'prijs-oplopend'}) url = PyQuery(webpage)('a.merkenUrl').attr('href') if url is None: # Try finding other offers (not crates) with the search term as brand name webpage = await get_search(self, {'zoeken': 'true', 'merk': search_term.replace(" ", "-"), 'sorteer': 'prijs-oplopend'}) url = PyQuery(webpage)('a.merkenUrl').attr('href') if url is None: # Try finding other offers with the search term as search term webpage = await get_search(self, {'zoeken': 'true', 'zoek': search_term.replace(" ", "+"), 'sorteer': 'prijs-oplopend'}) url = PyQuery(webpage)('a.merkenUrl').attr('href') if url is None: # If nothing is found, we throw an exception raise ValueError(search_term + ' not found, or not on sale') host = "https://www.biernet.nl" first_result = PyQuery(PyQuery(webpage)('li.cardStyle')[0]) # Get all the various information from the HTML page biernet_url = host + first_result('div.item_image')('a').attr('href') image = host + first_result('div.item_image')('a')('img').attr('data-src') brand = first_result('h3.merkenH3')('a')[0].text product = first_result('p.artikel')('a')[0].text product_name = first_result('div.item_image')('a')('img').attr('title') original_price = first_result('p.prijs')('span.van_prijs')[0].text sale_price = first_result('p.prijs')('span.voor_prijs')[0].text sale = PyQuery(first_result('div.informatie')('li.item')[0]).text() sale = sale.replace('korting', 'off') sale_price_liter = PyQuery(first_result('div.informatie')('li.item')[1]).text() end_date = first_result('div.footer-item')('span')[0].text end_date = end_date.replace("t/m ", "").strip() biernet_shop_url = host + first_result('div.logo_image')('a').attr('href') shop_name = biernet_shop_url.split('winkel:')[-1] shop_name = shop_name.replace('-', ' ').title() shop_image = host + first_result('div.logo_image')('a')('img').attr('data-src') shop_url = first_result('a.bestelknop').attr('href') if shop_url is None: shop_url = biernet_shop_url biernet_url = urllib.parse.quote(biernet_url, safe=':/%') image = urllib.parse.quote(image, safe=':/%') shop_url = urllib.parse.quote(shop_url, safe=':/%') shop_image = urllib.parse.quote(shop_image, safe=':/%') return {'url': biernet_url, 'brand': brand, 'name': product_name, 'img': image, 'product': product, 'shop_name': shop_name, 'shop_url': shop_url, 'biernet_shop_url': biernet_shop_url, 'shop_img': shop_image, 'original_price': original_price, 'sale_price': sale_price, 'sale': sale, 'PPL': sale_price_liter, 'end_date': end_date}
def get_img_urls(content): if not content: return [] url_list = [] doc = PyQuery(content) nodeList = doc('li.tab-trigger > div.vertical-img > a.box-img > img') for node in nodeList: url = PyQuery(node).attr('src') if not url: continue if url.find('60x60') > 0: url = url.replace('60x60', '400x400') url_list.append(url) needDescImg = True if needDescImg: link_url = doc('div#desc-lazyload-container').attr('data-tfs-url') if not link_url: return url_list desc_content = fetchPageWithUrl(link_url) #懒惰匹配模式 imgNodes = re.findall('<img[^<>]*>.*?', desc_content) #desc_content = re.sub('var[\s]*offer_details[\s]*=[\s]*', '', desc_content) for node in imgNodes: nodeQ = PyQuery(node) desc_url = nodeQ('img').attr('src') if desc_url: desc_url = desc_url.replace('\\"', '') if not desc_url: continue if 'gif' in desc_url: #gif图片不要 continue #if '//gd' in desc_url or '/2015/' in desc_url: url_list.append(desc_url) return url_list
def analysisPage(response): if response.status_code: global photos photos = re.findall(' zoomfile="(.*?)" ', response.text) # 图片url folderName = PyQuery(response.text)("span#thread_subject").text() # 标题 for ch in r'\/:|<>?*"': folderName = folderName.replace(ch, ' ⁂ ') # 去除特殊字符 downHtml(response, folderName) # 下载单页, 以方便观看 text = PyQuery(response.text) # print(text) formhash = text("input[name='formhash']").attr('value') # print(formhash) try: urlPay = text("td[class='t_f'] ignore_js_op span a").attr('href') # print(urlPay) aid, tid = re.findall(r'(\d+)', urlPay) except TypeError: urlPay = text("ignore_js_op .attnm a").attr('href') # print(urlPay) aid, tid = re.findall(r'(\d+)', urlPay) except ValueError: print('该资源已经解析过了'.center(72, '-')) aid, tid = None, None except Exception: aid, tid = None, None return { 'folderName': folderName, 'formhash': formhash, 'aid': aid, 'tid': tid } print('当前网络不可用') return None
def get_bounds(scene_name): """Use Earth Explorer metadata to get bounds of a Scene""" url_code = get_metadata_code(scene_name) metadata = PyQuery( 'http://earthexplorer.usgs.gov/fgdc/%s/%s/' % (url_code, scene_name) ) metadata = metadata.text()[ metadata.text().find('G-Ring_Latitude:'): metadata.text().find('\n Keywords:') ] coords = ( metadata.replace(' ', '') .replace('G-Ring_Latitude:', '') .replace('G-Ring_Longitude:', '') .split('\n') ) coords = [float(coord) for coord in coords if coord != ''] # create a list of lists with the coordinates coords = [coords[i:i + 2] for i in range(0, len(coords), 2)] # use reverse() to change [lat, lon] to [lon, lat] [coord.reverse() for coord in coords] # repeat the first coordinate on the end of the list if coords[0] != coords[-1]: coords.append(coords[0]) return coords
def get_img_urls(content): if not content: return [] url_list = [] doc = PyQuery(content) nodeList = doc('li.tab-trigger > div.vertical-img > a.box-img > img') for node in nodeList: url = PyQuery(node).attr('src') if not url: continue if url.find('60x60') > 0: url=url.replace('60x60','400x400') url_list.append(url) needDescImg = True if needDescImg: link_url = doc('div#desc-lazyload-container').attr('data-tfs-url') if not link_url: return url_list desc_content = fetchPageWithUrl(link_url) #懒惰匹配模式 imgNodes = re.findall('<img[^<>]*>.*?', desc_content) #desc_content = re.sub('var[\s]*offer_details[\s]*=[\s]*', '', desc_content) for node in imgNodes: nodeQ = PyQuery(node) desc_url = nodeQ('img').attr('src') if desc_url: desc_url = desc_url.replace('\\"', '') if not desc_url: continue if 'gif' in desc_url: #gif图片不要 continue #if '//gd' in desc_url or '/2015/' in desc_url: url_list.append(desc_url) return url_list
def get_content_text(content): # content = [s.extract() for s in content('style')] content_text = PyQuery(str(content)).text() content_text = content_text.replace('\r\n', '\n').replace('\r', '\n') final_content_text = '' for each_text in content_text.split('\n'): each_final_text = remove_special_char(each_text).strip() if each_final_text != '': final_content_text += each_final_text + '\n' return final_content_text.strip()
def __init__(self, elem, trims, should_cleanup): text = PyQuery(elem).text() for trim in (trims or []): text = text.replace(trim, '') self.rx = re.compile(r'\W+') self.text = text.strip() self.trimmed_text = non_trimmed.sub(' ', self.text) self.html = PyQuery(elem).html() if should_cleanup: self.html = self.cleanup_html() self.normalized_text = nonword.sub('', text.lower())
def get_key_person_info(self, key_person_info): key_person_info_dict = {} lst_key_person = [] page = self.get_crawl_page(key_person_info) if page is None or page == u'': # 如果是抓取失败或者抓取异常,一律为None,不能影响后面的解析 return key_person_info_dict json_data = util.json_loads(page) if json_data is None: return key_person_info_dict data_arr = json_data.get('data', []) if data_arr is None: return key_person_info_dict for data in data_arr: key_person_name_html = data.get('name', '') key_person_name = '' # 去除html代码 if key_person_name_html != '': key_person_name = PyQuery( key_person_name_html, parser='html').remove('span').remove('div').text() # 职位需要进行判断 key_position_temp = data.get('position_CN', '') if string.find(key_position_temp, 'img') != -1: pic_md5 = util.get_match_value('"', '"', key_position_temp) m = hashlib.md5() m.update(pic_md5.strip().replace('\n', '')) psw = m.hexdigest() key_position = GsModel.get_md5_key_position(psw) else: key_position = key_position_temp if key_position is None: key_position = '' key_person = { GsModel.KeyPerson.KEY_PERSON_NAME: key_person_name.replace(" ", ""), GsModel.KeyPerson.KEY_PERSON_POSITION: key_position, } lst_key_person.append(key_person) if len(lst_key_person) > 0: key_person_info_dict[GsModel.KEY_PERSON] = lst_key_person return key_person_info_dict
def detail_chapter(test_url, host_url): detail_dicts = [] header = { 'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1 Edg/85.0.4183.83', 'referer': test_url, } text = requests.get(test_url, headers=header).text comic_title = PyQuery(text)('.detail-main-info-title').text() for ch in r'\/:|<.・>?*"': comic_title = comic_title.replace(ch, '㇑') # 去除特殊字符 for li_tag in PyQuery(text)('.detail-list-select li'): chapter = PyQuery(li_tag)('a').text() for ch in r'\/:|<.・>?*"': chapter = chapter.replace(ch, '㇑') # 去除特殊字符 a_href = PyQuery(li_tag)('a').attr('href') detail_dict = { 'chapter': convert(chapter, 'zh-hans'), 'a_href': host_url + a_href, 'comic_title': convert(comic_title, 'zh-hans'), } detail_dicts.append(detail_dict) return detail_dicts
def save_result(self, paste_id, paste_txt, file, directory): paste_url = self.PASTESRAW_URL + (paste_id if paste_id[0] == '/' else '/' + paste_id) fn, ext = os.path.splitext(os.path.split(file)[1]) timestamp = get_timestamp() if paste_txt == '': content = urllib.request.urlopen(paste_url).read().strip() paste_txt = PyQuery(content)('#paste_code').text() #paste_txt = PyQuery(url=paste_url)('#paste_code').text() if fn == 'base64' and len(paste_txt) > 20: codes = '' r = re.findall(r'[\w\d+/=]{30,}', paste_txt) if r: for c in r: if len(c) > len(codes): codes = c try: i = (4 - len(codes) % 4) % 4 if 0 < i < 2: codes += "=" * i decodes = base64.b64decode(codes).strip() paste_txt = paste_txt.replace(codes, decodes).strip() if not re.search('\w+', paste_txt, flags=re.I): paste_txt = '' except: pass else: paste_txt = paste_txt + os.linesep if paste_txt != '': self.validpastes += 1 with open(file, 'a') as matching: matching.write(fn + '-' + timestamp + '-' + paste_url + os.linesep) try: os.mkdir(directory) except KeyboardInterrupt: raise except: pass with open(directory + '/' + fn + '_' + timestamp.replace( '/', '_').replace(':', '_').replace(' ', '__') + '_' + paste_id.replace('/', '') + '.txt', mode='w') as paste: paste.write(paste_txt)
def verify_token(self): for x in range(5): try: WebDriverWait(self.driver, wait).until( EC.element_to_be_clickable((By.CSS_SELECTOR, "body > app-root > app-golden-main > app-golden-content > main > div > div > aside.block-setting > div > div.card-body.d-flex.flex-column > div.overflow-container.flex-grow-1 > ul > li"))) HTML = self.driver.find_elements_by_css_selector(".card-control")[ 0].get_attribute("innerHTML") Doc = PQ(HTML) Doc = Doc('.list-group-item-action').text() Doc = Doc.replace(" ", "\n") Doc = Doc.split("\n") # print(Doc) path = Doc.index("表名") pathh = '//*[@id="dp_ads.' + Doc[path+1] +'"]' self.driver.find_element_by_xpath(pathh).click() break except: time.sleep(3) return "dp_ads." + Doc[path+1]
def get_content(url, file_path=''): content_html = Pq(url=url) # 获取首条卡片 content = content_html('.content')[0] # 获取卡片内容 text_html = Pq(content).children('.txt') # 如果有两个卡片内容,那么说明有展开全文,需要获取后一个,全部文字版的 text = Pq(text_html[0]).text() if text_html.length == 1 else Pq(text_html[1]).text() # 删除一些尾部文字、换行、空格 text = text.replace('收起全文d', '').replace('O抽奖详情', '').replace('0网页链接', '').replace('\n', '').replace(' ', '') # 视频类内容删除尾部视频链接文字 text = re.sub(r'L.*?的微博视频|L.*?的秒拍视频', '', text) txt = ' '.join(jieba.cut(text, cut_all=False, HMM=True)) print(text) print(txt) print('\n') # 第二个参数传了就保存文本到文件 if file_path != '': with open(file_path, "wb") as fp: fp.write(txt.encode())
def click_dataset(self,lan): #---PyQuery→Xpath--- for x in range(5): try: HTML = self.driver.find_elements_by_css_selector(".card-control")[ 0].get_attribute("innerHTML") Doc = PQ(HTML) Doc = Doc('.list-group-item-action').text() Doc = Doc.replace(" ", "\n") Doc = Doc.split("\n") # print(Doc) path = Doc.index(lan) pathh = '//*[@id="dp_ads.' + Doc[path+1] +'"]' self.driver.find_element_by_xpath(pathh).click() break except: time.sleep(3) #對照頁面上的→維度條件 WebDriverWait(self.driver, wait).until( EC.visibility_of_element_located((By.CSS_SELECTOR, "body > app-root > app-golden-main > app-golden-content > main > div > div > aside.block-fitler > div > div.card-body > h3:nth-child(3)"))) check = self.driver.find_element_by_css_selector("body > app-root > app-golden-main > app-golden-content > main > div > div > aside.block-fitler > div > div.card-body > h3:nth-child(3)").get_attribute("innerText") return check
def analysisPage(response): if response.status_code: global photos photos = re.findall(' zoomfile="(.*?)" ', response.text) # 图片url folderName = PyQuery(response.text)("span#thread_subject").text() # 标题 for ch in r'\/:|<>?*"': folderName = folderName.replace(ch, ' ⁂ ') # 去除特殊字符 formhash = PyQuery( response.text)("input[name='formhash']").attr('value') print(folderName) print(formhash) urlPay = PyQuery( response.text)("td[class='t_f'] ignore_js_op span a").attr('href') # print(urlPay,type(urlPay)) aid, tid = re.findall(r'(\d+)', urlPay) downHtml(response, folderName) # 下载单页, 以方便观看 return { 'folderName': folderName, 'formhash': formhash, 'aid': aid, 'tid': tid } print('当前网络不可用') return None
def run(self): ''' 解析网站源码 ''' time.sleep(random.uniform(1.0, 3.6)) try: pq = helper.get(self.url, myHeaders=self.headers) # 款型名称 name = pq('div.product-brand').text().strip() + ' ' + pq('h1.product-name').text().strip() # 颜色尺寸 # 找出所有的尺寸 size_span_list = pq('div.product-sizes__options span.product-sizes__detail') size_price_list = [] for size_span in size_span_list: size = PyQuery(size_span).find('span.product-sizes__size').text().strip() if 'K' in size or 'k' in size or '-' in size or 'XS' in size: continue size = re.sub(r'[WwYyCc\*]', '', size) # 还有非数字的size,醉了 if size == 'S': continue elif size == 'M': continue elif size == 'L': continue elif size == 'XL': continue elif size == 'XXL': continue elif size == 'XXXL': continue elif size == '': continue elif size == 'OS': continue price = PyQuery(size_span).find('span.product-sizes__price').text().strip() if price.startswith('$'): price = price.replace('$', '').replace(',', '') size_price_list.append({ 'size': size, 'price': float(price), 'isInStock': True }) else: size_price_list.append({ 'size': size, 'price': 0.0, 'isInStock': False }) if len(size_price_list) < 1: return # 配色的编号 number = '' # 性别 gender = 0 # 颜色 color_value = '' tr_list = pq('table#product-attribute-specs-table tr') for tr in tr_list: key = PyQuery(tr).find('th').text().strip() if key == 'Gender': gender_txt = PyQuery(tr).find('td').text().strip() if gender_txt == 'Mens': gender = 1 elif gender_txt == 'Womens': gender = 2 elif key == 'Colorway': color_value = PyQuery(tr).find('td').text().strip() elif key == 'Manufacturer Sku': number = PyQuery(tr).find('td').text().strip() # print(name, number, self.url, size_price_list, gender, color_value) img_downloaded = mongo.is_pending_goods_img_downloaded(self.url) if not img_downloaded: img_url = pq('div.product-gallery-image > img')[0].get('src') # 下载图片 result = helper.downloadImg(img_url, os.path.join('.', 'imgs', 'stadiumgoods', '%s.jpg' % number)) if result == 1: # 上传到七牛 qiniuUploader.upload_2_qiniu('stadiumgoods', '%s.jpg' % number, './imgs/stadiumgoods/%s.jpg' % number) img_downloaded = True mongo.insert_pending_goods(name, number, self.url, size_price_list, ['%s.jpg' % number], gender, color_value, 'stadiumgoods', '5b8f484b299207efc1fb0904', self.crawl_counter, img_downloaded=img_downloaded) except: global error_detail_url error_counter = error_detail_url.get(self.url, 1) error_detail_url[self.url] = error_counter + 1 helper.log('[ERROR] error timer = %s, url = %s' % (error_counter, self.url), 'stadiumgoods') if error_counter < 3: self.q.put(self.url)
def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100): refreshCursor = '' results = [] resultsAux = [] cookieJar = http.cookiejar.CookieJar() active = True while active: json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor, cookieJar) if len(json['items_html'].strip()) == 0: break refreshCursor = json['min_position'] tweets = PyQuery(json['items_html'])('div.js-stream-tweet') if len(tweets) == 0: break for tweetHTML in tweets: tweetPQ = PyQuery(tweetHTML) tweet = models.Tweet() usernameTweet = tweetPQ( "span.username.js-action-profile-name b").text() # txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace('@ ', '@')); temp_txt = PyQuery( re.sub( r'</span><span class="js-display-url">|</span><span class="invisible">', '', str(tweetPQ("p.js-tweet-text")))).text() txt = re.sub( r"\s+", " ", temp_txt.replace('# ', '#').replace('@ ', '@').replace('…', '...')) retweets = int( tweetPQ( "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) favorites = int( tweetPQ( "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) dateSec = int( tweetPQ("small.time span.js-short-timestamp").attr( "data-time")) id = tweetPQ.attr("data-tweet-id") permalink = tweetPQ.attr("data-permalink-path") user_id = int( tweetPQ("a.js-user-profile-link").attr("data-user-id")) geo = '' geoSpan = tweetPQ('span.Tweet-geo') if len(geoSpan) > 0: geo = geoSpan.attr('title') urls = [] for link in tweetPQ("a"): try: urls.append((link.attrib["data-expanded-url"])) except KeyError: pass tweet.id = id tweet.permalink = 'https://twitter.com' + permalink tweet.username = usernameTweet tweet.text = txt tweet.date = datetime.datetime.fromtimestamp(dateSec) tweet.formatted_date = datetime.datetime.fromtimestamp( dateSec).strftime("%a %b %d %X +0000 %Y") tweet.retweets = retweets tweet.favorites = favorites tweet.mentions = re.compile('(@\\w*)').findall(tweet.text) tweet.hashtags = re.compile('(#\\w*)').findall(tweet.text) tweet.geo = geo tweet.urls = urls tweet.pics = re.compile('(pic.twitter.com/\\S+)').findall( tweet.text) tweet.author_id = user_id # links = tweet.urls + tweet.pics # for link in links: # txt = txt.replace(link, '') # tweet.text = txt # # temp_mentions = [] # for m in tweet.mentions: # temp_mentions.append(m[1:len(m)]) # tweet.mentions = temp_mentions # temp_hashtags = [] # for h in tweet.hashtags: # temp_hashtags.append(h[1:len(h)].lower()) # tweet.hashtags = temp_hashtags results.append(tweet) resultsAux.append(tweet) if receiveBuffer and len(resultsAux) >= bufferLength: receiveBuffer(resultsAux) resultsAux = [] if tweetCriteria.maxTweets > 0 and len( results) >= tweetCriteria.maxTweets: active = False break if receiveBuffer and len(resultsAux) > 0: receiveBuffer(resultsAux) return results
def scan_proxy_qiaodm(): """ 扫描代理资源 :return: """ import requests from pyquery import PyQuery as Pq source_site = 'http://ip.qiaodm.com/' header = { 'Host': 'ip.qiaodm.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36' } s = requests.session() # 抓取页面 file_html = s.get(source_site).content # 保存文件 # with open('test.html', 'a') as f: # f.write(file_html.encode('utf-8')) # # # 读取抓取的页面 # with open('test.html', 'r') as f: # file_html = f.read() text_pq = Pq(file_html) tr_list = text_pq('tbody').find('tr[style="text-align: center;"]') print '单页共 %s 条记录' % len(tr_list) for tr_item in tr_list: # print Pq(tr_item).html() # print('---------------------') td_list = Pq(tr_item).find('td') # print '单条共 %s 列字段' % len(td_list) field_list = [] for td_item in Pq(td_list): field = Pq(td_item).text() field_list.append(field) # print field # print('++++++++++++++++++') # 特殊处理ip地址 ip = Pq(td_list).eq(0).html() # 去除干扰信息 ip = html.replace_html(ip, r'<p style="display:none;"/>') ip = html.replace_html(ip, r'<p style="display: none;"/>') ip = html.replace_html(ip, r'<p style=.*?display:.*?none;.*?>.*?</p>') # 去除标签 ip = html.strip_html(ip) # print ip # 过滤掉非法ip地址 if len(ip.split('.')) != 4: continue # 特殊处理端口 port_key = Pq(td_list).eq(1).attr('class').split()[1] if port_key not in PortDict: print '发现新端口: %s' % port_key continue port = PortDict.get(port_key, '') ProsyItem['Ip'] = ip.replace(' ', '') ProsyItem['Port'] = port ProsyItem['Type'] = field_list[2].strip() ProsyItem['AnonymousDegree'] = field_list[3].strip() ProsyItem['Area'] = field_list[4].strip() ProsyItem['Speed'] = field_list[5].strip() ProsyItem['ScanTime'] = field_list[6].strip() # print ProsyItem proxy_item = json.dumps(ProsyItem, ensure_ascii=False) html.save_file('proxy.json', proxy_item + '\n', 'a')
server.sendmail(mailFrom, rcptToList, message.as_string()) server.quit() if '__main__' == __name__: configFile = 'config.cfg' novels = PyQuery(filename = configFile) message = '' for novel in novels('novel'): name = PyQuery(novel)('name').text() url = PyQuery(novel)('url').text() prefix = PyQuery(novel)('prefix').text() next = int(PyQuery(novel)('next').text()) rcptToList = [] for addr in PyQuery(novel)('emails>email'): rcptToList.append(PyQuery(addr).text()) print rcptToList html = PyQuery(url = url) nextUrl = None for i in html('div.threadlist_title.pull_left.j_th_tit.member_thread_title_frs > a.j_th_tit'): if i.text.find(number2chinese(next)) != -1: nextUrl = prefix + PyQuery(i).attr('href') break if nextUrl: next += 1 PyQuery(novel)('next').text(str(next)) text = PyQuery(url=nextUrl)('cc:first > div:first').html() text = text.replace(u'<br/>', '\n').strip() subject = name + u' ' + u'第'+unicode(str(next))+u'章' send_mail('*****@*****.**', rcptToList, subject.encode('utf8'), text.encode('utf8')) open(configFile, 'wt').write(str(novels))
def scan_proxy(): """ 扫描代理资源 :return: """ import requests from pyquery import PyQuery as Pq source_site = 'http://ip.qiaodm.com/' header = { 'Host': 'ip.qiaodm.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36' } s = requests.session() # 抓取页面 file_html = s.get(source_site).content # 保存文件 # with open('test.html', 'a') as f: # f.write(file_html.encode('utf-8')) # # # 读取抓取的页面 # with open('test.html', 'r') as f: # file_html = f.read() text_pq = Pq(file_html) tr_list = text_pq('tbody').find('tr[style="text-align: center;"]') print '单页共 %s 条记录' % len(tr_list) for tr_item in tr_list: # print Pq(tr_item).html() # print('---------------------') td_list = Pq(tr_item).find('td') # print '单条共 %s 列字段' % len(td_list) field_list = [] for td_item in Pq(td_list): field = Pq(td_item).text() field_list.append(field) # print field # print('++++++++++++++++++') # 特殊处理ip地址 ip = Pq(td_list).eq(0).html() # 去除干扰信息 ip = html.replace_html(ip, r'<p style="display:none;"/>') ip = html.replace_html(ip, r'<p style="display: none;"/>') ip = html.replace_html(ip, r'<p style=.*?display:.*?none;.*?>.*?</p>') # 去除标签 ip = html.strip_html(ip) # print ip # 过滤掉非法ip地址 if len(ip.split('.')) != 4: continue # 特殊处理端口 port_key = Pq(td_list).eq(1).attr('class').split()[1] if port_key not in PortDict: print '发现新端口: %s' % port_key continue port = PortDict.get(port_key, '') ProsyItem['Ip'] = ip.replace(' ', '') ProsyItem['Port'] = port ProsyItem['Type'] = field_list[2].strip() ProsyItem['AnonymousDegree'] = field_list[3].strip() ProsyItem['Area'] = field_list[4].strip() ProsyItem['Speed'] = field_list[5].strip() ProsyItem['ScanTime'] = field_list[6].strip() # print ProsyItem proxy_item = json.dumps(ProsyItem, ensure_ascii=False) html.save_file('proxy.json', proxy_item + '\n', 'a')