def diy(body, pg_connect): cloudpan_url = 'null' cloudpan_pass = '******' msg = eval(body) print('{} {}'.format(msg.keys(), msg.values())) fkey = list(msg.keys())[0] ourl = OpenUrl(msg[fkey]) code, content = ourl.run() if code == 200: selecter = etree.HTML(content) try: cloudpan_url = selecter.xpath('//div[@class="part"]/a/@href') if len(cloudpan_url) == 1: cloudpan_url = cloudpan_url[0] cloudpan_pass = selecter.xpath( '//div[@class="part"]/text()')[2] elif len(cloudpan_url) == 2: cloudpan_url = '|'.join(cloudpan_url) cloudpan_pass = '******'.join( selecter.xpath('//div[@class="part"]/text()')[2:4]) else: cloudpan_url = cloudpan_url[0] cloudpan_pass = cloudpan_pass[0] except: pass send_pg(pg_connect, [fkey, cloudpan_url, cloudpan_pass]) else: pass time.sleep(0.5)
def get_download_url(self): redis_keys = self.__redis_link.keys() for fkey in redis_keys: url = self.__redis_link.get(fkey) ourl = OpenUrl(url) code, content = ourl.run() if code == 200: selecter = etree.HTML(content) try: cloudpan_url = selecter.xpath( '//div[@class="part"]/a/@href') if len(cloudpan_url) == 1: cloudpan_url = cloudpan_url[0] cloudpan_pass = selecter.xpath( '//div[@class="part"]/text()')[2] elif len(cloudpan_url) == 2: cloudpan_url = '|'.join(cloudpan_url) cloudpan_pass = '******'.join( selecter.xpath('//div[@class="part"]/text()')[2:4]) else: logger.error( '[{}] donot has cloudpan download link...'.format( fkey.decode())) continue except: logger.error('[{}] miss something..'.format(fkey.decode())) continue self.send_pg([fkey, cloudpan_url, cloudpan_pass]) else: logger.error('[%s] can not open the download page..' % fkey.decode()) continue time.sleep(0.5)
def _get_content(self,url): ourl = OpenUrl(url) code, content = ourl.run() if code == 200: return content else: return None
def get_pages(start_url): url = start_url + '.html' ourl = OpenUrl(url) code, html = ourl.run() if code == 200: selecter = etree.HTML(html) pages_url = selecter.xpath('//div[@class="page"]/a/@href')[-1] pages = int(re.split('[/|.|-]', pages_url)[3]) else: logger.error('get [{0}] failed: [{1}]'.format(url, code)) pages = None return pages
def gethtml(self, url): ''' 获取html文件 返回url的列表 ''' ob_openurl = OpenUrl(url) code, html = ob_openurl.run() if code == 200: return html else: print('open [{}] failed..'.format(url)) return None
def get_price(): ourl = OpenUrl('https://training.linuxfoundation.cn/certificate/details/1') code, html = ourl.run() if code == 200: selecter = etree.HTML(html) try: tmp = str( selecter.xpath('//span[@class="text-red mr-2 text-sm"]/text()') [0]) return int(float(tmp.replace(',', ''))) except: return None
def getMovieInfo(url): full_url = 'https://www.dytt8.net/' + url ourl = OpenUrl(full_url,'gb2312') code,html = ourl.openurl() info = {} if code==200: selecter = etree.HTML(html) try: info['name'] = selecter.xpath("//div[@class='title_all']/h1/font/text()")[0] info['public_time'] = selecter.xpath("//div[@class='co_content8']/ul/text()")[0].strip().split(':')[1] info['downlink'] = selecter.xpath("//tbody/tr/td/a/text()")[0] return info except: return None
def get_useful_url(start_url, redis_conn): all_page = get_pages(start_url) for page in range(1, all_page): if page == 1: url = start_url + '.html' else: url = start_url + '-' + str(page) + '.html' ourl = OpenUrl(url) code, html = ourl.run() if code == 200: selecter = etree.HTML(html) for urls in selecter.xpath('//a/@href'): if urls.startswith('/html'): print(urls) redis_conn.lpush('simi', urls) else: logger.error('get [{0}] failed: [{1}]'.format(url, code))
def get_img(redis_conn, url): ourl = OpenUrl('https://se.haodd92.com/' + url.decode('utf-8')) code, html = ourl.run() if code == 200: selecter = etree.HTML(html) img_url_list = selecter.xpath( '//div[@class="center margintop border clear main"]/img/@src') for img_url in img_url_list: time.sleep(0.5) img_name = img_url.split('/')[-1] local = 'image/{}'.format(img_name) try: r = requests.get(img_url, stream=True) with open(local, 'wb') as f: f.write(r.content) logger.info('download [{0}] to [{1}] successfully'.format( img_url, img_name)) except Exception as e: logger.error('download [{0}] to [{1}] failed: [{2}]'.format( img_url, img_name, e))
def get_url(ftype, sender): main_url = 'https://www.hanfan.cc/' ourl = OpenUrl(main_url + ftype) code, main_content = ourl.run() if code == 200: selecter = etree.HTML(main_content) pages = int( selecter.xpath( '/html/body/section/div[1]/div/div[2]/ul/li[8]/span/text()') [0].split(' ')[1]) else: print("bad url: {}".format(main_url)) sys.exit() for page in range(1, pages): page_url = main_url + ftype + '/page/%s/' % page sub_ourl = OpenUrl(page_url) sub_code, sub_content = sub_ourl.run() if sub_code == 200: selecter = etree.HTML(sub_content) selecter_list = selecter.xpath('//article/header/h2/a') for link in selecter_list: name = link.text sub_url = link.attrib['href'] + '#prettyPhoto/0/' sender.send_date(str({name: sub_url})) else: continue time.sleep(1)
def get_url(self, ftype): ourl = OpenUrl(self.main_url + ftype) code, main_content = ourl.run() if code == 200: selecter = etree.HTML(main_content) pages = int( selecter.xpath( '/html/body/section/div[1]/div/div[2]/ul/li[8]/span/text()' )[0].split(' ')[1]) else: logger.error("bad url: %s" % self.main_url) sys.exit() for page in range(1, pages): page_url = self.main_url + ftype + '/page/%s/' % page sub_ourl = OpenUrl(page_url) sub_code, sub_content = sub_ourl.run() if sub_code == 200: selecter = etree.HTML(sub_content) selecter_list = selecter.xpath('//article/header/h2/a') for link in selecter_list: name = link.text sub_url = link.attrib['href'] + '#prettyPhoto/0/' self.__redis_link.set(name, sub_url, ex=21600) else: logger.error('[%s] can not open...' % page_url) continue time.sleep(1)
@Author : Kellan Fan @Version : 1.0 @Contact : [email protected] @Desc : None ''' # here put the import lib from lxml import etree from misc.openurl import OpenUrl from misc.pg_client import Mypostgres from log.create_logger import create_logger logger = create_logger() url = 'http://www.310win.com/shuangseqiu/tubiao_lshm.html' ourl = OpenUrl(url) code, doc = ourl.run() pg_conn = Mypostgres() s_sql = 'select opendate from shuang_se_qiu order by opendate desc limit 1' last_time = pg_conn.execute(s_sql) if code == 200: selecter = etree.HTML(doc) info_list = selecter.xpath("//span[@id='spnHidValue']/text()") for item in info_list[0].split('#'): item_info = item.split('+') if len(item_info) > 1: opendate = item_info[0].split('&')[1] issue_num = item_info[1] r_nunber, b_number = item_info[2].split('|') if opendate > last_time[0][0]: sql = "insert into shuang_se_qiu(opendate, issue_num, r_number, b_number) values (%s,%s,%s,%s)"
from log.create_logger import create_logger logger = create_logger() base_url = 'http://wufazhuce.com/article/' mongo_client = pymongo.MongoClient("mongodb://mongodb:27017/") db = mongo_client["spider"] coll = db["one"] cur_last_id = list(coll.find().sort('article_id'))[-1]['article_id'] fail_time = 0 while True: cur_last_id += 1 data = {} ourl = OpenUrl(base_url + str(cur_last_id)) code, doc = ourl.run() if code == 200: soup = BeautifulSoup(doc, 'lxml') data['article_id'] = cur_last_id data["title"] = soup.find('h2', class_='articulo-titulo').text.strip() data["autor"] = soup.find('p', class_='articulo-autor').text.strip() data["content"] = soup.find('div', class_='articulo-contenido').text.strip() try: coll.insert_one(data) logger.info("insert [{}] successful".format(data["title"])) except Exception as e: logger.error("insert [{0}] failed: [{1}]".format(data["title"], e)) continue
movie_url.remove('/html/gndy/dyzz/index.html') return movie_url def getMovieInfo(url): full_url = 'https://www.dytt8.net/' + url ourl = OpenUrl(full_url,'gb2312') code,html = ourl.openurl() info = {} if code==200: selecter = etree.HTML(html) try: info['name'] = selecter.xpath("//div[@class='title_all']/h1/font/text()")[0] info['public_time'] = selecter.xpath("//div[@class='co_content8']/ul/text()")[0].strip().split(':')[1] info['downlink'] = selecter.xpath("//tbody/tr/td/a/text()")[0] return info except: return None if __name__ == "__main__": start_url='https://www.dytt8.net/' ourl = OpenUrl(start_url) code,html = ourl.openurl() if code == 200: info_list = [] movie_list = getMovieUrl(html) for url in movie_list: tmp = getMovieInfo(url) if tmp: info_list.append(tmp) print(info_list)
info['name'] = selecter.xpath( "//div[@class='title_all']/h1/font/text()")[0] info['public_time'] = selecter.xpath( "//div[@class='co_content8']/ul/text()")[0].strip().split( ':')[1] info['downlink'] = selecter.xpath("//tbody/tr/td/a/text()")[0] return info except: return None else: return html if __name__ == "__main__": start_url = 'https://www.dytt8.net/' ourl = OpenUrl(start_url + 'index.htm', ) code, html = ourl.run() info_list = [] if code == 200: movie_list = getMovieUrl(html) for url in movie_list: tmp = getMovieInfo(url) time.sleep(1) if tmp: info_list.append(tmp) else: print(html) exit() postgresql = Mypostgres() select_cmd = 'select public_time from dian_ying_tian_tang order by public_time desc limit 1' last_time = postgresql.execute(select_cmd)[0][0].strip()
def gethtml(url): ourl = OpenUrl(url) code, html = ourl.run() if code != 200: html = None return html