def getBrowser(self): brower = webdriver.Chrome() try: brower.get(self.startUrl) except: mylog.error('open %s failed' % self.startUrl) brower.implicitly_wait(30) return brower
def getBrowser(self): browser = webdriver.PhantomJS() try: browser.get(self.startUrl) except: mylog.error('open the %s failed' % self.startUrl) browser.implicitly_wait(20) return browser
def getBrowser(self): browser = webdriver.Firefox() print("15") try: browser.get(self.startUrl) print("16") except: mylog.info('open the %s failed' % self.startUrl) browser.implicitly_wait(20) return browser
class TestTime(object): def __init__(self): self.log = MyLog() self.testTime() self.testLocaltime() self.testSleep() self.testStrftime() def testTime(self): self.log.info(u'开始测试time.time()函数') print(u'当前时间戳为:time.time()=%f' % time.time()) print(u'这里返回的是一个浮点型的数值,他是从1970纪元后经过的浮点秒数') print('\n') def testLocaltime(self): self.log.info(u'开始测试time.localtime()函数') print(u'当前本地时间为:time.localtime()=%s' % time.localtime()) print(u'这里返回的是一个struct_time结构的元组') print('\n') def testSleep(self): self.log.info(u'开始测试time.sleep()函数') print(u'这是个计时器,time.sleep(5)') print(u'闭上眼睛数上5秒就行') time.sleep(5) print('\n') def testStrftime(self): self.log.info(u'开始测试time.strftime()函数') print(u'这个函数返回的是一个格式化的时间') print(u'time.strftime("%%Y-%%m-%%d %%X",time.localtime())=%s' % time.strftime("%Y-%m-%d %X", time.localtime())) print('\n')
def __init__(self): self.r = redis.Redis(host=redis_host, port=redis_port, db=redis_name, password=redis_pwd) # kafka配置 self.producer = KafkaProducer(bootstrap_servers=bootstrap_servers) self.topic = kfk_topic # oss配置 self.auth = oss2.Auth(AccessKeyID, AccessKeySecret) self.endpoint = EndPoint self.bucket = oss2.Bucket(self.auth, self.endpoint, Bucket) # 项目名称 self.spec_info = spec_info self.brand_name = '广州女装批发' self.brand_id = '6' self.store_name = '拼拼侠' self.store_id = '1' self.type_name = "服装" self.type_id = "6" self.transport_title = "拼拼侠通用运费模板" self.transport_id = "11" self.mylog = MyLog()
class GoldNews(SpiderConfig): log = MyLog() def __init__(self, data_name): self.data_name = data_name self.DICT_START_URL = { '名家点金': 'http://gold.cnfol.com/mingjiadianjin/', '机构论金': 'http://gold.cnfol.com/jigoulunjin/' } self.dict_url_re_pattern = { '名家点金': 'http://gold\.cnfol\.com/mingjiadianjin/[0-9]+/[0-9]+\.shtml', '机构论金': 'http://gold\.cnfol\.com/jigoulunjin/[0-9]+/[0-9]+\.shtml' } SpiderConfig.__init__(self, self.DICT_START_URL[self.data_name], self.dict_url_re_pattern[data_name]) @log.deco_log(sys.argv[0][0:-3] + '.log', "get_data", False) def get_data(self): list_url = SpiderConfig.get_urls(self) assert isinstance(list_url, list) list_html = SpiderConfig.get_htmls(self, list_url) list_headline = [] list_date = [] list_content = [] check_data = lambda html: html.get_text() if html else "null" for soup in list_html: headline = soup.find('div', class_='EDArt').find('h1') list_headline.append(check_data(headline)) #print check_data(headline) date = soup.find('div', class_="GSTitsL Cf").find('span') list_date.append(date.get_text()) #print date.get_text() content = soup.find('div', class_="pageBd") if content: content = SpiderConfig.clean_str(self, str(content)) else: raise ValueError #print content list_content.append(content) return list_headline, list_date, list_content @log.deco_log(sys.argv[0][0:-3] + '.log', "update_data", False) def update_data(self, *tuple_data): list_headline, list_date, list_content = tuple_data[0][0], tuple_data[0][1], tuple_data[0][2] db = SpiderConfig.db select_sql = "select headling from [zy_tbNews] WHERE id=(select MAX(id) from [zy_tbNews] WHERE type='%s')" % self.data_name update_num = SpiderConfig.check_newest_data(self, select_sql, list_headline) if update_num != 0: for i in range(update_num)[::-1]: insert_sql = "INSERT INTO [zy_tbNews] VALUES ('{}','{}','{}','{}')".format( self.data_name, list_headline[i], list_date[i], list_content[i] ) db.ExecNonQuery(insert_sql.encode('utf-8')) print list_headline[i] print list_date[i] print list_content[i] print "%s页面数据上传更新完毕" % self.data_name else: print "%s页面数据源无最新数据更新" % self.data_name
class GetData(object): def __init__(self): self.url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html' self.log = MyLog() self.items = self.spider() self.pipelines() def get_response(self): #获取页面信息 flag = True ua = UserAgent() while flag: with open('new3proxy.txt', 'r') as fp: lines = fp.readlines() index = random.randint(1, len(lines)) proxys = 'https://' + lines[index - 1] fakeHeaders = {'User-Agent': ua.random} request = urllib.request.Request(self.url, headers=fakeHeaders) proxy = urllib.request.ProxyHandler({'https': proxys}) opener = urllib.request.build_opener(proxy) urllib.request.install_opener(opener) try: response = urllib.request.urlopen(request) flag = False self.log.info(u'导入URL: 成功') return response except: flag = True self.log.error(u'导入URL: 失败') def spider(self): #数据提取 items = [] response = self.get_response() soup = BeautifulSoup(response.read(), 'html.parser') datas = soup.find('div', {'class': 'news-text'}).find_all('tr') for data in datas[1:5]: item = Item() item.paihang = data.find_all('td')[0].text item.name = data.find_all('td')[1].text item.address = data.find_all('td')[2].text item.score = data.find_all('td')[3].text items.append(item) self.log.info(u'获取%s信息: 成功' % item.name) return items def pipelines(self): #数据清洗保存 filename = 'daxuedata.txt' with codecs.open(filename, 'w', 'utf8') as fp: for item in self.items: fp.write('%d \t %s \t %s \t %.f \n' % (int( item.paihang), item.name, item.address, float(item.score))) self.log.info(u'%s保存至%s:成功' % (item.name, filename))
def getBrowser(self): # 最初采用PhantomJS进行动态抓取,由于PhantomJS被 # browser = webdriver.PhantomJS(executable_path='E:/Learning/03-Programme/Python/script-libs/ # phantomjs-2.1.1-windows/bin/phantomjs.exe') chrome_options = Options() # 此处由于set_headless属性的被遗弃,采用option的实例化方式进行 chrome_options.headless = True chrome_options.add_argument('--disable-gpu') browser = webdriver.Chrome(chrome_options=chrome_options) try: browser.get(self.startUrl) except Exception as e: mylog.error('open the %s failed, exit the script ...' % self.startUrl) sys.exit(-1) browser.implicitly_wait(20) return browser
class GetData(object): def __init__(self): self.url = 'https://www.toutiao.com/search/?keyword=\xe8\xa1\x97\xe5\xa4\xb4\xe7\xaf\xae\xe7\x90\x83' self.log = MyLog() # self.urls = self.get_urls() self.items = self.spider() self.pipelines() # def get_urls(self): # pass def get_html(self): driver = webdriver.PhantomJS() driver.get(self.url) driver.implicitly_wait(10) submitelement = driver.find_element_by_xpath('//div[@class="tabBar"]//li[@class="y-left tab-item "]') submitelement.click() time.sleep(5) pageSource = driver.page_source self.log.info(u'successful') return pageSource def spider(self): i = 1 items = [] # for url in self.urls: # response = self.get_response() pageSource = self.get_html() try: soup = BeautifulSoup(pageSource, 'html.parser') datas = soup.find_all('div', {'class': 'articleCard'}) for data in datas: item = Item() try: item.image_url = data.find('a', {'class': 'img-wrap'}).find('img', {'alt': ''})['src'] items.append(item) except KeyError: pass self.log.info(u'获取信息: 成功') except AttributeError: self.log.info(u'url None') return items def pipelines(self): filename = '街头篮球1' if os.path.exists(filename): os.chdir(filename) else: os.mkdir(filename) os.chdir(filename) i = 1 for url in self.items: with open(str(i) + '.jpg', 'wb') as fp: i += 1 pic = requests.get(url.image_url) fp.write(pic.content)
""" MyMongo """ from __future__ import print_function from __future__ import absolute_import from __future__ import division #### import bson from time import sleep ##### # $ sudo pip install pymongo ##### import pymongo ##### from mylog import MyLog l = MyLog('MyMongo') __all__ = ['MyMongo'] class MyMongo(object): """ mongodb functions """ # pylint: disable=bare-except # pylint: disable=no-self-use # pool = None # dbase = None # connected = False def __init__(self, mongo_hosts, son=True): """ constructor """ self.connected = False
""" api """ from __future__ import print_function from __future__ import absolute_import from __future__ import division #### from password_strength import PasswordPolicy import pyotp #### from mymongo import MyMongo from mylog import MyLog from myconfig import MyConfig # pylint: disable=fixme c = MyConfig() l = MyLog(c.cfg['virtualenv']['dir'] + '_api') appname = c.cfg['virtualenv']['dir'] mongohost = c.cfg['dbs']['mongo']['host'] mongodb = c.cfg['dbs']['mongo']['db'] sessionHashSecret = c.cfg['session']['hash_secret'] passpolicylength = c.cfg['password_strength']['length'] passpolicyuppercase = c.cfg['password_strength']['uppercase'] passpolicynumbers = c.cfg['password_strength']['numbers'] passpolicyspecial = c.cfg['password_strength']['special'] passpolicy = PasswordPolicy.from_names( length=passpolicylength, # min length uppercase=passpolicyuppercase, # need min. uppercase letters numbers=passpolicynumbers, # need min. digits
#-------------------------------------------------------------------------------------------------------------------------------- from flask import Flask from flask import request, jsonify import requests import time from general_ocr_recog import general_ocr_client from paddle_serving_client import Client import random import json import time import datetime import numpy as np import cv2 from mylog import MyLog logger = MyLog('service').getlog() from service_config import * #-------------------------------------------------------------------------------------------------------------------------------- app = Flask(__name__) app.config['JSON_AS_ASCII'] = False medical_image_folder = '/data/images/' medical_json_folder = '/data/json/' #初始化检测和识别 det_client = Client() det_client.load_client_config( "./general_ocr_config/det_infer_client/serving_client_conf.prototxt") det_client.connect(det_ip_port) #start rec Client
def __init__(self): self.url = 'https://www.toutiao.com/search/?keyword=\xe8\xa1\x97\xe5\xa4\xb4\xe7\xaf\xae\xe7\x90\x83' self.log = MyLog() # self.urls = self.get_urls() self.items = self.spider() self.pipelines()
def __init__(self): self.url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html' self.log = MyLog() self.items = self.spider() self.pipelines()
class GetData(object): def __init__(self): self.url = 'https://movie.douban.com/subject/26266893/reviews?start=' self.log = MyLog() self.urls = self.get_urls() self.items = self.spider() self.pipelines() def get_urls(self): pages = 60 urls = [] for i in range(0, pages, 20): url = self.url + str(i) urls.append(url) self.log.info(u'导入URL 成功') return urls def get_response(self, url): flag = True ua = UserAgent() while flag: with open('new4proxy.txt', 'r') as fp: lines = fp.readlines() index = random.randint(1, len(lines)) proxys = 'https://' + lines[index - 1] fakeHeaders = {'User-Agent': ua.random} request = urllib.request.Request(url, headers=fakeHeaders) proxy = urllib.request.ProxyHandler({'https': proxys}) opener = urllib.request.build_opener(proxy) urllib.request.install_opener(opener) try: response = urllib.request.urlopen(request) flag = False self.log.info(u'导入URL: 成功') return response except (HTTPError, URLError): flag = True self.log.error(u'导入URL: 失败') def spider(self): items = [] for url in self.urls: response = self.get_response(url) try: item = Item() soup = BeautifulSoup(response.read(), 'html.parser') item.name = soup.find('a', {'class': 'name'}).text item.content = soup.find('div', { 'class': 'short-content' }).text items.append(item) self.log.info(u'获取%s信息: 成功' % item.name) except AttributeError: self.log.info(u'url None') return items def pipelines(self): filename = 'newdata.txt' with codecs.open(filename, 'w', 'utf8') as fp: for item in self.items: fp.write('%s \t %s \n' % (item.name, item.content)) self.log.info(u'%s保存至%s:成功' % (item.name, filename))
import pymysql import re import redis import time import phpserialize import itertools import os import hashlib import sys sys.path.append("../") from Fix.settings import mysql_host, mysql_port, mysql_db_user, mysql_db_pwd, mysql_db_name, mysql_db_charset from Fix.settings import redis_host, redis_port, redis_pwd, redis_name from Fix.settings import image_path, store_id from mylog import MyLog mylog = MyLog() class FixPipeline(object): brand_info = dict() cat_info = dict() img_url = "" def __init__(self): self.client = pymysql.connect( host=mysql_host, port=mysql_port, user=mysql_db_user, # 使用自己的用户名 passwd=mysql_db_pwd, # 使用自己的密码 db=mysql_db_name, # 数据库名 charset=mysql_db_charset)
def __init__(self): self.url = 'https://movie.douban.com/subject/26266893/reviews?start=' self.log = MyLog() self.urls = self.get_urls() self.items = self.spider() self.pipelines()
""" mysignal """ from __future__ import print_function from __future__ import absolute_import from __future__ import division #### import signal #### from mylog import MyLog l = MyLog('MySignal') __all__ = ['MySignal'] class MySignal(object): def __init__(self): """ constructor """ self.exitFlag = 0 def signalhandler(self, signum, stack): """ handle ctrl-c signal """ l.log('Exiting gracefully! ', 'info') self.exitFlag = 1 def set_signalhandler(self): """ set signalhandler """ signal.signal(signal.SIGTERM, self.signalhandler) signal.signal(signal.SIGINT, self.signalhandler)
"step": "86400", "tunes": i, "count": count, } response = json.loads( requests.post(url_, headers=headers, data=data).text) if response: for data in response: item = {} item['onlyKey'] = mid[0].upper( ) + mid[1:] + "_" + bName.upper() + "_" + unit.upper() item['type'] = "Alcoin" item['Measurement'] = "kline" item['Timestamp'] = int(data[0]) * 1000 item['Open'] = data[1] item['High'] = data[2] item['Low'] = data[3] item['Close'] = data[4] item['Volume'] = data[5] content.append(item) self.kfk.process_item(content) except Exception as e: log.info("this data an error {}".format(e)) if __name__ == '__main__': log = MyLog() log.debug("程序正在运行·····") a = get_requests() a.get_head()
def __init__(self): self.log = MyLog() self.testTime() self.testLocaltime() self.testSleep() self.testStrftime()