def get_collection(self): name = config.USERNAME sql = "select * from collection where(name='%s') order by id DESC limit 5" % ( name) mysqlClient = MysqlClient() find_res = mysqlClient.find_all(sql) return find_res
def __init__(self, master=None): self.root = master # 定义内部变量root self.root.geometry('%dx%d' % (300, 180)) # 设置窗口大小 self.username = StringVar() self.password = StringVar() self.createPage() self.mysqlClient = MysqlClient()
class Getter(): def __init__(self): self.Mysql = MysqlClient() self.crawler = Crawler() def is_over_threshold(self): """ Determine whether the agent pool limit has been reached """ if self.Mysql.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('Get the execution') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # Get an agent proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: if (self.Mysql.exists(proxy)): pass else: print(proxy) self.Mysql.add(proxy)
def get_history(self): name = config.USERNAME sql = "select * from content where(name='%s') order by id DESC limit 10" % ( name) mysqlClient = MysqlClient() find_res = mysqlClient.find_all(sql) return find_res
class RegPage(object): def __init__(self, master=None): self.root = master # 定义内部变量root self.root.geometry('%dx%d' % (300, 200)) # 设置窗口大小 self.username = StringVar() self.password = StringVar() self.repassword = StringVar() self.createPage() self.mysqlClient = MysqlClient() def createPage(self): self.page = Frame(self.root) # 创建Frame self.page.pack() Label(self.page).grid(row=0, stick=W) Label(self.page, text='账户: ').grid(row=1, stick=W, pady=10) Entry(self.page, textvariable=self.username).grid(row=1, column=1, stick=E) Label(self.page, text='密码: ').grid(row=2, stick=W, pady=10) Entry(self.page, textvariable=self.password, show='*').grid(row=2, column=1, stick=E) Label(self.page, text='确认密码: ').grid(row=3, stick=W, pady=10) Entry(self.page, textvariable=self.repassword, show='*').grid(row=3, column=1, stick=E) # Button(self.page, text='退出', command=self.page.quit).grid(row=3, column=1, stick=E) Button(self.page, text='注册', command=self.register).grid(row=4, column=1, stick=E) def register(self): name = self.username.get() password = self.password.get() repassword = self.repassword.get() if name.strip() == '' or password.strip() == '' or repassword.strip( ) == '': showinfo(title='注册失败', message='账户名或者密码不能为空') return if password == repassword: sql = "select * from user where(name='%s')" % (name) find_res = self.mysqlClient.find_one(sql) if find_res: showinfo(title='错误', message='该用户已存在') else: sql = "insert into user(name,password) values ('%s','%s')" % ( name, password) add_res = self.mysqlClient.save(sql) if add_res: showinfo(title='注册成功', message='注册成功') self.page.destroy() login = LoginPage.LoginPage(self.root) else: showinfo(title='注册失败', message='注册失败') else: showinfo(title='错误', message='两次输入的密码不一致')
def __init__(self, master=None): Frame.__init__(self, master) self.root = master # 定义内部变量root self.word = StringVar() self.mean = StringVar() self.createPage() self.mysqlClient = MysqlClient() self.spider = spider.Spider()
class CookiesGenerator(object): def __init__(self, website='default'): self.website = website self.mysql_client = MysqlClient(website) def new_cookies(self, username, password): """ 新生成Cookies,子类需要重写 :param username: 用户名 :param password: 密码 :return: """ raise NotImplementedError def process_cookies(self, cookies): ''' 处理cookies :param cookies: :return: ''' dict = {} for k, v in cookies.items(): dict[k] = v print(dict) return dict def run(self): """ 运行, 得到所有账户, 然后顺次模拟登录 :return: """ account_list = self.mysql_client.get_all() # print(account_list) for account in account_list: if account['valid'] == 0: print('正在生成Cookies...', '账号', account['username'], '密码', account['password']) result = self.new_cookies(account['username'], account['password']) # 成功获取 if result.get('status') == 1: cookies = self.process_cookies(result.get('content')) print('成功获取到Cookies', cookies) self.mysql_client.update_cookies_by_username( account['username'], json.dumps(cookies)) #密码错误,移除账号 elif result.get('status') == 2: print(result.get('content')) if self.mysql_client.delete_account(account['username']): print('成功删除账号') else: print(result.get('content')) print('所有账号都已经成功获取Cookies')
def add_detail_infos(self): conn = MysqlClient() results = conn.random() for result in results: response = requests.get(result[2]) if response.status_code == 200: with open('1'+'.'+'jpg','wb') as f: print('正在封面图片%s'%result[2]) f.write(response.content) self.fill_infos(result)
class CountFrame(Frame): # 继承Frame类 def __init__(self, master=None): Frame.__init__(self, master) self.root = master # 定义内部变量root self.mysqlClient = MysqlClient() self.message = StringVar() # self.createPage() def createPage(self): Label(self).grid(row=0, stick=W, pady=10) Label(self, text='请输入: ').grid(row=1, stick=W, pady=10) Entry(self, textvariable=self.message, width=40).grid(row=2, stick=W) Button(self, text='发布', command=self.set_message).grid(row=10, column=2, stick=E, pady=10) def set_message(self): name = config.USERNAME message = self.message.get() publishDateStr = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) publishDate = int(time.time()) sql = "insert into content(name,message,publishDateStr,publishDate) values ('%s','%s','%s','%s')" % ( name, message, publishDateStr, publishDate) save_res = self.mysqlClient.save(sql) if save_res: showinfo(title='成功', message='发布成功') else: showinfo(title='失败', message='发布失败') return save_res
class Tester(object): def __init__(self): self.mysql = MysqlClient() async def test_single_proxy(self, proxy): conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') real_proxy = 'http://' + proxy print('正在测试', proxy) async with session.get(TEST_URL, proxy=real_proxy, timeout=15) as response: if response.status in VALID_STATUS_CODES: self.mysql.max(proxy) print('代理可用', proxy) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): self.mysql.decrease(proxy) print('代理请求失败', proxy) def run(self): print('开始测试') try: proxies = self.mysql.all() loop = asyncio.get_event_loop() for i in range(0, len(proxies), BATCH_TEST_SIZE): test_proxies = proxies[i:i+BATCH_TEST_SIZE] tasks = [self.test_single_proxy(proxy) for proxy in test_proxies] loop.run_until_complete(asyncio.wait(tasks)) time.sleep(5) asyncio.open_connection() except Exception as e: print('测试发生错误', e.args) self.mysql.close()
class Tester(object): def __init__(self): self.Mysql = MysqlClient() async def test_single_proxy(self, proxy): """ 测试单个代理 :param proxy: :return: """ conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('utf-8') #print(proxy) #print(proxy[0],proxy[1],proxy[2]) real_proxy ="http://{0}:{1}".format(proxy[0],proxy[1]) print(real_proxy) print('正在测试', proxy) async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response: if response.status in VALID_STATUS_CODES: self.Mysql.max_(proxy) print('代理可用', proxy) else: self.Mysql.decrease(proxy) print('请求响应码不合法 ', response.status, 'IP', proxy) except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError): self.Mysql.decrease(proxy) print('代理请求失败', proxy) def run(self): """ 测试主函数 :return: """ print('测试器开始运行') try: count = self.Mysql.count() print('当前剩余', count, '个代理') for i in range(0, count, BATCH_TEST_SIZE): start = i stop = min(i + BATCH_TEST_SIZE, count) print('正在测试第', start + 1, '-', stop, '个代理') test_proxies =list(self.Mysql.batch(start,stop)) #print(test_proxies,type(test_proxies)) loop = asyncio.get_event_loop() tasks = [self.test_single_proxy(proxy) for proxy in test_proxies] loop.run_until_complete(asyncio.wait(tasks)) sys.stdout.flush() time.sleep(5) except Exception as e: print('测试器发生错误', e.args)
def add_detail_infos(self): conn = MysqlClient() results = conn.randombigmoney() nums = 0 for result in results: response = requests.get(result[2]) if response.status_code == 200: with open('1' + '.' + 'jpg', 'wb') as f: print('下载封面图片%s' % result[2]) f.write(response.content) self.fill_infos(result) nums += 1 print('*' * 50) print('*' * 50) print('添加最新口子%s' % nums) print('*' * 50) print('*' * 50)
class Getter(): def __init__(self): self.mysql = MysqlClient() self.spider = Spider() def is_over_max(self): if self.mysql.count() >= MAX_POOL_COUNT: return True else: return False def run(self): print('爬虫程序开始执行') if not self.is_over_max(): for callback_lable in range(self.spider.__SpiderFuncCount__): callback = self.spider.__SpiderFunc__[callback_lable] proxies = self.spider.get_proxies(callback) for proxy in proxies: self.mysql.add(proxy) self.mysql.close()
def run(self): print('代理池开始运行') mysql=MysqlClient() if TESTER_ENABLED: tester_process = Process(target=self.schedule_tester) tester_process.start() if GETTER_ENABLED: getter_process = Process(target=self.schedule_getter) getter_process.start() if API_ENABLED: api_process = Process(target=self.schedule_api) api_process.start()
class ValidTester(object): def __init__(self, website='default'): self.website = website self.mysql_client = MysqlClient(website) def test(self, username, cookies): raise NotImplementedError def run(self): account_list = self.mysql_client.get_all() for account in account_list: self.test(account['username'], account['cookies'])
def run(self): print('Agent pool starts running') logger.log('INFOR', 'Scheduler starts running...') mysql=MysqlClient() if TESTER_ENABLED: tester_process = Process(target=self.schedule_tester) tester_process.start() if GETTER_ENABLED: getter_process = Process(target=self.schedule_getter) getter_process.start() if API_ENABLED: api_process = Process(target=self.schedule_api) api_process.start()
class LoginPage(object): def __init__(self, master=None): self.root = master # 定义内部变量root self.root.geometry('%dx%d' % (300, 180)) # 设置窗口大小 self.username = StringVar() self.password = StringVar() self.createPage() self.mysqlClient = MysqlClient() def createPage(self): self.page = Frame(self.root) # 创建Frame self.page.pack() Label(self.page).grid(row=0, stick=W) Label(self.page, text='账户: ').grid(row=1, stick=W, pady=10) Entry(self.page, textvariable=self.username).grid(row=1, column=1, stick=E) Label(self.page, text='密码: ').grid(row=2, stick=W, pady=10) Entry(self.page, textvariable=self.password, show='*').grid(row=2, column=1, stick=E) Button(self.page, text='登陆', command=self.loginCheck).grid(row=3, stick=W, pady=10) # Button(self.page, text='退出', command=self.page.quit).grid(row=3, column=1, stick=E) Button(self.page, text='注册', command=self.register).grid(row=3, column=1, stick=E) def loginCheck(self): name = self.username.get() secret = self.password.get() sql = "select * from user where(name='%s' and password='******')" % ( name, secret) find_res = self.mysqlClient.find_one(sql) if find_res: self.page.destroy() MainPage(self.root) config.USERNAME = find_res[1] else: showinfo(title='错误', message='账号或密码错误!') def register(self): self.page.destroy() RegPage(self.root)
def __init__(self): self.Mysql = MysqlClient()
def save_to_mysql(self, query, title, url): m = MysqlClient() m.add(query, title, url)
def save_to_mysql(self,query,title,url): m = MysqlClient() m.add(query,title,url)
def __init__(self): self.download = Download() self.db = MysqlClient() self.redisClient = RedisClient()
class Scheduler(object): def __init__(self): self.download = Download() self.db = MysqlClient() self.redisClient = RedisClient() def run(self): #self.get_qu() #self.get_zhen() # self.push_url_to_redis() self.get_position() def get_qu(self): sql = 'select * from shi' results = self.db.find_all(sql) for res in results: shi_id = res[2] url = SHI_URL.format(shi_id='c' + shi_id) print(url) html = self.download.get_html(url) if html.status_code == 200 and html is not None: html = HTML(html.text) qu_id_list = html.xpath( '//dl[@class="condition-district show-condition-district"]/dd/a/@href' ) qu_name_list = html.xpath( '//dl[@class="condition-district show-condition-district"]/dd/a/text()' ) for qu_id, name in zip(qu_id_list[1:], qu_name_list[1:]): qu_id = qu_id.split('/') qu_id = qu_id[2] sql = '''insert into qu(pid,qu_id,name) VALUES ('{pid}','{qu_id}','{name}')'''\ .format(pid=shi_id,qu_id=qu_id, name=name) print(sql) self.db.save(sql) else: print('该url无数据') def get_zhen(self): sql = 'select * from qu' results = self.db.find_all(sql) for res in results: shi_id = res[1] qu_id = res[2] url = QU_URL.format(shi_id='c' + shi_id, qu_id=qu_id) print(url) html = self.download.get_html(url) if html is not None and html.status_code == 200: html = HTML(html.text) zhen_id_list = html.xpath( '//dl[@class="condition-area show-condition-area"]/dd/a/@href' ) zhen_name_list = html.xpath( '//dl[@class="condition-area show-condition-area"]/dd/a/text()' ) for zhen_id, name in zip(zhen_id_list[1:], zhen_name_list[1:]): zhen_id = zhen_id.split('/') zhen_id = zhen_id[2] sql = '''insert into zhen(pid,qu_id, zhen_id,name) VALUES ('{pid}','{qu_id}','{zhen_id}','{name}')'''\ .format(pid=shi_id,qu_id=qu_id,zhen_id=zhen_id, name=name) print(sql) self.db.save(sql) else: print('该url无数据') def get_position(self): redis_results = self.redisClient.pop('employment') try: json_obj = json.loads(redis_results[1].decode('utf8')) except: return None if json_obj: flag = True pageToken = 1 #处理翻页问题 while flag: detail_url_list = [] url = json_obj['url'] pre_page = re.search('\/\?page=(.*?)&', url).group(1) if int(pageToken) > 10: break url = url.replace( 'page=' + pre_page + '&sort=2&ka=page-' + pre_page, 'page=' + str(pageToken) + '&sort=2&ka=page-' + str(pageToken)) cityId = json_obj['cityId'] zhiweiId = json_obj['zhiweiId'] print(url) html = self.download.get_html(url) if html is not None and html.status_code == 200: html = HTML(html.text) #判断是否是当天发布,是的话请求详情页, 判断数据库是否有这条数据,有的话不请求(暂时) li_xpath = html.xpath('//div[@class="job-list"]/ul/li') for li in li_xpath: content = etree.tostring(li) content = HT.unescape(content.decode()) content = HTML(content) li_time = content.xpath( 'string(//div[@class="info-publis"]/p)') href_url = content.xpath( 'string(//div[@class="info-primary"]//h3/a/@href)') try: last_str = li_time.split('发布于')[1] minute = last_str.split(':')[1] #判断是否当天发布 if minute: #判断数据库存不存在: try: cid = re.match('^/job_detail/(.*?)\.html', href_url).group(1) sql = "select * from positions where cid='%s'" % ( cid) find_one_res = self.db.find_one(sql) if find_one_res is None: #先把cid插入,避免重复抓取 sql = "insert into positions(cid) values ('%s')" % ( cid) self.db.save(sql) detail_url_list.append( config.HOST_URL + href_url) elif find_one_res[2] is None: detail_url_list.append( config.HOST_URL + href_url) else: print('数据库存在该记录:' + str(cid)) except: print('查询数据库出错:' + str(cid)) except: print('该URL发布日期小于当天:' + config.HOST_URL + href_url) results = self.get_detail(detail_url_list, cityId, zhiweiId) #判断是否翻页 try: last_li = html.xpath( 'string(//div[@class="job-list"]/ul/li[last()]//div[@class="info-publis"]/p)' ) last_str = last_li.split('发布于')[1] minute = last_str.split(':')[1] if minute: pageToken = str(int(pageToken) + 1) except: flag = False else: print('该url无数据') def get_detail(self, detail_url_list, cityId, zhiweiId): for url in detail_url_list: print('下载该详情页:' + url) html = self.download.get_html(url) if html is not None and html.status_code == 200: html = HTML(html.text) try: cid = re.match( '^https://www.zhipin.com/job_detail/(.*?)\.html', url).group(1) except: print('获取cid失败') continue title = html.xpath('string(//h1)') url = url try: publishDateStr = html.xpath( 'string(//span[@class="time"])').split('发布于')[1] publishDate = int( time.mktime( time.strptime(publishDateStr, "%Y-%m-%d %H:%M"))) except: publishDateStr = None publishDate = None try: info = html.xpath( 'string(//div[@class="job-banner"]//div[@class="info-primary"]/p)' ) info = info.split(':') city = info[1][:-2] jingyan = info[2][:-2] xueli = info[3] except: city = None jingyan = None xueli = None price = html.xpath( 'string(//div[@class="info-primary"]//span[@class="badge"])' ) posterName = html.xpath('string(//h2)') posterId = None posterUrl = html.xpath( 'string(//div[@class="detail-figure"]/img/@src)') content = html.xpath( 'string(//div[@class="job-sec"]/div[@class="text"])' ).strip() try: company_text = html.xpath( 'string(//a[@ka="job-cominfo"]/@href)') companyID = re.match('/gongsi/(.*?)\.html', company_text).group(1) except: companyID = None createDate = int(time.time()) #判断是否是当天发布 temp_time = time.localtime(int(time.time())) now_DateStr = time.strftime("%Y-%m-%d", temp_time) lt = time.strptime(now_DateStr, "%Y-%m-%d") now_timestamp = int(time.mktime(lt)) if publishDate == None or publishDate < now_timestamp or publishDate >= ( now_timestamp + 86400): print('特例.该url不是当天发布:' + str(url)) continue res_obj = { 'cid': cid, 'title': title, 'url': url, 'publishDateStr': publishDateStr, 'publishDate': publishDate, 'city': city, 'jingyan': jingyan, 'xueli': xueli, 'price': price, 'posterName': posterName, 'posterId': posterId, 'posterUrl': posterUrl, 'content': content, 'companyID': companyID, 'createDate': createDate, 'cityId': cityId, 'zhiweiId': zhiweiId } print(res_obj) sql = "insert into positions(cid,title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)" \ " VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \ % (cid,title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)\ + "ON DUPLICATE KEY UPDATE title='%s', url='%s', publishDate='%s', publishDateStr='%s', city='%s', jingyan='%s', xueli='%s', price='%s', posterName='%s', posterId='%s', posterUrl='%s', content='%s', companyID='%s', createDate='%s',cityId='%s', zhiweiId='%s'" \ %(title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId) self.db.save(sql) else: print('请求详情页失败:' + str(url)) def push_url_to_redis(self): # zhiwei_list = [] # zhiwei_sql = 'select * from zhiwei' # zhiwei_results = self.db.find_all(zhiwei_sql) # for zhiwei in zhiwei_results: # zhiwei_list.append(zhiwei[2]) # # zhen_sql = 'select * from zhen' # zhen_results = self.db.find_all(zhen_sql) # # for res in zhen_results: # pid = res[1] # zhen_id = res[2] # for zhiwei_id in zhiwei_list: # url = POSITION_URL.format(pid=pid, zhen_id=zhen_id, zhiwei_id=zhiwei_id, pageToken='1') # self.redisClient.push('employment',url) zhiwei_list = [] zhiwei_sql = 'select * from zhiwei' zhiwei_results = self.db.find_all(zhiwei_sql) for zhiwei in zhiwei_results: zhiwei_list.append(zhiwei[2]) shi_sql = 'select * from shi' shi_results = self.db.find_all(shi_sql) for res in shi_results: pid = res[2] for zhiwei_id in zhiwei_list: url = NEW_POSITION_URL.format(pid=pid, zhiwei_id=zhiwei_id, pageToken='1') url_obj = {"url": url, "cityId": pid, "zhiweiId": zhiwei_id} self.redisClient.push('employment', json.dumps(url_obj))
class Scheduler(object): def __init__(self): self.download = Download() self.db = MysqlClient() # self.redisClient = RedisClient() def run(self): bestseller = get_bestseller.Bestseller() bestseller.start() # for i in range(1,11): # self.get_kw('apple',str(i)) def get_kw(self, kw, page): url = 'https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords={kw}&page={page}'.format( kw=kw, page=page) print(url) response = self.download.get_html(url) if response is not None: html = HTML(response.text) # titles = html.xpath('//div[@class="a-row a-spacing-small"]//a/h2/text()') urls = html.xpath('//div[@class="a-row a-spacing-small"]//a/@href') for url in urls: if url[:3] == '/gp': url = 'https://www.amazon.com' + url detail_response = self.download.get_html(url) try: url = re.search('<link rel="canonical" href="(.*?)"', detail_response.text).group(1) except: url = url detail_html = HTML(detail_response.text) product_id = hashlib.md5(url.encode()).hexdigest() title = detail_html.xpath('string(//h1[@id="title"])').strip() price = detail_html.xpath( 'string(//span[@id="priceblock_ourprice"])').replace( ',', '').replace('$', '') if price == '': price = 0 color = detail_html.xpath( 'string(//div[@id="variation_color_name"]//span)').strip() size = detail_html.xpath( 'string(//div[@id="variation_size_name"]//span)').strip() commentCount = detail_html.xpath( 'string(//span[@id="acrCustomerReviewText"])').split( ' ')[0].replace(',', '') if commentCount == '': commentCount = 0 commentRating = detail_html.xpath( 'string(//a[@class="a-popover-trigger a-declarative"]/i/span)' ).split(' ')[0] if commentRating == '': commentRating = 0 crawled_timestamp = int(time.time()) crawled_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) crawled_date = time.strftime("%Y-%m-%d", time.localtime()) keywordtype = kw #编号 try: asin = re.search( '.*?productDetails_detailBullets_sections1.*?ASIN.*?<td class="a-size-base">(.*?)</td>', detail_response.text, re.S).group(1).strip() except: asin = None #类目排名 try: category_res1 = re.search( '.*?productDetails_detailBullets_sections1.*?Best Sellers Rank.*?<span>.*?(<span>.*?</span>)', detail_response.text, re.S) category_res2 = re.search( '.*?productDetails_detailBullets_sections1.*?Best Sellers Rank.*?<span>.*?<span>.*?</span>.*?(<span>.*?</span>).*?</span>', detail_response.text, re.S) if category_res1: # rank_search = re.search('.*?#(.*?)in.*?', category_res1.group(1)) # if rank_search: # rank1 = rank_search.group(1) # else: # rank1 = None # print(rank1) html = HTML(category_res1.group(1)) list_res = html.xpath('//text()') rank1 = ''.join(list_res) if category_res2: html = HTML(category_res2.group(1)) list_res = html.xpath('//text()') rank2 = ''.join(list_res) except: rank1 = None rank2 = None #图片信息入库 try: imageUrls = [] img_res = re.search( "var data = {};.*?var obj = jQuery.parseJSON\('(.*?)'\);", detail_response.text, re.S) img_obj = json.loads(img_res.group(1)) key_one = list(img_obj['colorImages'].keys())[0] for data in img_obj['colorImages'][key_one]: imageUrls.append(data['large']) for img in imageUrls: img_id = hashlib.md5(img.encode()).hexdigest() img_url = img sql = "insert into image(product_id,img_id,img_url,crawled_timestamp,crawled_time) values ('%s','%s','%s','%s','%s')" \ % (product_id,img_id,img_url,crawled_timestamp,crawled_time) \ + "ON DUPLICATE KEY UPDATE crawled_timestamp='%s',crawled_time='%s'" % (crawled_timestamp, crawled_time) print(sql) self.db.save(sql) except: pass #跟卖信息入库 have_follow_sale = '0' follow_sale_num = 0 follow_sale_str = detail_html.xpath( 'string(//div[@id="olp_feature_div"]/div/span)') if follow_sale_str != '': have_follow_sale = '1' follow_sale_num = re.search('\((\d+)\)', follow_sale_str).group(1) follow_sale_url = detail_html.xpath( 'string(//div[@id="olp_feature_div"]/div/span/a/@href)') if follow_sale_url[0:4] == 'http': follow_sale_url = follow_sale_url else: follow_sale_url = 'https://www.amazon.com' + follow_sale_url + '&startIndex={startIndex}' follow_response = self.get_follow_sale(follow_sale_url, follow_sale_num) for item in follow_response: follow_sale_id = item['follow_sale_id'] price = item['price'] seller = item['seller'] type = item['type'] sql = "insert into follow_sale(product_id,follow_sale_id,price,seller,type,crawled_timestamp,crawled_time) values ('%s','%s','%s','%s','%s','%s','%s')" \ % (product_id,follow_sale_id,price,seller,type,crawled_timestamp,crawled_time) \ + "ON DUPLICATE KEY UPDATE crawled_timestamp='%s',crawled_time='%s'" % (crawled_timestamp, crawled_time) print(sql) self.db.save(sql) #商品信息入库 obj = { 'product_id': product_id, 'title': title, 'url': url, 'price': price, 'color': color, 'size': size, 'commentCount': commentCount, 'commentRating': commentRating, # 'imageUrls': imageUrls, 'crawled_timestamp': crawled_timestamp, 'crawled_time': crawled_time, 'have_follow_sale': have_follow_sale, 'follow_sale_num': follow_sale_num, } print(obj) sql = "insert into keyword_res(product_id,title,url,price,color,size,commentCount,commentRating,have_follow_sale,follow_sale_num,asin,rank1,rank2,crawled_timestamp,crawled_time,crawled_date,keywordtype) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')"\ % (product_id, title, url, price, color, size, commentCount, commentRating,have_follow_sale,follow_sale_num,asin,rank1,rank2, crawled_timestamp, crawled_time,crawled_date,keywordtype)\ + "ON DUPLICATE KEY UPDATE title='%s', url='%s', price='%s',commentCount='%s',crawled_timestamp='%s',crawled_time='%s',crawled_date='%s'"%(title,url,price,commentCount,crawled_timestamp,crawled_time,crawled_date) print(sql) self.db.save(sql) def get_follow_sale(self, url, follow_sale_num): if follow_sale_num == 0: return [] if int(follow_sale_num) > 10: pageNum = math.ceil(int(follow_sale_num) / 10) else: pageNum = 1 item_list = [] for page in range(0, pageNum): startIndex = page * 10 url = url.format(startIndex=startIndex) print(url) follow_response = self.download.get_html(url) if follow_response is None: return [] follow_html = HTML(follow_response.text) html_list = follow_html.xpath( '//div[@class="a-row a-spacing-mini olpOffer"]') for html in html_list: html = etree.tostring(html).decode() html = HTML(html) price = html.xpath( 'string(//div[@class="a-column a-span2 olpPriceColumn"]/span)' ).strip().replace('$', '') seller = html.xpath('string(//h3/span)').strip() FBA = html.xpath('string(//div[@class="olpBadge"])') type = 'FBM' if FBA != '': type = 'FBA' follow_sale_id = hashlib.md5( (seller + price + type).encode()).hexdigest() obj = { 'follow_sale_id': follow_sale_id, 'price': price, 'seller': seller, 'type': type } print(obj) item_list.append(obj) return item_list
def __init__(self, master=None): Frame.__init__(self, master) self.root = master # 定义内部变量root self.mysqlClient = MysqlClient()
class Scheduler(object): def __init__(self): self.download = Download() self.db = MysqlClient() def run(self): self.get_books() def get_books(self): kw = input('请输入要查找的书籍(例如:python编程):') host_url = 'http://search.dangdang.com/?key={kw}&act=input&page_index={page}' #删除原有数据 sql = 'delete from books' self.db.save(sql) for i in range(1, 10): print('当前页:'+str(i)) start_url = host_url.format(kw=kw, page=i) print(start_url) response = self.download.get_html(start_url) response.encoding = 'gbk' # print(response.text) html = HTML(response.text) item_xpath_list = html.xpath('//div[@id="search_nature_rg"]/ul/li') for item in item_xpath_list: url = item.xpath('string(.//a[@name="itemlist-title"]/@href)') bookId = re.search('http://product.dangdang.com/(\d+).html',url) if bookId: bookId = bookId.group(1) else: bookId = '' title = item.xpath('string(.//a[@name="itemlist-title"]/@title)').strip() now_price = item.xpath('string(.//span[@class="search_now_price"]/text())').replace('¥','') old_price = item.xpath('string(.//span[@class="search_pre_price"]/text())').replace('¥','') discount = item.xpath('string(.//span[@class="search_discount"]/text())').replace('(','').replace(')','').replace('折','').strip() commentCount = item.xpath('string(.//a[@class="search_comment_num"]/text())').replace('条评论','') author = item.xpath('string(.//p[@class="search_book_author"]/span[1]/a/@title)') publishDateStr = item.xpath('string(.//p[@class="search_book_author"]/span[2]/text())').replace('/','').strip() publishing = item.xpath('string(.//p[@class="search_book_author"]/span[3]/a/text())') # print(url) # print(title) # print(now_price) # print(old_price) # print(discount) # print(commentCount) # print(author) # print(publishDateStr) # print(publishing) sql = "insert into books(bookId,url,title,now_price,old_price,discount,commentCount,publishDateStr,author,publishing)" \ " VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \ % (bookId,url,title,now_price,old_price,discount,commentCount,publishDateStr,author,publishing) \ + "ON DUPLICATE KEY UPDATE title='%s'" % (title) print(sql) self.db.save(sql)
class InputFrame(Frame): # 继承Frame类 def __init__(self, master=None): Frame.__init__(self, master) self.root = master # 定义内部变量root self.word = StringVar() self.mean = StringVar() self.createPage() self.mysqlClient = MysqlClient() self.spider = spider.Spider() def createPage(self, query_res={ 'fanyi': '', 'phonetic': '', 'translation': '' }): Label(self).grid(row=0, stick=W, pady=10) Label(self, text='请输入: ').grid(row=1, stick=W, pady=10) Entry(self, textvariable=self.word, width=40).grid(row=2, stick=W) Label(self, text='结果如下: ').grid(row=4, stick=W, pady=10) Label(self, text=query_res['fanyi'], height=2, width=40, justify='left').grid(row=5, stick=W, pady=10) Label(self, text=query_res['phonetic'], height=2, width=40, justify='left').grid(row=6, stick=W, pady=10) Label(self, text=query_res['translation'], height=2, width=40, justify='left').grid(row=7, stick=W, pady=10) # Entry(self, textvariable=self.mean).grid(row=3, column=1, stick=E) # Text(self, height=5, width=50).grid(row=4, column=1, stick=W) Button(self, text='查询', command=self.query_word).grid(row=10, column=1, stick=E, pady=10) Button(self, text='收藏', command=self.collect_word).grid(row=10, column=2, stick=E, pady=10) def query_word(self): #插入到历史查询记录 word = self.word.get() eng = '1' # 是中文 if '\u4e00' <= word[0] <= '\u9fff': eng = '0' query_res = self.spider.get_dic(word, eng=eng) self.createPage(query_res) #添加到历史查询记录 name = config.USERNAME mean = json.dumps(query_res['fanyi']) mytime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) timestamp = int(time.time()) sql = "insert into history(word,mean,name,time,timestamp) values('%s','%s','%s','%s','%s')" % ( word, mean, name, mytime, timestamp) print(sql) self.mysqlClient.save(sql) def collect_word(self): word = self.word.get() eng = '1' # 是中文 if '\u4e00' <= word[0] <= '\u9fff': eng = '0' query_res = self.spider.get_dic(word, eng=eng) self.createPage(query_res) # 添加到我的收藏 name = config.USERNAME mean = json.dumps(query_res) mytime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) timestamp = int(time.time()) sql = "insert into collection(word,mean,name,time,timestamp) values('%s','%s','%s','%s','%s')" % ( word, mean, name, mytime, timestamp) save_res = self.mysqlClient.save(sql) if save_res: showinfo(title='成功', message='收藏成功') else: showinfo(title='失败', message='收藏失败!')
def get_conn(): g_Mysql = MysqlClient() return g_Mysql
def __init__(self): self.mysql = MysqlClient() self.spider = Spider()
class InputFrame(Frame): # 继承Frame类 def __init__(self, master=None): Frame.__init__(self, master) self.root = master # 定义内部变量root self.word = StringVar() self.mean = StringVar() self.createPage() self.mysqlClient = MysqlClient() self.spider = spider.Spider() def createPage(self, query_res={ 'fanyi': '', 'phonetic': '', 'translation': '' }): # self.fm2 = Frame(self.root) # Button(self, text='Left').pack(side=LEFT) # Button(self, text='This is the Center button').pack(side=LEFT) # Button(self, text='Right').pack(side=LEFT) # self.fm2.pack(side=LEFT, padx=10) Label(self).grid(row=0, stick=W, pady=10) Label(self, text='请输入: ').grid(row=1, stick=W, pady=10) Entry(self, textvariable=self.word, width=40).grid(row=2, stick=W) Label(self, text='结果如下: ').grid(row=4, stick=W, pady=10) Label(self, text='翻译:' + query_res['fanyi'], height=2, width=40, justify='left').grid(row=5, stick=W, pady=10) Label(self, text='发音:' + query_res['phonetic'], height=2, width=40, justify='left').grid(row=6, stick=W, pady=10) Label(self, text='其他:' + query_res['translation'], height=2, width=40, justify='left').grid(row=7, stick=W, pady=10) # Entry(self, textvariable=self.mean).grid(row=3, column=1, stick=E) # Text(self, height=5, width=50).grid(row=4, column=1, stick=W) Button(self, text='查询', command=self.query_word).grid(row=10, column=1, stick=E, pady=10) Button(self, text='收藏', command=self.collect_word).grid(row=10, column=2, stick=E, pady=10) def query_word(self): word = self.word.get() #是英文 eng = '1' # 是中文 if '\u4e00' <= word[0] <= '\u9fff': eng = '0' #先判断在不在当前数据库,不在再用360翻译接口 sql1 = "select * from words where en_word like '%s'" % (word) find_res1 = self.mysqlClient.find_one(sql1) sql2 = "select * from words where cn_word like '%s'" % (word) find_res2 = self.mysqlClient.find_one(sql2) if find_res1: query_res = { 'fanyi': find_res1[2], 'phonetic': '', 'translation': '', } self.createPage(query_res) elif find_res2: query_res = { 'fanyi': find_res2[1], 'phonetic': '', 'translation': '', } self.createPage(query_res) else: query_res = self.spider.get_dic(word, eng=eng) if query_res: self.createPage(query_res) else: showinfo(title='查询失败', message='查询失败,请检查您的网络') #添加到历史查询记录 name = config.USERNAME mean = json.dumps(query_res['fanyi']) mytime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) timestamp = int(time.time()) sql = "insert into history(word,mean,name,time,timestamp) values('%s','%s','%s','%s','%s')" % ( word, mean, name, mytime, timestamp) print(sql) self.mysqlClient.save(sql) def collect_word(self): word = self.word.get() # 是英文 eng = '1' # 是中文 if '\u4e00' <= word[0] <= '\u9fff': eng = '0' # 先判断在不在当前数据库,不在再用360翻译接口 sql1 = "select * from words where en_word like '%s'" % (word) find_res1 = self.mysqlClient.find_one(sql1) sql2 = "select * from words where cn_word like '%s'" % (word) find_res2 = self.mysqlClient.find_one(sql2) if find_res1: query_res = { 'fanyi': find_res1[2], 'phonetic': '', 'translation': '', } self.createPage(query_res) elif find_res2: query_res = { 'fanyi': find_res2[1], 'phonetic': '', 'translation': '', } self.createPage(query_res) else: query_res = self.spider.get_dic(word, eng=eng) if query_res: self.createPage(query_res) else: showinfo(title='查询失败', message='查询失败,请检查您的网络') # 添加到我的收藏 name = config.USERNAME mean = str(query_res['fanyi']) mytime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) timestamp = int(time.time()) sql = "insert into collection(word,mean,name,time,timestamp) values('%s','%s','%s','%s','%s')" % ( word, mean, name, mytime, timestamp) print(sql) save_res = self.mysqlClient.save(sql) if save_res: showinfo(title='成功', message='收藏成功') else: showinfo(title='失败', message='收藏失败!')
from mitmproxy import ctx import json from db import MysqlClient from config import * import re import time import smtplib from email.mime.text import MIMEText conn = MysqlClient() import time def response(flow): url = 'https://api.huafer.cc/api/v1/schizo' if flow.request.url.startswith(url): text = flow.response.text result = json.loads(text) if result.get('obj') and result.get('obj').get('items'): items = result.get('obj').get('items') second = 0 for x in items: second += 1 if second <= 3: if x.get('item') and x.get('counts') and x.get('user'): info1 = x.get('item') info2 = x.get('counts') info3 = x.get('user') sellers = {} stuff_data = {} sellers['STUFFID'] = info1.get('goodsId') # 宝贝id sellers['ADDRESS'] = None # 卖家地址
def __init__(self, website='default'): self.website = website self.mysql_client = MysqlClient(website)