#stocks.py from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import Selector from nasdaq.items import NasdaqItem #instantiates the mail sender from scrapy.mail import MailSender mailer = MailSender() #nasdaq class class nasdaq(CrawlSpider): name = "nasdaq" allowed_domains = ["nasdaq.com"] start_urls = ["http://nasdaq.com/symbol/intc/after-hours"] #Extracts a page of trade detial follows rule to exact webpage address rules = [ Rule(SgmlLinkExtractor( allow=("http://nasdaq.com/symbol/intc/after-hours")), follow=True), Rule(SgmlLinkExtractor(allow=()), callback='parse_item') ] #Parse function for extracting the stock ticker name,current price,date stamp. # Where stn=stock ticker name, cp=current price, dstmp=date stamp #xpath to get exact data wanted stn,cp,dstmp def parse_item(self, response): #log message to show scrapy is extracting self.log('Hi! am going to start extractinging now')
def process_item(self, item, spider): if isinstance(item, HiltonAvailabilityItem): table_name = 'hotel_availability' sql = self.make_sql_from_item(item, table_name) self.cursor.execute(sql) #执行SQL print '-------------------' + str( self.cursor.rowcount) + '----------------' self.connection.commit() # 写入操作 #---------------push notice start---------- sql = 'SELECT openid,email,notice_times,max_times FROM weixinuser WHERE openid IN (SELECT openid FROM noticeset WHERE date={} AND ctyhocn={} AND points={} AND (latest_notice_time is NULL OR HOUR(TIMEDIFF(Now(),latest_notice_time))>3))'.format( '"' + item['date'] + '"', '"' + item['ctyhocn'] + '"', '"' + item['points'] + '"') self.cursor.execute(sql) result_tuple = self.cursor.fetchall() print '查询匹配noticeset结果为' print result_tuple if result_tuple: mailsender0 = MailSender('smtp.qq.com', '*****@*****.**', '95605319', 'lhytrhunlcahg', 465, smtpssl=True) mailsender1 = MailSender('smtp.exmail.qq.com', '*****@*****.**', '*****@*****.**', 'hytrhy', 465, smtpssl=True) mailsender2 = MailSender('smtp.exmail.qq.com', '*****@*****.**', '*****@*****.**', 'yhytrhyt', 465, smtpssl=True) mailsender3 = MailSender('smtp.exmail.qq.com', '*****@*****.**', '*****@*****.**', 'y6y65', 465, smtpssl=True) mailsender4 = MailSender('smtp.exmail.qq.com', '*****@*****.**', '*****@*****.**', 'y6htrhy', 465, smtpssl=True) mailsender5 = MailSender('smtp.exmail.qq.com', '*****@*****.**', '*****@*****.**', 'y6545y6', 465, smtpssl=True) mailsender6 = MailSender('smtp.exmail.qq.com', '*****@*****.**', '*****@*****.**', 'y64y6', 465, smtpssl=True) mailsender7 = MailSender('smtp.exmail.qq.com', '*****@*****.**', '*****@*****.**', 'y65yy6', 465, smtpssl=True) mailsender = choice([ mailsender2, mailsender3, mailsender4, mailsender5, mailsender6, mailsender7 ]) maillist = [] openid_list = [] for tup in result_tuple: openid = tup[0] email = tup[1] notice_times = tup[2] max_times = tup[3] print email, notice_times, max_times if notice_times < max_times: #超过最大发送次数 则不会再发送 maillist.append(email) openid_list.append(openid) if maillist: to = maillist subject = (self.hilton_codes[item['ctyhocn']] + u'有空余基础积分房').encode('utf-8') body = (self.hilton_codes[item['ctyhocn']] + '---' + item['date'] + '---' + item['points'] + u'---请去预订吧').encode('utf-8') mailsender.send(to, subject, body) openids = '(' + ','.join('"' + openid + '"' for openid in openid_list) + ')' #使用mysql自联结 更新用户的总发送邮件次数 sql = 'UPDATE weixinuser AS a,weixinuser AS b SET a.notice_times=b.notice_times+1 WHERE a.openid=b.openid AND a.openid IN {}'.format( openids) #在noticeset表中更新被触发的规则的最近发邮件时间 sql2 = 'UPDATE noticeset set latest_notice_time=Now() where date={} AND ctyhocn={} AND points={}'.format( '"' + item['date'] + '"', '"' + item['ctyhocn'] + '"', '"' + item['points'] + '"') self.cursor.execute(sql) self.cursor.execute(sql2) self.connection.commit() # 写入操作 #---------------push notice end------------ if isinstance(item, HiltonDetailItem): table_name = 'hotel_detail' sql = self.make_sql_from_item(item, table_name) self.cursor.execute(sql) #执行SQL self.connection.commit() # 写入操作
# -*- coding: utf-8 -*- from scrapy.mail import MailSender mailer = MailSender() mailer = MailSender(smtphost='smtp.exmail.qq.com', mailfrom='*****@*****.**', smtpuser='******', smtppass='******', smtpport=25) body = '快去看\[email protected]:/home/ubuntu/ruyi-scrapy/xmly/xmly_to_delete.json' mailer.send(to=["*****@*****.**"], subject="XMLY 下架资源列表", body=body)
class test_create_gov_scriptSpider(Spider): name = 'test_create_gov_script' allowed_domains = ['wzkj.wenzhou.gov.cn'] start_urls = ['http://wzkj.wenzhou.gov.cn/'] rules = [ ('关于下达温州市201[\s\S]{1}年公益性科技计划项目', '关于下达温州市2017年公益性科技计划项目'), ] htmk_link_extractor = HtmlLinkExtractor() error_correction_extractor = ErrorCorrectionExtractor( rules, domain='wzkj.wenzhou.gov.cn') blank_html_extractor = BlankHtmlExtractor() mailer = MailSender(smtphost='smtp.qq.com', mailfrom='*****@*****.**', smtpport=465, smtpssl=True, smtpuser='******', smtppass='******') custom_settings = { # 'CONCURRENT_REQUESTS_PER_DOMAIN' : 4, 'LOG_LEVEL': 'INFO' # 'DOWNLOAD_DELAY': 0.3, } def parse(self, response: TextResponse): is_blank = self.blank_html_extractor.is_blank(response) if is_blank: blank_result = { 'type': 'gov', 'reason': '网页内容为空', 'url': response.url } render_dict = { 'title': '(PyScraper发送)错误网站', 'url': response.url, 'tablehead': ['错误原因'], 'table_data': blank_result['reason'] } body = render_error_correction_result_mail(**render_dict) self.mailer.send(to=["*****@*****.**"], subject='(PyScraper发送)网站纠错情况', body=body, mimetype='text/html') yield blank_result error_correction_result = self.error_correction_extractor.find_error( response) if error_correction_result: print("error_correction_result", error_correction_result) render_dict = { 'title': '(PyScraper发送)错误网站', 'url': response.url, 'tablehead': ['正确词', '错误词'], 'table_data': error_correction_result } body = render_error_correction_result_mail(**render_dict) self.mailer.send(to=["*****@*****.**"], subject='(PyScraper发送)网站纠错情况', body=body, mimetype='text/html') yield { 'type': 'gov', 'reason': '网页无法访问状态{}'.format(response.status), 'url': response.url } links: List[Link] = [ lnk for lnk in self.htmk_link_extractor.extract_links(response) ] for link in links: yield Request(link.url, callback=self.parse, errback=self.errorback) """ 获取dataproxy接口的链接 """ data_proxy_extractor = DataProxyXmlLinkExtractor() if data_proxy_extractor.has_dataproxy_link(response): yield data_proxy_extractor.gen_dataproxy_links() def errorback(self, failure): if isinstance(failure.value, HttpError): response = failure.value.response result = { 'type': 'gov', 'reason': '网页无法访问状态{}'.format(response.status), 'url': response.url } yield result render_dict = { 'title': '(PyScraper发送)错误网站', 'url': response.url, 'tablehead': ['错误原因'], 'table_data': result['reason'] } body = render_error_correction_result_mail(**render_dict) self.mailer.send(to=["*****@*****.**"], subject='(PyScraper发送)网站纠错情况', body=body, mimetype='text/html') print('repsonse is error in response.url:', failure)
class a2018611wzskjjSpider(Spider): name = 'a2018611wzskjj' allowed_domains = ['wzkj.wenzhou.gov.cn'] start_urls = ['http://wzkj.wenzhou.gov.cn/'] rules = [['温州市201*', '温州市2017']] htmk_link_extractor = HtmlLinkExtractor() error_correction_extractor = ErrorCorrectionExtractor( rules, domain='wzkj.wenzhou.gov.cn') blank_html_extractor = BlankHtmlExtractor() mailer = MailSender(smtphost='smtp.qq.com', mailfrom='*****@*****.**', smtpport=465, smtpssl=True, smtpuser='******', smtppass='******') custom_settings = { # 'CONCURRENT_REQUESTS_PER_DOMAIN' : , 'LOG_LEVEL': 'DEBUG' # 'DOWNLOAD_DELAY': 0.3, } def parse(self, response: TextResponse): request_url = response.meta.get("url") first_url = response.meta.get("first_url") response_is_blank = self.blank_html_extractor.is_blank(response) if response_is_blank: blank_result = { 'type': 'gov', 'reason': '网页内容为空', 'url': request_url or response.url, 'first_url': first_url } yield blank_result render_dict = { 'title': '(PyScraper发送)错误网站', 'url': request_url or response.url, 'table_head': ['错误原因'], 'table_data': blank_result['reason'] } body = render_error_correction_result_mail(**render_dict) self.mailer.send(to=["*****@*****.**"], subject='(PyScraper发送)网站纠错情况', body=body, mimetype='text/html') error_correction_result = self.error_correction_extractor.find_error( response) if error_correction_result: print("error_correction_result", error_correction_result) message = "\n".join([ "正确词:{} 错误词: {}".format(error['correct'], error['error']) for error in error_correction_result ]) yield { 'type': 'gov', 'reason': '网页中有错误词:\n' + message, 'url': request_url or response.url } render_dict = { 'title': '(PyScraper发送)错误网站', 'url': request_url or response.url, 'table_head': ['正确词', '错误词'], 'table_data': error_correction_result } body = render_error_correction_result_mail(**render_dict) self.mailer.send(to=["*****@*****.**"], subject='(PyScraper发送)网站纠错情况', body=body, mimetype='text/html') links: List[Link] = [ lnk for lnk in self.htmk_link_extractor.extract_links(response) ] for link in links: yield Request(link.url, callback=self.parse, errback=self.errorback, meta={ "url": link.url, "first_url": request_url or response.url }) """ 获取dataproxy接口的链接 """ data_proxy_extractor = DataProxyXmlLinkExtractor() if data_proxy_extractor.has_dataproxy_link(response): yield data_proxy_extractor.gen_dataproxy_links() def errorback(self, failure): if isinstance(failure.value, HttpError): response = failure.value.response request_url = response.meta.get("url") first_url = response.meta.get("first_url") result = { 'type': 'gov', 'reason': '网页无法访问状态{}'.format(response.status), 'url': request_url or response.url, 'first_url': first_url } yield result render_dict = { 'title': '(PyScraper发送)错误网站', 'url': first_url or response.url, 'table_head': ['错误原因'], 'table_data': result['reason'] } body = render_error_correction_result_mail(**render_dict) # self.mailer.send(to=["*****@*****.**"], subject='(PyScraper发送)网站纠错情况', body=body, mimetype='text/html') print('response is error in response.url:', failure)
def send_mail(self, subject, content): mailer = MailSender() mailer.send(to=["*****@*****.**"], subject=subject, body=content, cc=["*****@*****.**"])
def spider_closed(self, spider): mailer = MailSender() mailer.send(to=["*****@*****.**"], subject="Scraping", body="The data has been scraped sucessfully", cc=["*****@*****.**"])