예제 #1
0
#stocks.py
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from nasdaq.items import NasdaqItem

#instantiates the mail sender
from scrapy.mail import MailSender
mailer = MailSender()


#nasdaq class
class nasdaq(CrawlSpider):
    name = "nasdaq"
    allowed_domains = ["nasdaq.com"]
    start_urls = ["http://nasdaq.com/symbol/intc/after-hours"]

    #Extracts a page of trade detial follows rule to exact webpage address
    rules = [
        Rule(SgmlLinkExtractor(
            allow=("http://nasdaq.com/symbol/intc/after-hours")),
             follow=True),
        Rule(SgmlLinkExtractor(allow=()), callback='parse_item')
    ]

    #Parse function for extracting the stock ticker name,current price,date stamp.
    # Where stn=stock ticker name, cp=current price, dstmp=date stamp
    #xpath to get exact data wanted stn,cp,dstmp
    def parse_item(self, response):
        #log message to show scrapy is extracting
        self.log('Hi! am going to start extractinging now')
예제 #2
0
    def process_item(self, item, spider):
        if isinstance(item, HiltonAvailabilityItem):
            table_name = 'hotel_availability'
            sql = self.make_sql_from_item(item, table_name)
            self.cursor.execute(sql)  #执行SQL
            print '-------------------' + str(
                self.cursor.rowcount) + '----------------'
            self.connection.commit()  # 写入操作

            #---------------push notice start----------
            sql = 'SELECT openid,email,notice_times,max_times FROM weixinuser WHERE openid IN (SELECT openid FROM noticeset WHERE date={} AND ctyhocn={} AND points={} AND (latest_notice_time is NULL OR HOUR(TIMEDIFF(Now(),latest_notice_time))>3))'.format(
                '"' + item['date'] + '"', '"' + item['ctyhocn'] + '"',
                '"' + item['points'] + '"')
            self.cursor.execute(sql)
            result_tuple = self.cursor.fetchall()
            print '查询匹配noticeset结果为'
            print result_tuple
            if result_tuple:
                mailsender0 = MailSender('smtp.qq.com',
                                         '*****@*****.**',
                                         '95605319',
                                         'lhytrhunlcahg',
                                         465,
                                         smtpssl=True)
                mailsender1 = MailSender('smtp.exmail.qq.com',
                                         '*****@*****.**',
                                         '*****@*****.**',
                                         'hytrhy',
                                         465,
                                         smtpssl=True)
                mailsender2 = MailSender('smtp.exmail.qq.com',
                                         '*****@*****.**',
                                         '*****@*****.**',
                                         'yhytrhyt',
                                         465,
                                         smtpssl=True)
                mailsender3 = MailSender('smtp.exmail.qq.com',
                                         '*****@*****.**',
                                         '*****@*****.**',
                                         'y6y65',
                                         465,
                                         smtpssl=True)
                mailsender4 = MailSender('smtp.exmail.qq.com',
                                         '*****@*****.**',
                                         '*****@*****.**',
                                         'y6htrhy',
                                         465,
                                         smtpssl=True)
                mailsender5 = MailSender('smtp.exmail.qq.com',
                                         '*****@*****.**',
                                         '*****@*****.**',
                                         'y6545y6',
                                         465,
                                         smtpssl=True)
                mailsender6 = MailSender('smtp.exmail.qq.com',
                                         '*****@*****.**',
                                         '*****@*****.**',
                                         'y64y6',
                                         465,
                                         smtpssl=True)
                mailsender7 = MailSender('smtp.exmail.qq.com',
                                         '*****@*****.**',
                                         '*****@*****.**',
                                         'y65yy6',
                                         465,
                                         smtpssl=True)
                mailsender = choice([
                    mailsender2, mailsender3, mailsender4, mailsender5,
                    mailsender6, mailsender7
                ])
                maillist = []
                openid_list = []
                for tup in result_tuple:
                    openid = tup[0]
                    email = tup[1]
                    notice_times = tup[2]
                    max_times = tup[3]
                    print email, notice_times, max_times
                    if notice_times < max_times:  #超过最大发送次数 则不会再发送
                        maillist.append(email)
                        openid_list.append(openid)
                if maillist:
                    to = maillist
                    subject = (self.hilton_codes[item['ctyhocn']] +
                               u'有空余基础积分房').encode('utf-8')
                    body = (self.hilton_codes[item['ctyhocn']] + '---' +
                            item['date'] + '---' + item['points'] +
                            u'---请去预订吧').encode('utf-8')
                    mailsender.send(to, subject, body)
                    openids = '(' + ','.join('"' + openid + '"'
                                             for openid in openid_list) + ')'
                    #使用mysql自联结 更新用户的总发送邮件次数
                    sql = 'UPDATE weixinuser AS a,weixinuser AS b SET a.notice_times=b.notice_times+1 WHERE a.openid=b.openid AND a.openid IN {}'.format(
                        openids)
                    #在noticeset表中更新被触发的规则的最近发邮件时间
                    sql2 = 'UPDATE noticeset set latest_notice_time=Now() where date={} AND ctyhocn={} AND points={}'.format(
                        '"' + item['date'] + '"', '"' + item['ctyhocn'] + '"',
                        '"' + item['points'] + '"')
                    self.cursor.execute(sql)
                    self.cursor.execute(sql2)
                    self.connection.commit()  # 写入操作
            #---------------push notice end------------

        if isinstance(item, HiltonDetailItem):
            table_name = 'hotel_detail'
            sql = self.make_sql_from_item(item, table_name)
            self.cursor.execute(sql)  #执行SQL
            self.connection.commit()  # 写入操作
예제 #3
0
# -*- coding: utf-8 -*-
from scrapy.mail import MailSender

mailer = MailSender()
mailer = MailSender(smtphost='smtp.exmail.qq.com', mailfrom='*****@*****.**', smtpuser='******', smtppass='******', smtpport=25)
body = '快去看\[email protected]:/home/ubuntu/ruyi-scrapy/xmly/xmly_to_delete.json'
mailer.send(to=["*****@*****.**"], subject="XMLY 下架资源列表", body=body)
예제 #4
0
class test_create_gov_scriptSpider(Spider):
    name = 'test_create_gov_script'
    allowed_domains = ['wzkj.wenzhou.gov.cn']
    start_urls = ['http://wzkj.wenzhou.gov.cn/']
    rules = [
        ('关于下达温州市201[\s\S]{1}年公益性科技计划项目', '关于下达温州市2017年公益性科技计划项目'),
    ]
    htmk_link_extractor = HtmlLinkExtractor()
    error_correction_extractor = ErrorCorrectionExtractor(
        rules, domain='wzkj.wenzhou.gov.cn')
    blank_html_extractor = BlankHtmlExtractor()
    mailer = MailSender(smtphost='smtp.qq.com',
                        mailfrom='*****@*****.**',
                        smtpport=465,
                        smtpssl=True,
                        smtpuser='******',
                        smtppass='******')
    custom_settings = {
        # 'CONCURRENT_REQUESTS_PER_DOMAIN' : 4,
        'LOG_LEVEL': 'INFO'
        # 'DOWNLOAD_DELAY': 0.3,
    }

    def parse(self, response: TextResponse):
        is_blank = self.blank_html_extractor.is_blank(response)
        if is_blank:
            blank_result = {
                'type': 'gov',
                'reason': '网页内容为空',
                'url': response.url
            }
            render_dict = {
                'title': '(PyScraper发送)错误网站',
                'url': response.url,
                'tablehead': ['错误原因'],
                'table_data': blank_result['reason']
            }
            body = render_error_correction_result_mail(**render_dict)
            self.mailer.send(to=["*****@*****.**"],
                             subject='(PyScraper发送)网站纠错情况',
                             body=body,
                             mimetype='text/html')
            yield blank_result

        error_correction_result = self.error_correction_extractor.find_error(
            response)
        if error_correction_result:
            print("error_correction_result", error_correction_result)
            render_dict = {
                'title': '(PyScraper发送)错误网站',
                'url': response.url,
                'tablehead': ['正确词', '错误词'],
                'table_data': error_correction_result
            }
            body = render_error_correction_result_mail(**render_dict)
            self.mailer.send(to=["*****@*****.**"],
                             subject='(PyScraper发送)网站纠错情况',
                             body=body,
                             mimetype='text/html')

            yield {
                'type': 'gov',
                'reason': '网页无法访问状态{}'.format(response.status),
                'url': response.url
            }

        links: List[Link] = [
            lnk for lnk in self.htmk_link_extractor.extract_links(response)
        ]
        for link in links:
            yield Request(link.url,
                          callback=self.parse,
                          errback=self.errorback)
        """
        获取dataproxy接口的链接
        """
        data_proxy_extractor = DataProxyXmlLinkExtractor()
        if data_proxy_extractor.has_dataproxy_link(response):
            yield data_proxy_extractor.gen_dataproxy_links()

    def errorback(self, failure):
        if isinstance(failure.value, HttpError):
            response = failure.value.response
            result = {
                'type': 'gov',
                'reason': '网页无法访问状态{}'.format(response.status),
                'url': response.url
            }
            yield result

            render_dict = {
                'title': '(PyScraper发送)错误网站',
                'url': response.url,
                'tablehead': ['错误原因'],
                'table_data': result['reason']
            }
            body = render_error_correction_result_mail(**render_dict)
            self.mailer.send(to=["*****@*****.**"],
                             subject='(PyScraper发送)网站纠错情况',
                             body=body,
                             mimetype='text/html')

        print('repsonse is error in response.url:', failure)
예제 #5
0
class a2018611wzskjjSpider(Spider):
    name = 'a2018611wzskjj'
    allowed_domains = ['wzkj.wenzhou.gov.cn']
    start_urls = ['http://wzkj.wenzhou.gov.cn/']
    rules = [['温州市201*', '温州市2017']]
    htmk_link_extractor = HtmlLinkExtractor()
    error_correction_extractor = ErrorCorrectionExtractor(
        rules, domain='wzkj.wenzhou.gov.cn')
    blank_html_extractor = BlankHtmlExtractor()
    mailer = MailSender(smtphost='smtp.qq.com',
                        mailfrom='*****@*****.**',
                        smtpport=465,
                        smtpssl=True,
                        smtpuser='******',
                        smtppass='******')
    custom_settings = {
        # 'CONCURRENT_REQUESTS_PER_DOMAIN' : ,
        'LOG_LEVEL': 'DEBUG'
        # 'DOWNLOAD_DELAY': 0.3,
    }

    def parse(self, response: TextResponse):
        request_url = response.meta.get("url")
        first_url = response.meta.get("first_url")
        response_is_blank = self.blank_html_extractor.is_blank(response)
        if response_is_blank:
            blank_result = {
                'type': 'gov',
                'reason': '网页内容为空',
                'url': request_url or response.url,
                'first_url': first_url
            }
            yield blank_result

            render_dict = {
                'title': '(PyScraper发送)错误网站',
                'url': request_url or response.url,
                'table_head': ['错误原因'],
                'table_data': blank_result['reason']
            }
            body = render_error_correction_result_mail(**render_dict)
            self.mailer.send(to=["*****@*****.**"],
                             subject='(PyScraper发送)网站纠错情况',
                             body=body,
                             mimetype='text/html')

        error_correction_result = self.error_correction_extractor.find_error(
            response)
        if error_correction_result:
            print("error_correction_result", error_correction_result)

            message = "\n".join([
                "正确词:{} 错误词: {}".format(error['correct'], error['error'])
                for error in error_correction_result
            ])
            yield {
                'type': 'gov',
                'reason': '网页中有错误词:\n' + message,
                'url': request_url or response.url
            }

            render_dict = {
                'title': '(PyScraper发送)错误网站',
                'url': request_url or response.url,
                'table_head': ['正确词', '错误词'],
                'table_data': error_correction_result
            }
            body = render_error_correction_result_mail(**render_dict)
            self.mailer.send(to=["*****@*****.**"],
                             subject='(PyScraper发送)网站纠错情况',
                             body=body,
                             mimetype='text/html')

        links: List[Link] = [
            lnk for lnk in self.htmk_link_extractor.extract_links(response)
        ]
        for link in links:
            yield Request(link.url,
                          callback=self.parse,
                          errback=self.errorback,
                          meta={
                              "url": link.url,
                              "first_url": request_url or response.url
                          })
        """
        获取dataproxy接口的链接
        """
        data_proxy_extractor = DataProxyXmlLinkExtractor()
        if data_proxy_extractor.has_dataproxy_link(response):
            yield data_proxy_extractor.gen_dataproxy_links()

    def errorback(self, failure):

        if isinstance(failure.value, HttpError):
            response = failure.value.response
            request_url = response.meta.get("url")
            first_url = response.meta.get("first_url")

            result = {
                'type': 'gov',
                'reason': '网页无法访问状态{}'.format(response.status),
                'url': request_url or response.url,
                'first_url': first_url
            }
            yield result

            render_dict = {
                'title': '(PyScraper发送)错误网站',
                'url': first_url or response.url,
                'table_head': ['错误原因'],
                'table_data': result['reason']
            }
            body = render_error_correction_result_mail(**render_dict)
            # self.mailer.send(to=["*****@*****.**"], subject='(PyScraper发送)网站纠错情况', body=body, mimetype='text/html')

        print('response is error in response.url:', failure)
 def send_mail(self, subject, content):
     mailer = MailSender()
     mailer.send(to=["*****@*****.**"],
                 subject=subject,
                 body=content,
                 cc=["*****@*****.**"])
예제 #7
0
 def spider_closed(self, spider):
     mailer = MailSender()
     mailer.send(to=["*****@*****.**"],
                 subject="Scraping",
                 body="The data has been scraped sucessfully",
                 cc=["*****@*****.**"])