示例#1
0
 def from_crawler(cls, crawler):
     recipients = crawler.settings.getlist("STATSMAILER_RCPTS")
     mail = MailSender.from_settings(crawler.settings)
     o = cls(crawler.stats, recipients, mail)
     crawler.signals.connect(o.engine_stopped, signal=signals.engine_stopped)
     crawler.signals.connect(o.engine_started, signal=signals.engine_started)
     return o
示例#2
0
    def __init__(self, **kwargs):
        # problem report
        super(CarSpider, self).__init__(**kwargs)
        self.mailer = MailSender.from_settings(settings)
        self.counts = 0
        self.carnum = 1010000
        # Mongo
        settings.set('DOWNLOAD_DELAY', '0', priority='cmdline')
        settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
        settings.set('MONGODB_DB', 'newcar', priority='cmdline')
        settings.set('MONGODB_COLLECTION', website, priority='cmdline')
        self.nationp = dict()
        self.npcounts = 0
        # nation select
        self.browser = webdriver.PhantomJS(
            executable_path=settings['PHANTOMJS_PATH'])

        # desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
        # proxy = webdriver.Proxy()
        # proxy.proxy_type = ProxyType.MANUAL
        # proxy.http_proxy = self.getProxy()
        # proxy.add_to_capabilities(desired_capabilities)

        # self.browser.start_session(desired_capabilities)
        # self.browser.set_page_load_timeout(12)
        # self.browser = webdriver.PhantomJS(executable_path=settings['PHANTOMJS_PATH'])
        # self.browser = webdriver.PhantomJS(executable_path="/usr/local/phantomjs/bin/phantomjs")
        # self.browser = webdriver.PhantomJS(executable_path="/root/home/phantomjs")
        super(CarSpider, self).__init__()
        dispatcher.connect(self.spider_closed, signals.spider_closed)
示例#3
0
    def __init__(self, **kwargs):
        super(CarSpider, self).__init__(**kwargs)
        self.mailer = MailSender.from_settings(self.settings)
        self.counts = 0
        self.carnum = 800000
        self.headers = {
            "User-Agent":
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
        }
        self.category = {
            "7": "国家补贴",
            "2": "地方补贴",
            "1": "推广政策",
            "3": "充电桩补贴政策",
            "4": "充电价格政策",
            "9": "路权政策",
            "8": "充电设施建设规划",
            "10": "网约车政策",
            "5": "国外政策",
            "6": "其它政策",
        }

        self.settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
        self.settings.set('MONGODB_DB', 'koubei', priority='cmdline')
        self.settings.set('MONGODB_COLLECTION', website, priority='cmdline')
示例#4
0
    def __init__(self, **kwargs):
        super(CarSpider, self).__init__(**kwargs)
        self.mailer = MailSender.from_settings(settings)
        self.counts = 0
        self.carnum = 800000

        settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
        settings.set('MONGODB_DB', 'carbusiness', priority='cmdline')
        settings.set('MONGODB_COLLECTION', website, priority='cmdline')

        with codecs.open("D:/county.txt", "r", "utf-8") as f:
            filecontent = f.read()
            # print(filecontent)
            indexlist = re.findall("\d+\_\d+\_\d+|\d+\_\d+", filecontent)
            indexlist.append("0")
            # print(indexlist)
            datalist = re.findall("\[(.*?)\]", filecontent, re.S)
            # print(datalist)
        self.datadict = {}
        for index in indexlist:
            self.datadict[index] = datalist[indexlist.index(index)]

        # print(self.datadict)

        self.browser = webdriver.PhantomJS(
            executable_path=settings['PHANTOMJS_PATH'])
        # self.browser = webdriver.PhantomJS(executable_path="/usr/local/phantomjs/bin/phantomjs")
        # self.browser = webdriver.PhantomJS(executable_path="/root/home/phantomjs")
        super(CarSpider, self).__init__()
        dispatcher.connect(self.spider_closed, signals.spider_closed)
示例#5
0
    def __init__(self, *args, **kwargs):
        self.running = True
        kwargs['settings'] = self.configure_image_store(kwargs['settings'])

        super(BaseSpider, self).__init__(*args, **kwargs)
        self.settings = kwargs[
            'settings']  # If we don't do that, the setting sobject only exist after __init__()

        enable_profiler = False if 'ENABLE_PROFILER' not in self.settings else self.settings[
            'ENABLE_PROFILER']
        profiler.enable_all(enable_profiler)

        if 'MODE' in self.settings:
            self.replay = True if self.settings['MODE'] == 'replay' else False
        else:
            self.replay = False

        self.load_spider_settings()
        self.initlogs()
        self.configure_login()
        self.configure_proxy()
        self.add_log_filter()
        self.mailer = MailSender.from_settings(self.settings)

        if not hasattr(BaseSpider, '_allspiders'):
            BaseSpider._allspiders = {}

        if self.__class__ not in BaseSpider._allspiders:
            BaseSpider._allspiders[self.__class__] = []

        BaseSpider._allspiders[self.__class__].append(self)

        self.start_interrupt_polling()
示例#6
0
 def close_spider(self, spider):
     print('爬虫结束')
     # mail_list = []
     # mail_list.append(item['file_name'])
     self.fp.close()
     settings = scrapy.settings.Settings(
         {
             'MAIL_FROM': '*****@*****.**',
             'MAIL_HOST': 'smtp.sina.com',
             'MAIL_PORT': '465',
             'MAIL_USER': '******',
             'MAIL_PASS': '******',
             'MAIL_SSL': 'True'
         },
         priority='project')
     mailer = MailSender.from_settings(settings)
     print(mail_list)
     print('start mail')
     for i in mail_list:
         attach_name = i + '.txt'
         mimetype = 'text/plain'
         file_object = open('files/' + i + '.txt', 'r')
         print(i)
         mailer.send(to=['*****@*****.**'],
                     subject='convert',
                     body='',
                     cc=[''],
                     attachs=[(attach_name, mimetype, file_object)],
                     mimetype='text/plain')
示例#7
0
    def closed(self, reason):
        str = ''
        #conn = MySQLdb.connect(host='127.0.0.1',user='******',passwd='spider_user!@#',port=3306,db='db_spider',charset='utf8')
        #cur = conn.cursor()

        #mydict = {"name":"Lucy", "sex":"female","job":"nurse"}

        for index, item in enumerate(self.web_data_list):
            tmp = 'index:%d, userid:%s, author:%s,head_img:%s \n,age:%s,sex:%s, vote:%s,contentid:%s\n[%s]\n\n' % (
                index, item['userid'], item['author'], item['head_img'],
                item['age'], item['sex'], item['stats_vote'],
                item['contentid'], item['content'])
            str = str + tmp
            author = item['author']
            content = item['content']
            stats_vote = item['stats_vote']
            contentid = item['contentid']

        #sql="insert ignore into t_qiushi(author,content,vote,content_id) values('%s','%s','%s','%s')" % (author,content,stats_vote,contentid)
        #cur.execute(sql)
        #print str
        #conn.commit()
        #cur.close()
        #conn.close()

        #将爬取的数据发送邮件
        settings = get_project_settings()
        mailer = MailSender.from_settings(settings)
示例#8
0
文件: a58.py 项目: zhuzhenping/City58
 def close(self, reason):
     """
     爬虫邮件报告状态
     """
     # 结束时间
     fnished = time.strftime('%Y-%m-%d %H:%M:%S',
                             time.localtime(time.time()))
     # 创建邮件发送对象
     mail = MailSender.from_settings(self.settings)
     # 邮件内容
     spider_name = self.settings.get('BOT_NAME')
     start_time = self.start
     success_request = self.crawler.stats.get_value("Success_Reqeust")
     failed_request = self.crawler.stats.get_value("Failed_Reqeust")
     # 若请求成功, 则默认为0
     if failed_request == None:
         failed_request = 0
     insert_into_success = self.crawler.stats.get_value(
         "Success_Inserted_DB")
     failed_db = self.crawler.stats.get_value("Failed_Insert_DB")
     # 若插入成功, 则默认为0
     if failed_db == None:
         failed_db = 0
     fnished_time = fnished
     body = "爬虫名称: {}\n\n 开始时间: {}\n\n 请求成功总量:{}\n 请求失败总量:{} \n\n 数据库存储总量:{}\n 数据库存储失败总量:{}\n\n 结束时间  : {}\n".format(
         spider_name, start_time, success_request, failed_request,
         insert_into_success, failed_db, fnished_time)
     try:
         # 发送邮件
         mail.send(to=self.settings.get('RECEIVE_LIST'),
                   subject=self.settings.get('SUBJECT'),
                   body=body)
     except Exception as e:
         self.logger.error("Send Email Existing Error, Reason: {}".format(
             e.args))
示例#9
0
    def close_spider(self, spider):
        mailer = MailSender.from_settings(get_project_settings())

        msg = ""
        for count, item in enumerate(self.offers):
            msg += "oferta {}:\ncompany: {}\nsalary: {}\ntitle: {}\nrequirements: {}\n\n".format(count, item["company"], item["salary"], item["title"], item["requirements"])
        mailer.send(to=os.environ.get('MAIL_USERNAME'),subject="test",body=msg)
示例#10
0
 def send_mail(self, month_year):
     subject = 'Bonn: Neuer Termin frei im ' + month_year
     body = self.start_urls[0]
     # you have to set up the mail settings in your own settings.py
     # http://doc.scrapy.org/en/latest/topics/email.html#topics-email-settings
     mailer = MailSender.from_settings(self.settings)
     mailer.send(to=[self.notification_email], subject=subject, body=body)
示例#11
0
    def __init__(self,**kwargs):
        super(CarSpider,self).__init__(**kwargs)
        self.mailer=MailSender.from_settings(self.settings)
        self.counts=0
        self.carnum=800000

        self.cate_dict = {
            "https://coll.jd.com/list.html?sub=46994":"京东保养",
            "https://list.jd.com/list.html?cat=6728,6742,11849":"汽机油",
            "https://list.jd.com/list.html?cat=6728,6742,9248":"轮胎",
            "https://list.jd.com/list.html?cat=6728,6742,11850":"添加剂",
            "https://list.jd.com/list.html?cat=6728,6742,6756":"防冻液",
            "https://coll.jd.com/list.html?sub=23851":"滤清器",
            "https://list.jd.com/list.html?cat=6728,6742,9971":"蓄电池",
            "https://list.jd.com/list.html?cat=6728,6742,13992":"变速箱油/滤",
            "https://list.jd.com/list.html?cat=6728,6742,6766":"雨刷",
            "https://coll.jd.com/list.html?sub=23867":"刹车片/盘",
            "https://list.jd.com/list.html?cat=6728,6742,6767":"火花塞",
            "https://coll.jd.com/list.html?sub=23843":"车灯",
            "https://list.jd.com/list.html?cat=6728,6742,11951":"轮毂",
            "https://list.jd.com/list.html?cat=6728,6742,6769":"维修配件",
            "https://list.jd.com/list.html?cat=6728,6742,13246":"汽车玻璃",
            "https://list.jd.com/list.html?cat=6728,6742,13243":"减震器",
            "https://list.jd.com/list.html?cat=6728,6742,13244":"正时皮带",
            "https://list.jd.com/list.html?cat=6728,6742,13245":"汽车喇叭",
            "https://list.jd.com/list.html?cat=6728,6742,6795":"汽修工具",
            "https://list.jd.com/list.html?cat=6728,6742,12406":"改装配件",
            "https://coll.jd.com/list.html?sub=42052":"原厂件",
        }

        self.settings.set('CrawlCar_Num',self.carnum,priority='cmdline')
        self.settings.set('MONGODB_DB','koubei',priority='cmdline')
        self.settings.set('MONGODB_COLLECTION',website,priority='cmdline')
示例#12
0
 def __init__(self):
     #mail
     self.mailer = MailSender.from_settings(settings)
     #mongo
     self.connection = pymongo.MongoClient(settings['MONGODB_SERVER'],
                                           settings['MONGODB_PORT'])
     db = self.connection[settings['MONGODB_DB']]
     self.collection = db[settings['MONGODB_COLLECTION']]
     self.collectionurllog = db[settings['MONGODB_COLLECTION'] + "_urllog"]
     #bloom file
     filename = 'blm/' + settings['MONGODB_DB'] + '/' + settings[
         'MONGODB_COLLECTION'] + '.blm'
     #pybloom
     num = (int(settings['CrawlCar_Num']) + self.collection.count()) * 1.5
     self.df = BloomFilter(capacity=num, error_rate=0.001)
     #read
     isexists = os.path.exists(filename)
     self.fa = open(filename, "a")
     if isexists:
         fr = open(filename, "r")
         lines = fr.readlines()
         for line in lines:
             line = line.strip('\n')
             self.df.add(line)
         fr.close()
     else:
         for i in self.collection.find():
             if "status" in i.keys():
                 item = i["status"]
                 item = md5(item).hexdigest()
                 self.df.add(item)
                 self.fa.writelines(item + '\n')
     #count
     self.counts = 0
示例#13
0
 def spider_closed(self, spider, reason):
     jira_id = spider.custom_settings['JIRA_ID']
     self.finish_time = datetime.datetime.now()
     self.used_time = self.finish_time - self.start_time
     files = []
     for name, compressed in self.files.items():
         compressed.fileobj.write(compressed.compress.flush())
         gzip.write32u(compressed.fileobj, compressed.crc)
         gzip.write32u(compressed.fileobj, compressed.size & 0xffffffff)
         files.append((name + compressed.extension, compressed.mimetype, compressed))
     try:
         size = self.files[spider.name + '-items.json'].size
     except KeyError:
         size = 0
     stats = spider.crawler.stats.get_stats()
     dqr_status = stats.pop('columns_stats_information', {})
     if ('downloader/exception_count' in stats and stats['downloader/exception_count'] > 0) \
         or ('log_count/ERROR' in stats and stats['log_count/ERROR'] > 0):
         subject = "failed"
     else:
         subject = "succeed"
     mailsender = MailSender.from_settings(self.settings)
     mailsender.send(to=self.settings.getlist('JOB_NOTIFICATION_EMAILS'),
                     subject='JIRA ID:{}  job ends with {}'.format(jira_id, subject),
                     # attachs=files,
                     body=Environment().from_string(config.HTML).render({'stats':stats,
                                                                         'dqr_status':dqr_status,
                                                                         'jira':jira_id,
                                                                         'size':format_size(size)}),
                     mimetype='text/html', _callback=self._catch_mail_sent)
示例#14
0
    def __init__(self, **kwargs):
        super(CarSpider, self).__init__(**kwargs)

        self.mailer = MailSender.from_settings(settings)
        self.counts = 0
        self.carnum = 2000000
        #MonGo
        settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
        settings.set('MONGODB_DB', 'newcar', priority='cmdline')
        settings.set('MONGODB_COLLECTION', website, priority='cmdline')

        with open("blm/" + settings['MONGODB_DB'] + "/yiche_city.txt") as f:
            content = f.read()
            f.close()
        obj = json.loads(content)
        self.city_id_list = []
        for city in obj:
            self.city_id_list.append(city['cityId'])

        desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
        desired_capabilities[
            "phantomjs.page.settings.userAgent"] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'

        self.browser = webdriver.PhantomJS(
            executable_path="/home/phantomjs-2.1.1-linux-x86_64/bin/phantomjs",
            desired_capabilities=desired_capabilities)
        # self.browser = webdriver.PhantomJS(executable_path="/usr/local/phantomjs/bin/phantomjs")
        # self.browser = webdriver.PhantomJS(executable_path="D:/phantomjs", desired_capabilities=desired_capabilities)
        self.browser.set_page_load_timeout(10)
        super(CarSpider, self).__init__()
        dispatcher.connect(self.spider_closed, signals.spider_closed)
示例#15
0
    def __init__(self, **kwargs):
        super(CarSpider, self).__init__(**kwargs)
        self.mailer = MailSender.from_settings(settings)
        self.counts = 0
        self.carnum = 800000

        self.DEVICE_ID = "08b3e7995356e97d8f61dc171048c05a"
        self._uab_collina = "156698635367094109337198"
        self.soucheAnalytics_usertag = "WDuGdR0f7I"
        self.U = "1497797_d51aecb5183ede2691a53a97a963906c"
        self.UserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"

        self.cookies = {
            'DEVICE_ID': '{}'.format(self.DEVICE_ID),
            '_uab_collina': '{}'.format(self._uab_collina),
            'soucheAnalytics_usertag': '{}'.format(self.soucheAnalytics_usertag),
            'U': '{}'.format(self.U),
        }

        self.headers = {
            'User-agent': "{}".format(self.UserAgent),
            'cookies': "DEVICE_ID={}; _uab_collina={}; soucheAnalytics_usertag={}; U={}".format(self.DEVICE_ID, self._uab_collina, self.soucheAnalytics_usertag, self.U)
        }
        settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
        settings.set('MONGODB_DB', 'koubei', priority='cmdline')
        settings.set('MONGODB_COLLECTION', website, priority='cmdline')
	def closed(self,reason):
		#GDP绝对值 单位(亿元)
		gdpList={'gdp2016':744127,'gdp2015':676708,'gdp2014':635910,'gdp2013':588018.76,'gdp2012':534123.04,'gdp2011':473104.05,'gdp2010':401512.8,'gdp2009':340902.81,'gdp2008':314045.4,'gdp2007':265810.3,'gdp2006':216314.4,'gdp2005':184937.4,'gdp2004':159878.3,'gdp2003':135822.8,'gdp2002':120332.7,'gdp2001':109655.2,'gdp2000':99214.6,'gdp1999':89677.1,'gdp1998':84402.3,'gdp1997':78973,'gdp1996':71176.6,'gdp1995':60793.7,'gdp1994':48197.9,'gdp1993':35333.9,'gdp1992':26923.5,'gdp1991':21781.5}

		self.myCursor.execute("SELECT * FROM stock_gdp_ratios WHERE `date`='"+self.todayDate+"'")
		resultStockRecord=self.myCursor.fetchone()
		
		self.myCursor.execute("SELECT `date`,sum(`total_value`) AS total_value2 FROM index_day_historical_data WHERE `date`='"+self.todayDate+"' group by `date`")
		resultStockDay=self.myCursor.fetchone()

		gdpListKey='gdp'+str(self.now.year-1)
		dayStockTotal=int(resultStockDay[1])

		GDPratios=dayStockTotal/(gdpList[gdpListKey]*100000000)
		valueee=[self.todayDate,GDPratios]

		if resultStockRecord is None:
			result=self.myCursor.execute("INSERT INTO `stock_gdp_ratios`(`date`,`ratios`) VALUES (%s,%s)",valueee)
		# print '--------------------------------------------------'
		# print resultStockDay
		# print type(resultStockDay[1])
		mailer = MailSender.from_settings(self.settings)
		title="今日A股证券化率:"+str(GDPratios)
		body="证券化率:"+str(GDPratios)+"<br/>"+"总市值:"+str(resultStockDay[1])+"元"

		mailer.send(to=self.settings['SEND_TO_EMAIL'], subject=title, body=body,mimetype="text/html")
示例#17
0
    def __init__(self, **kwargs):
        super(Chehang168TestSpider, self).__init__(**kwargs)
        self.mailer = MailSender.from_settings(settings)
        self.counts = 0
        self.carnum = 800000

        self.DEVICE_ID = "5539468a883db5093a916df82dfeac8e"
        self._uab_collina = "156707458541401104823761"
        self.soucheAnalytics_usertag = "DhwfDpdHfx"
        self.U = "1497797_d51aecb5183ede2691a53a97a963906c"
        self.UserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"

        self.cookies = {
            'DEVICE_ID': f'{self.DEVICE_ID}',
            '_uab_collina': f'{self._uab_collina}',
            'soucheAnalytics_usertag': f'{self.soucheAnalytics_usertag}',
            'U': f'{self.U}',
        }

        self.headers = {
            # 'User-agent': "{}".format(self.UserAgent),
            'cookies': f"DEVICE_ID={self.DEVICE_ID}; _uab_collina={self._uab_collina}; soucheAnalytics_usertag={self.soucheAnalytics_usertag}; U={self.U}"
            # 'cookies': "DEVICE_ID={}; soucheAnalytics_usertag={}; U={}".format(self.DEVICE_ID, self.soucheAnalytics_usertag, self.U)
        }
        settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
        settings.set('MONGODB_DB', 'koubei', priority='cmdline')
        settings.set('MONGODB_COLLECTION', website, priority='cmdline')
示例#18
0
    def closed(self,reason):
        str = ''
        #conn = MySQLdb.connect(host='127.0.0.1',user='******',passwd='spider_user!@#',port=3306,db='db_spider',charset='utf8')
        #cur = conn.cursor()
        
        #mydict = {"name":"Lucy", "sex":"female","job":"nurse"}
        

        for index,item in enumerate(self.web_data_list):
    	    tmp = 'index:%d, userid:%s, author:%s,head_img:%s \n,age:%s,sex:%s, vote:%s,contentid:%s\n[%s]\n\n' % (index,item['userid'],item['author'],item['head_img'],item['age'],item['sex'],item['stats_vote'],item['contentid'],item['content'])
            str = str + tmp
            author=item['author']
            content=item['content']
            stats_vote = item['stats_vote']
            contentid=item['contentid']
            
            #sql="insert ignore into t_qiushi(author,content,vote,content_id) values('%s','%s','%s','%s')" % (author,content,stats_vote,contentid)
            #cur.execute(sql)
        #print str
        #conn.commit()
        #cur.close()
        #conn.close()
        
        #将爬取的数据发送邮件
        settings = get_project_settings()
        mailer = MailSender.from_settings(settings)
示例#19
0
    def close_spider(self, spider):
        #self.cursor.commit()
        self.conn.commit()
        items_enviados = list()
        try:
            crs = self.cursor.execute(
                """select id, title, link from news where enviado=0"""
            ).fetchall()
            news = list()
            for item in crs:
                news.append('<p><a href="{link}">{title}</a></p>'.format(
                    link=item[2], title=item[1]))
                items_enviados.append(str(item[0]))
            if len(news):
                mailer = MailSender.from_settings(self.settings)
                body = "<h1>Novidades NF-e!</h1><br><div>{body}</div>".format(
                    body="".join(news))
                #TODO: inserir aqui os emails de destino
                send_to = list()
                if len(send_to) > 0:
                    print('mail enviado paraaaaaaaa ' + " ".join(send_to))
                    #mailer.send(to=send_to, subject='Novidades NF-e', body=body, mimetype="text/html")
        except Exception as e:
            print(e)
            pass
        else:
            if len(items_enviados) > 0:
                self.cursor.execute(
                    """update news set enviado=1 where id in ({})""".format(
                        ", ".join(items_enviados)))
                self.conn.commit()

        self.cursor.close()
        self.conn.close()
示例#20
0
 def closed(self,reason):
     #爬取完成后进行邮件通知
     mailer = MailSender.from_settings(self.settings)
     body = '''本次爬取状态:{}\r\n本次爬取电影数量:{}\r\n本次爬取电影列表:{}'''.format(reason,self.crawler.stats.get_value('movie_count'),
                                                                 self.crawler.stats.get_value('movie_list'))
     subject = '天堂网电影爬取通知'
     mailer.send(to=["*****@*****.**"], subject=subject, body=body)
示例#21
0
    def from_crawler(cls, crawler):
        recipients = crawler.settings.getlist('STATUSMAILER_RECIPIENTS')
        compression = crawler.settings.get('STATUSMAILER_COMPRESSION')

        if not compression:
            compressor = PlainCompressor
        elif compression.lower().startswith('gz'):
            compressor = GzipCompressor
        else:
            raise NotConfigured

        if not recipients:
            raise NotConfigured

        mail = MailSender.from_settings(crawler.settings)
        instance = cls(recipients, mail, compressor, crawler)

        crawler.signals.connect(instance.item_scraped,
                                signal=signals.item_scraped)
        crawler.signals.connect(instance.spider_error,
                                signal=signals.spider_error)
        crawler.signals.connect(instance.spider_closed,
                                signal=signals.spider_closed)
        crawler.signals.connect(instance.request_received,
                                signal=signals.request_received)

        return instance
示例#22
0
 def __init__(self, **kwargs):
     # args
     super(CarSpider, self).__init__(**kwargs)
     #problem report
     self.mailer = MailSender.from_settings(settings)
     self.counts=0
     # Mongo
     settings.set('CrawlCar_Num', carnum, priority='cmdline')
     settings.set('MONGODB_DB', 'usedcar', priority='cmdline')
     settings.set('MONGODB_COLLECTION', website, priority='cmdline')
     #mysql
     # mysql
     mysqldb = MySQLdb.connect("192.168.1.94", "root", "Datauser@2017", "usedcar", port=3306)
     mysqldbc = mysqldb.cursor()
     # read
     mysqldbc.execute("select newcarurl from che58")
     items = mysqldbc.fetchall()
     self.urllist=[]
     df =pybloom.BloomFilter(carnum,0.01)
     for i in items:
         j=i[0]
         md5i= hashlib.md5(j)
         rf = df.add(md5i)
         if not rf:
             self.urllist.append(j)
 def from_crawler(cls, crawler):
     recipients = crawler.settings.getlist("STATSMAILER_RCPTS")
     if not recipients:
         raise NotConfigured
     mail = MailSender.from_settings(crawler.settings)
     o = cls(crawler.stats, recipients, mail)
     crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
     return o
示例#24
0
 def from_crawler(cls, crawler):
     recipients = crawler.settings.getlist("STATSMAILER_RCPTS")
     if not recipients:
         raise NotConfigured
     mail = MailSender.from_settings(crawler.settings)  # 连接基本配置
     o = cls(crawler.stats, recipients, mail)
     crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
     return o
示例#25
0
 def __init__(self, **kwargs):
     super(CarSpider, self).__init__(**kwargs)
     self.mailer = MailSender.from_settings(settings)
     self.counts = 0
     self.carnum = 800000
     settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
     settings.set('MONGODB_DB', 'koubei', priority='cmdline')
     settings.set('MONGODB_COLLECTION', website, priority='cmdline')
示例#26
0
    def content_parse(self, response):
        selector = Selector(response=response)

        self.loopGet(self.savefont, response)
        contents = selector.xpath("//div[contains(@class,'koubei-final')]")[0]
        publishTime = contents.xpath(
            ".//div[contains(@class,'title-name')]/b/text()").extract_first()
        publishTitle = contents.xpath(
            ".//div[contains(@class,'kou-tit')]//text()").extract()
        publishTitle = ''.join(publishTitle)
        publishTitle = publishTitle.strip('\n').strip('\t').strip()
        contentsText = contents.xpath(
            ".//div[contains(@class,'text-con')]/text()|.//div[contains(@class,'text-con')]//span/text()"
        ).extract()
        contentsText = ''.join(contentsText)
        contentsText = contentsText.strip('\n').strip('\t').strip()
        contentsDic = dict()
        contentsDic['publishTime'] = publishTime
        contentsDic['publishTitle'] = publishTitle
        imageRecognizer = ImageRecognizer(orignText=contentsText,
                                          orignFont='temp.ttf')
        contentsDic['contentsText'] = ' '.join(
            imageRecognizer.outterCall().replace('\n', '').split())
        entireContentsItem = EntireContentsItem()
        entireContentsItem['url'] = response.url
        entireContentsItem['specId'] = response.meta['specId']
        entireContentsItem['commentsId'] = response.meta['commentsId']
        entireContentsItem['leftBar'] = response.meta['nameValueDic']
        ifRenzhen = selector.xpath("//i[@class='renzhen']").extract_first()
        if ifRenzhen:
            entireContentsItem['leftBar']['renzhen'] = True
        else:
            entireContentsItem['leftBar']['renzhen'] = False
        modelId = selector.xpath(
            "//dl[@class='choose-dl'][1]//a[1]/@href").extract_first().replace(
                '/', '')
        entireContentsItem['modelId'] = modelId
        entireContentsItem['contents'] = contentsDic
        allCommentsNum = selector.xpath(
            '//span[@id="Comment_{commentsId}"]//text()'.format(
                commentsId=entireContentsItem['commentsId'])).extract_first()
        if allCommentsNum is None:
            with open('temp.txt', 'rb') as writer:
                writer.write(response)
            mailer = MailSender.from_settings(settings)
            mailer.send('*****@*****.**',
                        'scrapy',
                        'allCommentsNum is None'.encode('utf-8'),
                        charset='utf-8')
        try:
            allPagesNum = math.ceil(int(allCommentsNum) / 10)
        except Exception as e:
            logging.warning(e)
        #entireContentsItem['comments'] = self.getComments(entireContentsItem['commentsId'], allPagesNum)
        entireContentsItem['comments'] = list()
        entireContentsItem['scrapyTime'] = time.strftime(
            "%Y-%m-%d %H:%M:%S", time.localtime())
        yield entireContentsItem
示例#27
0
 def closed(self,reason):
     self.logger.info("Spider closed: %s"%str(reason))
     mailer = MailSender.from_settings(self.settings)
     mailer.send(
         to=["*****@*****.**"],
         subject="Spider closed",
         body=str(self.crawler.stats.get_stats()),
         cc=["*****@*****.**"]
         )
示例#28
0
 def from_crawler(cls, crawler):
     # This method is used by Scrapy to create your spiders.
     s = cls()
     # crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
     crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
     crawler.signals.connect(s.item_scraped, signal=signals.item_scraped)
     crawler.signals.connect(s.spider_error, signal=signals.spider_error)
     s.mail = MailSender.from_settings(crawler.settings)
     return s
示例#29
0
    def from_crawler(cls, crawler):
        mail_list = crawler.settings.getlist("ERRMAIL_LIST")
        if not mail_list:
            raise NotConfigured
        mail = MailSender.from_settings(crawler.settings)
        o = cls(crawler.stats, mail_list, mail)
        crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)

        return o
示例#30
0
 def closed(self, reason):
     pdb.set_trace()
     self.logger.info("Spider closed: {}".format(reason))
     mailer = MailSender.from_settings(self.settings)
     mailer.send(
         to=settings.ADMINS,
         subject="Spider closed",
         body=str(self.crawler.stats.get_stats()),
     )
示例#31
0
 def closed(self, reason):
     import pdb
     pdb.set_trace()
     self.logger.info("Spider closed: %s" % str(reason))
     mailer = MailSender.from_settings(self.settings)
     mailer.send(to=["******@qq.com"],
                 subject="Spider closed",
                 body=str(self.crawler.stats.get_stats()),
                 cc=["**********@xxxxxxxx.com"])
示例#32
0
 def __init__(self, **kwargs):
     # report bug session
     self.mailer = MailSender.from_settings(settings)
     self.counts = 0
     self.carnum = 50000
     # Mongo setting
     settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
     settings.set('MONGODB_DB', 'newcar', priority='cmdline')
     settings.set('MONGODB_COLLECTION', website, priority='cmdline')
示例#33
0
 def from_crawler(cls, crawler):
     recipients = crawler.settings.getlist("STATSMAILER_RCPTS")
     # 在 scrapy.settings 加上 PROJECT_NAME 字段,用于设定项目名
     project_name = crawler.settings.get('PROJECT_NAME')
     if not recipients:
         raise NotConfigured
     mail = MailSender.from_settings(crawler.settings)
     o = cls(crawler.stats, recipients, mail, project_name)
     crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
     return o
示例#34
0
    def from_crawler(cls, crawler):
        mail = MailSender.from_settings(crawler.settings)

        instance = cls(mail)

        crawler.signals.connect(instance.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped)

        return instance
示例#35
0
    def parse(self, response):
    	mailer = MailSender.from_settings(settings)
    	try:
        	mailer.send(to=["*****@*****.**"],subject="scrapy spider",body="test message",cc=['*****@*****.**'],charset="utf-8")
    	except Exception as e :
        	msg = "Error occurred...{0}".format(str(e))
        	print(msg)


    	print('mail sending')
 def from_crawler(cls, crawler):
     recipients = crawler.settings.getlist("STATSMAILER_RCPTS")
     if not recipients:
         raise NotConfigured
     mail = MailSender.from_settings(crawler.settings)
     test_server = crawler.settings.getbool("TEST_SERVER")
     use_feed_export = crawler.settings.getbool("USE_FEED_EXPORT")
     o = cls(crawler.stats, recipients, mail, test_server, use_feed_export)
     crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
     return o
示例#37
0
 def __init__(self, **kwargs):
     super(TtpaiSpider, self).__init__(**kwargs)
     self.mailer = MailSender.from_settings(settings)
     self.counts = 0
     self.carnum = 3000000
     self.page = 1
     # Mongo
     settings.set('CrawlCar_Num', self.carnum, priority='cmdline')
     settings.set('MONGODB_DB', 'carbusiness', priority='cmdline')
     settings.set('MONGODB_COLLECTION', website, priority='cmdline')
示例#38
0
 def closed(self,reason):
     import pdb;pdb.set_trace()
     self.logger.info("Spider closed: %s"%str(reason))
     mailer = MailSender.from_settings(self.settings)
     mailer.send(
         to=["******@qq.com"], 
         subject="Spider closed", 
         body=str(self.crawler.stats.get_stats()), 
         cc=["**********@xxxxxxxx.com"]
         )
示例#39
0
 def close_spider(self, spider):
     self.persist_dict.close()
     if not self.email_list:
         return
     email_str = "\n\n".join(self.email_list)
     mailer = MailSender.from_settings(spider.settings)
     with open('list.csv', 'r') as csv_file:
         mailer.send(
             to = ["*****@*****.**"],
             subject = "Scrapy Info",
             body = email_str,
             attachs = [('scrapy_info.csv', 'text/csv', csv_file)],
         )
示例#40
0
    def parse(self, response):
        items = [ ]
        mailer = MailSender.from_settings(self.settings)
        sel = scrapy.Selector(response)
        posts = sel.xpath('//div [@class="wall_item"]')
        for post in posts:
            item = HonlineItem()
            #AUTHOR = post.xpath('.//div[1]//div[1]//div[1]//a[1]/text()').extract() #wi_head/wi_cont/wi_author/a
            item['post_link'] = str(post.xpath('.//div[1]//div[1]//div[2]//a[1]/@href').extract()[0])
            item['post_time'] = str(post.xpath('.//div[1]//div[1]//div[2]//a[1]/text()').extract()[0])
            item['key'] = (post.re('\d\d\d\d\d\d\d\d\d\d\d\d\d\d\d')) #" 289276165354594 "

            if len(item['key']) > 0:
                item['key'] = str(item['key'][0])
                items.append(item)
        return items
示例#41
0
    def __init__(self, crawler):
        if not crawler.settings.getbool('MEMUSAGE_ENABLED'):
            raise NotConfigured
        try:
            self.resource = __import__('resource')
        except ImportError:
            raise NotConfigured

        self.crawler = crawler
        self.warned = False
        self.notify_mails = crawler.settings.getlist('MEMUSAGE_NOTIFY_MAIL')
        self.limit = crawler.settings.getint('MEMUSAGE_LIMIT_MB')*1024*1024
        self.warning = crawler.settings.getint('MEMUSAGE_WARNING_MB')*1024*1024
        self.report = crawler.settings.getbool('MEMUSAGE_REPORT')
        self.mail = MailSender.from_settings(crawler.settings)
        crawler.signals.connect(self.engine_started, signal=signals.engine_started)
        crawler.signals.connect(self.engine_stopped, signal=signals.engine_stopped)
示例#42
0
    def __init__(self, crawler):
        if not crawler.settings.getbool('MEMUSAGE_ENABLED'):
            raise NotConfigured
        try:
            # stdlib's resource module is only available on unix platforms.
            self.resource = import_module('resource')
        except ImportError:
            raise NotConfigured

        self.crawler = crawler
        self.warned = False
        self.notify_mails = crawler.settings.getlist('MEMUSAGE_NOTIFY_MAIL')
        self.limit = crawler.settings.getint('MEMUSAGE_LIMIT_MB')*1024*1024
        self.warning = crawler.settings.getint('MEMUSAGE_WARNING_MB')*1024*1024
        self.check_interval = crawler.settings.getfloat('MEMUSAGE_CHECK_INTERVAL_SECONDS')
        self.mail = MailSender.from_settings(crawler.settings)
        crawler.signals.connect(self.engine_started, signal=signals.engine_started)
        crawler.signals.connect(self.engine_stopped, signal=signals.engine_stopped)
示例#43
0
	def close_spider(self, spider):
		self.logfile.write("stock pipeline finish \n")
		pipelog = open("stockpipeline.txt")
		if spider.name == "nasdaq":
			# mail body
			mail_body = "please consider the following {count} stocks: \n".format(count=len(self.emailContent))
			for name, content in self.emailContent.items():
				mail_body += "{name}	{currentprice}	{yearlowprice}	{yearhighprice}	{sharevolume} \n".format(
					name=name, currentprice=content[0], yearlowprice=content[1], yearhighprice=content[2], sharevolume=content[3])

			nasdaqlog = open("nasdaqcrawl.txt")
			attachment = [('nasdaqlog.txt', 'text/plain', nasdaqlog), ('pipelog.txt', 'text/plain', pipelog)]
			mailer = MailSender.from_settings(emailSettings())
			mailer.send(to=["*****@*****.**"],
						subject='nasdaq spider finish', body=mail_body, cc=["*****@*****.**"],
						attachs=attachment)
			nasdaqlog.close()
		pipelog.close()
		self.logfile.close()
		self.session.close()
示例#44
0
    def from_crawler(cls, crawler):
        recipients = crawler.settings.getlist('STATUSMAILER_RECIPIENTS')
        compression = crawler.settings.get('STATUSMAILER_COMPRESSION')

        if not compression:
            compressor = PlainCompressor
        elif compression.lower().startswith('gz'):
            compressor = GzipCompressor
        else:
            raise NotConfigured

        if not recipients:
            raise NotConfigured

        mail = MailSender.from_settings(crawler.settings)
        instance = cls(recipients, mail, compressor, crawler)

        crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped)
        crawler.signals.connect(instance.spider_error, signal=signals.spider_error)
        crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(instance.request_received, signal=signals.request_received)

        return instance
示例#45
0
 def from_crawler(cls,crawler): 
     mailer = MailSender.from_settings(crawler.settings)
     spider = cls(mailer)
     crawler.signals.connect(spider.spider_closed,signals.spider_closed)
     crawler.signals.connect(spider.spider_error,signals.spider_error)
     return spider
    def spider_closed(self, spider):
        spider.log("Generating status report", log.INFO)

        now = datetime.datetime.today()

        context = {
            'spider': spider.name,
            'date': now.strftime("%d %b %Y"),
            'time': now.strftime("%H:%M"),
            'checklists': 'No checklists downloaded',
            'errors': 'No errors reported',
            'warnings': 'No warnings reported',
        }

        checklists = getattr(spider, 'checklists', [])
        spider.log("%d checklists downloaded" % len(checklists), log.INFO)

        if checklists:
            summary = []
            for checklist in checklists:
                if 'protocol' in checklist and 'time' in checklist['protocol']:
                    time = checklist['protocol']['time']
                else:
                    time = '--:--'
                summary.append("%s %s, %s (%s)" % (
                    checklist['date'],
                    time,
                    unidecode(checklist['location']['name']),
                    unidecode(checklist['source']['submitted_by'])
                ))
            context['checklists'] = '\n'.join(summary).encode('utf-8')

        errors = getattr(spider, 'errors', [])
        spider.log("%d errors reported" % len(errors), log.INFO)

        if errors:
            summary = []
            for url, failure in errors:
                summary.append("URL: %s\n%s\n\n" % (
                    url,
                    failure.getTraceback()
                ))
            context['errors'] = '\n'.join(summary).encode('utf-8')

        warnings = getattr(spider, 'warnings', [])
        spider.log("%d warnings reported" % len(warnings), log.INFO)

        if warnings:
            summary = []

            for checklist, messages in warnings:
                if 'protocol' in checklist and 'time' in checklist['protocol']:
                    time = checklist['protocol']['time']
                else:
                    time = '--:--'
                summary.append("%s %s, %s (%s)" % (
                    checklist['date'],
                    time,
                    unidecode(checklist['location']['name']),
                    unidecode(checklist['source']['submitted_by'])
                ))
                summary.append("API: %s" % checklist['source']['api'])
                summary.append("URL: %s" % checklist['source']['url'])
                summary.extend(messages)
                summary.append('')

            context['warnings'] = '\n'.join(summary).encode('utf-8')

        report = self.template % context

        if spider.settings['LOG_LEVEL'] == 'DEBUG':
            directory = spider.settings['DOWNLOAD_DIR']
            filename = os.path.join(directory, 'checklists_scrapers_status.txt')
            with open(filename, 'wb') as fp:
                fp.write(report)

        recipients = spider.settings['REPORT_RECIPIENTS'].strip()

        if recipients:
            mailer = MailSender.from_settings(spider.settings)
            addrs = [recipient.strip() for recipient in recipients.split(',')]
            mailer.send(
                to=addrs,
                subject="%s Status Report" % spider.name,
                body=report
            )
        else:
            spider.log("No recipients listed to receive status report",
                       log.INFO)