def from_crawler(cls, crawler): recipients = crawler.settings.getlist("STATSMAILER_RCPTS") mail = MailSender.from_settings(crawler.settings) o = cls(crawler.stats, recipients, mail) crawler.signals.connect(o.engine_stopped, signal=signals.engine_stopped) crawler.signals.connect(o.engine_started, signal=signals.engine_started) return o
def __init__(self, **kwargs): # problem report super(CarSpider, self).__init__(**kwargs) self.mailer = MailSender.from_settings(settings) self.counts = 0 self.carnum = 1010000 # Mongo settings.set('DOWNLOAD_DELAY', '0', priority='cmdline') settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'newcar', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline') self.nationp = dict() self.npcounts = 0 # nation select self.browser = webdriver.PhantomJS( executable_path=settings['PHANTOMJS_PATH']) # desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() # proxy = webdriver.Proxy() # proxy.proxy_type = ProxyType.MANUAL # proxy.http_proxy = self.getProxy() # proxy.add_to_capabilities(desired_capabilities) # self.browser.start_session(desired_capabilities) # self.browser.set_page_load_timeout(12) # self.browser = webdriver.PhantomJS(executable_path=settings['PHANTOMJS_PATH']) # self.browser = webdriver.PhantomJS(executable_path="/usr/local/phantomjs/bin/phantomjs") # self.browser = webdriver.PhantomJS(executable_path="/root/home/phantomjs") super(CarSpider, self).__init__() dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self, **kwargs): super(CarSpider, self).__init__(**kwargs) self.mailer = MailSender.from_settings(self.settings) self.counts = 0 self.carnum = 800000 self.headers = { "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', } self.category = { "7": "国家补贴", "2": "地方补贴", "1": "推广政策", "3": "充电桩补贴政策", "4": "充电价格政策", "9": "路权政策", "8": "充电设施建设规划", "10": "网约车政策", "5": "国外政策", "6": "其它政策", } self.settings.set('CrawlCar_Num', self.carnum, priority='cmdline') self.settings.set('MONGODB_DB', 'koubei', priority='cmdline') self.settings.set('MONGODB_COLLECTION', website, priority='cmdline')
def __init__(self, **kwargs): super(CarSpider, self).__init__(**kwargs) self.mailer = MailSender.from_settings(settings) self.counts = 0 self.carnum = 800000 settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'carbusiness', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline') with codecs.open("D:/county.txt", "r", "utf-8") as f: filecontent = f.read() # print(filecontent) indexlist = re.findall("\d+\_\d+\_\d+|\d+\_\d+", filecontent) indexlist.append("0") # print(indexlist) datalist = re.findall("\[(.*?)\]", filecontent, re.S) # print(datalist) self.datadict = {} for index in indexlist: self.datadict[index] = datalist[indexlist.index(index)] # print(self.datadict) self.browser = webdriver.PhantomJS( executable_path=settings['PHANTOMJS_PATH']) # self.browser = webdriver.PhantomJS(executable_path="/usr/local/phantomjs/bin/phantomjs") # self.browser = webdriver.PhantomJS(executable_path="/root/home/phantomjs") super(CarSpider, self).__init__() dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self, *args, **kwargs): self.running = True kwargs['settings'] = self.configure_image_store(kwargs['settings']) super(BaseSpider, self).__init__(*args, **kwargs) self.settings = kwargs[ 'settings'] # If we don't do that, the setting sobject only exist after __init__() enable_profiler = False if 'ENABLE_PROFILER' not in self.settings else self.settings[ 'ENABLE_PROFILER'] profiler.enable_all(enable_profiler) if 'MODE' in self.settings: self.replay = True if self.settings['MODE'] == 'replay' else False else: self.replay = False self.load_spider_settings() self.initlogs() self.configure_login() self.configure_proxy() self.add_log_filter() self.mailer = MailSender.from_settings(self.settings) if not hasattr(BaseSpider, '_allspiders'): BaseSpider._allspiders = {} if self.__class__ not in BaseSpider._allspiders: BaseSpider._allspiders[self.__class__] = [] BaseSpider._allspiders[self.__class__].append(self) self.start_interrupt_polling()
def close_spider(self, spider): print('爬虫结束') # mail_list = [] # mail_list.append(item['file_name']) self.fp.close() settings = scrapy.settings.Settings( { 'MAIL_FROM': '*****@*****.**', 'MAIL_HOST': 'smtp.sina.com', 'MAIL_PORT': '465', 'MAIL_USER': '******', 'MAIL_PASS': '******', 'MAIL_SSL': 'True' }, priority='project') mailer = MailSender.from_settings(settings) print(mail_list) print('start mail') for i in mail_list: attach_name = i + '.txt' mimetype = 'text/plain' file_object = open('files/' + i + '.txt', 'r') print(i) mailer.send(to=['*****@*****.**'], subject='convert', body='', cc=[''], attachs=[(attach_name, mimetype, file_object)], mimetype='text/plain')
def closed(self, reason): str = '' #conn = MySQLdb.connect(host='127.0.0.1',user='******',passwd='spider_user!@#',port=3306,db='db_spider',charset='utf8') #cur = conn.cursor() #mydict = {"name":"Lucy", "sex":"female","job":"nurse"} for index, item in enumerate(self.web_data_list): tmp = 'index:%d, userid:%s, author:%s,head_img:%s \n,age:%s,sex:%s, vote:%s,contentid:%s\n[%s]\n\n' % ( index, item['userid'], item['author'], item['head_img'], item['age'], item['sex'], item['stats_vote'], item['contentid'], item['content']) str = str + tmp author = item['author'] content = item['content'] stats_vote = item['stats_vote'] contentid = item['contentid'] #sql="insert ignore into t_qiushi(author,content,vote,content_id) values('%s','%s','%s','%s')" % (author,content,stats_vote,contentid) #cur.execute(sql) #print str #conn.commit() #cur.close() #conn.close() #将爬取的数据发送邮件 settings = get_project_settings() mailer = MailSender.from_settings(settings)
def close(self, reason): """ 爬虫邮件报告状态 """ # 结束时间 fnished = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 创建邮件发送对象 mail = MailSender.from_settings(self.settings) # 邮件内容 spider_name = self.settings.get('BOT_NAME') start_time = self.start success_request = self.crawler.stats.get_value("Success_Reqeust") failed_request = self.crawler.stats.get_value("Failed_Reqeust") # 若请求成功, 则默认为0 if failed_request == None: failed_request = 0 insert_into_success = self.crawler.stats.get_value( "Success_Inserted_DB") failed_db = self.crawler.stats.get_value("Failed_Insert_DB") # 若插入成功, 则默认为0 if failed_db == None: failed_db = 0 fnished_time = fnished body = "爬虫名称: {}\n\n 开始时间: {}\n\n 请求成功总量:{}\n 请求失败总量:{} \n\n 数据库存储总量:{}\n 数据库存储失败总量:{}\n\n 结束时间 : {}\n".format( spider_name, start_time, success_request, failed_request, insert_into_success, failed_db, fnished_time) try: # 发送邮件 mail.send(to=self.settings.get('RECEIVE_LIST'), subject=self.settings.get('SUBJECT'), body=body) except Exception as e: self.logger.error("Send Email Existing Error, Reason: {}".format( e.args))
def close_spider(self, spider): mailer = MailSender.from_settings(get_project_settings()) msg = "" for count, item in enumerate(self.offers): msg += "oferta {}:\ncompany: {}\nsalary: {}\ntitle: {}\nrequirements: {}\n\n".format(count, item["company"], item["salary"], item["title"], item["requirements"]) mailer.send(to=os.environ.get('MAIL_USERNAME'),subject="test",body=msg)
def send_mail(self, month_year): subject = 'Bonn: Neuer Termin frei im ' + month_year body = self.start_urls[0] # you have to set up the mail settings in your own settings.py # http://doc.scrapy.org/en/latest/topics/email.html#topics-email-settings mailer = MailSender.from_settings(self.settings) mailer.send(to=[self.notification_email], subject=subject, body=body)
def __init__(self,**kwargs): super(CarSpider,self).__init__(**kwargs) self.mailer=MailSender.from_settings(self.settings) self.counts=0 self.carnum=800000 self.cate_dict = { "https://coll.jd.com/list.html?sub=46994":"京东保养", "https://list.jd.com/list.html?cat=6728,6742,11849":"汽机油", "https://list.jd.com/list.html?cat=6728,6742,9248":"轮胎", "https://list.jd.com/list.html?cat=6728,6742,11850":"添加剂", "https://list.jd.com/list.html?cat=6728,6742,6756":"防冻液", "https://coll.jd.com/list.html?sub=23851":"滤清器", "https://list.jd.com/list.html?cat=6728,6742,9971":"蓄电池", "https://list.jd.com/list.html?cat=6728,6742,13992":"变速箱油/滤", "https://list.jd.com/list.html?cat=6728,6742,6766":"雨刷", "https://coll.jd.com/list.html?sub=23867":"刹车片/盘", "https://list.jd.com/list.html?cat=6728,6742,6767":"火花塞", "https://coll.jd.com/list.html?sub=23843":"车灯", "https://list.jd.com/list.html?cat=6728,6742,11951":"轮毂", "https://list.jd.com/list.html?cat=6728,6742,6769":"维修配件", "https://list.jd.com/list.html?cat=6728,6742,13246":"汽车玻璃", "https://list.jd.com/list.html?cat=6728,6742,13243":"减震器", "https://list.jd.com/list.html?cat=6728,6742,13244":"正时皮带", "https://list.jd.com/list.html?cat=6728,6742,13245":"汽车喇叭", "https://list.jd.com/list.html?cat=6728,6742,6795":"汽修工具", "https://list.jd.com/list.html?cat=6728,6742,12406":"改装配件", "https://coll.jd.com/list.html?sub=42052":"原厂件", } self.settings.set('CrawlCar_Num',self.carnum,priority='cmdline') self.settings.set('MONGODB_DB','koubei',priority='cmdline') self.settings.set('MONGODB_COLLECTION',website,priority='cmdline')
def __init__(self): #mail self.mailer = MailSender.from_settings(settings) #mongo self.connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) db = self.connection[settings['MONGODB_DB']] self.collection = db[settings['MONGODB_COLLECTION']] self.collectionurllog = db[settings['MONGODB_COLLECTION'] + "_urllog"] #bloom file filename = 'blm/' + settings['MONGODB_DB'] + '/' + settings[ 'MONGODB_COLLECTION'] + '.blm' #pybloom num = (int(settings['CrawlCar_Num']) + self.collection.count()) * 1.5 self.df = BloomFilter(capacity=num, error_rate=0.001) #read isexists = os.path.exists(filename) self.fa = open(filename, "a") if isexists: fr = open(filename, "r") lines = fr.readlines() for line in lines: line = line.strip('\n') self.df.add(line) fr.close() else: for i in self.collection.find(): if "status" in i.keys(): item = i["status"] item = md5(item).hexdigest() self.df.add(item) self.fa.writelines(item + '\n') #count self.counts = 0
def spider_closed(self, spider, reason): jira_id = spider.custom_settings['JIRA_ID'] self.finish_time = datetime.datetime.now() self.used_time = self.finish_time - self.start_time files = [] for name, compressed in self.files.items(): compressed.fileobj.write(compressed.compress.flush()) gzip.write32u(compressed.fileobj, compressed.crc) gzip.write32u(compressed.fileobj, compressed.size & 0xffffffff) files.append((name + compressed.extension, compressed.mimetype, compressed)) try: size = self.files[spider.name + '-items.json'].size except KeyError: size = 0 stats = spider.crawler.stats.get_stats() dqr_status = stats.pop('columns_stats_information', {}) if ('downloader/exception_count' in stats and stats['downloader/exception_count'] > 0) \ or ('log_count/ERROR' in stats and stats['log_count/ERROR'] > 0): subject = "failed" else: subject = "succeed" mailsender = MailSender.from_settings(self.settings) mailsender.send(to=self.settings.getlist('JOB_NOTIFICATION_EMAILS'), subject='JIRA ID:{} job ends with {}'.format(jira_id, subject), # attachs=files, body=Environment().from_string(config.HTML).render({'stats':stats, 'dqr_status':dqr_status, 'jira':jira_id, 'size':format_size(size)}), mimetype='text/html', _callback=self._catch_mail_sent)
def __init__(self, **kwargs): super(CarSpider, self).__init__(**kwargs) self.mailer = MailSender.from_settings(settings) self.counts = 0 self.carnum = 2000000 #MonGo settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'newcar', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline') with open("blm/" + settings['MONGODB_DB'] + "/yiche_city.txt") as f: content = f.read() f.close() obj = json.loads(content) self.city_id_list = [] for city in obj: self.city_id_list.append(city['cityId']) desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() desired_capabilities[ "phantomjs.page.settings.userAgent"] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36' self.browser = webdriver.PhantomJS( executable_path="/home/phantomjs-2.1.1-linux-x86_64/bin/phantomjs", desired_capabilities=desired_capabilities) # self.browser = webdriver.PhantomJS(executable_path="/usr/local/phantomjs/bin/phantomjs") # self.browser = webdriver.PhantomJS(executable_path="D:/phantomjs", desired_capabilities=desired_capabilities) self.browser.set_page_load_timeout(10) super(CarSpider, self).__init__() dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self, **kwargs): super(CarSpider, self).__init__(**kwargs) self.mailer = MailSender.from_settings(settings) self.counts = 0 self.carnum = 800000 self.DEVICE_ID = "08b3e7995356e97d8f61dc171048c05a" self._uab_collina = "156698635367094109337198" self.soucheAnalytics_usertag = "WDuGdR0f7I" self.U = "1497797_d51aecb5183ede2691a53a97a963906c" self.UserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" self.cookies = { 'DEVICE_ID': '{}'.format(self.DEVICE_ID), '_uab_collina': '{}'.format(self._uab_collina), 'soucheAnalytics_usertag': '{}'.format(self.soucheAnalytics_usertag), 'U': '{}'.format(self.U), } self.headers = { 'User-agent': "{}".format(self.UserAgent), 'cookies': "DEVICE_ID={}; _uab_collina={}; soucheAnalytics_usertag={}; U={}".format(self.DEVICE_ID, self._uab_collina, self.soucheAnalytics_usertag, self.U) } settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'koubei', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline')
def closed(self,reason): #GDP绝对值 单位(亿元) gdpList={'gdp2016':744127,'gdp2015':676708,'gdp2014':635910,'gdp2013':588018.76,'gdp2012':534123.04,'gdp2011':473104.05,'gdp2010':401512.8,'gdp2009':340902.81,'gdp2008':314045.4,'gdp2007':265810.3,'gdp2006':216314.4,'gdp2005':184937.4,'gdp2004':159878.3,'gdp2003':135822.8,'gdp2002':120332.7,'gdp2001':109655.2,'gdp2000':99214.6,'gdp1999':89677.1,'gdp1998':84402.3,'gdp1997':78973,'gdp1996':71176.6,'gdp1995':60793.7,'gdp1994':48197.9,'gdp1993':35333.9,'gdp1992':26923.5,'gdp1991':21781.5} self.myCursor.execute("SELECT * FROM stock_gdp_ratios WHERE `date`='"+self.todayDate+"'") resultStockRecord=self.myCursor.fetchone() self.myCursor.execute("SELECT `date`,sum(`total_value`) AS total_value2 FROM index_day_historical_data WHERE `date`='"+self.todayDate+"' group by `date`") resultStockDay=self.myCursor.fetchone() gdpListKey='gdp'+str(self.now.year-1) dayStockTotal=int(resultStockDay[1]) GDPratios=dayStockTotal/(gdpList[gdpListKey]*100000000) valueee=[self.todayDate,GDPratios] if resultStockRecord is None: result=self.myCursor.execute("INSERT INTO `stock_gdp_ratios`(`date`,`ratios`) VALUES (%s,%s)",valueee) # print '--------------------------------------------------' # print resultStockDay # print type(resultStockDay[1]) mailer = MailSender.from_settings(self.settings) title="今日A股证券化率:"+str(GDPratios) body="证券化率:"+str(GDPratios)+"<br/>"+"总市值:"+str(resultStockDay[1])+"元" mailer.send(to=self.settings['SEND_TO_EMAIL'], subject=title, body=body,mimetype="text/html")
def __init__(self, **kwargs): super(Chehang168TestSpider, self).__init__(**kwargs) self.mailer = MailSender.from_settings(settings) self.counts = 0 self.carnum = 800000 self.DEVICE_ID = "5539468a883db5093a916df82dfeac8e" self._uab_collina = "156707458541401104823761" self.soucheAnalytics_usertag = "DhwfDpdHfx" self.U = "1497797_d51aecb5183ede2691a53a97a963906c" self.UserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" self.cookies = { 'DEVICE_ID': f'{self.DEVICE_ID}', '_uab_collina': f'{self._uab_collina}', 'soucheAnalytics_usertag': f'{self.soucheAnalytics_usertag}', 'U': f'{self.U}', } self.headers = { # 'User-agent': "{}".format(self.UserAgent), 'cookies': f"DEVICE_ID={self.DEVICE_ID}; _uab_collina={self._uab_collina}; soucheAnalytics_usertag={self.soucheAnalytics_usertag}; U={self.U}" # 'cookies': "DEVICE_ID={}; soucheAnalytics_usertag={}; U={}".format(self.DEVICE_ID, self.soucheAnalytics_usertag, self.U) } settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'koubei', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline')
def closed(self,reason): str = '' #conn = MySQLdb.connect(host='127.0.0.1',user='******',passwd='spider_user!@#',port=3306,db='db_spider',charset='utf8') #cur = conn.cursor() #mydict = {"name":"Lucy", "sex":"female","job":"nurse"} for index,item in enumerate(self.web_data_list): tmp = 'index:%d, userid:%s, author:%s,head_img:%s \n,age:%s,sex:%s, vote:%s,contentid:%s\n[%s]\n\n' % (index,item['userid'],item['author'],item['head_img'],item['age'],item['sex'],item['stats_vote'],item['contentid'],item['content']) str = str + tmp author=item['author'] content=item['content'] stats_vote = item['stats_vote'] contentid=item['contentid'] #sql="insert ignore into t_qiushi(author,content,vote,content_id) values('%s','%s','%s','%s')" % (author,content,stats_vote,contentid) #cur.execute(sql) #print str #conn.commit() #cur.close() #conn.close() #将爬取的数据发送邮件 settings = get_project_settings() mailer = MailSender.from_settings(settings)
def close_spider(self, spider): #self.cursor.commit() self.conn.commit() items_enviados = list() try: crs = self.cursor.execute( """select id, title, link from news where enviado=0""" ).fetchall() news = list() for item in crs: news.append('<p><a href="{link}">{title}</a></p>'.format( link=item[2], title=item[1])) items_enviados.append(str(item[0])) if len(news): mailer = MailSender.from_settings(self.settings) body = "<h1>Novidades NF-e!</h1><br><div>{body}</div>".format( body="".join(news)) #TODO: inserir aqui os emails de destino send_to = list() if len(send_to) > 0: print('mail enviado paraaaaaaaa ' + " ".join(send_to)) #mailer.send(to=send_to, subject='Novidades NF-e', body=body, mimetype="text/html") except Exception as e: print(e) pass else: if len(items_enviados) > 0: self.cursor.execute( """update news set enviado=1 where id in ({})""".format( ", ".join(items_enviados))) self.conn.commit() self.cursor.close() self.conn.close()
def closed(self,reason): #爬取完成后进行邮件通知 mailer = MailSender.from_settings(self.settings) body = '''本次爬取状态:{}\r\n本次爬取电影数量:{}\r\n本次爬取电影列表:{}'''.format(reason,self.crawler.stats.get_value('movie_count'), self.crawler.stats.get_value('movie_list')) subject = '天堂网电影爬取通知' mailer.send(to=["*****@*****.**"], subject=subject, body=body)
def from_crawler(cls, crawler): recipients = crawler.settings.getlist('STATUSMAILER_RECIPIENTS') compression = crawler.settings.get('STATUSMAILER_COMPRESSION') if not compression: compressor = PlainCompressor elif compression.lower().startswith('gz'): compressor = GzipCompressor else: raise NotConfigured if not recipients: raise NotConfigured mail = MailSender.from_settings(crawler.settings) instance = cls(recipients, mail, compressor, crawler) crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped) crawler.signals.connect(instance.spider_error, signal=signals.spider_error) crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed) crawler.signals.connect(instance.request_received, signal=signals.request_received) return instance
def __init__(self, **kwargs): # args super(CarSpider, self).__init__(**kwargs) #problem report self.mailer = MailSender.from_settings(settings) self.counts=0 # Mongo settings.set('CrawlCar_Num', carnum, priority='cmdline') settings.set('MONGODB_DB', 'usedcar', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline') #mysql # mysql mysqldb = MySQLdb.connect("192.168.1.94", "root", "Datauser@2017", "usedcar", port=3306) mysqldbc = mysqldb.cursor() # read mysqldbc.execute("select newcarurl from che58") items = mysqldbc.fetchall() self.urllist=[] df =pybloom.BloomFilter(carnum,0.01) for i in items: j=i[0] md5i= hashlib.md5(j) rf = df.add(md5i) if not rf: self.urllist.append(j)
def from_crawler(cls, crawler): recipients = crawler.settings.getlist("STATSMAILER_RCPTS") if not recipients: raise NotConfigured mail = MailSender.from_settings(crawler.settings) o = cls(crawler.stats, recipients, mail) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) return o
def from_crawler(cls, crawler): recipients = crawler.settings.getlist("STATSMAILER_RCPTS") if not recipients: raise NotConfigured mail = MailSender.from_settings(crawler.settings) # 连接基本配置 o = cls(crawler.stats, recipients, mail) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) return o
def __init__(self, **kwargs): super(CarSpider, self).__init__(**kwargs) self.mailer = MailSender.from_settings(settings) self.counts = 0 self.carnum = 800000 settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'koubei', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline')
def content_parse(self, response): selector = Selector(response=response) self.loopGet(self.savefont, response) contents = selector.xpath("//div[contains(@class,'koubei-final')]")[0] publishTime = contents.xpath( ".//div[contains(@class,'title-name')]/b/text()").extract_first() publishTitle = contents.xpath( ".//div[contains(@class,'kou-tit')]//text()").extract() publishTitle = ''.join(publishTitle) publishTitle = publishTitle.strip('\n').strip('\t').strip() contentsText = contents.xpath( ".//div[contains(@class,'text-con')]/text()|.//div[contains(@class,'text-con')]//span/text()" ).extract() contentsText = ''.join(contentsText) contentsText = contentsText.strip('\n').strip('\t').strip() contentsDic = dict() contentsDic['publishTime'] = publishTime contentsDic['publishTitle'] = publishTitle imageRecognizer = ImageRecognizer(orignText=contentsText, orignFont='temp.ttf') contentsDic['contentsText'] = ' '.join( imageRecognizer.outterCall().replace('\n', '').split()) entireContentsItem = EntireContentsItem() entireContentsItem['url'] = response.url entireContentsItem['specId'] = response.meta['specId'] entireContentsItem['commentsId'] = response.meta['commentsId'] entireContentsItem['leftBar'] = response.meta['nameValueDic'] ifRenzhen = selector.xpath("//i[@class='renzhen']").extract_first() if ifRenzhen: entireContentsItem['leftBar']['renzhen'] = True else: entireContentsItem['leftBar']['renzhen'] = False modelId = selector.xpath( "//dl[@class='choose-dl'][1]//a[1]/@href").extract_first().replace( '/', '') entireContentsItem['modelId'] = modelId entireContentsItem['contents'] = contentsDic allCommentsNum = selector.xpath( '//span[@id="Comment_{commentsId}"]//text()'.format( commentsId=entireContentsItem['commentsId'])).extract_first() if allCommentsNum is None: with open('temp.txt', 'rb') as writer: writer.write(response) mailer = MailSender.from_settings(settings) mailer.send('*****@*****.**', 'scrapy', 'allCommentsNum is None'.encode('utf-8'), charset='utf-8') try: allPagesNum = math.ceil(int(allCommentsNum) / 10) except Exception as e: logging.warning(e) #entireContentsItem['comments'] = self.getComments(entireContentsItem['commentsId'], allPagesNum) entireContentsItem['comments'] = list() entireContentsItem['scrapyTime'] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime()) yield entireContentsItem
def closed(self,reason): self.logger.info("Spider closed: %s"%str(reason)) mailer = MailSender.from_settings(self.settings) mailer.send( to=["*****@*****.**"], subject="Spider closed", body=str(self.crawler.stats.get_stats()), cc=["*****@*****.**"] )
def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() # crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) crawler.signals.connect(s.spider_closed, signal=signals.spider_closed) crawler.signals.connect(s.item_scraped, signal=signals.item_scraped) crawler.signals.connect(s.spider_error, signal=signals.spider_error) s.mail = MailSender.from_settings(crawler.settings) return s
def from_crawler(cls, crawler): mail_list = crawler.settings.getlist("ERRMAIL_LIST") if not mail_list: raise NotConfigured mail = MailSender.from_settings(crawler.settings) o = cls(crawler.stats, mail_list, mail) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) return o
def closed(self, reason): pdb.set_trace() self.logger.info("Spider closed: {}".format(reason)) mailer = MailSender.from_settings(self.settings) mailer.send( to=settings.ADMINS, subject="Spider closed", body=str(self.crawler.stats.get_stats()), )
def closed(self, reason): import pdb pdb.set_trace() self.logger.info("Spider closed: %s" % str(reason)) mailer = MailSender.from_settings(self.settings) mailer.send(to=["******@qq.com"], subject="Spider closed", body=str(self.crawler.stats.get_stats()), cc=["**********@xxxxxxxx.com"])
def __init__(self, **kwargs): # report bug session self.mailer = MailSender.from_settings(settings) self.counts = 0 self.carnum = 50000 # Mongo setting settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'newcar', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline')
def from_crawler(cls, crawler): recipients = crawler.settings.getlist("STATSMAILER_RCPTS") # 在 scrapy.settings 加上 PROJECT_NAME 字段,用于设定项目名 project_name = crawler.settings.get('PROJECT_NAME') if not recipients: raise NotConfigured mail = MailSender.from_settings(crawler.settings) o = cls(crawler.stats, recipients, mail, project_name) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) return o
def from_crawler(cls, crawler): mail = MailSender.from_settings(crawler.settings) instance = cls(mail) crawler.signals.connect(instance.spider_opened, signal=signals.spider_opened) crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed) crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped) return instance
def parse(self, response): mailer = MailSender.from_settings(settings) try: mailer.send(to=["*****@*****.**"],subject="scrapy spider",body="test message",cc=['*****@*****.**'],charset="utf-8") except Exception as e : msg = "Error occurred...{0}".format(str(e)) print(msg) print('mail sending')
def from_crawler(cls, crawler): recipients = crawler.settings.getlist("STATSMAILER_RCPTS") if not recipients: raise NotConfigured mail = MailSender.from_settings(crawler.settings) test_server = crawler.settings.getbool("TEST_SERVER") use_feed_export = crawler.settings.getbool("USE_FEED_EXPORT") o = cls(crawler.stats, recipients, mail, test_server, use_feed_export) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) return o
def __init__(self, **kwargs): super(TtpaiSpider, self).__init__(**kwargs) self.mailer = MailSender.from_settings(settings) self.counts = 0 self.carnum = 3000000 self.page = 1 # Mongo settings.set('CrawlCar_Num', self.carnum, priority='cmdline') settings.set('MONGODB_DB', 'carbusiness', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline')
def closed(self,reason): import pdb;pdb.set_trace() self.logger.info("Spider closed: %s"%str(reason)) mailer = MailSender.from_settings(self.settings) mailer.send( to=["******@qq.com"], subject="Spider closed", body=str(self.crawler.stats.get_stats()), cc=["**********@xxxxxxxx.com"] )
def close_spider(self, spider): self.persist_dict.close() if not self.email_list: return email_str = "\n\n".join(self.email_list) mailer = MailSender.from_settings(spider.settings) with open('list.csv', 'r') as csv_file: mailer.send( to = ["*****@*****.**"], subject = "Scrapy Info", body = email_str, attachs = [('scrapy_info.csv', 'text/csv', csv_file)], )
def parse(self, response): items = [ ] mailer = MailSender.from_settings(self.settings) sel = scrapy.Selector(response) posts = sel.xpath('//div [@class="wall_item"]') for post in posts: item = HonlineItem() #AUTHOR = post.xpath('.//div[1]//div[1]//div[1]//a[1]/text()').extract() #wi_head/wi_cont/wi_author/a item['post_link'] = str(post.xpath('.//div[1]//div[1]//div[2]//a[1]/@href').extract()[0]) item['post_time'] = str(post.xpath('.//div[1]//div[1]//div[2]//a[1]/text()').extract()[0]) item['key'] = (post.re('\d\d\d\d\d\d\d\d\d\d\d\d\d\d\d')) #" 289276165354594 " if len(item['key']) > 0: item['key'] = str(item['key'][0]) items.append(item) return items
def __init__(self, crawler): if not crawler.settings.getbool('MEMUSAGE_ENABLED'): raise NotConfigured try: self.resource = __import__('resource') except ImportError: raise NotConfigured self.crawler = crawler self.warned = False self.notify_mails = crawler.settings.getlist('MEMUSAGE_NOTIFY_MAIL') self.limit = crawler.settings.getint('MEMUSAGE_LIMIT_MB')*1024*1024 self.warning = crawler.settings.getint('MEMUSAGE_WARNING_MB')*1024*1024 self.report = crawler.settings.getbool('MEMUSAGE_REPORT') self.mail = MailSender.from_settings(crawler.settings) crawler.signals.connect(self.engine_started, signal=signals.engine_started) crawler.signals.connect(self.engine_stopped, signal=signals.engine_stopped)
def __init__(self, crawler): if not crawler.settings.getbool('MEMUSAGE_ENABLED'): raise NotConfigured try: # stdlib's resource module is only available on unix platforms. self.resource = import_module('resource') except ImportError: raise NotConfigured self.crawler = crawler self.warned = False self.notify_mails = crawler.settings.getlist('MEMUSAGE_NOTIFY_MAIL') self.limit = crawler.settings.getint('MEMUSAGE_LIMIT_MB')*1024*1024 self.warning = crawler.settings.getint('MEMUSAGE_WARNING_MB')*1024*1024 self.check_interval = crawler.settings.getfloat('MEMUSAGE_CHECK_INTERVAL_SECONDS') self.mail = MailSender.from_settings(crawler.settings) crawler.signals.connect(self.engine_started, signal=signals.engine_started) crawler.signals.connect(self.engine_stopped, signal=signals.engine_stopped)
def close_spider(self, spider): self.logfile.write("stock pipeline finish \n") pipelog = open("stockpipeline.txt") if spider.name == "nasdaq": # mail body mail_body = "please consider the following {count} stocks: \n".format(count=len(self.emailContent)) for name, content in self.emailContent.items(): mail_body += "{name} {currentprice} {yearlowprice} {yearhighprice} {sharevolume} \n".format( name=name, currentprice=content[0], yearlowprice=content[1], yearhighprice=content[2], sharevolume=content[3]) nasdaqlog = open("nasdaqcrawl.txt") attachment = [('nasdaqlog.txt', 'text/plain', nasdaqlog), ('pipelog.txt', 'text/plain', pipelog)] mailer = MailSender.from_settings(emailSettings()) mailer.send(to=["*****@*****.**"], subject='nasdaq spider finish', body=mail_body, cc=["*****@*****.**"], attachs=attachment) nasdaqlog.close() pipelog.close() self.logfile.close() self.session.close()
def from_crawler(cls,crawler): mailer = MailSender.from_settings(crawler.settings) spider = cls(mailer) crawler.signals.connect(spider.spider_closed,signals.spider_closed) crawler.signals.connect(spider.spider_error,signals.spider_error) return spider
def spider_closed(self, spider): spider.log("Generating status report", log.INFO) now = datetime.datetime.today() context = { 'spider': spider.name, 'date': now.strftime("%d %b %Y"), 'time': now.strftime("%H:%M"), 'checklists': 'No checklists downloaded', 'errors': 'No errors reported', 'warnings': 'No warnings reported', } checklists = getattr(spider, 'checklists', []) spider.log("%d checklists downloaded" % len(checklists), log.INFO) if checklists: summary = [] for checklist in checklists: if 'protocol' in checklist and 'time' in checklist['protocol']: time = checklist['protocol']['time'] else: time = '--:--' summary.append("%s %s, %s (%s)" % ( checklist['date'], time, unidecode(checklist['location']['name']), unidecode(checklist['source']['submitted_by']) )) context['checklists'] = '\n'.join(summary).encode('utf-8') errors = getattr(spider, 'errors', []) spider.log("%d errors reported" % len(errors), log.INFO) if errors: summary = [] for url, failure in errors: summary.append("URL: %s\n%s\n\n" % ( url, failure.getTraceback() )) context['errors'] = '\n'.join(summary).encode('utf-8') warnings = getattr(spider, 'warnings', []) spider.log("%d warnings reported" % len(warnings), log.INFO) if warnings: summary = [] for checklist, messages in warnings: if 'protocol' in checklist and 'time' in checklist['protocol']: time = checklist['protocol']['time'] else: time = '--:--' summary.append("%s %s, %s (%s)" % ( checklist['date'], time, unidecode(checklist['location']['name']), unidecode(checklist['source']['submitted_by']) )) summary.append("API: %s" % checklist['source']['api']) summary.append("URL: %s" % checklist['source']['url']) summary.extend(messages) summary.append('') context['warnings'] = '\n'.join(summary).encode('utf-8') report = self.template % context if spider.settings['LOG_LEVEL'] == 'DEBUG': directory = spider.settings['DOWNLOAD_DIR'] filename = os.path.join(directory, 'checklists_scrapers_status.txt') with open(filename, 'wb') as fp: fp.write(report) recipients = spider.settings['REPORT_RECIPIENTS'].strip() if recipients: mailer = MailSender.from_settings(spider.settings) addrs = [recipient.strip() for recipient in recipients.split(',')] mailer.send( to=addrs, subject="%s Status Report" % spider.name, body=report ) else: spider.log("No recipients listed to receive status report", log.INFO)