def main(): db_session = Session() scheduler = SpiderUploadNotificationScheduler() e = EmailNotifier(SMTP_USER, SMTP_PASS, SMTP_FROM, SMTP_HOST, SMTP_PORT) spider_uploads = db_session.query(SpiderUpload).all() for s in spider_uploads: if not s.user.email: continue if scheduler.should_send_initial(s): subject = 'Spider upload request %s' % s.spider_name text = 'A spider upload has been assigned to you:\n' text += 'Account: %s\n' % (s.account.name if s.account else 'New account') text += 'Spider: %s\n' % s.spider_name if s.notes: text += 'Notes: %s' % s.notes e.send_notification([s.user.email] + TO, subject, text) print s.user.email s.last_notification = datetime.now() db_session.add(s) elif scheduler.should_send_final(s): subject = 'Spider deployed %s' % s.spider_name text = 'The following spider has been deployed:\n' text += 'Account: %s\n' % (s.account.name if s.account else 'New account') text += 'Spider: %s\n' % s.spider_name if s.notes: text += 'Notes: %s' % s.notes e.send_notification([s.user.email] + TO, subject, text) s.last_notification = datetime.now() db_session.add(s) elif scheduler.should_send_reminder(s): subject = 'Spider upload reminder %s' % s.spider_name text = 'The following spider has been assigned to you:\n' text += 'Account: %s\n' % (s.account.name if s.account else 'New account') text += 'Spider: %s\n' % s.spider_name if s.notes: text += 'Notes: %s' % s.notes e.send_notification([s.user.email], subject, text) s.last_notification = datetime.now() db_session.add(s) db_session.commit()
def main(): db_session = Session() spider_usage = get_spiders_usage() now = datetime.datetime.now() for data in spider_usage: spider = data['spider'] cpu_usage = data['cpu_usage'] mem_usage = data['mem_usage'] usage = SpiderResourcesUsage() usage.spider_id = spider.id usage.worker_server_id = spider.worker_server_id usage.time = now usage.cpu_usage = cpu_usage usage.mem_usage = mem_usage db_session.add(usage) db_session.commit() db_session.close()
def export_delisted_duplicate_errors(self): website_id = self.current_crawl.spider.website_id crawl_id = self.current_crawl.id filename = '%s_%s_delisted_duplicate_errors.csv' % (website_id, crawl_id) filename_full = os.path.join(DATA_DIR, filename) errors_df = pd.DataFrame(self.errors, dtype=pd.np.str) try: errors_df.to_csv(filename_full, index=False, encoding='utf-8') except: errors_df.to_csv(filename_full, index=False) db_session = Session() dd_error = db_session.query(DelistedDuplicateError)\ .filter(DelistedDuplicateError.website_id == website_id, DelistedDuplicateError.crawl_id == crawl_id)\ .first() if not dd_error: dd_error = DelistedDuplicateError() dd_error.website_id = website_id dd_error.crawl_id = crawl_id dd_error.filename = filename db_session.add(dd_error) db_session.commit() db_session.close()
HERE = os.path.dirname(os.path.abspath(__file__)) product_spiders_root = os.path.dirname(HERE) project_root = os.path.dirname(product_spiders_root) sys.path.append(project_root) sys.path.append(os.path.join(project_root, 'product_spiders')) from product_spiders.db import Session from scrapy.utils.misc import walk_modules from scrapy.utils.spider import iter_spider_classes from productspidersweb.models import Spider print sys.path here = os.path.abspath(os.path.dirname(__file__)) db_session = Session() spider_modules = ['product_spiders.spiders'] for name in spider_modules: for module in walk_modules(name): for spider in iter_spider_classes(module): sp = db_session.query(Spider).filter( Spider.name == spider.name).first() if sp: sp.module = str(spider.__module__) db_session.add(sp) db_session.commit()