class ScrapersRunInterface(object): tasks = TASKS db = db_connection() @classmethod def crawl(cls, task_name): scraper_module = cls.tasks[task_name] scraper_instance = ScrapersFactory.run_instance(scraper_module) cls.saveStats(task_name, scraper_instance) return scraper_instance @classmethod def saveStats(cls, task_name, scraper_instance): # remove dots due to # bson.errors.InvalidDocument: key 'response_handler.task_repo' must not contain '.' timers = { k.replace('.', '_'): v for k, v in scraper_instance.stat.timers.items() } stats = { 'task_name': task_name, 'crawler': cls.tasks[task_name], 'counters': scraper_instance.stat.counters, 'timers': timers, '_created': datetime.utcnow(), } cls.db['scrapersStats'].insert(stats) return stats
from datetime import datetime from contextlib import contextmanager from sqlalchemy import Column, DateTime, ForeignKey, Float from sqlalchemy import Integer, String, Text from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import scoped_session, sessionmaker, backref from sqlalchemy.orm import relationship, class_mapper, ColumnProperty from sqlalchemy.exc import SQLAlchemyError from sqlalchemy import create_engine from core.errors import DBError from core.models.artifact import Artifact import settings engine = create_engine(settings.db_connection()) Session = sessionmaker(autocommit=False, autoflush=False, bind=engine) Base = declarative_base() Base.metadata.bind = engine metadata = Base.metadata def next_uuid(): return str(uuid.uuid4()) def create_session(): """Create and return session for db.""" return scoped_session(Session)
def db(self): return db_connection()
'-S', '--scrape-blogs', type=str, help='scrape blogs from provided *.opml file') parser.add_argument( '--stats', action="store_true", default=False, help='print some stats' ) args = parser.parse_args() if args.scrape_blogs: spider = DataScienceBlogsSpider.get_instance( data_science_blogs_list=args.scrape_blogs) spider.run() # logger.info(spider.render_stats()) elif args.stats: db = db_connection() print("Blogs in database: {}".format( db['blogs'].count() )) # calculate top 10 tags print("Top 10 tags:") pipeline = [ {"$match": {"content.tags": {"$not": {"$size": 0}}}}, {"$project": {"content.tags": 1}}, {"$unwind": "$content.tags"}, {"$group": {"_id": "$content.tags", "count": {"$sum": 1}}}, {"$sort": {"count": -1}}, {"$limit": 10} ] results = db.command("aggregate", "blogs", pipeline=pipeline)