Пример #1
0
class ScrapersRunInterface(object):
    tasks = TASKS
    db = db_connection()

    @classmethod
    def crawl(cls, task_name):
        scraper_module = cls.tasks[task_name]
        scraper_instance = ScrapersFactory.run_instance(scraper_module)
        cls.saveStats(task_name, scraper_instance)
        return scraper_instance

    @classmethod
    def saveStats(cls, task_name, scraper_instance):
        # remove dots due to
        # bson.errors.InvalidDocument: key 'response_handler.task_repo' must not contain '.'
        timers = {
            k.replace('.', '_'): v
            for k, v in scraper_instance.stat.timers.items()
        }
        stats = {
            'task_name': task_name,
            'crawler': cls.tasks[task_name],
            'counters': scraper_instance.stat.counters,
            'timers': timers,
            '_created': datetime.utcnow(),
        }
        cls.db['scrapersStats'].insert(stats)
        return stats
Пример #2
0
from datetime import datetime
from contextlib import contextmanager

from sqlalchemy import Column, DateTime, ForeignKey, Float
from sqlalchemy import Integer, String, Text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import scoped_session, sessionmaker, backref
from sqlalchemy.orm import relationship, class_mapper, ColumnProperty
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy import create_engine

from core.errors import DBError
from core.models.artifact import Artifact
import settings

engine = create_engine(settings.db_connection())
Session = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base()
Base.metadata.bind = engine
metadata = Base.metadata


def next_uuid():
    return str(uuid.uuid4())


def create_session():
    """Create and return session for db."""
    return scoped_session(Session)

Пример #3
0
 def db(self):
     return db_connection()
Пример #4
0
        '-S', '--scrape-blogs', type=str,
        help='scrape blogs from provided *.opml file')
    parser.add_argument(
        '--stats', action="store_true", default=False,
        help='print some stats'
    )

    args = parser.parse_args()

    if args.scrape_blogs:
        spider = DataScienceBlogsSpider.get_instance(
            data_science_blogs_list=args.scrape_blogs)
        spider.run()
        # logger.info(spider.render_stats())
    elif args.stats:
        db = db_connection()
        print("Blogs in database: {}".format(
            db['blogs'].count()
        ))

        # calculate top 10 tags
        print("Top 10 tags:")
        pipeline = [
            {"$match": {"content.tags": {"$not": {"$size": 0}}}},
            {"$project": {"content.tags": 1}},
            {"$unwind": "$content.tags"},
            {"$group": {"_id": "$content.tags", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}},
            {"$limit": 10}
        ]
        results = db.command("aggregate", "blogs", pipeline=pipeline)