예제 #1
0
 def __init__(self, spec: CrawlSpecification, settings: {}):
     super().__init__(spec)
     self.log = shared.simple_logger("RemoteCrawlFinalizer",
                                     file_path=os.path.join(
                                         self.crawl_specification.logs,
                                         self.crawl_specification.name,
                                         "scrapy.log"))
 def __init__(self, logname="scrapy_wrapper", spec=None, **kwargs):
     super().__init__(**kwargs)
     if spec.logs:
         self.logger = shared.simple_logger(loger_name="linkextractor",
                                            file_path=os.path.join(
                                                spec.logs,
                                                logname + ".log"))
        def __init__(self):
            super().__init__()
            # setup individual logger for every spider
            if self.crawl_specification.logs:
                self.s_log = shared.simple_logger(
                    loger_name="crawlspider",
                    file_path=os.path.join(self.crawl_specification.logs,
                                           self.name + ".log"))
            else:
                self.s_log = shared.simple_logger(loger_name="crawlspider")

            # enter spider to parser
            self.parser.spider = self

            for hand in self.s_log.handlers:
                self.logger.logger.addHandler(hand)
            self.s_log.info("[__init__] - Crawlspider logger setup finished.")
def run_crawl(call_parameter, worker_flag=False):
    """Run crawl with given parameter."""
    global MLOG
    # setup consistent language detection
    DetectorFactory.seed = 0

    if os.path.exists(call_parameter):
        crawl_specification = load_settings(call_parameter)
    else:
        # assume the first parameter to be the json string
        crawl_specification = CrawlSpecification()
        crawl_specification.deserialize(call_parameter)

    if not crawl_specification:
        MLOG.error(
            "Crawl settings could not be loaded. Exiting scrapy_wrapper.")
        sys.exit(1)

    scrapy_settings = GenericScrapySettings()
    if crawl_specification.logs:
        if not os.path.exists(crawl_specification.logs):
            os.makedirs(crawl_specification.logs, exist_ok=True)
        # reset the master log for the wrapper to include file logging
        MLOG = shared.simple_logger(loger_name="scrapy_wrapper",
                                    file_path=os.path.join(
                                        crawl_specification.logs,
                                        "scrapy_wrapper.log"),
                                    file_level=log_level)
        # specifically assign a log file for scrapy
        scrapy_settings.set(
            "LOG_FILE", os.path.join(crawl_specification.logs, "scrapy.log"))

    scrapy_settings.set("ITEM_PIPELINES", crawl_specification.pipelines)

    MLOG.info("Initiating scrapy crawler process")
    process = CrawlerProcess(settings=scrapy_settings)
    start_urls = list(set(crawl_specification.urls))
    allowed_domains = list(map(lambda x: urlparse(x).netloc, start_urls))
    for url in start_urls:
        name = shared.url2filename(url)
        MLOG.info("Creating spider {0}".format(name))
        process.crawl(create_spider(crawl_specification, url, name))
    try:
        process.start()
    except Exception as exc:
        MLOG.exception("{0}: {1}".format(type(exc).__name__, exc))

    # every spider finished, finalize crawl
    for finalizer_path in crawl_specification.finalizers:
        finalizer = shared.get_class(finalizer_path)
        if finalizer:
            # somehow pass the collected language statistics from parser
            finalizer(crawl_specification, crawl_specification.
                      finalizers[finalizer_path]).finalize_crawl()

    if worker_flag == True:
        return True
def test_finalize_paragraphs():
    """Correct finalization for crawl with csv data."""

    logger = shared.simple_logger("RemoteCrawlFinalizer",
                                  file_path=os.path.join(
                                      'result_logs', 'my_crawl', "scrapy.log"))

    finalize_paragraphs('my_crawl', 'result_data', 'result_logs', logger)

    assert True == False
LANGSTATS = pandas.DataFrame(columns=ACCEPTED_LANG)
LANGSTATS.index.name = "url"

DEBUG = False
if len(sys.argv) >= 3 and sys.argv[2] == "DEBUG":
    DEBUG = True

if DEBUG:
    log_level = logging.DEBUG
else:
    log_level = logging.INFO

VERSION = "0.4.2"

# Prepare logging, before reading specification only log on console
MLOG = shared.simple_logger(loger_name="scrapy_wrapper")
MLOG.info("Running scrapy_wrapper on version {}".format(VERSION))


def load_settings(settings_path) -> CrawlSpecification:
    """
    Loads the json string in settings_path and deserializes to CrawlSpecification object.
    Determines required behaviour with respect to CrawlSpecification.mode.
    :param settings_path: file path to the json crawl specification file
    :return: parsed and semantically updated CrawlSpecification object
    """
    try:
        settings_file = open(settings_path, "r")
        settings = CrawlSpecification()
        json_str = settings_file.read()
        settings.deserialize(json_str)