from argparse import ArgumentParser config = Config('./config/config.yml') arg_parser = ArgumentParser( description='Download data from wiki by link or search request') arg_parser.add_argument('-f', help='turn on the force mode') arg_parser.add_argument('-l', help='custom link to page with result(s)') opts = arg_parser.parse_args() insee_index = 0 name_index = 1 population_index = 2 force_update = opts.f headers = {'User-Agent': 'Mozilla/5.0'} loader = Loader.loader_with_mongodb(config.get('mongodb')) document_factory = DocFactory(config.get('mongodb')) log = FileLog('./log/wiki_page_italy_{date}.log'.format( date=datetime.datetime.now().strftime('%Y-%m-%d'))) log.add('Start', log.INFO) log.add('Params: [{0}]'.format(repr(opts).encode('utf-8')), log.INFO) message_format = 'Parsing request:[{0}]' use_link = bool(opts.l) custom_link = opts.l if use_link else '' def update_meta(url, request, document): actual_doc = document.get_document() actual_doc.update(url=url)
from lib.factory.StorageLocation import StorageLocation as DocFactory from lib.factory.Loader import Loader as LoaderFactory from lib.config.Yaml import Yaml as Config from lib.parser.wiki.France import France as WikiParser config = Config('./config/config.yml') document_factory = DocFactory(config.get('mongodb')) url = 'https://fr.wikipedia.org/wiki/Paris' headers = {'User-Agent': 'Mozilla/5.0'} loader = LoaderFactory.loader_with_mongodb(config.get('mongodb')) content, code = loader.load(url, headers=headers) parser = WikiParser(content) doc = document_factory.wiki(url) print('.' if doc.is_new() else 'E', end='') document = doc.get_document() print('.' if 'code' in document else 'E', end='') doc.update(parser.as_dictionary()) dic = doc.get_document() print('.' if dic.get('name') == 'Paris' else 'E', end='')
from lib.config.Yaml import Yaml as Config from lib.factory.Loader import Loader as LoaderFactory from lib.factory.StorageLocation import StorageLocation as DocFactory from lib.parser.wiki.Italy import Italy from lib.logger.MongoDB import MongoDB as Log from time import sleep force = True config = Config('./config/config.yml') country = 'Italia' options = {} loader = LoaderFactory.loader_with_mongodb( storage_config=config.get('mongodb')) options.update(loader=loader) doc_factory = DocFactory(config.get('mongodb')) options.update(doc_factory=doc_factory) options.update(force_update=force) options.update(parser=Italy) options.update(headers={'User-Agent': 'Mozilla/5.0'}) storage = Storage(job_name=PageTask.get_name(country), storage_config=config.get('mongodb')) log = Log(log_name=PageTask.get_name(country), config=config.get('mongodb')) task_list = TaskListMongoDB(task_type=PageTask.get_name(country),