def _get_common_data(self, list_name, force=False): config = Config('./config/config.yml') loader = LoaderFactory.loader_gmaps_with_cache( gmaps_config=config.get('googlemaps'), storage_config=config.get('mongodb') ) document_factory = DocFactory(config.get('mongodb')) options = { 'loader': loader, 'doc_factory': document_factory, 'parser': getattr(MapFactory, self._country), 'force_update': force } storage = MongoDB('{}_{}'.format(list_name, self._country), config.get('mongodb')) return options, storage
from lib.job.storage.MongoDB import MongoDB as Storage from lib.job.map.google.AddressTask import AddressTask from lib.config.Yaml import Yaml as Config import pandas as pd country = 'Italy' lst_address = [] region_index = 1 provincia_index = 3 comune_index = 5 localita_index = 9 config = Config('./config/config.yml').get('mongodb') job_list = Storage(AddressTask.get_name(country), config) df = pd.read_csv('./data/italy/indicatori_2011_localita.csv', delimiter=";", skiprows=[1], encoding='ISO-8859-1') for index, row in df.iterrows(): print(index) try: new_address = 'Italia, ' if row[region_index]: new_address += row[region_index] if new_address not in lst_address: lst_address.append(new_address) job_list.add(new_address)
from lib.job.storage.MongoDB import MongoDB as Storage from lib.job.wiki.RequestTask import RequestTask from lib.config.Yaml import Yaml as Config import pandas as pd config = Config('./config/config.yml') country = 'France' job_list = Storage(RequestTask.get_name(country), config.get('mongodb')) df = pd.read_csv('./WorkBaseFile/BaseCommuneInInseeFR', delimiter="\t") for index, row in df.iterrows(): insee = row[0] job_list.add("insee+{insee}".format(insee=insee))
from lib.job.storage.MongoDB import MongoDB as Storage from lib.job.map.google.PositionTask import PositionTask from lib.config.Yaml import Yaml as Config from lib.factory.StorageLocation import StorageLocation as DocFactory country = 'Italia' config = Config('./config/config.yml').get('mongodb') job_list = Storage(PositionTask.get_name(country), config) factory = DocFactory(config) wiki = factory.wiki_collection() filter = { 'name': { '$exists': True, '$not': { '$size': 0 } }, 'admin_hierarchy': { '$elemMatch': { 'name': country } } } objects = wiki.find(filter) for obj in objects: try:
from lib.job.wiki.RequestTask import RequestTask from lib.config.Yaml import Yaml as Config import pandas as pd import urllib.parse country = 'Italy' lst_address = [] region_index = 1 provincia_index = 3 comune_index = 5 localita_index = 9 config = Config('./config/config.yml').get('mongodb') job_list = Storage(RequestTask.get_name(country), config) df = pd.read_csv('./data/italy/indicatori_2011_localita.csv', delimiter=";", skiprows=[1], encoding='ISO-8859-1') for index, row in df.iterrows(): print(index) try: new_address = 'Italia,' if row[1]: new_address += row[region_index] if new_address not in lst_address: lst_address.append(new_address) job_list.add(urllib.parse.quote(new_address))
from lib.config.Yaml import Yaml as Config from lib.factory.StorageLocation import StorageLocation as DocFactory import pandas as pd import urllib.parse country = 'Italia' lst_address = [] region_index = 1 provincia_index = 3 comune_index = 5 localita_index = 9 config = Config('./config/config.yml').get('mongodb') job_list = Storage(PageTask.get_name(country), config) factory = DocFactory(config) wiki = factory.wiki_collection() filter = { 'name': { '$exists': True, '$not': { '$size': 0 } }, 'admin_hierarchy': { '$elemMatch': { 'name': country }
from lib.job.storage.MongoDB import MongoDB as Storage from lib.job.wiki.RequestTask import RequestTask from lib.config.Yaml import Yaml as Config import pandas as pd from lib.job.wiki.PageRecursiveTask import PageRecursiveTask config = Config('./config/config.yml') country = 'Italy' max_dig_level = 4 job_list = Storage(PageRecursiveTask.get_name(country), config.get('mongodb')) df = pd.read_csv('./WorkBaseFile/ItalyUrlMainList', delimiter="\t") for index, row in df.iterrows(): link = row[0] job_list.add({'link': link, 'level': max_dig_level})