示例#1
0
def matching_france(region=None):
    mode = request.args.get('mode', 'none')
    if region is None:
        config = Config('./config/config.yml')

        factory = DocFactory(config.get('mongodb'))
        internal = factory.internal_collection()
        objects = internal.aggregate([{
            '$match': {
                'name': {
                    '$exists': True,
                    '$not': {
                        '$size': 0
                    }
                },
                '$and': [{
                    'admin_hierarchy.ADMIN_LEVEL_1.name': 'France'
                }]
            }
        }, {
            '$group': {
                '_id': '$admin_hierarchy.ADMIN_LEVEL_2.name',
                'count': {
                    '$sum': 1
                }
            }
        }])
        return render_template('admin/matching-france/region-list.html',
                               data=objects,
                               mode=mode)
    else:
        return render_template('admin/matching-france/list.html',
                               region=region,
                               mode=mode)
from lib.factory.StorageLocation import StorageLocation as DocFactory
from lib.factory.Loader import Loader as LoaderFactory
from lib.config.Yaml import Yaml as Config
from lib.parser.wiki.France import France as WikiParser


config = Config('./config/config.yml')
document_factory = DocFactory(config.get('mongodb'))

url = 'https://fr.wikipedia.org/wiki/Paris'
headers = {'User-Agent': 'Mozilla/5.0'}

loader = LoaderFactory.loader_with_mongodb(config.get('mongodb'))

content, code = loader.load(url, headers=headers)

parser = WikiParser(content)

doc = document_factory.wiki(url)

print('.' if doc.is_new() else 'E', end='')

document = doc.get_document()

print('.' if 'code' in document else 'E', end='')

doc.update(parser.as_dictionary())

dic = doc.get_document()

print('.' if dic.get('name') == 'Paris' else 'E', end='')
示例#3
0
config = Config('./config/config.yml')

arg_parser = ArgumentParser(
    description='Download data from wiki by link or search request')
arg_parser.add_argument('-f', help='turn on the force mode')
arg_parser.add_argument('-l', help='custom link to page with result(s)')
opts = arg_parser.parse_args()

insee_index = 0
name_index = 1
population_index = 2
force_update = opts.f
headers = {'User-Agent': 'Mozilla/5.0'}
loader = Loader.loader_with_mongodb(config.get('mongodb'))
document_factory = DocFactory(config.get('mongodb'))
log = FileLog('./log/wiki_page_italy_{date}.log'.format(
    date=datetime.datetime.now().strftime('%Y-%m-%d')))
log.add('Start', log.INFO)
log.add('Params: [{0}]'.format(repr(opts).encode('utf-8')), log.INFO)

message_format = 'Parsing request:[{0}]'

use_link = bool(opts.l)
custom_link = opts.l if use_link else ''


def update_meta(url, request, document):
    actual_doc = document.get_document()
    actual_doc.update(url=url)
    added_requests = [tuple(x) for x in actual_doc.get('requests', ())]
lst_address = []

region_index = 1
provincia_index = 3
comune_index = 5
localita_index = 9
altitude_index = 13
codloc_index = 8
loc2011_index = 7
procom_index = 6
codcom_index = 4
codpro_index = 2
codreg_index = 0

config = Config('./config/config.yml')
doc_factory = DocFactory(config.get('mongodb'))
language='it'

spider = Spider(
    loader_factory=LoaderFactory,
    gmap_parser=MapFactory.italy,
    wiki_parser=ParserItalyWiki,
    doc_factory=doc_factory,
    language=language,
    config=config,
    use_cache=True
)

def gmap_by_address(address):
    objects = spider.get_gmap_address(address)
示例#5
0
from pymongo import MongoClient
from lib.config.Yaml import Yaml as Config
from lib.factory.StorageLocation import StorageLocation as DocFactory


config = Config('./config/config.yml').get('mongodb')
connection = MongoClient(config['host'], config['port'])

factory = DocFactory(config)


wiki = factory.wiki_collection()

wiki.drop_indexes()

wiki.create_index([('_id', 1)])
wiki.create_index([('code', 1)])
wiki.create_index([('name', 1)])
wiki.create_index([('admin_hierarchy', 1)])


gmaps = factory.gmaps_collection()

gmaps.drop_indexes()

gmaps.create_index([('_id', 1)])
gmaps.create_index([('code', 1)])
gmaps.create_index([('name', 1)])
gmaps.create_index([('admin_hierarchy', 1)])

示例#6
0
from lib.config.Yaml import Yaml as Config
from lib.logger.File import File as FileLog
from lib.factory.StorageLocation import StorageLocation as DocFactory
from argparse import ArgumentParser

arg_parser = ArgumentParser(description='Download data from gmaps by address')
arg_parser.add_argument('-f', help='turn on the force mode')
arg_parser.add_argument('-a', help='address')
opts = arg_parser.parse_args()

config = Config('./config/config.yml')

loader = LoaderFactory.loader_gmaps_with_cache(
    gmaps_config=config.get('googlemaps'),
    storage_config=config.get('mongodb'))
document_factory = DocFactory(config.get('mongodb'))
log = FileLog('./log/gmaps_address_france_{date}.log'.format(
    date=datetime.datetime.now().strftime('%Y-%m-%d')))
log.add('Start', log.INFO)
log.add('Params: [{0}]'.format(repr(opts).encode('utf-8')), log.INFO)

use_address = bool(opts.a)
address = opts.a if use_address else ''
force_update = opts.f


def update_meta(request, document):
    actual_doc = document.get_document()
    added_requests = [(tuple(x) if isinstance(x, list) else x)
                      for x in actual_doc.get('requests', ())]
    added_requests.append(request)
from lib.job.storage.MongoDB import MongoDB as Storage
from lib.job.map.google.PositionTask import PositionTask
from lib.config.Yaml import Yaml as Config
from lib.factory.StorageLocation import StorageLocation as DocFactory

country = 'Italia'

config = Config('./config/config.yml').get('mongodb')

job_list = Storage(PositionTask.get_name(country), config)

factory = DocFactory(config)
wiki = factory.wiki_collection()

filter = {
    'name': {
        '$exists': True,
        '$not': {
            '$size': 0
        }
    },
    'admin_hierarchy': {
        '$elemMatch': {
            'name': country
        }
    }
}

objects = wiki.find(filter)
for obj in objects:
    try:
示例#8
0
from lib.parser.wiki.France import France as ParserFranceWiki
from lib.factory.Loader import Loader as LoaderFactory
import csv
from lib.parser.map.google.GMapFactory import GMapFactory as MapFactory
from lib.spider.Spider import Spider

files = [
    #'data/france/Departements_28_08_17_cards.csv',
    #'data/france/arrondissements_25_08_17_cards.csv',
    'data/france/20_08_17_canton_google_3.csv',
    'data/france/communes_17_09_17.csv'
]

config = Config('./config/config.yml')

doc_factory = DocFactory(config.get('mongodb'))
language = 'fr'

spider = Spider(loader_factory=LoaderFactory,
                gmap_parser=MapFactory.france,
                wiki_parser=ParserFranceWiki,
                doc_factory=doc_factory,
                language=language,
                config=config,
                use_cache=True)

internal_collection = doc_factory.internal_collection()
i = 0
hash_lib = hash()
for csv_file in files:
    with open(csv_file, encoding='utf-8') as admin_div_CSV:
示例#9
0
from lib.spider.Spider import Spider
from lib.parser.map.google.GMapFactory import GMapFactory as MapFactory
from lib.factory.Loader import Loader
import math
import wikipedia
import datetime
import sys
from lib.parser.wiki.Spain import Spain as WikiES
from lib.logger.File import File as FileLog
from argparse import ArgumentParser

# from lib.parser.wiki.Spain import Spain as ParserSpain
country = 'Spain'

config = Config('./config/config.yml')
document_factory = DocFactory(config.get('mongodb'))
df = pd.read_csv('./data/spain/Spain_notDublicate.csv', skiprows=[1])
# print(config)
language = 'es'
# spider = Spider(
#     loader_factory=LoaderFactory,
#     gmap_parser=MapFactory.spain,
#     wiki_parser=True,
#     doc_factory=doc_factory,
#     language=language,
#     config=config,
#     use_cache=True
# )
loader = Loader.loader_with_mongodb(config.get('mongodb'))

示例#10
0
def insee_code_unit(id):
    config = Config('./config/config.yml')
    factory = DocFactory(config.get('mongodb'))
    collection = factory.insee_collection()
    obj = collection.find_one({'code': id})
    return render_template('admin/other/unit.html', data=obj)
示例#11
0
def matching_france_js(region):
    region = unquote_plus(region)
    #mode = request.args.get('mode', 'none')
    config = Config('./config/config.yml')

    factory = DocFactory(config.get('mongodb'))
    internal = factory.internal_collection()
    wiki = factory.wiki_collection()
    gmap = factory.gmaps_collection()
    insee = factory.insee_collection()
    objects = internal.find({
        'name': {
            '$exists': True,
            '$not': {
                '$size': 0
            }
        },
        '$and': [{
            'admin_hierarchy.ADMIN_LEVEL_1.name': 'France'
        }, {
            'admin_hierarchy.ADMIN_LEVEL_2.name': region
        }],
    })
    result = []
    for item in objects:
        dic = {'internal': item}

        wiki_res = {}
        if item.get('source', {}).get('wiki'):
            wiki_res = wiki.find_one(
                {'code': item.get('source', {}).get('wiki')})

        dic.update(wiki=wiki_res)

        gmap_res = {}
        if item.get('source', {}).get('gmap'):
            gmap_res = gmap.find_one(
                {'code': item.get('source', {}).get('gmap')})

        dic.update(gmap=gmap_res)

        insee_res = {}
        if item.get('source', {}).get('insee'):
            insee_res = insee.find_one(
                {'code': item.get('source', {}).get('insee')})

        dic.update(insee=insee_res)

        compare_res = {}
        compare_res.update({
            'insee_code!=wiki_code':
            1 if not (insee_res.get('InseeXls_CodeCommune')
                      == wiki_res.get('commune_codes')) else 0
        })
        compare_res.update({
            'insee_name!=wiki_name':
            1 if
            not (insee_res.get('InseeXls_NameCommune') == wiki_res.get('name'))
            else 0
        })
        compare_res.update({
            'wiki_name!=gmaps_name':
            1
            if not (wiki_res.get('true_name', wiki_res.get('name'))
                    == gmap_res.get('true_name', gmap_res.get('name'))) else 0
        })
        compare_res.update({
            'wiki_post!=gmaps_post':
            1 if not (str(wiki_res.get('postal_codes')) == str(
                gmap_res.get('postal_code'))) else 0
        })
        compare_res.update({
            'wiki_admin!=gmaps_admin':
            1 if not (str(wiki_res.get('admin_hierarchy')) == str(
                gmap_res.get('admin_hierarchy'))) else 0
        })
        try:
            max_meters_in_distance = 5000
            compare_res.update({
                'wiki_posinion>gmaps_position':
                1 if Comparison.by_distance(wiki_res.get('center'),
                                            gmap_res.get('center')) >
                max_meters_in_distance else 0
            })
        except:
            compare_res.update({'wiki_posinion>gmaps_position': 1})
        dic.update(compare=compare_res)

        #        if mode != 'none':
        #            if mode == 'wiki_adapte':
        #                if dic.get('wiki', {}).get('name', '').lower() != dic.get('insee', {}).get('name', '').lower():
        #                    result.append(dic)
        #            elif mode == 'gmap_adapte':
        #                if dic.get('gmap', {}).get('name', '').lower() != dic.get('insee', {}).get('name', '').lower():
        #                    result.append(dic)
        #        else:
        #            result.append(dic)

        result.append(dic)

    return render_template('admin/matching-france/list.js',
                           e=escape,
                           items=result)
示例#12
0
from lib.parser.wiki.France import France as ParserFranceWiki
from lib.factory.Loader import Loader as LoaderFactory
import csv
from lib.parser.map.google.GMapFactory import GMapFactory as MapFactory
from lib.spider.Spider import Spider

files = [
    'data/france/Departements_28_08_17_cards.csv',
    'data/france/arrondissements_25_08_17_cards.csv',
    'data/france/20_08_17_canton_google_3.csv',
    'data/france/communes_17_09_17.csv'
]

config = Config('./config/config.yml')

doc_factory = DocFactory(config.get('mongodb'))
language = 'fr'

spider = Spider(loader_factory=LoaderFactory,
                gmap_parser=MapFactory.france,
                wiki_parser=ParserFranceWiki,
                doc_factory=doc_factory,
                language=language,
                config=config,
                use_cache=True)


def gmap_by_address(wiki):

    address = []
    for name, value in wiki.get('admin_hierarchy', {}).items():
示例#13
0
from lib.factory.StorageLocation import StorageLocation as DocFactory
from lib.config.Yaml import Yaml as Config
import re
subject = "replacing the leftmost non-overlapping"
re.sub('e', 'E', subject)

config = Config('./config/config.yml')

doc_factory = DocFactory(config.get('mongodb'))

gmap_docs = doc_factory.gmaps_collection()
wiki_docs = doc_factory.wiki_collection()

for gmap_doc in gmap_docs.find():
    print(gmap_doc.get('name'), "\n")
    if gmap_doc.get('name'):
        gmap_doc.update(true_name=gmap_doc.get('name'))
        gmap_docs.update_one({'code': gmap_doc.get('code')},
                             {'$set': gmap_doc})

for wiki_doc in wiki_docs.find():
    print(wiki_doc.get('name'), "\n")
    if wiki_doc.get('name'):
        true_name = wiki_doc.get('name')
        true_name = re.sub('\s+\([^\)]+\)$', '', true_name)
        wiki_doc.update(true_name=true_name)
        wiki_docs.update_one({'code': wiki_doc.get('code')},
                             {'$set': wiki_doc})
from lib.factory.StorageLocation import StorageLocation as DocFactory
from lib.factory.Loader import Loader
from lib.config.Yaml import Yaml as Config
from lib.parser.map.google.GMapFactory import GMapFactory as MapFactory
from lib.factory.Loader import Loader as LoaderFactory
from lib.parser.wiki.France import France as ParserFranceWiki
from lib.spider.Spider import Spider

config = Config('./config/config.yml')

doc_factory = DocFactory(config.get('mongodb'))

internal_docs = doc_factory.internal_collection()
gmap_docs = doc_factory.gmaps_collection()
wiki_docs = doc_factory.wiki_collection()

language = 'fr'

gmap_config = config.get('googlemaps')
gmap_config.update(language=language)

gmap_loader = Loader.loader_gmaps_with_cache(
    gmaps_config=gmap_config, storage_config=config.get('mongodb'))

document_filter = {
    'name': {
        '$exists': True,
        '$not': {
            '$size': 0
        }
    },