Exemplo n.º 1
0
def main_routine():
    t = time()
    with open("config.yaml", "r") as f:
        conf = yaml.load(f)
    set_up_logging(conf[const.CONF_LOGGING])
    logger = logging.getLogger("Main")

    db = storage.MongoDatabase(conf=conf[const.CONF_MONGO])

    scrape.Hlasovanie(db, conf).store_all()
    html_parser.Hlasovanie(db, conf).parse_all()
    scrape.Poslanec(db, conf).store_all()
    html_parser.Poslanec(db, conf).parse_all()
    scrape.Zakon(db, conf).store_all()
    html_parser.Zakon(db, conf).parse_all()
    scrape.LegislativnaIniciativa(db, conf).store_all()
    html_parser.LegislativnaIniciativa(db, conf).parse_all()
    scrape.HlasovanieTlace(db, conf).store_all()
    html_parser.HlasovanieTlace(db, conf).parse_all()
    scrape.Zmena(db, conf).store_all()
    html_parser.Zmena(db, conf).parse_all()
    scrape.Rozprava(db, conf).store_all()
    html_parser.Rozprava(db, conf).parse_all()

    logger.info("Total elapsed time after scrape + parse: %f", time() - t)

    processing.NodesHlasovanie(db, conf).process_and_store_all()
    processing.NodesPoslanec(db, conf).process_and_store_all()
    processing.NodesKlub(db, conf).process_and_store_all()
    processing.NodesVybor(db, conf).process_and_store_all()
    processing.NodesDelegacia(db, conf).process_and_store_all()
    processing.NodesZakon(db, conf).process_and_store_all()
    processing.NodesSpektrum(db, conf).process_and_store_all()
    processing.NodesZmena(db, conf).process_and_store_all()
    processing.NodesRozprava(db, conf).process_and_store_all()

    logger.info("Total elapsed time after nodes insert: %f", time() - t)

    processing.EdgesPoslanecKlubClen(db, conf).process_and_store_all()
    processing.EdgesPoslanecKlubBolClenom(db, conf).process_and_store_all()
    processing.EdgesPoslanecVyborClen(db, conf).process_and_store_all()
    processing.EdgesPoslanecDelegaciaClen(db, conf).process_and_store_all()
    processing.EdgesPoslanecHlasovanieHlasoval(db,
                                               conf).process_and_store_all()
    processing.EdgesVyborZakonNavrhnuty(db, conf).process_and_store_all()
    processing.EdgesVyborZakonGestorsky(db, conf).process_and_store_all()
    processing.EdgesPoslanecZakonNavrhol(db, conf).process_and_store_all()
    processing.EdgesKlubSpektrumClen(db, conf).process_and_store_all()
    processing.EdgesSpektrumZakonNavrhol(db, conf).process_and_store_all()
    processing.EdgesHlasovanieZakonHlasovaloO(db, conf).process_and_store_all()
    processing.EdgesPoslanecZmenaNavrhol(db, conf).process_and_store_all()
    processing.EdgesPoslanecZmenaPodpisal(db, conf).process_and_store_all()
    processing.EdgesZmenaZakonNavrhnuta(db, conf).process_and_store_all()
    processing.EdgesHlasovanieZmenaHlasovaloO(db, conf).process_and_store_all()
    processing.EdgesPoslanecRozpravaVystupil(db, conf).process_and_store_all()
    processing.EdgesRozpravaZakonTykalaSa(db, conf).process_and_store_all()

    logger.info("Total elapsed time after edges insert: %f", time() - t)
from multiprocessing import Pool as ThreadPool

import validictory
from validictory import SchemaValidator

from schema.validate.sopr_html import transformed_ld1_schema,\
    transformed_ld2_schema
from utils.validate import validate_uuid, validate_url, validate_email
from utils import set_up_logging
from settings import TRANS_DIR

format_validators = {"uuid_hex": validate_uuid,
                     "url_http": validate_url,
                     "email": validate_email}

log = set_up_logging('validate', loglevel=logging.DEBUG)

required_by_default = True

blank_by_default = False

disallow_unknown_properties = True

apply_default_to_data = False

validator = SchemaValidator(format_validators, required_by_default,
                            blank_by_default, disallow_unknown_properties,
                            apply_default_to_data)


def log_result(result):
import os
import logging
import time

from multiprocessing.dummy import Pool as ThreadPool

from utils import set_up_logging

log = set_up_logging('download', loglevel=logging.DEBUG)


# GENERAL DOWNLOAD FUNCTIONS
def response_download(response, output_loc):
    if response.ok:
        try:
            with open(output_loc, 'wb') as output_file:
                for chunk in response.iter_content():
                    output_file.write(chunk)
            return response.headers.get('content-length', 'N/A')
        except Exception as e:
            log.error(e)
    else:
        log.error('response not okay: '+response.reason)
        raise Exception('didn''t work, trying again')


def log_result(result):
    if result[0] == 'success':
        url, loc, content_length = result[1:]
        log.info(
            'success: {source} => {dest}({size})'.format(
Exemplo n.º 4
0
def configure_worker(sender=None, **extra):
    from utils import set_up_logging
    set_up_logging()
Exemplo n.º 5
0
import viewer


def create_app():
    app = flask.Flask(__name__, instance_relative_config=True)
    app.config.from_pyfile('settings.py', silent=True)
    data.initialize(app)
    viewer.initialize(app)
    return app


manager = Manager(create_app)

data.register_commands(manager)


@manager.option('-s', '--socket')
def runfcgi(socket):
    from flup.server.fcgi import WSGIServer
    app = create_app()
    WSGIServer(app, debug=app.debug, bindAddress=socket, umask=0).run()


if __name__ == '__main__':
    from utils import set_up_logging
    set_up_logging()
    manager.run()

else:
    app = create_app()
import os
import re
import sys
import logging
import json

from glob import iglob
from collections import defaultdict

import numpy as np

from settings import CACHE_DIR, REF_DIR
from utils import set_up_logging
from utils import mkdir_p

log = set_up_logging('describe', loglevel=logging.DEBUG)


def describe_dos(options):
    if options.get('loglevel', None):
        log.setLevel(options['loglevel'])

    OUT_DIR = os.path.join(REF_DIR, 'dos')
    if not os.path.exists(OUT_DIR):
        mkdir_p(OUT_DIR)

    sql_to_dtype = {
        'VARCHAR': 'object',
        'INT': 'int64',
        'MONEY': 'float64'
    }
Exemplo n.º 7
0
def configure_worker(sender=None, **extra):
    from utils import set_up_logging
    set_up_logging()
import logging
import json
from collections import defaultdict
from glob import iglob

try:
    import pandas as pd
except ImportError:
    sys.stderr.write("python-pandas not installed.")

from settings import ORIG_DIR, TRANS_DIR
from utils import mkdir_p
from utils import set_up_logging
from utils import sqlize_colname

log = set_up_logging('transform', loglevel=logging.DEBUG)

with open('ref/field_codes.json', 'r') as fc_ref:
    FIELD_CODES = json.load(fc_ref)


def transform_cfo(options):
    if options.get('loglevel', None):
        log.setLevel(options['loglevel'])

    OUT_DIR = os.path.join(TRANS_DIR, 'cfo')
    if not os.path.exists(OUT_DIR):
        mkdir_p(OUT_DIR)

    CFO_ORIG = os.path.join(ORIG_DIR, 'cfo')
import logging
import zipfile
import json
from collections import defaultdict
from glob import glob

from multiprocessing import Pool as ThreadPool

from lxml import etree

from settings import CACHE_DIR, ORIG_DIR, TEST_CACHE_DIR, TEST_ORIG_DIR
from utils import mkdir_p, translate_dir
from utils import set_up_logging
from schema.scrape.sopr_html import ld1_schema, ld2_schema

log = set_up_logging('extract', loglevel=logging.DEBUG)

html_parser = etree.HTMLParser()


def log_result(result):
    if result[0] == 'success':
        src_dir, dest_dir, num_files = result[1:]
        log.info("successfully extracted " +
                 "{src_dir} => {dest_dir} ({num} files)".format(
                     src_dir=src_dir, dest_dir=dest_dir, num=num_files))
    elif result[0] == 'failure':
        loc, e = result[1:]
        log.error("extracting from {loc} failed: {exception}".format(
            loc=loc, exception=str(e)))
    elif result[0] == 'no_update':
import logging
from datetime import datetime
import locale

from pytz import timezone

from utils import set_up_logging

log = set_up_logging('schema', loglevel=logging.DEBUG)

REPLACE_MAP = {u' ': u'',
               u'\xa0':  u'',
               u'\u200b':  u'',
               u' ': u''}

locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

us_eastern = timezone('US/Eastern')

DATE_FORMATS = ['%m/%d/%Y',
                '%m/%d/%Y %I:%M:%S %p',
                '%m/%d/%y',
                '%Y/%m/%d',
                '%m-%d-%Y',
                '%m-%d-%y']


def checkbox_boolean(e):
    return 'checked' in e.attrib