def db_connect(): if not len(CONNECTION): import sqlaload as sl sqlalchemy_url = config_get('sqlalchemy.url') log.info('Using database: %s', sqlalchemy_url) CONNECTION.append(sl.connect(sqlalchemy_url)) return CONNECTION[0]
def etl_engine(): from sqlaload import connect return connect(app.config['ETL_URL'])
import SETTINGS import logging import sqlaload as sl from extract import extract from entities import create_entities, update_entities from load import load from setup import setup, make_grano from transform import transform from network_entities import update_network_entities if __name__ == '__main__': import sys logging.basicConfig(level=logging.DEBUG) assert len(sys.argv) == 3, "Usage: %s [ir_source_file] [ap_source_file]" ir_source_file = sys.argv[1] ap_source_file = sys.argv[2] engine = sl.connect(SETTINGS.ETL_URL) extract(engine, ir_source_file, ap_source_file) update_network_entities(engine, 'network_entities.csv') create_entities(engine) update_entities(engine, 'entities.csv') transform(engine) grano = make_grano() setup(engine, grano) load(engine, grano)
from datetime import datetime import sqlaload as sl import sys def dump_table(engine, table): file_name = '%s-%s.csv' % (table.name, datetime.utcnow().strftime("%Y-%m-%d")) fh = open(file_name, 'wb') sl.dump_csv(sl.all(engine, table), fh) if __name__ == '__main__': assert len(sys.argv)==2, "Usage: %s [engine-url]" engine = sl.connect(sys.argv[1]) table = sl.get_table(engine, 'bund') dump_table(engine, table)
from lxml import etree from pprint import pprint import sqlaload as sl engine = sl.connect('sqlite:///budget.db') table = sl.get_table(engine, 'budget') year = 2010 FIGURE_FIELDS = { 'total': ['amount_total', 'amount_reserve_total_total', 'amount_reserve_figure_total'], 'comm': ['amount_comm', 'amount_reserve_comm_comm', 'amount_reserve_figure_comm'], 'pay': ['amount_pay', 'amount_reserve_pay_pay', 'amount_reserve_figure_pay'] } def xml_dict(file_name, depth=2): doc = etree.parse(file_name) def _node(node, depth): data = {'!name': node.tag, '!e': node} if node.tail is not None and len(node.tail.strip()): data[':tail'] = node.tail if node.text is not None and len(node.text.strip()): data[':text'] = node.text for a, v in node.attrib.items(): data['@' + a] = v if depth > 0: for child in node: cd = _node(child, depth-1) if not child.tag in data: data[child.tag] = cd elif isinstance(data[child.tag], list):
def db_connect(): sqlalchemy_url = config_get('sqlalchemy.url') log.info('Using database: %s', sqlalchemy_url) return sl.connect(sqlalchemy_url)
def etl_engine(): return sl.connect(app.config.get('ETL_URL'))
def make_engine(): db_url = os.environ.get('FTS_URL') assert db_url is not None, \ "Set FTS_URL in the environment!" return sl.connect(db_url)
def db_connect(): return sl.connect("postgresql:///uk25k")
a_table = sl.get_table(engine, 'article') for data in sl.find(engine, a_table): up = {'number': data['number']} slug_parts = data['canonical_url'].split('/')[3:] if len(slug_parts) > 3: print slug_parts if len(slug_parts) == 3: up['ressort'], up['subressort'], _ = slug_parts elif len(slug_parts) == 2: up['ressort'], _ = slug_parts up['date'] = parse_date(data['date_text']) sl.upsert(engine, a_table, up, ['number']) def parse_date(date_text): for name, num in DE_MONTHS.items(): date_text = date_text.replace(name, num) date_text = date_text.replace(u"\xa0Uhr", "") date_text = date_text.replace("31. 09", "30. 09") try: return datetime.strptime(date_text, "%d. %m %Y, %H:%M") except ValueError, ve: return datetime.strptime(date_text, "%m/%d/%Y %H:%M %p") print[date_text] if __name__ == "__main__": engine = sl.connect('postgresql://localhost/spon_scrape') articles(engine)
up = {'number': data['number']} slug_parts = data['canonical_url'].split('/')[3:] if len(slug_parts) > 3: print slug_parts if len(slug_parts) == 3: up['ressort'], up['subressort'], _ = slug_parts elif len(slug_parts) == 2: up['ressort'], _ = slug_parts up['date'] = parse_date(data['date_text']) sl.upsert(engine, a_table, up, ['number']) def parse_date(date_text): for name, num in DE_MONTHS.items(): date_text = date_text.replace(name, num) date_text = date_text.replace(u"\xa0Uhr", "") date_text = date_text.replace("31. 09", "30. 09") try: return datetime.strptime(date_text, "%d. %m %Y, %H:%M") except ValueError, ve: return datetime.strptime(date_text, "%m/%d/%Y %H:%M %p") print [date_text] if __name__ == "__main__": engine = sl.connect('postgresql://localhost/spon_scrape') articles(engine)