def load(self): self.cube['run'] = 'run' self.mongo['cube'].update({'slug': self.slug}, self.cube) self.cube['start_process'] = datetime.now() _sql = self.cube['sql'] if _sql[-1] == ';': _sql = _sql[:-1] self.sql = u"""SELECT * FROM ({}) AS CUBE;""".format(_sql) self.connection = self.mongo['connection'].find_one({ 'slug': self.cube['connection']})['connection'] log_it("CONNECT IN RELATION DATA BASE: {}".format(self.slug), "bin-mining") if 'sqlite' in self.connection: e = create_engine(self.connection) else: e = create_engine(self.connection, **conf('openmining')['sql_conn_params']) Session = sessionmaker(bind=e) session = Session() resoverall = session.execute(text(self.sql)) self.data = resoverall.fetchall() self.keys = resoverall.keys()
def clean(self): log_it("CLEAN DATA (JSON) ON RIAK: {}".format(self.slug), "bin-mining") self.MyBucket.new(self.slug, data='').store() self.MyBucket.new(u'{}-columns'.format(self.slug), data='').store() self.MyBucket.new(u'{}-connect'.format(self.slug), data='').store() self.MyBucket.new(u'{}-sql'.format(self.slug), data='').store()
def save(self): self.clean() log_it("SAVE DATA (JSON) ON RIAK: {}".format(self.slug), "bin-mining") self.MyBucket.new(self.slug, data=self.pdict, content_type="application/json").store() log_it("SAVE COLUMNS ON RIAK: {}".format(self.slug), "bin-mining") self.MyBucket.new(u'{}-columns'.format(self.slug), data=json.dumps(self.keys)).store() log_it("SAVE CONNECT ON RIAK: {}".format(self.slug), "bin-mining") self.MyBucket.new(u'{}-connect'.format(self.slug), data=self.connection).store() log_it("SAVE SQL ON RIAK: {}".format(self.slug), "bin-mining") self.MyBucket.new(u'{}-sql'.format(self.slug), data=self.sql).store() self.cube['status'] = True self.cube['lastupdate'] = datetime.now() self.cube['run'] = True self.mongo['cube'].update({'slug': self.cube['slug']}, self.cube) log_it("CLEAN MEMORY: {}".format(self.slug), "bin-mining") gc.collect()
def clean(self): log_it("CLEAN DATA (JSON) ON RIAK: {}".format(self.slug), "bin-mining") self.MyBucket.new(self.slug, data="").store() self.MyBucket.new(u"{}-columns".format(self.slug), data="").store() self.MyBucket.new(u"{}-connect".format(self.slug), data="").store() self.MyBucket.new(u"{}-sql".format(self.slug), data="").store()
def frame(self, data_type=None): log_it("LOAD DATA ON DATAWAREHOUSE via {}: {}".format( data_type or 'dict', self.slug), "bin-mining") if data_type: self.df = getattr(pandas, "read_{}".format(data_type))(self.data) else: self.df = DataFrame(self.data) if self.df.empty: self.pdict = {} log_it('[warning]Empty cube: {}!!'.format(self.cube), "bin-mining") return try: self.df.columns = self.keys except AttributeError: self._keys(self.df.columns.tolist()) # If the OML is active, it renders the script that there is if conf("oml").get("on") and self.cube.get("oml"): from oml import RunTime self.df.columns = self.keys df = RunTime(conf("oml").get("language", "lua"), self.df.to_dict(orient='records'), self.cube.get("oml"), conf("oml").get("class", {"OML": "oml.base.OMLBase"})) self.df = DataFrame(df) self._keys(self.df.columns.tolist()) self.df.head() self.pdict = map(fix_render, self.df.to_dict(orient='records'))
def load(self): self.cube['run'] = 'run' self.mongo['cube'].update({'slug': self.slug}, self.cube) self.cube['start_process'] = datetime.now() _sql = self.cube['sql'] if _sql[-1] == ';': _sql = _sql[:-1] self.sql = u"""SELECT * FROM ({}) AS CUBE;""".format(_sql) self.connection = self.mongo['connection'].find_one( {'slug': self.cube['connection']})['connection'] log_it("CONNECT IN RELATION DATA BASE: {}".format(self.slug), "bin-mining") if 'sqlite' in self.connection: e = create_engine(self.connection) else: e = create_engine(self.connection, **conf('openmining')['sql_conn_params']) Session = sessionmaker(bind=e) session = Session() resoverall = session.execute(text(self.sql)) self.data = resoverall.fetchall() self.keys = resoverall.keys()
def frame(self): log_it("LOAD DATA ON DATAWAREHOUSE: {}".format(self.slug), "bin-mining") self.df = DataFrame(self.data) if self.df.empty: log_it("[warning]Empty cube: {}!!".format(self.cube), "bin-mining") return self.df.columns = self.keys self.df.head() self.pdict = map(fix_render, self.df.to_dict(outtype="records"))
def __init__(self, _cube): log_it("START: {}".format(_cube['slug']), "bin-mining") self.mongo = MongoPlugin(uri=conf("mongodb")["uri"], db=conf("mongodb")["db"], json_mongo=True).get_mongo() del _cube['_id'] self.cube = _cube self.slug = self.cube['slug']
def frame(self): log_it("LOAD DATA ON DATAWAREHOUSE: {}".format(self.slug), "bin-mining") self.df = DataFrame(self.data) if self.df.empty: log_it('[warning]Empty cube: {}!!'.format(self.cube), "bin-mining") return self.df.columns = self.keys self.df.head() self.pdict = map(fix_render, self.df.to_dict(outtype='records'))
def __init__(self, _cube): log_it("START: {}".format(_cube['slug']), "bin-mining") self.mongo = MongoPlugin( uri=conf("mongodb")["uri"], db=conf("mongodb")["db"], json_mongo=True).get_mongo() del _cube['_id'] self.cube = _cube self.slug = self.cube['slug']
def save(self): log_it("SAVE DATA (JSON) ON DATA WAREHOUSE: {}".format(self.slug), "bin-mining") data = {'data': self.pdict, 'columns': self.keys} DW = DataWarehouse() DW.save(self.slug, data) self.cube['status'] = True self.cube['lastupdate'] = datetime.now() self.cube['run'] = True self.mongo['cube'].update({'slug': self.cube['slug']}, self.cube) log_it("CLEAN MEMORY: {}".format(self.slug), "bin-mining") gc.collect()
def __init__(self, _cube): log_it("START: {}".format(_cube["slug"]), "bin-mining") self.mongo = MongoPlugin(uri=conf("mongodb")["uri"], db=conf("mongodb")["db"], json_mongo=True).get_mongo() MyClient = riak.RiakClient( protocol=conf("riak")["protocol"], http_port=conf("riak")["http_port"], host=conf("riak")["host"] ) self.MyBucket = MyClient.bucket(conf("riak")["bucket"]) self.MyBucket.enable_search() del _cube["_id"] self.cube = _cube self.slug = self.cube["slug"]
def __init__(self, _cube): log_it("START: {}".format(_cube['slug']), "bin-mining") self.mongo = MongoPlugin(uri=conf("mongodb")["uri"], db=conf("mongodb")["db"], json_mongo=True).get_mongo() MyClient = riak.RiakClient(protocol=conf("riak")["protocol"], http_port=conf("riak")["http_port"], host=conf("riak")["host"]) self.MyBucket = MyClient.bucket(conf("riak")["bucket"]) self.MyBucket.enable_search() del _cube['_id'] self.cube = _cube self.slug = self.cube['slug']
def frame(self, data_type=None): log_it( "LOAD DATA ON DATAWAREHOUSE via {}: {}".format( data_type or 'dict', self.slug), "bin-mining") if data_type: self.df = getattr(pandas, "read_{}".format(data_type))(self.data) else: self.df = DataFrame(self.data) if self.df.empty: self.pdict = {} log_it('[warning]Empty cube: {}!!'.format(self.cube), "bin-mining") return try: self.df.columns = self.keys except AttributeError: self._keys(self.df.columns.tolist()) self.df.head() self.pdict = map(fix_render, self.df.to_dict(orient='records'))
def frame(self, data_type=None): log_it("LOAD DATA ON DATAWAREHOUSE via {}: {}".format( data_type or 'dict', self.slug), "bin-mining") if data_type: self.df = getattr(pandas, "read_{}".format(data_type))(self.data) else: self.df = DataFrame(self.data) if self.df.empty: self.pdict = {} log_it('[warning]Empty cube: {}!!'.format(self.cube), "bin-mining") return try: self.df.columns = self.keys except AttributeError: self._keys(self.df.columns.tolist()) self.df.head() self.pdict = map(fix_render, self.df.to_dict(orient='records'))
def load(self): self.cube["run"] = "run" self.mongo["cube"].update({"slug": self.slug}, self.cube) self.cube["start_process"] = datetime.now() _sql = self.cube["sql"] if _sql[-1] == ";": _sql = _sql[:-1] self.sql = u"""SELECT * FROM ({}) AS CUBE;""".format(_sql) self.connection = self.mongo["connection"].find_one({"slug": self.cube["connection"]})["connection"] log_it("CONNECT IN RELATION DATA BASE: {}".format(self.slug), "bin-mining") e = create_engine(self.connection, **conf("openmining")["sql_conn_params"]) Session = sessionmaker(bind=e) session = Session() resoverall = session.execute(text(self.sql)) self.data = resoverall.fetchall() self.keys = resoverall.keys()
def process(_cube): try: log_it("START: {}".format(_cube["slug"]), "bin-mining") mongo = MongoPlugin(uri=conf("mongodb")["uri"], db=conf("mongodb")["db"], json_mongo=True).get_mongo() c = Cube(_cube) if _cube.get("type") == "relational": c.load() c.frame() c.save() elif _cube.get("type") == "cube_join": c.environment(_cube.get("type")) cube_join = CubeJoin(_cube) c._data(cube_join.none()) c._keys(cube_join.none().columns.values) c.frame() c.save() elif _cube.get("type") == "url": c._data(requests.get(_cube.get("connection")).text) c.frame(data_type=_cube.get("url_type")) c.save() except Exception, e: log_it(e, "bin-mining") log_it(traceback.format_exc(), "bin-mining") _cube["run"] = False mongo["cube"].update({"slug": _cube["slug"]}, _cube)
def process(_cube): try: log_it("START: {}".format(_cube['slug']), "bin-mining") mongo = MongoPlugin(uri=conf("mongodb")["uri"], db=conf("mongodb")["db"], json_mongo=True).get_mongo() c = CubeProcess(_cube) if _cube.get('type') == 'relational': c.load() c.frame() c.save() elif _cube.get('type') == 'cube_join': c.environment(_cube.get('type')) cube_join = CubeJoin(_cube) c._data(cube_join.none()) c._keys(cube_join.none().columns.values) c.frame() c.save() except Exception, e: log_it(e, "bin-mining") log_it(traceback.format_exc(), "bin-mining") _cube['run'] = False mongo['cube'].update({'slug': _cube['slug']}, _cube)
def process(_cube): try: log_it("START: {}".format(_cube['slug']), "bin-mining") mongo = MongoPlugin( uri=conf("mongodb")["uri"], db=conf("mongodb")["db"], json_mongo=True).get_mongo() c = CubeProcess(_cube) if _cube.get('type') == 'relational': c.load() c.frame() c.save() elif _cube.get('type') == 'cube_join': c.environment(_cube.get('type')) cube_join = CubeJoin(_cube) c._data(cube_join.none()) c._keys(cube_join.none().columns.values) c.frame() c.save() except Exception, e: log_it(e, "bin-mining") log_it(traceback.format_exc(), "bin-mining") _cube['run'] = False mongo['cube'].update({'slug': _cube['slug']}, _cube)
def save(self): self.clean() log_it("SAVE DATA (JSON) ON RIAK: {}".format(self.slug), "bin-mining") self.MyBucket.new(self.slug, data=self.pdict, content_type="application/json").store() log_it("SAVE COLUMNS ON RIAK: {}".format(self.slug), "bin-mining") self.MyBucket.new(u"{}-columns".format(self.slug), data=json.dumps(self.keys)).store() log_it("SAVE CONNECT ON RIAK: {}".format(self.slug), "bin-mining") self.MyBucket.new(u"{}-connect".format(self.slug), data=self.connection).store() log_it("SAVE SQL ON RIAK: {}".format(self.slug), "bin-mining") self.MyBucket.new(u"{}-sql".format(self.slug), data=self.sql).store() self.cube["status"] = True self.cube["lastupdate"] = datetime.now() self.cube["run"] = True self.mongo["cube"].update({"slug": self.cube["slug"]}, self.cube) log_it("CLEAN MEMORY: {}".format(self.slug), "bin-mining") gc.collect()
# -*- coding: utf-8 -*- from gevent import monkey monkey.patch_all() from os import sys, path import schedule from time import sleep from bottle.ext.mongo import MongoPlugin sys.path.append(path.dirname(path.dirname(path.abspath(__file__)))) from mining.bin.cube import run from mining.utils import conf, log_it log_it("START", "bin-scheduler") onrun = {} register = [] def job(slug): log_it("START JOB: {}".format(slug), "bin-scheduler") run(slug) log_it("END JOB: {}".format(slug), "bin-scheduler") def rules(cube, scheduler_type='minutes', scheduler_interval=59, dashboard=None): if scheduler_type: scheduler_type = cube.get('scheduler_type', 'minutes') if scheduler_interval:
def rules(cube, scheduler_type='minutes', scheduler_interval=59, dashboard=None): if scheduler_type: scheduler_type = cube.get('scheduler_type', 'minutes') if scheduler_interval: scheduler_interval = cube.get('scheduler_interval', 59) log_it("START REGISTER", "bin-scheduler") log_it("cube: {}".format(cube.get('slug')), "bin-scheduler") log_it("type: {}".format(scheduler_type), "bin-scheduler") log_it("interval: {}".format(scheduler_interval), "bin-scheduler") log_it("END REGISTER", "bin-scheduler") t = {} if scheduler_type == 'minutes': env = schedule.every(int(scheduler_interval)) t = env.minutes elif scheduler_type == 'hour': env = schedule.every() t = env.hour elif scheduler_type == 'day': env = schedule.every() t = env.day else: return False jobn = cube.get("slug") try: t.do(job, cube=cube) if dashboard: jobn = u"{}-{}".format(cube.get("slug"), dashboard) onrun[jobn] = env register.append(jobn) if cube.get('run') != 'run': process.delay(cube) except Exception, e: if jobn in register: register.remove(jobn) if onrun.get(jobn): del onrun[jobn] log_it("ERROR {}: {}".format(cube.get('slug'), e))
def job(slug): log_it("START JOB: {}".format(slug), "bin-scheduler") run(slug) log_it("END JOB: {}".format(slug), "bin-scheduler")
# -*- coding: utf-8 -*- from os import sys, path import schedule from time import sleep from bottle.ext.mongo import MongoPlugin sys.path.append(path.dirname(path.dirname(path.abspath(__file__)))) from mining.utils import conf, log_it from mining.tasks import process log_it("START", "bin-scheduler") onrun = {} register = [] def job(cube): log_it("START JOB: {}".format(cube.get('slug')), "bin-scheduler") process.delay(cube) log_it("END JOB: {}".format(cube.get('slug')), "bin-scheduler") def rules(cube, scheduler_type='minutes', scheduler_interval=59, dashboard=None): if scheduler_type: scheduler_type = cube.get('scheduler_type', 'minutes') if scheduler_interval: scheduler_interval = cube.get('scheduler_interval', 59)
def job(cube): log_it("START JOB: {}".format(cube.get('slug')), "bin-scheduler") process.delay(cube) log_it("END JOB: {}".format(cube.get('slug')), "bin-scheduler")
def process(_cube): try: log_it("START: {}".format(_cube["slug"]), "bin-mining") mongo = MongoPlugin(uri=conf("mongodb")["uri"], db=conf("mongodb")["db"], json_mongo=True).get_mongo() c = CubeProcess(_cube) if _cube.get("type") == "relational": c.load() c.frame() c.save() elif _cube.get("type") == "cube_join": c.environment(_cube.get("type")) cube_join = CubeJoin(_cube) c._data(cube_join.none()) c._keys(cube_join.none().columns.values) c.frame() c.save() except Exception, e: log_it(e, "bin-mining") log_it(traceback.format_exc(), "bin-mining") _cube["run"] = False mongo["cube"].update({"slug": _cube["slug"]}, _cube) log_it("END: {}".format(_cube["slug"]), "bin-mining") if __name__ == "__main__": run()
log_it("START: {}".format(_cube['slug']), "bin-mining") mongo = MongoPlugin(uri=conf("mongodb")["uri"], db=conf("mongodb")["db"], json_mongo=True).get_mongo() c = CubeProcess(_cube) if _cube.get('type') == 'relational': c.load() c.frame() c.save() elif _cube.get('type') == 'cube_join': c.environment(_cube.get('type')) cube_join = CubeJoin(_cube) c._data(cube_join.none()) c._keys(cube_join.none().columns.values) c.frame() c.save() except Exception, e: log_it(e, "bin-mining") log_it(traceback.format_exc(), "bin-mining") _cube['run'] = False mongo['cube'].update({'slug': _cube['slug']}, _cube) log_it("END: {}".format(_cube['slug']), "bin-mining") if __name__ == "__main__": run()
mongo = MongoPlugin( uri=conf("mongodb")["uri"], db=conf("mongodb")["db"], json_mongo=True).get_mongo() c = CubeProcess(_cube) if _cube.get('type') == 'relational': c.load() c.frame() c.save() elif _cube.get('type') == 'cube_join': c.environment(_cube.get('type')) cube_join = CubeJoin(_cube) c._data(cube_join.none()) c._keys(cube_join.none().columns.values) c.frame() c.save() except Exception, e: log_it(e, "bin-mining") log_it(traceback.format_exc(), "bin-mining") _cube['run'] = False mongo['cube'].update({'slug': _cube['slug']}, _cube) log_it("END: {}".format(_cube['slug']), "bin-mining") if __name__ == "__main__": run()
def rules(cube, scheduler_type='minutes', scheduler_interval=59, dashboard=None): if scheduler_type: scheduler_type = cube.get('scheduler_type', 'minutes') if scheduler_interval: scheduler_interval = cube.get('scheduler_interval', 59) log_it("START REGISTER", "bin-scheduler") log_it("cube: {}".format(cube.get('slug')), "bin-scheduler") log_it("type: {}".format(scheduler_type), "bin-scheduler") log_it("interval: {}".format(scheduler_interval), "bin-scheduler") log_it("END REGISTER", "bin-scheduler") t = {} if scheduler_type == 'minutes': env = schedule.every(int(scheduler_interval)) t = env.minutes elif scheduler_type == 'hour': env = schedule.every() t = env.hour elif scheduler_type == 'day': env = schedule.every() t = env.day try: t.do(job, slug=cube.get('slug')) jobn = cube.get("slug") if dashboard: jobn = u"{}-{}".format(cube.get("slug"), dashboard) onrun[jobn] = env register.append(jobn) except Exception, e: log_it("ERROR {}: {}".format(cube.get('slug'), e))