def test_schema(self): frame = tm.makeTimeDataFrame() create_sql = sql.get_schema(frame, 'test', 'sqlite') lines = create_sql.splitlines() for l in lines: tokens = l.split(' ') if len(tokens) == 2 and tokens[0] == 'A': self.assert_(tokens[1] == 'DATETIME') frame = tm.makeTimeDataFrame() create_sql = sql.get_schema(frame, 'test', 'sqlite', keys=['A', 'B'],) lines = create_sql.splitlines() self.assert_('PRIMARY KEY (A,B)' in create_sql) cur = self.db.cursor() cur.execute(create_sql)
def test_schema(self): _skip_if_no_MySQLdb() frame = tm.makeTimeDataFrame() create_sql = sql.get_schema(frame, 'test', 'mysql') lines = create_sql.splitlines() for l in lines: tokens = l.split(' ') if len(tokens) == 2 and tokens[0] == 'A': self.assert_(tokens[1] == 'DATETIME') frame = tm.makeTimeDataFrame() drop_sql = "DROP TABLE IF EXISTS test" create_sql = sql.get_schema(frame, 'test', 'mysql', keys=['A', 'B'],) lines = create_sql.splitlines() self.assert_('PRIMARY KEY (A,B)' in create_sql) cur = self.db.cursor() cur.execute(drop_sql) cur.execute(create_sql)
def create_ingestion_table(data, engine, schema: str, table: str, **kwargs): engine.delete_table(schema, table) # Use sqlalchemy's engine to convert types and create a DDL statement for the table. # If there's an unnamed index (created by Pandas), we don't add PKs to the table. if data.index.names == [None]: ddl = get_schema(data, name=table, con=_get_sqlalchemy_engine(engine)) else: ddl = get_schema( data.reset_index(), name=table, keys=data.index.names, con=_get_sqlalchemy_engine(engine), ) engine.run_sql_in(schema, ddl)
def test_get_pandas_schema(conn_str, sample_dataframe): """ Test the behviour of get_schemaa in Pandas It is not accurate as per schema in the table """ from pandas.io.sql import get_schema with DBConnector(conn_str) as conn: logging.info(get_schema(sample_dataframe, "userdata", con=conn.engine))
def _make_table(self, dataframe, schema_name, table_name, cur, dtype=None): """ Drop & create table """ drop_table_sql = 'DROP TABLE IF EXISTS {}.{};'.format( schema_name, table_name) sql_str = get_schema(dataframe, table_name, dtype=dtype) tmp_name = '"{}"."{}"'.format(schema_name, table_name) sql_str = sql_str.replace(r'"{}"'.format(table_name), tmp_name) print(drop_table_sql) print(sql_str) cur.execute(drop_table_sql) cur.execute(sql_str)
def test_execute(self): frame = tm.makeTimeDataFrame() create_sql = sql.get_schema(frame, 'test', 'sqlite') cur = self.db.cursor() cur.execute(create_sql) ins = "INSERT INTO test VALUES (?, ?, ?, ?)" row = frame.ix[0] sql.execute(ins, self.db, params=tuple(row)) self.db.commit() result = sql.read_frame("select * from test", self.db) result.index = frame.index[:1] tm.assert_frame_equal(result, frame[:1])
def make_schema(df, name, database_settings): user = database_settings[USER] pw = os.environ.get(database_settings[PW_ENV_VAR], None) host = database_settings[HOST] dbname = database_settings[DB_NAME] conn = psycopg2.connect("dbname='{}' user='******' host='{}' password='******'".format(dbname, user, host, pw)) cur = conn.cursor() sql_str = get_schema(df, name).replace(r'"{}"'.format(name), name) try: cur.execute(sql_str) conn.commit() except psycopg2.ProgrammingError: print "[WARNING] Table already exists..." finally: conn.close()
def test_execute(self): _skip_if_no_MySQLdb() frame = tm.makeTimeDataFrame() drop_sql = "DROP TABLE IF EXISTS test" create_sql = sql.get_schema(frame, 'test', 'mysql') cur = self.db.cursor() cur.execute(drop_sql) cur.execute(create_sql) ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" row = frame.ix[0] sql.execute(ins, self.db, params=tuple(row)) self.db.commit() result = sql.read_frame("select * from test", self.db) result.index = frame.index[:1] tm.assert_frame_equal(result, frame[:1])
def test_write_row_by_row(self): frame = tm.makeTimeDataFrame() frame.ix[0, 0] = np.nan create_sql = sql.get_schema(frame, 'test', 'sqlite') cur = self.db.cursor() cur.execute(create_sql) cur = self.db.cursor() ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" for idx, row in frame.iterrows(): fmt_sql = format_query(ins, *row) sql.tquery(fmt_sql, cur=cur) self.db.commit() result = sql.read_frame("select * from test", con=self.db) result.index = frame.index tm.assert_frame_equal(result, frame)
def test_get_schema(load_data, conn_str): with DBConnector(conn_str) as conn: logging.info(get_schema(load_data, "kaggle_data", con=conn.engine))
def _create_table(self): """Creates the SQL table using the schema of the current DataFrame object""" dataframe_schema = get_schema(self.dataframe, self.sql_table) with self.engine.connect() as conn: conn.execute(dataframe_schema)
def load_file(self, fname, table=None, sep="\t", bins=False, indexes=None): """ use some of the machinery in pandas to load a file into a table Parameters ---------- fname : str filename or filehandle to load table : str table to load the file to sep : str CSV separator bins : bool add a "bin" column for efficient spatial queries. indexes : list[str] list of columns to index """ convs = {"#chr": "chrom", "start": "txStart", "end": "txEnd", "chr": "chrom", "pos": "start", "POS": "start", "chromStart": "txStart", "chromEnd": "txEnd"} if table is None: import os.path as op table = op.basename(op.splitext(fname)[0]).replace(".", "_") print("writing to:", table, file=sys.stderr) from pandas.io import sql import pandas as pa from toolshed import nopen needs_name = False for i, chunk in enumerate(pa.read_csv(nopen(fname), iterator=True, chunksize=100000, sep=sep, encoding="latin-1")): chunk.columns = [convs.get(k, k) for k in chunk.columns] if not "name" in chunk.columns: needs_name = True chunk['name'] = chunk.get('chrom', chunk[chunk.columns[0]]) if bins: chunk['bin'] = 1 if i == 0 and not table in self.tables: flavor = self.url.split(":")[0] schema = sql.get_schema(chunk, table, flavor) print(schema) self.engine.execute(schema) elif i == 0: print >>sys.stderr,\ """adding to existing table, you may want to drop first""" tbl = getattr(self, table)._table cols = chunk.columns data = list(dict(zip(cols, x)) for x in chunk.values) if needs_name: for d in data: d['name'] = "%s:%s" % (d.get("chrom"), d.get("txStart", d.get("chromStart"))) if bins: for d in data: d['bin'] = max(Genome.bins(int(d["txStart"]), int(d["txEnd"]))) self.engine.execute(tbl.insert(), data) self.session.commit() if i > 0: print >>sys.stderr, "writing row:", i * 100000 if "txStart" in chunk.columns: if "chrom" in chunk.columns: ssql = """CREATE INDEX "%s.chrom_txStart" ON "%s" (chrom, txStart)""" % (table, table) else: ssql = """CREATE INDEX "%s.txStart" ON "%s" (txStart)""" % (table, table) self.engine.execute(ssql) for index in (indexes or []): ssql = """CREATE INDEX "%s.%s" ON "%s" (%s)""" % (table, index, table, index) self.engine.execute(ssql) if bins: ssql = """CREATE INDEX "%s.chrom_bin" ON "%s" (chrom, bin)""" % (table, table) self.engine.execute(ssql) self.session.commit()
"I_porc_len", "II_porc_len", "III_porc_len", "IV_porc_len", "I_similar_len", "II_similar_len", "III_similar_len", "IV_similar_len", "I_total_mat", "II_total_mat", "III_total_mat", "IV_total_mat", "I_porc_mat", "II_porc_mat", "III_porc_mat", "IV_porc_mat", "I_similar_mat", "II_similar_mat", "III_similar_mat", "IV_similar_mat" ] df = pd.read_excel("/data/raw/planea_sec_2019.xlsx", skiprows=4, header=None, engine="openpyxl") df.columns = colnames df.ent = df.ent.map(int).map(lambda x: str(x).zfill(2)) df.to_csv("/data/clean/planea.csv",sep="|",index=False) schema = get_schema(df,'planea') with open("/data/clean/planea.sql", "w") as text_file: text_file.write(schema)
class MrData(object): """MrData: basic utility for creating and maintaining sqlite files of bb data, and loading them into pandas dataframes or panels. Multiple tickers / single fields or single tickers / multiple fields are stored in single tables. If multiple tickers and fields are required (e.g. OHLC) then each ticker will be given its own table. """ def __init__(self, datadir): self.DATA = datadir self.suffix_re = re.compile('(_)(Comdty|Index|Equity|Curncy)$') self.intraday_re = re.compile('^(\d+)([HT])$') self.monthly_re = re.compile('^B?([MQA])S?(-[A-Z]{3})?$') #self.qtr_re = re.compile('^B?QS?(-[A-Z]3})$') def create(self, datafile, start, tickers, fields=['PX_LAST'], end=None, period=None): """Create new datafile based on the given tickers, fields, and start date. Optionally pass periodicity (defaults to 'DAILY'), an end date (defaults to today). If an intraday period is passed the fields will be set to ['TRADE'].""" try: open(self.DATA + datafile, 'r') raise Exception(self.DATA + datafile + ' already exists, please use update') except IOError: pass if type(period) is int:df fields = ['TRADE'] # Note if period is an integer (minutes) fields needs to be ['TRADE'] or something dnow = datetime.now() df = bbget(tickers, fields, start, period=period) dims = df.shape conn = sqlite3.connect(self.DATA + datafile) cur = conn.cursor() # Sqlite doesn't have a seperate TIMESTAMP type, but instead lets you store them as TEXT, # REAL (julian date), or INTEGER (unix epoch) and provides converter functions. Pandas # uses TEXT cos everything goes into the insert statement via %s. Could slim things down # by using REAL but doesn't seem necessary. if len(dims) == 3: for ii in df.items: safe_table = ii.replace(' ', '_').strip() iframe = df[ii].dropna(how='all') iframe['timestamp'] = iframe.index cur.execute(sql.get_schema(iframe, safe_table, 'sqlite', keys='timestamp')) safe_cols = [s.replace(' ', '_').strip() for s in iframe.columns] sql._write_sqlite(iframe, safe_table, safe_cols, cur) else: # Only going to be a dataframe if there is a single field, which becomes # the name of the table df['timestamp'] = df.index cur.execute(sql.get_schema(df, fields[0], 'sqlite', keys='timestamp')) cur.execute('CREATE INDEX ts_idx ON %s (timestamp);' % fields[0]) safe_cols = [s.replace(' ', '_').strip() for s in df.columns] sql._write_sqlite(df.dropna(how='all'), fields[0], safe_cols, cur) conn.commit() conn.close()
def load_file(self, fname, table=None, sep="\t", bins=False, indexes=None): """ use some of the machinery in pandas to load a file into a table Parameters ---------- fname : str filename or filehandle to load table : str table to load the file to sep : str CSV separator bins : bool add a "bin" column for efficient spatial queries. indexes : list[str] list of columns to index """ convs = {"#chr": "chrom", "start": "txStart", "end": "txEnd", "chr": "chrom", "pos": "start", "POS": "start", "chromStart": "txStart", "chromEnd": "txEnd"} if table is None: import os.path as op table = op.basename(op.splitext(fname)[0]).replace(".", "_") print(("writing to:", table)) from pandas.io import sql import pandas as pa from toolshed import nopen needs_name = False for i, chunk in enumerate(pa.read_csv(nopen(fname), iterator=True, chunksize=100000, sep=sep, encoding="latin-1")): chunk.columns = [convs.get(k, k) for k in chunk.columns] if not "name" in chunk.columns: needs_name = True chunk['name'] = chunk.get('chrom', chunk[chunk.columns[0]]) if bins: chunk['bin'] = 1 if i == 0 and not table in self.tables: flavor = self.url.split(":")[0] schema = sql.get_schema(chunk, table, flavor) print(schema) self.engine.execute(schema) elif i == 0: print("""adding to existing table, you may want to drop first""", file=sys.stderr) tbl = getattr(self, table)._table cols = chunk.columns data = list(dict(list(zip(cols, x))) for x in chunk.values) if needs_name: for d in data: d['name'] = "%s:%s" % (d.get("chrom"), d.get("txStart", d.get("chromStart"))) if bins: for d in data: d['bin'] = max(Genome.bins(int(d["txStart"]), int(d["txEnd"]))) self.engine.execute(tbl.insert(), data) self.session.commit() if i > 0: print("writing row:", i * 100000, file=sys.stderr) if "txStart" in chunk.columns: if "chrom" in chunk.columns: ssql = """CREATE INDEX "%s.chrom_txStart" ON "%s" (chrom, txStart)""" % (table, table) else: ssql = """CREATE INDEX "%s.txStart" ON "%s" (txStart)""" % (table, table) self.engine.execute(ssql) for index in (indexes or []): ssql = """CREATE INDEX "%s.%s" ON "%s" (%s)""" % (table, index, table, index) self.engine.execute(ssql) if bins: ssql = """CREATE INDEX "%s.chrom_bin" ON "%s" (chrom, bin)""" % (table, table) self.engine.execute(ssql) self.session.commit()
#!/usr/bin/env python3 import pandas as pd import urllib.request from pandas.io.sql import get_schema url = 'http://www.inpi.gob.mx/cedulas/poblacion-indigena-municipal-2010.xls' urllib.request.urlretrieve(url, '/data/raw/indigenous_language.xlsx') df = pd.read_excel("/data/raw/indigenous_language.xlsx", skiprows=0, sheet_name="COMPARATIVO 2010") df = df.loc[(~df.NOMMUN.isin(["Estados Unidos Mexicanos", "Total Estatal"])) & (~df.MPO.isnull())] df.MPO = df.MPO.map(int).map(lambda x: str(x).zfill(3)) df.to_csv("/data/clean/indigenous_language.csv", sep="|", index=False) schema = get_schema(df, 'indigenous_language') with open("/data/sql/indigenous_language.sql", "w") as text_file: text_file.write(schema)