示例#1
0
    def test_schema(self):
        frame = tm.makeTimeDataFrame()
        create_sql = sql.get_schema(frame, 'test', 'sqlite')
        lines = create_sql.splitlines()
        for l in lines:
            tokens = l.split(' ')
            if len(tokens) == 2 and tokens[0] == 'A':
                self.assert_(tokens[1] == 'DATETIME')

        frame = tm.makeTimeDataFrame()
        create_sql = sql.get_schema(frame, 'test', 'sqlite', keys=['A', 'B'],)
        lines = create_sql.splitlines()
        self.assert_('PRIMARY KEY (A,B)' in create_sql)
        cur = self.db.cursor()
        cur.execute(create_sql)
示例#2
0
    def test_schema(self):
        _skip_if_no_MySQLdb()
        frame = tm.makeTimeDataFrame()
        create_sql = sql.get_schema(frame, 'test', 'mysql')
        lines = create_sql.splitlines()
        for l in lines:
            tokens = l.split(' ')
            if len(tokens) == 2 and tokens[0] == 'A':
                self.assert_(tokens[1] == 'DATETIME')

        frame = tm.makeTimeDataFrame()
        drop_sql = "DROP TABLE IF EXISTS test"
        create_sql = sql.get_schema(frame, 'test', 'mysql', keys=['A', 'B'],)
        lines = create_sql.splitlines()
        self.assert_('PRIMARY KEY (A,B)' in create_sql)
        cur = self.db.cursor()
        cur.execute(drop_sql)
        cur.execute(create_sql)
示例#3
0
    def create_ingestion_table(data, engine, schema: str, table: str,
                               **kwargs):
        engine.delete_table(schema, table)
        # Use sqlalchemy's engine to convert types and create a DDL statement for the table.

        # If there's an unnamed index (created by Pandas), we don't add PKs to the table.
        if data.index.names == [None]:
            ddl = get_schema(data,
                             name=table,
                             con=_get_sqlalchemy_engine(engine))
        else:
            ddl = get_schema(
                data.reset_index(),
                name=table,
                keys=data.index.names,
                con=_get_sqlalchemy_engine(engine),
            )
        engine.run_sql_in(schema, ddl)
示例#4
0
def test_get_pandas_schema(conn_str, sample_dataframe):
    """
    Test the behviour of get_schemaa in Pandas
    It is not accurate as per schema in the table
    """
    from pandas.io.sql import get_schema

    with DBConnector(conn_str) as conn:
        logging.info(get_schema(sample_dataframe, "userdata", con=conn.engine))
示例#5
0
    def _make_table(self, dataframe, schema_name, table_name, cur, dtype=None):
        """ Drop & create table """
        drop_table_sql = 'DROP TABLE IF EXISTS {}.{};'.format(
            schema_name, table_name)

        sql_str = get_schema(dataframe, table_name, dtype=dtype)
        tmp_name = '"{}"."{}"'.format(schema_name, table_name)
        sql_str = sql_str.replace(r'"{}"'.format(table_name), tmp_name)

        print(drop_table_sql)
        print(sql_str)
        cur.execute(drop_table_sql)
        cur.execute(sql_str)
示例#6
0
    def test_execute(self):
        frame = tm.makeTimeDataFrame()
        create_sql = sql.get_schema(frame, 'test', 'sqlite')
        cur = self.db.cursor()
        cur.execute(create_sql)
        ins = "INSERT INTO test VALUES (?, ?, ?, ?)"

        row = frame.ix[0]
        sql.execute(ins, self.db, params=tuple(row))
        self.db.commit()

        result = sql.read_frame("select * from test", self.db)
        result.index = frame.index[:1]
        tm.assert_frame_equal(result, frame[:1])
示例#7
0
def make_schema(df, name, database_settings):
    user = database_settings[USER]
    pw = os.environ.get(database_settings[PW_ENV_VAR], None)
    host = database_settings[HOST]
    dbname = database_settings[DB_NAME]
    conn = psycopg2.connect("dbname='{}' user='******' host='{}' password='******'".format(dbname, user, host, pw))
    cur = conn.cursor()
    sql_str = get_schema(df, name).replace(r'"{}"'.format(name), name)
    try:
        cur.execute(sql_str)
        conn.commit()
    except psycopg2.ProgrammingError:
        print "[WARNING] Table already exists..."
    finally:
        conn.close()
    def test_execute(self):
        _skip_if_no_MySQLdb()
        frame = tm.makeTimeDataFrame()
        drop_sql = "DROP TABLE IF EXISTS test"
        create_sql = sql.get_schema(frame, 'test', 'mysql')
        cur = self.db.cursor()
        cur.execute(drop_sql)
        cur.execute(create_sql)
        ins = "INSERT INTO test VALUES (%s, %s, %s, %s)"

        row = frame.ix[0]
        sql.execute(ins, self.db, params=tuple(row))
        self.db.commit()

        result = sql.read_frame("select * from test", self.db)
        result.index = frame.index[:1]
        tm.assert_frame_equal(result, frame[:1])
    def test_execute(self):
        _skip_if_no_MySQLdb()
        frame = tm.makeTimeDataFrame()
        drop_sql = "DROP TABLE IF EXISTS test"
        create_sql = sql.get_schema(frame, 'test', 'mysql')
        cur = self.db.cursor()
        cur.execute(drop_sql)
        cur.execute(create_sql)
        ins = "INSERT INTO test VALUES (%s, %s, %s, %s)"

        row = frame.ix[0]
        sql.execute(ins, self.db, params=tuple(row))
        self.db.commit()

        result = sql.read_frame("select * from test", self.db)
        result.index = frame.index[:1]
        tm.assert_frame_equal(result, frame[:1])
示例#10
0
    def test_write_row_by_row(self):
        frame = tm.makeTimeDataFrame()
        frame.ix[0, 0] = np.nan
        create_sql = sql.get_schema(frame, 'test', 'sqlite')
        cur = self.db.cursor()
        cur.execute(create_sql)

        cur = self.db.cursor()

        ins = "INSERT INTO test VALUES (%s, %s, %s, %s)"
        for idx, row in frame.iterrows():
            fmt_sql = format_query(ins, *row)
            sql.tquery(fmt_sql, cur=cur)

        self.db.commit()

        result = sql.read_frame("select * from test", con=self.db)
        result.index = frame.index
        tm.assert_frame_equal(result, frame)
示例#11
0
def test_get_schema(load_data, conn_str):
    with DBConnector(conn_str) as conn:
        logging.info(get_schema(load_data, "kaggle_data", con=conn.engine))
示例#12
0
 def _create_table(self):
     """Creates the SQL table using the schema of the current DataFrame object"""
     dataframe_schema = get_schema(self.dataframe, self.sql_table)
     with self.engine.connect() as conn:
         conn.execute(dataframe_schema)
示例#13
0
文件: __init__.py 项目: gilesc/cruzdb
    def load_file(self, fname, table=None, sep="\t", bins=False, indexes=None):
        """
        use some of the machinery in pandas to load a file into a table

        Parameters
        ----------

        fname : str
            filename or filehandle to load

        table : str
            table to load the file to

        sep : str
            CSV separator

        bins : bool
            add a "bin" column for efficient spatial queries.

        indexes : list[str]
            list of columns to index

        """
        convs = {"#chr": "chrom", "start": "txStart", "end": "txEnd", "chr":
                "chrom", "pos": "start", "POS": "start", "chromStart": "txStart",
                "chromEnd": "txEnd"}
        if table is None:
            import os.path as op
            table = op.basename(op.splitext(fname)[0]).replace(".", "_")
            print("writing to:", table, file=sys.stderr)

        from pandas.io import sql
        import pandas as pa
        from toolshed import nopen

        needs_name = False
        for i, chunk in enumerate(pa.read_csv(nopen(fname), iterator=True,
            chunksize=100000, sep=sep, encoding="latin-1")):
            chunk.columns = [convs.get(k, k) for k in chunk.columns]
            if not "name" in chunk.columns:
                needs_name = True
                chunk['name'] = chunk.get('chrom', chunk[chunk.columns[0]])
            if bins:
                chunk['bin'] = 1
            if i == 0 and not table in self.tables:
                flavor = self.url.split(":")[0]
                schema = sql.get_schema(chunk, table, flavor)
                print(schema)
                self.engine.execute(schema)
            elif i == 0:
                print >>sys.stderr,\
                        """adding to existing table, you may want to drop first"""

            tbl = getattr(self, table)._table
            cols = chunk.columns
            data = list(dict(zip(cols, x)) for x in chunk.values)
            if needs_name:
                for d in data:
                    d['name'] = "%s:%s" % (d.get("chrom"), d.get("txStart", d.get("chromStart")))
            if bins:
                for d in data:
                    d['bin'] = max(Genome.bins(int(d["txStart"]), int(d["txEnd"])))
            self.engine.execute(tbl.insert(), data)
            self.session.commit()
            if i > 0:
                print >>sys.stderr, "writing row:", i * 100000
        if "txStart" in chunk.columns:
            if "chrom" in chunk.columns:
                ssql = """CREATE INDEX "%s.chrom_txStart" ON "%s" (chrom, txStart)""" % (table, table)
            else:
                ssql = """CREATE INDEX "%s.txStart" ON "%s" (txStart)""" % (table, table)

            self.engine.execute(ssql)
        for index in (indexes or []):
            ssql = """CREATE INDEX "%s.%s" ON "%s" (%s)""" % (table,
                                index, table, index)
            self.engine.execute(ssql)

        if bins:
            ssql = """CREATE INDEX "%s.chrom_bin" ON "%s" (chrom, bin)""" % (table, table)
            self.engine.execute(ssql)

        self.session.commit()
示例#14
0
文件: planea.py 项目: ollin18/planea
        "I_porc_len",
        "II_porc_len",
        "III_porc_len",
        "IV_porc_len",
        "I_similar_len",
        "II_similar_len",
        "III_similar_len",
        "IV_similar_len",
        "I_total_mat",
        "II_total_mat",
        "III_total_mat",
        "IV_total_mat",
        "I_porc_mat",
        "II_porc_mat",
        "III_porc_mat",
        "IV_porc_mat",
        "I_similar_mat",
        "II_similar_mat",
        "III_similar_mat",
        "IV_similar_mat"
        ]

df = pd.read_excel("/data/raw/planea_sec_2019.xlsx", skiprows=4, header=None, engine="openpyxl")
df.columns = colnames
df.ent = df.ent.map(int).map(lambda x: str(x).zfill(2))
df.to_csv("/data/clean/planea.csv",sep="|",index=False)

schema = get_schema(df,'planea')
with open("/data/clean/planea.sql", "w") as text_file:
    text_file.write(schema)
示例#15
0
class MrData(object):
	"""MrData: basic utility for creating and maintaining sqlite files of bb data, and 
	loading them into pandas dataframes or panels. Multiple tickers / single fields or
	single tickers / multiple fields are stored in single tables. If multiple tickers
	and fields are required (e.g. OHLC) then each ticker will be given its own table.
	
	"""
	def __init__(self, datadir):
		self.DATA = datadir
		self.suffix_re = re.compile('(_)(Comdty|Index|Equity|Curncy)$')
		
		self.intraday_re = re.compile('^(\d+)([HT])$')
		self.monthly_re = re.compile('^B?([MQA])S?(-[A-Z]{3})?$')
		#self.qtr_re = re.compile('^B?QS?(-[A-Z]3})$')
	

	def create(self, datafile, start, tickers, fields=['PX_LAST'], end=None, period=None):
		"""Create new datafile based on the given tickers, fields, and start date. Optionally pass
		periodicity (defaults to 'DAILY'), an end date (defaults to today). If an intraday period
		is passed the fields will be set to ['TRADE']."""
		try:
			open(self.DATA + datafile, 'r')
			raise Exception(self.DATA + datafile + ' already exists, please use update')
		except IOError:
			pass
		
		if type(period) is int:df
			fields = ['TRADE']
			
		# Note if period is an integer (minutes) fields needs to be ['TRADE'] or something
		dnow = datetime.now()
		df = bbget(tickers, fields, start, period=period)
		dims = df.shape
		
		conn = sqlite3.connect(self.DATA + datafile)
		cur = conn.cursor()
		
		# Sqlite doesn't have a seperate TIMESTAMP type, but instead lets you store them as TEXT, 
		# REAL (julian date), or INTEGER (unix epoch) and provides converter functions. Pandas 
		# uses TEXT cos everything goes into the insert statement via %s. Could slim things down
		# by using REAL but doesn't seem necessary. 
		if len(dims) == 3:
			for ii in df.items:
				safe_table = ii.replace(' ', '_').strip()
				iframe = df[ii].dropna(how='all')
				iframe['timestamp'] = iframe.index
	
				cur.execute(sql.get_schema(iframe, safe_table, 'sqlite', keys='timestamp'))
				safe_cols = [s.replace(' ', '_').strip() for s in iframe.columns]
				sql._write_sqlite(iframe, safe_table, safe_cols, cur)
				
		else:
			# Only going to be a dataframe if there is a single field, which becomes
			# the name of the table
			df['timestamp'] = df.index
			
			cur.execute(sql.get_schema(df, fields[0], 'sqlite', keys='timestamp'))
			cur.execute('CREATE INDEX ts_idx ON %s (timestamp);' % fields[0])

			safe_cols = [s.replace(' ', '_').strip() for s in df.columns]
			sql._write_sqlite(df.dropna(how='all'), fields[0], safe_cols, cur)

				
		conn.commit()
		conn.close()
示例#16
0
    def load_file(self, fname, table=None, sep="\t", bins=False, indexes=None):
        """
        use some of the machinery in pandas to load a file into a table

        Parameters
        ----------

        fname : str
            filename or filehandle to load

        table : str
            table to load the file to

        sep : str
            CSV separator

        bins : bool
            add a "bin" column for efficient spatial queries.

        indexes : list[str]
            list of columns to index

        """
        convs = {"#chr": "chrom", "start": "txStart", "end": "txEnd", "chr":
                "chrom", "pos": "start", "POS": "start", "chromStart": "txStart",
                "chromEnd": "txEnd"}
        if table is None:
            import os.path as op
            table = op.basename(op.splitext(fname)[0]).replace(".", "_")
            print(("writing to:", table))

        from pandas.io import sql
        import pandas as pa
        from toolshed import nopen

        needs_name = False
        for i, chunk in enumerate(pa.read_csv(nopen(fname), iterator=True,
            chunksize=100000, sep=sep, encoding="latin-1")):
            chunk.columns = [convs.get(k, k) for k in chunk.columns]
            if not "name" in chunk.columns:
                needs_name = True
                chunk['name'] = chunk.get('chrom', chunk[chunk.columns[0]])
            if bins:
                chunk['bin'] = 1
            if i == 0 and not table in self.tables:
                flavor = self.url.split(":")[0]
                schema = sql.get_schema(chunk, table, flavor)
                print(schema)
                self.engine.execute(schema)
            elif i == 0:
                print("""adding to existing table, you may want to drop first""", file=sys.stderr)

            tbl = getattr(self, table)._table
            cols = chunk.columns
            data = list(dict(list(zip(cols, x))) for x in chunk.values)
            if needs_name:
                for d in data:
                    d['name'] = "%s:%s" % (d.get("chrom"), d.get("txStart", d.get("chromStart")))
            if bins:
                for d in data:
                    d['bin'] = max(Genome.bins(int(d["txStart"]), int(d["txEnd"])))
            self.engine.execute(tbl.insert(), data)
            self.session.commit()
            if i > 0:
                print("writing row:", i * 100000, file=sys.stderr)
        if "txStart" in chunk.columns:
            if "chrom" in chunk.columns:
                ssql = """CREATE INDEX "%s.chrom_txStart" ON "%s" (chrom, txStart)""" % (table, table)
            else:
                ssql = """CREATE INDEX "%s.txStart" ON "%s" (txStart)""" % (table, table)

            self.engine.execute(ssql)
        for index in (indexes or []):
            ssql = """CREATE INDEX "%s.%s" ON "%s" (%s)""" % (table,
                                index, table, index)
            self.engine.execute(ssql)

        if bins:
            ssql = """CREATE INDEX "%s.chrom_bin" ON "%s" (chrom, bin)""" % (table, table)
            self.engine.execute(ssql)

        self.session.commit()
示例#17
0
#!/usr/bin/env python3
import pandas as pd
import urllib.request
from pandas.io.sql import get_schema

url = 'http://www.inpi.gob.mx/cedulas/poblacion-indigena-municipal-2010.xls'
urllib.request.urlretrieve(url, '/data/raw/indigenous_language.xlsx')

df = pd.read_excel("/data/raw/indigenous_language.xlsx",
                   skiprows=0,
                   sheet_name="COMPARATIVO 2010")
df = df.loc[(~df.NOMMUN.isin(["Estados Unidos Mexicanos", "Total Estatal"]))
            & (~df.MPO.isnull())]
df.MPO = df.MPO.map(int).map(lambda x: str(x).zfill(3))

df.to_csv("/data/clean/indigenous_language.csv", sep="|", index=False)

schema = get_schema(df, 'indigenous_language')
with open("/data/sql/indigenous_language.sql", "w") as text_file:
    text_file.write(schema)