Exemplo n.º 1
0
 def test_reserved_words(self, engine, connection):
     """Presto uses double quotes, not backticks"""
     fake_table = Table('select', MetaData(bind=engine), Column('current_timestamp', STRINGTYPE))
     query = str(fake_table.select(fake_table.c.current_timestamp == 'a'))
     self.assertIn('"select"', query)
     self.assertIn('"current_timestamp"', query)
     self.assertNotIn('`select`', query)
     self.assertNotIn('`current_timestamp`', query)
Exemplo n.º 2
0
    def test_to_sql(self, engine, conn):
        # TODO pyathena.error.OperationalError: SYNTAX_ERROR: line 1:305:
        #      Column 'foobar' cannot be resolved.
        #      def _format_bytes(formatter, escaper, val):
        #          return val.decode()
        table_name = "to_sql_{0}".format(str(uuid.uuid4()).replace("-", ""))
        df = pd.DataFrame({
            "col_int": np.int32([1]),
            "col_bigint": np.int64([12345]),
            "col_float": np.float32([1.0]),
            "col_double": np.float64([1.2345]),
            "col_string": ["a"],
            "col_boolean": np.bool_([True]),
            "col_timestamp": [datetime(2020, 1, 1, 0, 0, 0)],
            "col_date": [date(2020, 12, 31)],
            # "col_binary": "foobar".encode(),
        })
        # Explicitly specify column order
        df = df[[
            "col_int",
            "col_bigint",
            "col_float",
            "col_double",
            "col_string",
            "col_boolean",
            "col_timestamp",
            "col_date",
            # "col_binary",
        ]]
        df.to_sql(
            table_name,
            engine,
            schema=SCHEMA,
            index=False,
            if_exists="replace",
            method="multi",
        )

        table = Table(table_name, MetaData(bind=engine), autoload=True)
        self.assertEqual(
            table.select().execute().fetchall(),
            [(
                1,
                12345,
                1.0,
                1.2345,
                "a",
                True,
                datetime(2020, 1, 1, 0, 0, 0),
                date(2020, 12, 31),
                # "foobar".encode(),
            )],
        )
Exemplo n.º 3
0
    def test_to_sql(self, engine, conn):
        # TODO Add binary column (After dropping support for Python 2.7)
        table_name = "to_sql_{0}".format(str(uuid.uuid4()).replace("-", ""))
        df = pd.DataFrame(
            {
                "col_int": np.int32([1]),
                "col_bigint": np.int64([12345]),
                "col_float": np.float32([1.0]),
                "col_double": np.float64([1.2345]),
                "col_string": ["a"],
                "col_boolean": np.bool_([True]),
                "col_timestamp": [datetime(2020, 1, 1, 0, 0, 0)],
                "col_date": [date(2020, 12, 31)],
            }
        )
        # Explicitly specify column order
        df = df[
            [
                "col_int",
                "col_bigint",
                "col_float",
                "col_double",
                "col_string",
                "col_boolean",
                "col_timestamp",
                "col_date",
            ]
        ]
        df.to_sql(
            table_name,
            engine,
            schema=SCHEMA,
            index=False,
            if_exists="replace",
            method="multi",
        )

        table = Table(table_name, MetaData(bind=engine), autoload=True)
        self.assertEqual(
            table.select().execute().fetchall(),
            [
                (
                    1,
                    12345,
                    1.0,
                    1.2345,
                    "a",
                    True,
                    datetime(2020, 1, 1, 0, 0, 0),
                    date(2020, 12, 31),
                )
            ],
        )
Exemplo n.º 4
0
    def test_to_sql(self, engine, conn):
        table_name = 'to_sql_{0}'.format(str(uuid.uuid4()).replace('-', ''))
        df = pd.DataFrame({'a': [1, 2, 3, 4, 5]})
        df.to_sql(table_name,
                  engine,
                  schema=SCHEMA,
                  index=False,
                  if_exists='replace',
                  method='multi')

        table = Table(table_name, MetaData(bind=engine), autoload=True)
        rows = table.select().execute().fetchall()
        self.assertEqual(sorted(rows), [(1, ), (2, ), (3, ), (4, ), (5, )])
Exemplo n.º 5
0
 def test_reflect_select(self, engine, conn):
     one_row_complex = Table("one_row_complex",
                             MetaData(bind=engine),
                             autoload=True)
     self.assertEqual(len(one_row_complex.c), 15)
     self.assertIsInstance(one_row_complex.c.col_string, Column)
     rows = one_row_complex.select().execute().fetchall()
     self.assertEqual(len(rows), 1)
     self.assertEqual(
         list(rows[0]),
         [
             True,
             127,
             32767,
             2147483647,
             9223372036854775807,
             0.5,
             0.25,
             "a string",
             datetime(2017, 1, 1, 0, 0, 0),
             date(2017, 1, 2),
             b"123",
             "[1, 2]",
             "{1=2, 3=4}",
             "{a=1, b=2}",
             Decimal("0.1"),
         ],
     )
     self.assertIsInstance(one_row_complex.c.col_boolean.type, BOOLEAN)
     self.assertIsInstance(one_row_complex.c.col_tinyint.type, INTEGER)
     self.assertIsInstance(one_row_complex.c.col_smallint.type, INTEGER)
     self.assertIsInstance(one_row_complex.c.col_int.type, INTEGER)
     self.assertIsInstance(one_row_complex.c.col_bigint.type, BIGINT)
     self.assertIsInstance(one_row_complex.c.col_float.type, FLOAT)
     self.assertIsInstance(one_row_complex.c.col_double.type, FLOAT)
     self.assertIsInstance(one_row_complex.c.col_string.type,
                           type(STRINGTYPE))
     self.assertIsInstance(one_row_complex.c.col_timestamp.type, TIMESTAMP)
     self.assertIsInstance(one_row_complex.c.col_date.type, DATE)
     self.assertIsInstance(one_row_complex.c.col_binary.type, BINARY)
     self.assertIsInstance(one_row_complex.c.col_array.type,
                           type(STRINGTYPE))
     self.assertIsInstance(one_row_complex.c.col_map.type, type(STRINGTYPE))
     self.assertIsInstance(one_row_complex.c.col_struct.type,
                           type(STRINGTYPE))
     self.assertIsInstance(one_row_complex.c.col_decimal.type, DECIMAL)
def save_from_text_to_database(engine: Engine, df: pd.DataFrame):
    """ Saving these fields
            Column('language', String),
            Column('chamber', String),
            Column('date', Date),
            Column('file_name', String),
            Column('file_number', String),
            Column('file_number_additional', String),
            Column('html_url', String),
            Column('html_raw', String),
            Column('pdf_url', String),
            Column('pdf_raw', String),
    """
    def save_to_db(df: pd.DataFrame, table: str):
        # If the returned df is not a DataFrame but a Series, then convert it into a dataframe and Transpose it to correct the variable. (Not needed for most courts, but edge case needs it)
        if not isinstance(df, pd.DataFrame):
            df = df.to_frame()
            df = df.T
        df.to_sql(table, engine, if_exists="append", index=False)

    def add_ids_to_df_for_decision(series: pd.DataFrame) -> pd.DataFrame:
        query = f"SELECT file_id FROM file WHERE file_name = '{series['file_name']}'"
        series['file_id'] = pd.read_sql(query, engine.connect())["file_id"][0]
        series['language_id'] = -1
        query = f"SELECT chamber_id FROM chamber WHERE chamber_string = '{series['chamber']}'"
        chamber_id = pd.read_sql(query, engine.connect())['chamber_id']
        if len(chamber_id) == 0:
            print(
                f"The chamber {series['chamber']} was not found in the database. "
                f"Add it with the respective court and spider")
            raise ValueError
        else:
            series['chamber_id'] = chamber_id[0]

        series['decision_id'] = uuid.uuid5(uuid.UUID(int=0),
                                           series['file_name'])
        # TODO: Add topic recognition, similar to the title of the court decision
        series['topic'] = ''
        return series

    def save_the_file_numbers(series: pd.DataFrame) -> pd.DataFrame:
        """
        Saves the file_number for each of the decision ids
        :param series:
        :return:
        """
        query = f"SELECT decision_id FROM decision WHERE file_id = '{series['file_id']}'"
        series['decision_id'] = pd.read_sql(query,
                                            engine.connect())["decision_id"][0]
        with engine.connect() as conn:
            t = Table('file_number', MetaData(), autoload_with=engine)
            # Delete and reinsert as no upsert command is available
            stmt = t.delete().where(delete_stmt_decisions_with_df(series))
            conn.execute(stmt)
        series['text'] = series['file_number'].strip(
        )  # .map(lambda x: x.strip())
        save_to_db(series[['decision_id', 'text']], 'file_number')
        if ('file_number_additional' in series
                and series['file_number_additional'] is not None
                and len(series['file_number_additional']) > 0):
            series['text'] = series['file_number_additional'].strip(
            )  # .map(lambda x: x.strip())
            save_to_db(series[['decision_id', 'text']], 'file_number')
        return series

    if df.empty:
        return

    # Delete old decision and file entries
    with engine.connect() as conn:
        t_fil = Table('file', MetaData(), autoload_with=engine)
        t_dec = Table('decision', MetaData(), autoload_with=engine)
        file_name_list = ','.join(
            ["'" + str(item) + "'" for item in df['file_name'].tolist()])
        stmt = t_fil.select().where(text(f"file_name in ({file_name_list})"))
        file_ids = [item['file_id'] for item in conn.execute(stmt).all()]
        if len(file_ids) > 0:
            file_ids_list = ','.join(
                ["'" + str(item) + "'" for item in file_ids])
            # decision_ids = [item['decision_id'] for item in conn.execute(t_dec.select().where(text(f"file_id in ({file_ids_list})"))).all()]

            stmt = t_dec.delete().where(text(f"file_id in ({file_ids_list})"))
            conn.execute(stmt)
            stmt = t_fil.delete().where(text(f"file_id in ({file_ids_list})"))
            conn.execute(stmt)

    save_to_db(df[['file_name', 'html_url', 'pdf_url', 'html_raw', 'pdf_raw']],
               'file')

    df = df.apply(add_ids_to_df_for_decision, 1)

    df = df.replace(
        {np.NaN: None}
    )  # Convert pandas NaT values (Non-Type for Datetime) to None using np as np recognizes these types
    df['date'] = df['date'].replace(r'^\s*$', None, regex=True)
    df['date'] = df['date'].astype('datetime64[ns]')
    save_to_db(df[['language_id', 'chamber_id', 'file_id', 'date', 'topic']],
               'decision')
    df.apply(save_the_file_numbers, 1)
#-*- encoding: utf-8 -*-
'''
Created on 2014-11-5

@author: [email protected]
'''
from sqlalchemy.sql.schema import MetaData, Table


metadata = MetaData('mysql://%s:%s@%s/%s?charset=utf8' % ('root', 'root', '172.16.109.105:3306', 'itgfz2014'))

if __name__ == '__main__':
    
    mem_tab = Table('itgfz_member',metadata,autoload=True)
    
    stat = mem_tab.select()
    print stat
    r = stat.execute()
    print [v for v in r.fetchall()]
    
    pass