def process_data(cur: extensions.cursor, conn: connection, filepath: str, func: Callable): """ Function reads all *.json files in a directory, resolving it's absolut path and appending to a list. Prints out total number of found files and finally processes each them calling passed in callback. :param cur: connection cursor :param conn: connection object :param filepath: filepath to the directory with songs or logs data :param func: callback function to process single data file: songs or logs """ all_files: List = [] for root, dirs, files in os.walk(filepath): files = glob.glob(os.path.join(root, '*.json')) for f in files: all_files.append(os.path.abspath(f)) num_files: int = len(all_files) print('{} files found in {}'.format(num_files, filepath)) for i, datafile in enumerate(all_files, 1): func(cur, datafile) conn.commit() print('{}/{} files processed.'.format(i, num_files))
def get_df(query: str, conn: connection, dispose_conn=False) -> pd.DataFrame: """ Executes the query and fetches the results Args: query: The sql query conn: The connection object to the postgres dispose_conn: Whether to close the connection after the query Returns: The results of the query as a DataFrame """ res = sqlio.read_sql_query(query, conn) if dispose_conn: conn.close() return res
def execute_and_get_rows(db: connection, query: str, args: tuple, keys: list) -> list: rows = list() with db, db.cursor() as cur: cur.execute(query, args) reply = cur.fetchone() while reply: row = {key: value for key, value in list(zip(keys, reply))} rows.append(row) reply = cur.fetchone() return rows
def insert_bulk(conn: Conn, logs: List[Dict[str, Any]]) -> None: with conn: with conn.cursor() as c: values = ((log["timestamp"], log["raw_log"], log["stream"], json.dumps(log["log"]), json.dumps(log["metadata"])) for log in logs) psycopg2.extras.execute_batch(c, """ INSERT INTO logs (timestamp, raw_log, stream, log, metadata) VALUES (%s, %s, %s, %s, %s) """, values, page_size=1000)
def build_references(config: Config, conn: connection = None) -> Dict[str, dict]: """ Build a tables dependency graph Algorithm: 1) Get all table names 2) Get all Foreign Keys 3) Build a tables dependency graph (references dict) For each table: For each child table: build dependency graph recursive Result: { 'references': { 'table_a': { 'table_b': { 'references': [{'pk_ref': 'id', 'fk_ref': 'table_b_id'}] 'ref_tables': { 'table_c': { 'table_a': {}, 'table_b': {} }, ... } }, 'table_c': {...} }, 'table_b': {...} }, 'primary_keys': { 'table_a': 'id', 'table_b': 'id', 'table_c': 'id' } } """ references = {} primary_keys = {} if not conn: conn = get_db_conn(config) try: tables = get_all_tables(conn, config.db_config) foreign_keys = get_all_fk(conn, config.db_config) for table in tables: references[table['table_name']] = {} for fk in foreign_keys: if fk['main_table'] not in references: references[fk['main_table']] = {} if not fk['ref_table'] in references[fk['main_table']]: references[fk['main_table']][fk['ref_table']] = { 'ref_tables': {}, 'references': [] } table_references = references[fk['main_table']][fk['ref_table']]['references'] table_references.append(ForeignKey( pk_main=fk['main_table_column'], pk_ref=fk['ref_pk_columns'], fk_ref=fk['ref_fk_column'], )) primary_keys[fk['main_table']] = fk['main_table_column'] if references: references = OrderedDict(sorted(references.items(), key=lambda row: len(row[1]), reverse=True)) for parent, refs in references.items(): for ref, ref_data in refs.items(): visited = {parent, ref} ref_childs = ref_data['ref_tables'] recursive_build(ref, ref_childs, references, visited) finally: conn.close() result = { 'references': references, 'primary_keys': primary_keys } return result
def execute_and_get_row(db: connection, query: str, args: tuple, keys: list) -> str: with db, db.cursor() as cur: cur.execute(query, args) reply = cur.fetchone() return {key: value for key, value in list(zip(keys, reply))}
def execute(db: connection, query: str, args: tuple) -> None: with db, db.cursor() as cur: cur.execute(query, args)