def generate_partitioned_dataframes(self, df_msgs_and_meta_data): """ Generates the batched dataframes to upload to s3 Args: df_msgs_and_meta_data (dataframe): dataframe to be paritioned Returns (List[DataFrame, str, str]): list of triples (df, s3_dir, file_name) """ dataframes_and_fld_locs = [] for cur_interval_np_datetime in \ pd.unique(df_msgs_and_meta_data[self.partition_key_nm]): df_partition = \ df_msgs_and_meta_data[ df_msgs_and_meta_data[self.partition_key_nm] == cur_interval_np_datetime] cur_interval_ts = pd.to_datetime(cur_interval_np_datetime) batch_file_path = "{}/date_of_batch={}/time_of_batch={}".format( self.root_path, cur_interval_ts.strftime('%Y%m%d'), cur_interval_ts.strftime('%H%M%S')) file_nm = generate_snapshot_file_name_with_timestamp() LOG.debug("Data path : %s", batch_file_path) dataframes_and_fld_locs.append( (df_partition, batch_file_path, file_nm)) return dataframes_and_fld_locs
def _dataframe_to_cassandra_ddl(self, data_frame: DataFrame, primary_key_column_list: List[str], partition_key_column_list: List[str], table_name: str, table_options_statement: str = ""): """ Generates a 'create table' cql statement with primary keys (using compound keys for the partition) Args: data_frame (Dataframe): primary_key_column_list (list[str]): list of columns specifying the columns to be used as primary key partition_key_column_list (list[str]): list of columns specifying the columns to be used to partition the data table_name (str): name of the table to create table_options_statement (str): Returns: str """ column_list = _cql_manage_column_lists(data_frame, primary_key_column_list, partition_key_column_list) # create list of partition keys from first column of the primary key if not specified partition_key_column_list = partition_key_column_list if partition_key_column_list is not None and \ len(partition_key_column_list) > 0 else [primary_key_column_list[0]] partition_key = ["(" + ", ".join(partition_key_column_list) + ")"] # create list of cluster keys from the remainder of the primary key columns clustering_key_column_list = [x for x in primary_key_column_list if x not in partition_key_column_list] cluster_keys = [", ".join(clustering_key_column_list)] if len(clustering_key_column_list)>0 else [] cql = f""" CREATE TABLE IF NOT EXISTS {self.keyspace}.{table_name} ( {", ".join(column_list)}, PRIMARY KEY ({", ".join(partition_key + cluster_keys)}) ) {table_options_statement}; """ LOG.debug(cql) return cql
def _execute_batches(self, batches: List): results = [] LOG.info("Executing cassandra batches") for batch in batches: results.append(self._execute_batch(batch)) LOG.info("finished %s batches", len(results)) return results
def sync_etl_state_table(): """ Utility method to sync (Create) the table as per ORM model Returns: None """ LOG.debug("Sinking Cassandra Table using model") sync_table(EtlSinkRecordState)
def _show_result(self, execution_id, max_result_size=1000): results = self._get_query_result(execution_id, max_result_size) column_info = results['ResultSet']['ResultSetMetadata']['ColumnInfo'] headers = [h['Name'].encode('utf-8') for h in column_info] LOG.info(headers) csv_writer = csv.writer(sys.stdout, quoting=csv.QUOTE_ALL) csv_writer.writerows([[val['VarCharValue'] for val in row['Data']] for row in results['ResultSet']['Rows']])
def _cql_upsert_from_dataframe(self, dataframe: DataFrame, table: str): upsert_sql = f""" INSERT INTO {self.keyspace}.{table} ({", ".join(list(dataframe.columns.values))}) VALUES ({", ".join(['?' for key in dataframe.columns.values])}); """ LOG.debug(upsert_sql) return upsert_sql
def _cql_upsert_from_dict(self, data: dict, table: str): upsert_sql = f""" INSERT INTO {self.keyspace}.{table} ({", ".join(data)}) VALUES ({", ".join(['?' for key in data])}); """ LOG.debug(upsert_sql) return upsert_sql
def _check_msg_is_not_error(msg): if msg.error(): LOG.error("Consumer error: %s", msg.error()) return None LOG.debug("Message from topic: %s", msg.value().decode('utf-8')) return msg
def get_fields_types(self, field_types_row_number: int) -> List[str]: """ Get the field types as a list Args: field_types_row_number (int): Row number of field types Returns: field types list """ field_types = self._get_worksheet().row_values(field_types_row_number) LOG.debug("Field types:\n%s", field_types) return field_types
def poll_topic_and_upload_to_s3(self): """ Poll at the Kafka topic at set intervals and parse and export the messages to S3 Returns: None """ while True: LOG.debug("Polling Kafka for messages") self.create_events_snapshot() LOG.debug("Upload complete, sleeping") time.sleep(self._polling_interval)
def _validate_partition_key_list(column_dict, primary_key_column_list, partition_key_column_list): _validate_primary_key_list(column_dict, primary_key_column_list) if partition_key_column_list is None or not partition_key_column_list: LOG.debug("partition_key_column_list : %s\nNo partition key specified. Revert to using first column from the primary key for partitioning.", str(partition_key_column_list)) return for key in partition_key_column_list: if key not in primary_key_column_list: raise ValidationError( f"The column {key} is not in the primary key list. It cannot be specified as part of the partition key", )
def create_table(self, table_settings): """ Create a table from given settings Args: table_settings (dict): Dictionary of settings to create table Returns: None """ table_sql = self._build_create_table_sql(table_settings) LOG.info(table_sql) self.run_query(table_sql)
def _state_manager_connect(self): LOG.info("Connecting to Cassandra") conn = CassandraConnectionManager( self.__settings.etl_state_manager_connection) conn.setup_connection(self.__settings.etl_state_manager_keyspace) LOG.info("Cassandra connection established") sync_etl_state_table()
def _verify_data_before_upsert( self, data: List[dict]) -> (List[dict], List[dict]): data, issues = map(list, zip(*[self._sanitise_data(dat) for dat in data])) if len(issues) > 0: LOG.warning("Issues found in verification, number of issues: %i", len(issues)) # Remove None from the List return [i for i in data if i], [i for i in issues if i]
def get_msgs(self): """Get the latest messages from the Kafka topic Returns list(Message) : list of Kafka Messages """ LOG.debug("Getting messages from topic %s", self._topic) if not self._subscribed_to_topic: self._subscribe_consumer() return self.poll_kafka_for_messages()
def get_all_keys(self, key_prefix: str) -> List[str]: """ Sense all keys under a given key prefix Args: key_prefix (str): the key prefix under which all files will be sensed Returns: List[str] """ LOG.info("sensing files from s3://%s/%s ", self.bucket, key_prefix) metadata = self.get_object_metadata(key_prefix) lines = [file.key for file in metadata] LOG.info("found %s s3 keys", len(lines)) return lines
def _partition_data_and_upload_to_s3(self, data_list, interval): """ Partitions the messages by time in a dataframe, and then uploads to s3 """ if data_list: for msg_df, key, file_name in \ self.partition_msgs_by_kafka_ts(data_list, interval): LOG.debug("data path : %s/%s", key, file_name) self._s3_client.upload_dataframe_as_parquet( dataframe=msg_df, key=key, file_name=file_name)
def _upsert_data_frame(self, data_frame): if self.__settings.destination_batch_size > 1: LOG.info("Going to upsert batches of size %s", self.__settings.destination_batch_size) result = self._get_cassandra_util().upsert_dataframe_in_batches( dataframe=data_frame, table=self.__settings.destination_table, batch_size=self.__settings.destination_batch_size) else: LOG.info("Going to upsert one row at a time") result = self._get_cassandra_util().upsert_dataframe( dataframe=data_frame, table=self.__settings.destination_table) return result
def create_events_snapshot(self): """ Get Kafka messages from a topic and export to s3 Returns: None """ msgs = self._kafka_poller.get_msgs() LOG.debug("Json messages : %s", msgs) self._kafka_s3_exporter.parse_and_export_msgs(msgs, self._polling_interval)
def execute(self, query: str, row_factory: callable, **kwargs) -> Result: """ Execute a cql command and retrieve data with the row factory Args: query (str): row_factory (callable): **kwargs: Kwargs to match the session.execute command in cassandra Returns: ResultSet """ LOG.debug("Executing query: %s", query) if row_factory is not None: self._session.row_factory = row_factory return self._session.execute(query, **kwargs)
def delete_recursive(self, key_prefix: str) -> None: """ Recursively delete all keys with given prefix from the named bucket Args: key_prefix (str): Key prefix under which all files will be deleted Returns: NA """ if not key_prefix.endswith("/"): key_prefix = f"{key_prefix}/" LOG.info("Recursively deleting s3://%s/%s", self.bucket, key_prefix) response = self.get_resource().Bucket(self.bucket).objects.filter( Prefix=key_prefix).delete() LOG.info(response)
def _sanitise_data(self, dat): try: current_state = self._get_sink_manager(dat).current_state() LOG.debug("Current state of sink manager %s", current_state) if current_state == EtlStates.Ready: LOG.debug("Record in ready state with data: %s", dat) return dat, None else: LOG.debug( "Sink state found to be not ready, state is %s, the " "data is: " "%s", current_state, dat) return None, _get_structured_issue( f"Current state is {current_state} " "state", dat) except ValidationError as e: LOG.warning( "Issue while trying to ready a record for the upload \n %s \n %s", e, dat) return None, _get_structured_issue(str(e), dat)
def rename_file(self, key: str, new_file_name: str) -> None: """ Rename a file on s3 Args: key: Current key of the file new_file_name: target file name Returns: None """ s3 = self.get_resource() full_new_file_path = key.rpartition('/')[0] + '/' + new_file_name LOG.info("Renaming source: %s to %s", key, full_new_file_path) s3.Object(self.bucket, full_new_file_path).copy_from( CopySource={'Bucket': self.bucket, 'Key': key}) s3.Object(self.bucket, key).delete()
def add_versions_from_json_file(self, version_file_location): """ Load a json file from local storage and append these versions to the version tracking dictionary Args: version_file_location (str): Path to the json file Returns: None """ try: with open(version_file_location, "r") as content: file_content = content.read() version_dict_from_file = json.loads(file_content) self.add_dictionary_to_versions(version_dict_from_file) except FileNotFoundError as fnf: log.error("No versioning file found at : %s", version_file_location) raise fnf except json.decoder.JSONDecodeError as decode_error: log.error("JSON file failed to decode, check version dict is " "correctly formatted") raise decode_error except Exception as exception: log.error("unknown error") log.error(traceback.format_exc()) raise exception
def _multi_process_upload_file(settings: AwsConnectionSettings, filename: str, bucket: str, key: str) -> None: """ A standalone copy of the method making it simple to pickle in a multi processing pool Args: settings: the s3 connection settings to use for upload filename: local file name of the file to be uploaded. bucket: the s3 bucket to upload file to. key: the s3 key to use while uploading the file Returns: None """ LOG.info("Uploading File %s to s3://%s/%s", filename, bucket, key) S3Util( conn=AwsConnectionManager(settings), bucket=bucket ).upload_file(local_file_path=filename, key=key)
def get_page_as_list_of_dict(page: dict) -> List[OrderedDict]: """ Converts a list of entries from google adwords response into a list of Ordered Dictionaries Args: page (dict): the response page from google adwords api Returns: List[dict] """ result = [] if 'entries' in page: entries = page['entries'] # These entries are a list of zeep Objects that need conversion to Dict result = [zeep_object_to_dict(entry) for entry in entries] LOG.debug("The result from the adword API: %s", result) else: LOG.info('No entries were found.') return result
def add_partitions(self): """ Add the current Data Transfer's partition to Athena's Metadata Returns: None """ if self.__settings.is_partitioned_table: athena_util = self._get_athena_util() athena_util.add_partitions( table=self.__settings.target_table, partition_keys=[ key for (key, value) in self.__settings.partition_values ], partition_values=[ value for (key, value) in self.__settings.partition_values ]) else: LOG.warning("The table is not partitioned, this is a NOOP")
def parse_and_export_msgs(self, list_of_msgs, interval): """ Converts messages to a pandas dataframe and then exports to s3 Args: list_of_msgs (list(Kafka Message Object)): List of msg objects interval (int): Rounding interval for the temporal partitioning Returns: None """ good_data, bad_data = convert_msgs_to_dictionary(list_of_msgs) self._partition_data_and_upload_to_s3(good_data, interval) self._partition_data_and_upload_to_s3(bad_data, interval) LOG.info("Data Upload Complete")
def download_directory(self, source_key: str, file_suffix: str, local_directory: str) -> None: """ Download an entire directory from s3 onto local file system Args: source_key (str): key prefix of the directory to be downloaded from s3 file_suffix (str): suffix to filter a subset under the source_key to be downloaded local_directory (str): local absolute path to store all the files Returns: None """ s3 = self.get_resource() LOG.info("Downloading s3://%s/%s to %s", self.bucket, source_key, local_directory) for obj in s3.Bucket(self.bucket).objects.filter(Prefix=source_key): key_path = obj.key.split("/") if obj.key.endswith(file_suffix): filename = f"{local_directory}/{key_path[-1]}" self.download_file( local_file_path=filename, key=obj.key)
def delete_recursive_match_suffix(self, key_prefix: str, suffix: str) -> None: """ Recursively delete all keys with given key prefix and suffix from the bucket Args: key_prefix (str): Key prefix under which all files will be deleted. suffix (str): suffix of the subset of files in the given prefix directory to be deleted Returns: None """ if not key_prefix: raise ValueError("key_prefix must not be empty") if not suffix: raise ValueError("suffix must not be empty") s3 = self.get_resource() for obj in s3.Bucket(self.bucket).objects.filter(Prefix=key_prefix): if obj.key.endswith(suffix): LOG.info("deleting s3://%s/%s", self.bucket, obj.key) response = obj.delete() LOG.info("Response: %s ", response)