def read_and_validate_csv(file_handle, delimiter=',', mandatory_fields=None): """Generator for reading a CSV file. Args: file_handle: a file-like object containing the CSV content. delimiter: character used as a field separator, default: ',' mandatory_fields: list of fields that must be present in the CSV header. Raises: RuntimeError: when there are missing fields. DataIngestionError: when there are issues with the data ingestion. """ if not mandatory_fields: mandatory_fields = TIMESKETCH_FIELDS # Ensures delimiter is a string. if not isinstance(delimiter, six.text_type): delimiter = codecs.decode(delimiter, 'utf8') header_reader = pandas.read_csv(file_handle, sep=delimiter, nrows=0) _validate_csv_fields(mandatory_fields, header_reader) try: reader = pandas.read_csv(file_handle, sep=delimiter, chunksize=DEFAULT_CHUNK_SIZE) for idx, chunk in enumerate(reader): skipped_rows = chunk[chunk['datetime'].isnull()] if not skipped_rows.empty: logger.warning( '{0} rows skipped since they were missing a datetime field ' 'or it was empty '.format(len(skipped_rows))) # Normalize datetime to ISO 8601 format if it's not the case. try: chunk['datetime'] = pandas.to_datetime(chunk['datetime']) chunk['timestamp'] = chunk['datetime'].dt.strftime( '%s%f').astype(int) chunk['datetime'] = chunk['datetime'].apply( Timestamp.isoformat).astype(str) except ValueError: warning_string = ('Rows {0} to {1} skipped due to malformed ' 'datetime values ') logger.warning( warning_string.format(idx * reader.chunksize, chunk.shape[0])) continue if 'tag' in chunk: chunk['tag'] = chunk['tag'].apply(_parse_tag_field) for _, row in chunk.iterrows(): _scrub_special_tags(row) yield row except pandas.errors.ParserError as e: error_string = 'Unable to read file, with error: {0!s}'.format(e) logger.error(error_string) raise errors.DataIngestionError(error_string) from e
def read_and_validate_csv(file_handle, delimiter=','): """Generator for reading a CSV file. Args: file_handle: a file-like object containing the CSV content. delimiter: character used as a field separator, default: ',' Raises: RuntimeError: when there are missing fields. DataIngestionError: when there are issues with the data ingestion. """ # Columns that must be present in the CSV file. mandatory_fields = ['message', 'datetime', 'timestamp_desc'] # Ensures delimiter is a string. if not isinstance(delimiter, six.text_type): delimiter = codecs.decode(delimiter, 'utf8') # Due to issues with python2. if six.PY2: delimiter = str(delimiter) reader = csv.DictReader(file_handle, delimiter=delimiter) csv_header = reader.fieldnames missing_fields = [] # Validate the CSV header for field in mandatory_fields: if field not in csv_header: missing_fields.append(field) if missing_fields: raise RuntimeError('Missing fields in CSV header: {0:s}'.format( ','.join(missing_fields))) try: for row in reader: try: # normalize datetime to ISO 8601 format if it's not the case. parsed_datetime = parser.parse(row['datetime']) row['datetime'] = parsed_datetime.isoformat() normalized_timestamp = int( time.mktime(parsed_datetime.utctimetuple()) * 1000000) normalized_timestamp += parsed_datetime.microsecond row['timestamp'] = str(normalized_timestamp) if 'tag' in row: row['tag'] = [x for x in _parse_tag_field(row['tag']) if x] _scrub_special_tags(row) except ValueError: continue yield row except csv.Error as e: error_string = 'Unable to read file, with error: {0!s}'.format(e) logger.error(error_string) raise errors.DataIngestionError(error_string)
def read_and_validate_jsonl(file_handle): """Generator for reading a JSONL (json lines) file. Args: file_handle: a file-like object containing the CSV content. Raises: RuntimeError: if there are missing fields. DataIngestionError: If the ingestion fails. Yields: A dict that's ready to add to the datastore. """ # Fields that must be present in each entry of the JSONL file. mandatory_fields = ["message", "datetime", "timestamp_desc"] lineno = 0 for line in file_handle: lineno += 1 try: linedict = json.loads(line) ld_keys = linedict.keys() if "datetime" not in ld_keys and "timestamp" in ld_keys: epoch = int(str(linedict["timestamp"])[:10]) dt = datetime.datetime.fromtimestamp(epoch) linedict["datetime"] = dt.isoformat() if "timestamp" not in ld_keys and "datetime" in ld_keys: try: linedict["timestamp"] = int( parser.parse(linedict["datetime"]).timestamp() * 1000000) except parser.ParserError: logger.error( "Unable to parse timestamp, skipping line " "{0:d}".format(lineno), exc_info=True, ) continue missing_fields = [x for x in mandatory_fields if x not in linedict] if missing_fields: raise RuntimeError( "Missing field(s) at line {0:n}: {1:s}".format( lineno, ",".join(missing_fields))) if "tag" in linedict: linedict["tag"] = [ x for x in _parse_tag_field(linedict["tag"]) if x ] _scrub_special_tags(linedict) yield linedict except ValueError as e: raise errors.DataIngestionError( "Error parsing JSON at line {0:n}: {1:s}".format( lineno, str(e)))
def read_and_validate_jsonl(file_handle): """Generator for reading a JSONL (json lines) file. Args: file_handle: a file-like object containing the CSV content. Raises: RuntimeError: if there are missing fields. DataIngestionError: If the ingestion fails. Yields: A dict that's ready to add to the datastore. """ # Fields that must be present in each entry of the JSONL file. mandatory_fields = ['message', 'datetime', 'timestamp_desc'] lineno = 0 for line in file_handle: lineno += 1 try: linedict = json.loads(line) ld_keys = linedict.keys() if 'datetime' not in ld_keys and 'timestamp' in ld_keys: epoch = int(str(linedict['timestamp'])[:10]) dt = datetime.datetime.fromtimestamp(epoch) linedict['datetime'] = dt.isoformat() if 'timestamp' not in ld_keys and 'datetime' in ld_keys: linedict['timestamp'] = parser.parse(linedict['datetime']) missing_fields = [x for x in mandatory_fields if x not in linedict] if missing_fields: raise RuntimeError( 'Missing field(s) at line {0:n}: {1:s}'.format( lineno, ','.join(missing_fields))) if 'tag' in linedict: linedict['tag'] = [ x for x in _parse_tag_field(linedict['tag']) if x ] _scrub_special_tags(linedict) yield linedict except ValueError as e: raise errors.DataIngestionError( 'Error parsing JSON at line {0:n}: {1:s}'.format(lineno, e))
def read_and_validate_csv(file_handle, delimiter=",", mandatory_fields=None): """Generator for reading a CSV file. Args: file_handle: a file-like object containing the CSV content. delimiter: character used as a field separator, default: ',' mandatory_fields: list of fields that must be present in the CSV header. Raises: RuntimeError: when there are missing fields. DataIngestionError: when there are issues with the data ingestion. """ if not mandatory_fields: mandatory_fields = TIMESKETCH_FIELDS # Ensures delimiter is a string. if not isinstance(delimiter, six.text_type): delimiter = codecs.decode(delimiter, "utf8") header_reader = pandas.read_csv(file_handle, sep=delimiter, nrows=0) _validate_csv_fields(mandatory_fields, header_reader) if hasattr(file_handle, "seek"): file_handle.seek(0) try: reader = pandas.read_csv(file_handle, sep=delimiter, chunksize=DEFAULT_CHUNK_SIZE) for idx, chunk in enumerate(reader): skipped_rows = chunk[chunk["datetime"].isnull()] if not skipped_rows.empty: logger.warning( "{0} rows skipped since they were missing a datetime field " "or it was empty ".format(len(skipped_rows))) # Normalize datetime to ISO 8601 format if it's not the case. try: # Lines with unrecognized datetime format will result in "NaT" # (not available) as its value and the event row will be # dropped in the next line chunk["datetime"] = pandas.to_datetime(chunk["datetime"], errors="coerce") num_chunk_rows = chunk.shape[0] chunk.dropna(subset=["datetime"], inplace=True) if len(chunk) < num_chunk_rows: logger.warning( "{0} rows dropped from Rows {1} to {2} due to invalid " "datetime values".format( num_chunk_rows - len(chunk), idx * reader.chunksize, idx * reader.chunksize + num_chunk_rows, )) chunk["timestamp"] = chunk["datetime"].dt.strftime( "%s%f").astype(int) chunk["datetime"] = (chunk["datetime"].apply( Timestamp.isoformat).astype(str)) except ValueError: logger.warning("Rows {0} to {1} skipped due to malformed " "datetime values ".format( idx * reader.chunksize, idx * reader.chunksize + chunk.shape[0])) continue if "tag" in chunk: chunk["tag"] = chunk["tag"].apply(_parse_tag_field) for _, row in chunk.iterrows(): _scrub_special_tags(row) # Remove all NAN values from the pandas.Series. row.dropna(inplace=True) yield row.to_dict() except (pandas.errors.EmptyDataError, pandas.errors.ParserError) as e: error_string = "Unable to read file, with error: {0!s}".format(e) logger.error(error_string) raise errors.DataIngestionError(error_string) from e