예제 #1
0
def read_and_validate_csv(file_handle, delimiter=',', mandatory_fields=None):
    """Generator for reading a CSV file.

    Args:
        file_handle: a file-like object containing the CSV content.
        delimiter: character used as a field separator, default: ','
        mandatory_fields: list of fields that must be present in the CSV header.

    Raises:
        RuntimeError: when there are missing fields.
        DataIngestionError: when there are issues with the data ingestion.
    """
    if not mandatory_fields:
        mandatory_fields = TIMESKETCH_FIELDS
    # Ensures delimiter is a string.
    if not isinstance(delimiter, six.text_type):
        delimiter = codecs.decode(delimiter, 'utf8')

    header_reader = pandas.read_csv(file_handle, sep=delimiter, nrows=0)
    _validate_csv_fields(mandatory_fields, header_reader)

    try:
        reader = pandas.read_csv(file_handle,
                                 sep=delimiter,
                                 chunksize=DEFAULT_CHUNK_SIZE)
        for idx, chunk in enumerate(reader):
            skipped_rows = chunk[chunk['datetime'].isnull()]
            if not skipped_rows.empty:
                logger.warning(
                    '{0} rows skipped since they were missing a datetime field '
                    'or it was empty '.format(len(skipped_rows)))

            # Normalize datetime to ISO 8601 format if it's not the case.
            try:
                chunk['datetime'] = pandas.to_datetime(chunk['datetime'])

                chunk['timestamp'] = chunk['datetime'].dt.strftime(
                    '%s%f').astype(int)
                chunk['datetime'] = chunk['datetime'].apply(
                    Timestamp.isoformat).astype(str)
            except ValueError:
                warning_string = ('Rows {0} to {1} skipped due to malformed '
                                  'datetime values ')
                logger.warning(
                    warning_string.format(idx * reader.chunksize,
                                          chunk.shape[0]))
                continue
            if 'tag' in chunk:
                chunk['tag'] = chunk['tag'].apply(_parse_tag_field)
            for _, row in chunk.iterrows():
                _scrub_special_tags(row)
                yield row
    except pandas.errors.ParserError as e:
        error_string = 'Unable to read file, with error: {0!s}'.format(e)
        logger.error(error_string)
        raise errors.DataIngestionError(error_string) from e
예제 #2
0
파일: utils.py 프로젝트: roo1989/timesketch
def read_and_validate_csv(file_handle, delimiter=','):
    """Generator for reading a CSV file.

    Args:
        file_handle: a file-like object containing the CSV content.
        delimiter: character used as a field separator, default: ','

    Raises:
        RuntimeError: when there are missing fields.
        DataIngestionError: when there are issues with the data ingestion.
    """
    # Columns that must be present in the CSV file.
    mandatory_fields = ['message', 'datetime', 'timestamp_desc']

    # Ensures delimiter is a string.
    if not isinstance(delimiter, six.text_type):
        delimiter = codecs.decode(delimiter, 'utf8')

    # Due to issues with python2.
    if six.PY2:
        delimiter = str(delimiter)

    reader = csv.DictReader(file_handle, delimiter=delimiter)
    csv_header = reader.fieldnames
    missing_fields = []
    # Validate the CSV header
    for field in mandatory_fields:
        if field not in csv_header:
            missing_fields.append(field)
    if missing_fields:
        raise RuntimeError('Missing fields in CSV header: {0:s}'.format(
            ','.join(missing_fields)))
    try:
        for row in reader:
            try:
                # normalize datetime to ISO 8601 format if it's not the case.
                parsed_datetime = parser.parse(row['datetime'])
                row['datetime'] = parsed_datetime.isoformat()

                normalized_timestamp = int(
                    time.mktime(parsed_datetime.utctimetuple()) * 1000000)
                normalized_timestamp += parsed_datetime.microsecond
                row['timestamp'] = str(normalized_timestamp)
                if 'tag' in row:
                    row['tag'] = [x for x in _parse_tag_field(row['tag']) if x]

                _scrub_special_tags(row)
            except ValueError:
                continue

            yield row
    except csv.Error as e:
        error_string = 'Unable to read file, with error: {0!s}'.format(e)
        logger.error(error_string)
        raise errors.DataIngestionError(error_string)
예제 #3
0
def read_and_validate_jsonl(file_handle):
    """Generator for reading a JSONL (json lines) file.

    Args:
        file_handle: a file-like object containing the CSV content.

    Raises:
        RuntimeError: if there are missing fields.
        DataIngestionError: If the ingestion fails.

    Yields:
        A dict that's ready to add to the datastore.
    """
    # Fields that must be present in each entry of the JSONL file.
    mandatory_fields = ["message", "datetime", "timestamp_desc"]
    lineno = 0
    for line in file_handle:
        lineno += 1
        try:
            linedict = json.loads(line)
            ld_keys = linedict.keys()
            if "datetime" not in ld_keys and "timestamp" in ld_keys:
                epoch = int(str(linedict["timestamp"])[:10])
                dt = datetime.datetime.fromtimestamp(epoch)
                linedict["datetime"] = dt.isoformat()
            if "timestamp" not in ld_keys and "datetime" in ld_keys:
                try:
                    linedict["timestamp"] = int(
                        parser.parse(linedict["datetime"]).timestamp() *
                        1000000)
                except parser.ParserError:
                    logger.error(
                        "Unable to parse timestamp, skipping line "
                        "{0:d}".format(lineno),
                        exc_info=True,
                    )
                    continue

            missing_fields = [x for x in mandatory_fields if x not in linedict]
            if missing_fields:
                raise RuntimeError(
                    "Missing field(s) at line {0:n}: {1:s}".format(
                        lineno, ",".join(missing_fields)))

            if "tag" in linedict:
                linedict["tag"] = [
                    x for x in _parse_tag_field(linedict["tag"]) if x
                ]
            _scrub_special_tags(linedict)
            yield linedict

        except ValueError as e:
            raise errors.DataIngestionError(
                "Error parsing JSON at line {0:n}: {1:s}".format(
                    lineno, str(e)))
예제 #4
0
def read_and_validate_jsonl(file_handle):
    """Generator for reading a JSONL (json lines) file.

    Args:
        file_handle: a file-like object containing the CSV content.

    Raises:
        RuntimeError: if there are missing fields.
        DataIngestionError: If the ingestion fails.

    Yields:
        A dict that's ready to add to the datastore.
    """
    # Fields that must be present in each entry of the JSONL file.
    mandatory_fields = ['message', 'datetime', 'timestamp_desc']
    lineno = 0
    for line in file_handle:
        lineno += 1
        try:
            linedict = json.loads(line)
            ld_keys = linedict.keys()
            if 'datetime' not in ld_keys and 'timestamp' in ld_keys:
                epoch = int(str(linedict['timestamp'])[:10])
                dt = datetime.datetime.fromtimestamp(epoch)
                linedict['datetime'] = dt.isoformat()
            if 'timestamp' not in ld_keys and 'datetime' in ld_keys:
                linedict['timestamp'] = parser.parse(linedict['datetime'])

            missing_fields = [x for x in mandatory_fields if x not in linedict]
            if missing_fields:
                raise RuntimeError(
                    'Missing field(s) at line {0:n}: {1:s}'.format(
                        lineno, ','.join(missing_fields)))

            if 'tag' in linedict:
                linedict['tag'] = [
                    x for x in _parse_tag_field(linedict['tag']) if x
                ]
            _scrub_special_tags(linedict)
            yield linedict

        except ValueError as e:
            raise errors.DataIngestionError(
                'Error parsing JSON at line {0:n}: {1:s}'.format(lineno, e))
예제 #5
0
def read_and_validate_csv(file_handle, delimiter=",", mandatory_fields=None):
    """Generator for reading a CSV file.

    Args:
        file_handle: a file-like object containing the CSV content.
        delimiter: character used as a field separator, default: ','
        mandatory_fields: list of fields that must be present in the CSV header.

    Raises:
        RuntimeError: when there are missing fields.
        DataIngestionError: when there are issues with the data ingestion.
    """
    if not mandatory_fields:
        mandatory_fields = TIMESKETCH_FIELDS
    # Ensures delimiter is a string.
    if not isinstance(delimiter, six.text_type):
        delimiter = codecs.decode(delimiter, "utf8")

    header_reader = pandas.read_csv(file_handle, sep=delimiter, nrows=0)
    _validate_csv_fields(mandatory_fields, header_reader)

    if hasattr(file_handle, "seek"):
        file_handle.seek(0)

    try:
        reader = pandas.read_csv(file_handle,
                                 sep=delimiter,
                                 chunksize=DEFAULT_CHUNK_SIZE)
        for idx, chunk in enumerate(reader):
            skipped_rows = chunk[chunk["datetime"].isnull()]
            if not skipped_rows.empty:
                logger.warning(
                    "{0} rows skipped since they were missing a datetime field "
                    "or it was empty ".format(len(skipped_rows)))

            # Normalize datetime to ISO 8601 format if it's not the case.
            try:
                # Lines with unrecognized datetime format will result in "NaT"
                # (not available) as its value and the event row will be
                # dropped in the next line
                chunk["datetime"] = pandas.to_datetime(chunk["datetime"],
                                                       errors="coerce")
                num_chunk_rows = chunk.shape[0]
                chunk.dropna(subset=["datetime"], inplace=True)
                if len(chunk) < num_chunk_rows:
                    logger.warning(
                        "{0} rows dropped from Rows {1} to {2} due to invalid "
                        "datetime values".format(
                            num_chunk_rows - len(chunk),
                            idx * reader.chunksize,
                            idx * reader.chunksize + num_chunk_rows,
                        ))
                chunk["timestamp"] = chunk["datetime"].dt.strftime(
                    "%s%f").astype(int)
                chunk["datetime"] = (chunk["datetime"].apply(
                    Timestamp.isoformat).astype(str))
            except ValueError:
                logger.warning("Rows {0} to {1} skipped due to malformed "
                               "datetime values ".format(
                                   idx * reader.chunksize,
                                   idx * reader.chunksize + chunk.shape[0]))
                continue
            if "tag" in chunk:
                chunk["tag"] = chunk["tag"].apply(_parse_tag_field)

            for _, row in chunk.iterrows():
                _scrub_special_tags(row)
                # Remove all NAN values from the pandas.Series.
                row.dropna(inplace=True)
                yield row.to_dict()
    except (pandas.errors.EmptyDataError, pandas.errors.ParserError) as e:
        error_string = "Unable to read file, with error: {0!s}".format(e)
        logger.error(error_string)
        raise errors.DataIngestionError(error_string) from e