Exemplos de Token em Python, exemplos de plaso.lib.lexer.Token em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: text_parser_test.py Projeto: cvandeplas/plaso

class TestTextParser(text_parser.SlowLexicalTextParser):
    """Implement a text parser object that can successfully parse a text file.

  To be able to achieve that one function has to be implemented, the ParseDate
  one.
  """
    NAME = 'test_text'

    tokens = [
        lexer.Token('INITIAL', r'^([\d\/]+) ', 'SetDate', 'TIME'),
        lexer.Token('TIME', r'([0-9:\.]+) ', 'SetTime', 'STRING_HOST'),
        lexer.Token('STRING_HOST', r'([^\-]+)- ', 'ParseStringHost', 'STRING'),
        lexer.Token('STRING', '([^\n]+)', 'ParseString', ''),
        lexer.Token('STRING', '\n', 'ParseMessage', 'INITIAL')
    ]

    def ParseStringHost(self, match, **_):
        user, host = match.group(1).split(':')
        self.attributes['hostname'] = host
        self.attributes['username'] = user

    def SetDate(self, match, **_):
        month, day, year = match.group(1).split('/')
        self.attributes['imonth'] = int(month)
        self.attributes['iyear'] = int(year)
        self.attributes['iday'] = int(day)

    def Scan(self, unused_file_entry):
        pass

    def CreateEvent(self, timestamp, offset, attributes):
        event_object = TestTextEvent(timestamp, 0, attributes)
        event_object.offset = offset
        return event_object

Exemplo n.º 2

0

Exibir arquivo

Arquivo: text_parser.py Projeto: bethlogic/plaso

class TestTextParser(text_parser.SlowLexicalTextParser):
    """Implement a text parser object that can successfully parse a text file.

  To be able to achieve that one function has to be implemented, the ParseDate
  one.
  """
    NAME = u'test_text'

    tokens = [
        lexer.Token(u'INITIAL', r'^([\d\/]+) ', u'SetDate', u'TIME'),
        lexer.Token(u'TIME', r'([0-9:\.]+) ', u'SetTime', u'STRING_HOST'),
        lexer.Token(u'STRING_HOST', r'([^\-]+)- ', u'ParseStringHost',
                    u'STRING'),
        lexer.Token(u'STRING', r'([^\n]+)', u'ParseString', u''),
        lexer.Token(u'STRING', r'\n', u'ParseMessage', u'INITIAL')
    ]

    def CreateEvent(self, timestamp, offset, attributes):
        """Creates an event.

    Args:
      timestamp: the timestamp which is an integer containing the number
                 of micro seconds since January 1, 1970, 00:00:00 UTC.
      offset: an integer containing the offset.
      attributes: a dictionary containing the event attributes.

    Returns:
      An event object (instance of EventObject).
    """
        event_object = TestTextEvent(timestamp, 0, attributes)
        event_object.offset = offset
        return event_object

    def ParseStringHost(self, match, **_):
        """Parses a string containing an username and hostname.

    Args:
      match: a regular expression match.
    """
        user, host = match.group(1).split(u':')
        self.attributes[u'hostname'] = host
        self.attributes[u'username'] = user

    def SetDate(self, match, **_):
        """Parses a date string.

    Args:
      match: a regular expression match.
    """
        month, day, year = match.group(1).split(u'/')
        self.attributes[u'imonth'] = int(month)
        self.attributes[u'iyear'] = int(year)
        self.attributes[u'iday'] = int(day)

Exemplo n.º 3

0

Exibir arquivo

class PathReplacer(lexer.Lexer):
    """Replace path variables with values gathered from earlier preprocessing."""

    tokens = [
        lexer.Token('.', '{{([^}]+)}}', 'ReplaceVariable', ''),
        lexer.Token('.', '{([^}]+)}', 'ReplaceString', ''),
        lexer.Token('.', '([^{])', 'ParseString', ''),
    ]

    def __init__(self, pre_obj, data=''):
        """Constructor for a path replacer."""
        super(PathReplacer, self).__init__(data)
        self._path = []
        self._pre_obj = pre_obj

    def GetPath(self):
        """Run the lexer and replace path."""
        while True:
            _ = self.NextToken()
            if self.Empty():
                break

        return u''.join(self._path)

    def ParseString(self, match, **_):
        """Append a string to the path."""
        self._path.append(match.group(1))

    def ReplaceVariable(self, match, **_):
        """Replace a string that should not be a variable."""
        self._path.append(u'{{{0:s}}}'.format(match.group(1)))

    def ReplaceString(self, match, **_):
        """Replace a variable with a given attribute."""
        replace = getattr(self._pre_obj, match.group(1), None)

        if replace:
            self._path.append(replace)
        else:
            raise errors.PathNotFound(
                u'Path variable: {} not discovered yet.'.format(
                    match.group(1)))

Exemplo n.º 4

0

Exibir arquivo

Arquivo: syslog.py Projeto: vonnopsled/plaso

class SyslogParser(text_parser.SlowLexicalTextParser):
  """Parse text based syslog files."""

  NAME = u'syslog'
  DESCRIPTION = u'Parser for syslog files.'

  # TODO: can we change this similar to SQLite where create an
  # event specific object for different lines using a callback function.
  # Define the tokens that make up the structure of a syslog file.
  tokens = [
      lexer.Token(
          u'INITIAL', u'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) ',
          u'SetMonth', u'DAY'),
      lexer.Token(u'DAY', r'\s?(\d{1,2})\s+', u'SetDay', u'TIME'),
      lexer.Token(u'TIME', r'([0-9:\.]+) ', u'SetTime', u'STRING_HOST'),
      lexer.Token(u'STRING_HOST', r'^--(-)', u'ParseHostname', u'STRING'),
      lexer.Token(
          u'STRING_HOST', r'([^\s]+) ', u'ParseHostname', u'STRING_PID'),
      lexer.Token(u'STRING_PID', r'([^\:\n]+)', u'ParsePid', u'STRING'),
      lexer.Token(u'STRING', r'([^\n]+)', u'ParseString', u''),
      lexer.Token(u'STRING', r'\n\t', None, u''),
      lexer.Token(u'STRING', r'\t', None, u''),
      lexer.Token(u'STRING', r'\n', u'ParseMessage', u'INITIAL'),
      lexer.Token(u'.', r'([^\n]+)\n', u'ParseIncomplete', u'INITIAL'),
      lexer.Token(u'.', r'\n[^\t]', u'ParseIncomplete', u'INITIAL'),
      lexer.Token(u'S[.]+', r'(.+)', u'ParseString', u''),
      ]

  def __init__(self):
    """Initializes a syslog parser object."""
    super(SyslogParser, self).__init__(local_zone=True)
    # Set the initial year to 0 (fixed in the actual Parse method)
    self._year_use = 0
    self._last_month = 0

    # Set some additional attributes.
    self.attributes[u'reporter'] = u''
    self.attributes[u'pid'] = u''

  def ParseLine(self, parser_mediator):
    """Parse a single line from the syslog file.

    This method extends the one from TextParser slightly, adding
    the context of the reporter and pid values found inside syslog
    files.

    Args:
      parser_mediator: A parser mediator object (instance of ParserMediator).
    """
    if not self._year_use:
      self._year_use = parser_mediator.GetEstimatedYear()

    month_compare = int(self.attributes[u'imonth'])
    if month_compare and self._last_month > month_compare:
      self._year_use += 1

    self._last_month = int(self.attributes[u'imonth'])

    self.attributes[u'iyear'] = self._year_use

    super(SyslogParser, self).ParseLine(parser_mediator)

  def ParseHostname(self, match=None, **unused_kwargs):
    """Parses the hostname.

       This is a callback function for the text parser (lexer) and is
       called by the STRING_HOST lexer state.

    Args:
      match: The regular expression match object.
    """
    self.attributes[u'hostname'] = match.group(1)

  def ParsePid(self, match=None, **unused_kwargs):
    """Parses the process identifier (PID).

       This is a callback function for the text parser (lexer) and is
       called by the STRING_PID lexer state.

    Args:
      match: The regular expression match object.
    """
    # TODO: Change this logic and rather add more Tokens that
    # fully cover all variations of the various PID stages.
    line = match.group(1)
    if line[-1] == ']':
      splits = line.split(u'[')
      if len(splits) == 2:
        self.attributes[u'reporter'], pid = splits
      else:
        pid = splits[-1]
        self.attributes[u'reporter'] = u'['.join(splits[:-1])
      try:
        self.attributes[u'pid'] = int(pid[:-1])
      except ValueError:
        self.attributes[u'pid'] = 0
    else:
      self.attributes[u'reporter'] = line

  def ParseString(self, match=None, **unused_kwargs):
    """Parses a (body text) string.

       This is a callback function for the text parser (lexer) and is
       called by the STRING lexer state.

    Args:
      match: The regular expression match object.
    """
    self.attributes[u'body'] += utils.GetUnicodeString(match.group(1))

  def PrintLine(self):
    """Prints a log line."""
    self.attributes[u'iyear'] = 2012
    return super(SyslogParser, self).PrintLine()

  # TODO: this is a rough initial implementation to get this working.
  def CreateEvent(self, timestamp, offset, attributes):
    """Creates a syslog line event.

       This overrides the default function in TextParser to create
       syslog line events instead of text events.

    Args:
      timestamp: The timestamp time value. The timestamp contains the
                 number of microseconds since Jan 1, 1970 00:00:00 UTC.
      offset: The offset of the event.
      attributes: A dict that contains the events attributes.

    Returns:
      A text event (SyslogLineEvent).
    """
    return SyslogLineEvent(timestamp, offset, attributes)

Exemplo n.º 5

0

Exibir arquivo

class SELinuxParser(text_parser.SlowLexicalTextParser):
    """Parse SELinux audit log files."""

    NAME = 'selinux'
    DESCRIPTION = u'Parser for SELinux audit log files.'

    PID_RE = re.compile(r'pid=([0-9]+)[\s]+', re.DOTALL)

    tokens = [
        # Skipping empty lines, both EOLs are considered here and in other states.
        lexer.Token('INITIAL', r'^\r?\n', '', ''),
        # FSM entry point ('type=anything msg=audit'), critical to recognize a
        # SELinux audit file and used to retrieve the audit type. From there two
        # next states are possible: TIME or failure, since TIME state is required.
        # An empty type is not accepted and it will cause a failure.
        # Examples:
        #   type=SYSCALL msg=audit(...): ...
        #   type=UNKNOWN[1323] msg=audit(...): ...
        lexer.Token('INITIAL', r'^type=([\w]+(\[[0-9]+\])?)[ \t]+msg=audit',
                    'ParseType', 'TIMESTAMP'),
        lexer.Token('TIMESTAMP', r'\(([0-9]+)\.([0-9]+):([0-9]*)\):',
                    'ParseTime', 'STRING'),
        # Get the log entry description and stay in the same state.
        lexer.Token('STRING', r'[ \t]*([^\r\n]+)', 'ParseString', ''),
        # Entry parsed. Note that an empty description is managed and it will not
        # raise a parsing failure.
        lexer.Token('STRING', r'[ \t]*\r?\n', 'ParseMessage', 'INITIAL'),
        # The entry is not formatted as expected, so the parsing failed.
        lexer.Token('.', '([^\r\n]+)\r?\n', 'ParseFailed', 'INITIAL')
    ]

    def __init__(self):
        """Initializes a parser object."""
        # Set local_zone to false, since timestamps are UTC.
        super(SELinuxParser, self).__init__(local_zone=False)
        self.attributes = {u'audit_type': '', u'pid': '', u'body': ''}
        self.timestamp = 0

    def ParseType(self, match=None, **unused_kwargs):
        """Parse the audit event type.

    Args:
      match: The regular expression match object.
    """
        self.attributes[u'audit_type'] = match.group(1)

    def ParseTime(self, match=None, **unused_kwargs):
        """Parse the log timestamp.

    Args:
      match: The regular expression match object.
    """
        # TODO: do something with match.group(3) ?
        try:
            number_of_seconds = int(match.group(1), 10)
            timestamp = timelib.Timestamp.FromPosixTime(number_of_seconds)
            timestamp += int(match.group(2), 10) * 1000
            self.timestamp = timestamp
        except ValueError as exception:
            logging.error(
                u'Unable to retrieve timestamp with error: {0:s}'.format(
                    exception))
            self.timestamp = 0
            raise lexer.ParseError(u'Not a valid timestamp.')

    def ParseString(self, match=None, **unused_kwargs):
        """Add a string to the body attribute.

    This method extends the one from TextParser slightly,
    searching for the 'pid=[0-9]+' value inside the message body.

    Args:
      match: The regular expression match object.
    """
        try:
            self.attributes[u'body'] += match.group(1)
            # TODO: fix it using lexer or remove pid parsing.
            # Indeed this is something that lexer is able to manage, but 'pid' field
            # is non positional: so, by doing the following step, the FSM is kept
            # simpler. Left the 'to do' as a reminder of possible refactoring.
            pid_search = self.PID_RE.search(self.attributes[u'body'])
            if pid_search:
                self.attributes[u'pid'] = pid_search.group(1)
        except IndexError:
            self.attributes[u'body'] += match.group(0).strip(u'\n')

    def ParseFailed(self, **unused_kwargs):
        """Entry parsing failed callback."""
        raise lexer.ParseError(u'Unable to parse SELinux log line.')

    def ParseLine(self, parser_mediator):
        """Parse a single line from the SELinux audit file.

    This method extends the one from TextParser slightly, creating a
    SELinux event with the timestamp (UTC) taken from log entries.

    Args:
      parser_mediator: A parser mediator object (instance of ParserMediator).
    """
        if not self.timestamp:
            raise errors.TimestampNotCorrectlyFormed(
                u'Unable to parse entry, timestamp not defined.')

        offset = getattr(self, u'entry_offset', 0)
        event_object = SELinuxLineEvent(self.timestamp, offset,
                                        self.attributes)
        parser_mediator.ProduceEvent(event_object)
        self.timestamp = 0

Exemplo n.º 6

0

Exibir arquivo

class Parser(lexer.SearchParser):
    """Parses and generates an AST for a query written in the described language.

  Examples of valid syntax:
    size is 40
    (name contains "Program Files" AND hash.md5 is "123abc")
    @imported_modules (num_symbols = 14 AND symbol.name is "FindWindow")
  """
    expression_cls = BasicExpression
    binary_expression_cls = BinaryExpression
    context_cls = ContextExpression

    tokens = [
        # Operators and related tokens
        lexer.Token('INITIAL', r'\@[\w._0-9]+', 'ContextOperator,PushState',
                    'CONTEXTOPEN'),
        lexer.Token('INITIAL', r'[^\s\(\)]', 'PushState,PushBack',
                    'ATTRIBUTE'),
        lexer.Token('INITIAL', r'\(', 'PushState,BracketOpen', None),
        lexer.Token('INITIAL', r'\)', 'BracketClose', 'BINARY'),

        # Context
        lexer.Token('CONTEXTOPEN', r'\(', 'BracketOpen', 'INITIAL'),

        # Double quoted string
        lexer.Token('STRING', '"', 'PopState,StringFinish', None),
        lexer.Token('STRING', r'\\x(..)', 'HexEscape', None),
        lexer.Token('STRING', r'\\(.)', 'StringEscape', None),
        lexer.Token('STRING', r'[^\\"]+', 'StringInsert', None),

        # Single quoted string
        lexer.Token('SQ_STRING', '\'', 'PopState,StringFinish', None),
        lexer.Token('SQ_STRING', r'\\x(..)', 'HexEscape', None),
        lexer.Token('SQ_STRING', r'\\(.)', 'StringEscape', None),
        lexer.Token('SQ_STRING', r'[^\\\']+', 'StringInsert', None),

        # Basic expression
        lexer.Token('ATTRIBUTE', r'[\w._0-9]+', 'StoreAttribute', 'OPERATOR'),
        lexer.Token('OPERATOR', r'not ', 'FlipLogic', None),
        lexer.Token('OPERATOR', r'(\w+|[<>!=]=?)', 'StoreOperator',
                    'CHECKNOT'),
        lexer.Token('CHECKNOT', r'not', 'FlipLogic', 'ARG'),
        lexer.Token('CHECKNOT', r'\s+', None, None),
        lexer.Token('CHECKNOT', r'([^not])', 'PushBack', 'ARG'),
        lexer.Token('ARG', r'(\d+\.\d+)', 'InsertFloatArg', 'ARG'),
        lexer.Token('ARG', r'(0x\d+)', 'InsertInt16Arg', 'ARG'),
        lexer.Token('ARG', r'(\d+)', 'InsertIntArg', 'ARG'),
        lexer.Token('ARG', '"', 'PushState,StringStart', 'STRING'),
        lexer.Token('ARG', '\'', 'PushState,StringStart', 'SQ_STRING'),
        # When the last parameter from arg_list has been pushed

        # State where binary operators are supported (AND, OR)
        lexer.Token('BINARY', r'(?i)(and|or|\&\&|\|\|)', 'BinaryOperator',
                    'INITIAL'),
        # - We can also skip spaces
        lexer.Token('BINARY', r'\s+', None, None),
        # - But if it's not "and" or just spaces we have to go back
        lexer.Token('BINARY', '.', 'PushBack,PopState', None),

        # Skip whitespace.
        lexer.Token('.', r'\s+', None, None),
    ]

    def StoreAttribute(self, string='', **kwargs):
        self.flipped = False
        super(Parser, self).StoreAttribute(string, **kwargs)

    def FlipAllowed(self):
        """Raise an error if the not keyword is used where it is not allowed."""
        if not hasattr(self, 'flipped'):
            raise errors.ParseError(u'Not defined.')

        if not self.flipped:
            return

        if self.current_expression.operator:
            if not self.current_expression.operator.lower() in (
                    'is', 'contains', 'inset', 'equals'):
                raise errors.ParseError(
                    u'Keyword \'not\' does not work against operator: {0:s}'.
                    format(self.current_expression.operator))

    def FlipLogic(self, **unused_kwargs):
        """Flip the boolean logic of the expression.

    If an expression is configured to return True when the condition
    is met this logic will flip that to False, and vice versa.
    """
        if hasattr(self, 'flipped') and self.flipped:
            raise errors.ParseError(
                u'The operator \'not\' can only be expressed once.')

        if self.current_expression.args:
            raise errors.ParseError(
                u'Unable to place the keyword \'not\' after an argument.')

        self.flipped = True

        # Check if this flip operation should be allowed.
        self.FlipAllowed()

        if hasattr(self.current_expression, 'FlipBool'):
            self.current_expression.FlipBool()
            logging.debug(u'Negative matching [flipping boolean logic].')
        else:
            logging.warning(
                u'Unable to perform a negative match, issuing a positive one.')

    def InsertArg(self, string='', **unused_kwargs):
        """Insert an arg to the current expression."""
        # Note that "string" is not necessarily of type string.
        logging.debug(u'Storing argument: {0!s}'.format(string))

        # Check if this flip operation should be allowed.
        self.FlipAllowed()

        # This expression is complete
        if self.current_expression.AddArg(string):
            self.stack.append(self.current_expression)
            self.current_expression = self.expression_cls()
            # We go to the BINARY state, to find if there's an AND or OR operator
            return 'BINARY'

    def InsertFloatArg(self, string='', **unused_kwargs):
        """Inserts a Float argument."""
        try:
            float_value = float(string)
        except (TypeError, ValueError):
            raise errors.ParseError(
                u'{0:s} is not a valid float.'.format(string))
        return self.InsertArg(float_value)

    def InsertIntArg(self, string='', **unused_kwargs):
        """Inserts an Integer argument."""
        try:
            int_value = int(string)
        except (TypeError, ValueError):
            raise errors.ParseError(
                u'{0:s} is not a valid integer.'.format(string))
        return self.InsertArg(int_value)

    def InsertInt16Arg(self, string='', **unused_kwargs):
        """Inserts an Integer in base16 argument."""
        try:
            int_value = int(string, 16)
        except (TypeError, ValueError):
            raise errors.ParseError(
                u'{0:s} is not a valid base16 integer.'.format(string))
        return self.InsertArg(int_value)

    def StringFinish(self, **unused_kwargs):
        if self.state == 'ATTRIBUTE':
            return self.StoreAttribute(string=self.string)

        elif self.state == 'ARG':
            return self.InsertArg(string=self.string)

    def StringEscape(self, string, match, **unused_kwargs):
        """Escape backslashes found inside a string quote.

    Backslashes followed by anything other than [\'"rnbt.ws] will raise
    an Error.

    Args:
      string: The string that matched.
      match: the match object (instance of re.MatchObject).
             Where match.group(1) contains the escaped code.

    Raises:
      ParseError: When the escaped string is not one of [\'"rnbt]
    """
        if match.group(1) in '\\\'"rnbt\\.ws':
            self.string += string.decode('string_escape')
        else:
            raise errors.ParseError(
                u'Invalid escape character {0:s}.'.format(string))

    def HexEscape(self, string, match, **unused_kwargs):
        """Converts a hex escaped string."""
        logging.debug(u'HexEscape matched {0:s}.'.format(string))
        hex_string = match.group(1)
        try:
            self.string += binascii.unhexlify(hex_string)
        except TypeError:
            raise errors.ParseError(
                u'Invalid hex escape {0:s}.'.format(string))

    def ContextOperator(self, string='', **unused_kwargs):
        self.stack.append(self.context_cls(string[1:]))

    def Reduce(self):
        """Reduce the token stack into an AST."""
        # Check for sanity
        if self.state != 'INITIAL' and self.state != 'BINARY':
            self.Error(u'Premature end of expression')

        length = len(self.stack)
        while length > 1:
            # Precendence order
            self._CombineParenthesis()
            self._CombineBinaryExpressions('and')
            self._CombineBinaryExpressions('or')
            self._CombineContext()

            # No change
            if len(self.stack) == length:
                break
            length = len(self.stack)

        if length != 1:
            self.Error(u'Illegal query expression')

        return self.stack[0]

    def Error(self, message=None, _=None):
        # Note that none of the values necessarily are strings.
        raise errors.ParseError(
            u'{0!s} in position {1!s}: {2!s} <----> {3!s} )'.format(
                message, len(self.processed_buffer), self.processed_buffer,
                self.buffer))

    def _CombineBinaryExpressions(self, operator):
        for i in range(1, len(self.stack) - 1):
            item = self.stack[i]
            if (isinstance(item, lexer.BinaryExpression)
                    and item.operator.lower() == operator.lower()
                    and isinstance(self.stack[i - 1], lexer.Expression)
                    and isinstance(self.stack[i + 1], lexer.Expression)):
                lhs = self.stack[i - 1]
                rhs = self.stack[i + 1]

                self.stack[i].AddOperands(lhs, rhs)
                self.stack[i - 1] = None
                self.stack[i + 1] = None

        self.stack = filter(None, self.stack)

    def _CombineContext(self):
        # Context can merge from item 0
        for i in range(len(self.stack) - 1, 0, -1):
            item = self.stack[i - 1]
            if (isinstance(item, ContextExpression)
                    and isinstance(self.stack[i], lexer.Expression)):
                expression = self.stack[i]
                self.stack[i - 1].SetExpression(expression)
                self.stack[i] = None

        self.stack = filter(None, self.stack)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: dynamic_filter.py Projeto: vonnopsled/plaso

class SelectiveLexer(lexer.Lexer):
  """Selective filter lexer implementation.

  The selective (or dynamic) filter allow to construct filter expressions
  like:
    SELECT field_a, field_b WHERE attribute contains 'text'
  """

  tokens = [
      lexer.Token('INITIAL', r'SELECT', '', 'FIELDS'),
      lexer.Token('FIELDS', r'(.+) WHERE ', 'SetFields', 'FILTER'),
      lexer.Token('FIELDS', r'(.+) LIMIT', 'SetFields', 'LIMIT_END'),
      lexer.Token('FIELDS', r'(.+) SEPARATED BY', 'SetFields', 'SEPARATE'),
      lexer.Token('FIELDS', r'(.+)$', 'SetFields', 'END'),
      lexer.Token('FILTER', r'(.+) SEPARATED BY', 'SetFilter', 'SEPARATE'),
      lexer.Token('FILTER', r'(.+) LIMIT', 'SetFilter', 'LIMIT_END'),
      lexer.Token('FILTER', r'(.+)$', 'SetFilter', 'END'),
      lexer.Token('SEPARATE', r' ', '', ''),  # Ignore white space here.
      lexer.Token('SEPARATE', r'LIMIT', '', 'LIMIT_END'),
      lexer.Token(
          'SEPARATE', r'[\'"]([^ \'"]+)[\'"] LIMIT', 'SetSeparator',
          'LIMIT_END'),
      lexer.Token(
          'SEPARATE', r'[\'"]([^ \'"]+)[\'"]$', 'SetSeparator', 'END'),
      lexer.Token(
          'SEPARATE', r'(.+)$', 'SetSeparator', 'END'),
      lexer.Token(
          'LIMIT_END', r'SEPARATED BY [\'"]([^\'"]+)[\'"]', 'SetSeparator', ''),
      lexer.Token('LIMIT_END', r'(.+) SEPARATED BY', 'SetLimit', 'SEPARATE'),
      lexer.Token('LIMIT_END', r'(.+)$', 'SetLimit', 'END')]

  def __init__(self, data=''):
    """Initializes a selective lexer object.

    Args:
      data: optional initial data to be processed by the lexer.
    """
    super(SelectiveLexer, self).__init__(data=data)
    self.fields = []
    self.limit = 0
    self.lex_filter = None
    self.separator = u','

  def SetFields(self, match, **unused_kwargs):
    """Sets the output fields.

    The output fields is the part of the filter expression directly following
    the SELECT statement.

    Args:
      match: the match object (instance of re.MatchObject) that contains the
             output field names.
    """
    text = match.group(1).lower()
    field_text, _, _ = text.partition(' from ')

    use_field_text = field_text.replace(' ', '')
    if ',' in use_field_text:
      self.fields = use_field_text.split(',')
    else:
      self.fields = [use_field_text]

  def SetFilter(self, match, **unused_kwargs):
    """Set the filter query.

    The filter query is the part of the filter expression directly following
    the WHERE statement.

    Args:
      match: the match object (instance of re.MatchObject) that contains the
             filter query.
    """
    filter_match = match.group(1)
    if 'LIMIT' in filter_match:
      # This only occurs in the case where we have "LIMIT X SEPARATED BY".
      self.lex_filter, _, push_back = filter_match.rpartition('LIMIT')
      self.PushBack('LIMIT {0:s} SEPARATED BY '.format(push_back))
    else:
      self.lex_filter = filter_match

  def SetLimit(self, match, **unused_kwargs):
    """Sets the row limit.

    Args:
      match: the match object (instance of re.MatchObject) that contains the
             row limit.
    """
    try:
      limit = int(match.group(1))
    except ValueError:
      self.Error('Invalid limit value, should be int [{}] = {}'.format(
          type(match.group(1)), match.group(1)))
      limit = 0

    self.limit = limit

  def SetSeparator(self, match, **unused_kwargs):
    """Sets the output field separator.

    Args:
      match: the match object (instance of re.MatchObject) that contains the
             output field separate. Note that only the first character is used.
    """
    separator = match.group(1)
    if separator:
      self.separator = separator[0]

Exemplo n.º 8

0

Exibir arquivo

Arquivo: text_parser.py Projeto: cvandeplas/plaso

class SlowLexicalTextParser(interface.BaseParser, lexer.SelfFeederMixIn):
    """Generic text based parser that uses lexer to assist with parsing.

  This text parser is based on a rather slow lexer, which makes the
  use of this interface highly discouraged. Parsers that already
  implement it will most likely all be rewritten to support faster
  text parsing implementations.

  This text based parser needs to be extended to provide an accurate
  list of tokens that define the structure of the log file that the
  parser is designed for.
  """

    # Define the max number of lines before we determine this is
    # not the correct parser.
    MAX_LINES = 15

    # List of tokens that describe the structure of the log file.
    tokens = [
        lexer.Token('INITIAL', '(.+)\n', 'ParseString', ''),
    ]

    def __init__(self, local_zone=True):
        """Constructor for the SlowLexicalTextParser.

    Args:
      local_zone: A boolean value that determines if the entries
                  in the log file are stored in the local time
                  zone of the computer that stored it or in a fixed
                  timezone, like UTC.
    """
        # TODO: remove the multiple inheritance.
        lexer.SelfFeederMixIn.__init__(self)
        interface.BaseParser.__init__(self)
        self.line_ready = False
        self.attributes = {
            'body': '',
            'iyear': 0,
            'imonth': 0,
            'iday': 0,
            'time': '',
            'hostname': '',
            'username': '',
        }
        self.local_zone = local_zone
        self.file_entry = None

    def ClearValues(self):
        """Clears all the values inside the attributes dict.

    All values that start with the letter 'i' are considered
    to be an integer, otherwise string value is assumed.
    """
        self.line_ready = False
        for attr in self.attributes:
            if attr[0] == 'i':
                self.attributes[attr] = 0
            else:
                self.attributes[attr] = ''

    def ParseIncomplete(self, match=None, **unused_kwargs):
        """Indication that we've got a partial line to match against.

    Args:
      match: The regular expression match object.
    """
        self.attributes['body'] += match.group(0)
        self.line_ready = True

    def ParseMessage(self, **unused_kwargs):
        """Signal that a line is ready to be parsed."""
        self.line_ready = True

    def SetMonth(self, match=None, **unused_kwargs):
        """Parses the month.

       This is a callback function for the text parser (lexer) and is
       called by the corresponding lexer state.

    Args:
      match: The regular expression match object.
    """
        self.attributes['imonth'] = int(
            timelib.MONTH_DICT.get(match.group(1).lower(), 1))

    def SetDay(self, match=None, **unused_kwargs):
        """Parses the day of the month.

       This is a callback function for the text parser (lexer) and is
       called by the corresponding lexer state.

    Args:
      match: The regular expression match object.
    """
        self.attributes['iday'] = int(match.group(1))

    def SetTime(self, match=None, **unused_kwargs):
        """Set the time attribute.

    Args:
      match: The regular expression match object.
    """
        self.attributes['time'] = match.group(1)

    def SetYear(self, match=None, **unused_kwargs):
        """Parses the year.

       This is a callback function for the text parser (lexer) and is
       called by the corresponding lexer state.

    Args:
      match: The regular expression match object.
    """
        self.attributes['iyear'] = int(match.group(1))

    def Parse(self, parser_context, file_entry):
        """Extract data from a text file.

    Args:
      parser_context: A parser context object (instance of ParserContext).
      file_entry: A file entry object (instance of dfvfs.FileEntry).

    Yields:
      An event object (instance of EventObject).
    """
        path_spec_printable = u'{0:s}:{1:s}'.format(
            file_entry.path_spec.type_indicator, file_entry.name)
        file_object = file_entry.GetFileObject()

        self.file_entry = file_entry
        # TODO: this is necessary since we inherit from lexer.SelfFeederMixIn.
        self.file_object = file_object

        # Start by checking, is this a text file or not? Before we proceed
        # any further.
        file_object.seek(0, os.SEEK_SET)
        if not utils.IsText(file_object.read(40)):
            raise errors.UnableToParseFile(
                u'Not a text file, unable to proceed.')

        file_object.seek(0, os.SEEK_SET)

        error_count = 0
        file_verified = False
        # We need to clear out few values in the Lexer before continuing.
        # There might be some leftovers from previous run.
        self.error = 0
        self.buffer = ''

        while True:
            _ = self.NextToken()

            if self.state == 'INITIAL':
                self.entry_offset = getattr(self, 'next_entry_offset', 0)
                self.next_entry_offset = file_object.tell() - len(self.buffer)

            if not file_verified and self.error >= self.MAX_LINES * 2:
                logging.debug(
                    u'Lexer error count: {0:d} and current state {1:s}'.format(
                        self.error, self.state))
                file_object.close()
                raise errors.UnableToParseFile(
                    u'[{0:s}] unsupported file: {1:s}.'.format(
                        self.NAME, path_spec_printable))

            if self.line_ready:
                try:
                    event_object = self.ParseLine(parser_context)
                    parser_context.ProduceEvent(event_object,
                                                parser_name=self.NAME,
                                                file_entry=file_entry)

                    file_verified = True

                except errors.TimestampNotCorrectlyFormed as exception:
                    error_count += 1
                    if file_verified:
                        logging.debug(
                            u'[{0:s} VERIFIED] Error count: {1:d} and ERROR: {2:d}'
                            .format(path_spec_printable, error_count,
                                    self.error))
                        logging.warning(
                            u'[{0:s}] Unable to parse timestamp with error: {1:s}'
                            .format(self.NAME, exception))

                    else:
                        logging.debug((
                            u'[{0:s} EVALUATING] Error count: {1:d} and ERROR: '
                            u'{2:d})').format(path_spec_printable, error_count,
                                              self.error))

                        if error_count >= self.MAX_LINES:
                            file_object.close()
                            raise errors.UnableToParseFile(
                                u'[{0:s}] unsupported file: {1:s}.'.format(
                                    self.NAME, path_spec_printable))

                finally:
                    self.ClearValues()

            if self.Empty():
                # Try to fill the buffer to prevent the parser from ending prematurely.
                self.Feed()

            if self.Empty():
                break

        if not file_verified:
            file_object.close()
            raise errors.UnableToParseFile(
                u'[{0:s}] unable to parser file: {1:s}.'.format(
                    self.NAME, path_spec_printable))

        file_offset = file_object.get_offset()
        if file_offset < file_object.get_size():
            logging.error(
                (u'{0:s} prematurely terminated parsing: {1:s} at offset: '
                 u'0x{2:08x}.').format(self.NAME, path_spec_printable,
                                       file_offset))
        file_object.close()

    def ParseString(self, match=None, **unused_kwargs):
        """Return a string with combined values from the lexer.

    Args:
      match: The regular expression match object.

    Returns:
      A string that combines the values that are so far
      saved from the lexer.
    """
        try:
            self.attributes['body'] += match.group(1).strip('\n')
        except IndexError:
            self.attributes['body'] += match.group(0).strip('\n')

    def PrintLine(self):
        """"Return a string with combined values from the lexer."""
        year = getattr(self.attributes, 'iyear', None)
        month = getattr(self.attributes, 'imonth', None)
        day = getattr(self.attributes, 'iday', None)

        if None in [year, month, day]:
            date_string = u'[DATE NOT SET]'
        else:
            try:
                year = int(year, 10)
                month = int(month, 10)
                day = int(day, 10)

                date_string = u'{0:04d}-{1:02d}-{2:02d}'.format(
                    year, month, day)
            except ValueError:
                date_string = u'[DATE INVALID]'

        time_string = getattr(self.attributes, 'time', u'[TIME NOT SET]')
        hostname_string = getattr(self.attributes, 'hostname',
                                  u'HOSTNAME NOT SET')
        reporter_string = getattr(self.attributes, 'reporter',
                                  u'[REPORTER NOT SET]')
        body_string = getattr(self.attributes, 'body', u'[BODY NOT SET]')

        # TODO: this is a work in progress. The reason for the try-catch is that
        # the text parser is handed a non-text file and must deal with converting
        # arbitrary binary data.
        try:
            line = u'{0:s} {1:s} [{2:s}] {3:s} => {4:s}'.format(
                date_string, time_string, hostname_string, reporter_string,
                body_string)
        except UnicodeError:
            line = 'Unable to print line - due to encoding error.'

        return line

    def ParseLine(self, parser_context):
        """Return an event object extracted from the current line.

    Args:
      parser_context: A parser context object (instance of ParserContext).

    Returns:
      An event object (instance of TextEvent).
    """
        if not self.attributes['time']:
            raise errors.TimestampNotCorrectlyFormed(
                u'Unable to parse timestamp, time not set.')

        if not self.attributes['iyear']:
            raise errors.TimestampNotCorrectlyFormed(
                u'Unable to parse timestamp, year not set.')

        times = self.attributes['time'].split(':')
        if self.local_zone:
            timezone = parser_context.timezone
        else:
            timezone = pytz.UTC

        if len(times) < 3:
            raise errors.TimestampNotCorrectlyFormed(
                (u'Unable to parse timestamp, not of the format HH:MM:SS '
                 u'[{0:s}]').format(self.PrintLine()))
        try:
            secs = times[2].split('.')
            if len(secs) == 2:
                sec, us = secs
            else:
                sec = times[2]
                us = 0

            timestamp = timelib.Timestamp.FromTimeParts(
                int(self.attributes['iyear']),
                self.attributes['imonth'],
                self.attributes['iday'],
                int(times[0]),
                int(times[1]),
                int(sec),
                microseconds=int(us),
                timezone=timezone)

        except ValueError as exception:
            raise errors.TimestampNotCorrectlyFormed(
                u'Unable to parse: {0:s} with error: {1:s}'.format(
                    self.PrintLine(), exception))

        return self.CreateEvent(timestamp, getattr(self, 'entry_offset', 0),
                                self.attributes)

    # TODO: this is a rough initial implementation to get this working.
    def CreateEvent(self, timestamp, offset, attributes):
        """Creates an event.

       This function should be overwritten by text parsers that require
       to generate specific event object type, the default is TextEvent.

    Args:
      timestamp: The timestamp time value. The timestamp contains the
                 number of microseconds since Jan 1, 1970 00:00:00 UTC.
      offset: The offset of the event.
      attributes: A dict that contains the events attributes.

    Returns:
      An event object (instance of TextEvent).
    """
        return text_events.TextEvent(timestamp, offset, attributes)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: dynamic_filter.py Projeto: vertigo0001/plaso

class SelectiveLexer(lexer.Lexer):
  """A simple selective filter lexer implementation."""

  tokens = [
      lexer.Token('INITIAL', r'SELECT', '', 'FIELDS'),
      lexer.Token('FIELDS', r'(.+) WHERE ', 'SetFields', 'FILTER'),
      lexer.Token('FIELDS', r'(.+) LIMIT', 'SetFields', 'LIMIT_END'),
      lexer.Token('FIELDS', r'(.+) SEPARATED BY', 'SetFields', 'SEPARATE'),
      lexer.Token('FIELDS', r'(.+)$', 'SetFields', 'END'),
      lexer.Token('FILTER', r'(.+) SEPARATED BY', 'SetFilter', 'SEPARATE'),
      lexer.Token('FILTER', r'(.+) LIMIT', 'SetFilter', 'LIMIT_END'),
      lexer.Token('FILTER', r'(.+)$', 'SetFilter', 'END'),
      lexer.Token('SEPARATE', r' ', '', ''),  # Ignore white space here.
      lexer.Token('SEPARATE', r'LIMIT', '', 'LIMIT_END'),
      lexer.Token(
          'SEPARATE', r'[\'"]([^ \'"]+)[\'"] LIMIT', 'SetSeparator',
          'LIMIT_END'),
      lexer.Token(
          'SEPARATE', r'[\'"]([^ \'"]+)[\'"]$', 'SetSeparator', 'END'),
      lexer.Token(
          'SEPARATE', r'(.+)$', 'SetSeparator', 'END'),
      lexer.Token(
          'LIMIT_END', r'SEPARATED BY [\'"]([^\'"]+)[\'"]', 'SetSeparator', ''),
      lexer.Token('LIMIT_END', r'(.+) SEPARATED BY', 'SetLimit', 'SEPARATE'),
      lexer.Token('LIMIT_END', r'(.+)$', 'SetLimit', 'END')]

  def __init__(self, data=''):
    """Initialize the lexer."""
    self.fields = []
    self.limit = 0
    self.lex_filter = None
    self.separator = u','
    super(SelectiveLexer, self).__init__(data)

  def SetFilter(self, match, **_):
    """Set the filter query."""
    filter_match = match.group(1)
    if 'LIMIT' in filter_match:
      # This only occurs in the case where we have "LIMIT X SEPARATED BY".
      self.lex_filter, _, push_back = filter_match.rpartition('LIMIT')
      self.PushBack('LIMIT {} SEPARATED BY '.format(push_back))
    else:
      self.lex_filter = filter_match

  def SetSeparator(self, match, **_):
    """Set the separator of the output, only uses the first char."""
    separator = match.group(1)
    if separator:
      self.separator = separator[0]

  def SetLimit(self, match, **_):
    """Set the row limit."""
    try:
      limit = int(match.group(1))
    except ValueError:
      self.Error('Invalid limit value, should be int [{}] = {}'.format(
          type(match.group(1)), match.group(1)))
      limit = 0

    self.limit = limit

  def SetFields(self, match, **_):
    """Set the selective fields."""
    text = match.group(1).lower()
    field_text, _, _ = text.partition(' from ')

    use_field_text = field_text.replace(' ', '')
    if ',' in use_field_text:
      self.fields = use_field_text.split(',')
    else:
      self.fields = [use_field_text]

Exemplo n.º 10

0

Exibir arquivo

Arquivo: syslog.py Projeto: f-s-p/plaso

class SyslogParser(text_parser.SlowLexicalTextParser):
    """Parse text based syslog files."""

    NAME = 'syslog'
    DESCRIPTION = u'Parser for syslog files.'

    # TODO: can we change this similar to SQLite where create an
    # event specific object for different lines using a callback function.
    # Define the tokens that make up the structure of a syslog file.
    tokens = [
        lexer.Token('INITIAL',
                    '(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) ',
                    'SetMonth', 'DAY'),
        lexer.Token('DAY', r'\s?(\d{1,2})\s+', 'SetDay', 'TIME'),
        lexer.Token('TIME', r'([0-9:\.]+) ', 'SetTime', 'STRING_HOST'),
        lexer.Token('STRING_HOST', r'^--(-)', 'ParseHostname', 'STRING'),
        lexer.Token('STRING_HOST', r'([^\s]+) ', 'ParseHostname',
                    'STRING_PID'),
        lexer.Token('STRING_PID', r'([^\:\n]+)', 'ParsePid', 'STRING'),
        lexer.Token('STRING', r'([^\n]+)', 'ParseString', ''),
        lexer.Token('STRING', r'\n\t', None, ''),
        lexer.Token('STRING', r'\t', None, ''),
        lexer.Token('STRING', r'\n', 'ParseMessage', 'INITIAL'),
        lexer.Token('.', '([^\n]+)\n', 'ParseIncomplete', 'INITIAL'),
        lexer.Token('.', '\n[^\t]', 'ParseIncomplete', 'INITIAL'),
        lexer.Token('S[.]+', '(.+)', 'ParseString', ''),
    ]

    def __init__(self):
        """Initializes a syslog parser object."""
        super(SyslogParser, self).__init__(local_zone=True)
        # Set the initial year to 0 (fixed in the actual Parse method)
        self._year_use = 0
        self._last_month = 0

        # Set some additional attributes.
        self.attributes['reporter'] = ''
        self.attributes['pid'] = ''

    def _GetYear(self, stat, timezone):
        """Retrieves the year either from the input file or from the settings."""
        time = getattr(stat, 'crtime', 0)
        if not time:
            time = getattr(stat, 'ctime', 0)

        if not time:
            current_year = timelib.GetCurrentYear()
            logging.error(
                (u'Unable to determine year of syslog file.\nDefaulting to: '
                 u'{0:d}').format(current_year))
            return current_year

        try:
            timestamp = datetime.datetime.fromtimestamp(time, timezone)
        except ValueError as exception:
            current_year = timelib.GetCurrentYear()
            logging.error(
                u'Unable to determine year of syslog file with error: {0:s}\n'
                u'Defaulting to: {1:d}'.format(exception, current_year))
            return current_year

        return timestamp.year

    def ParseLine(self, parser_mediator):
        """Parse a single line from the syslog file.

    This method extends the one from TextParser slightly, adding
    the context of the reporter and pid values found inside syslog
    files.

    Args:
      parser_mediator: A parser mediator object (instance of ParserMediator).

    Returns:
      An event object (instance of TextEvent).
    """
        # Note this an older comment applying to a similar approach previously
        # the init function.
        # TODO: this is a HACK to get the tests working let's discuss this.
        if not self._year_use:
            self._year_use = parser_mediator.year

        if not self._year_use:
            # TODO: Find a decent way to actually calculate the correct year
            # from the syslog file, instead of relying on stats object.
            stat = self.file_entry.GetStat()
            self._year_use = self._GetYear(stat, parser_mediator.timezone)

            if not self._year_use:
                # TODO: Make this sensible, not have the year permanent.
                self._year_use = 2012

        month_compare = int(self.attributes['imonth'])
        if month_compare and self._last_month > month_compare:
            self._year_use += 1

        self._last_month = int(self.attributes['imonth'])

        self.attributes['iyear'] = self._year_use

        return super(SyslogParser, self).ParseLine(parser_mediator)

    def ParseHostname(self, match=None, **unused_kwargs):
        """Parses the hostname.

       This is a callback function for the text parser (lexer) and is
       called by the STRING_HOST lexer state.

    Args:
      match: The regular expression match object.
    """
        self.attributes['hostname'] = match.group(1)

    def ParsePid(self, match=None, **unused_kwargs):
        """Parses the process identifier (PID).

       This is a callback function for the text parser (lexer) and is
       called by the STRING_PID lexer state.

    Args:
      match: The regular expression match object.
    """
        # TODO: Change this logic and rather add more Tokens that
        # fully cover all variations of the various PID stages.
        line = match.group(1)
        if line[-1] == ']':
            splits = line.split('[')
            if len(splits) == 2:
                self.attributes['reporter'], pid = splits
            else:
                pid = splits[-1]
                self.attributes['reporter'] = '['.join(splits[:-1])
            try:
                self.attributes['pid'] = int(pid[:-1])
            except ValueError:
                self.attributes['pid'] = 0
        else:
            self.attributes['reporter'] = line

    def ParseString(self, match=None, **unused_kwargs):
        """Parses a (body text) string.

       This is a callback function for the text parser (lexer) and is
       called by the STRING lexer state.

    Args:
      match: The regular expression match object.
    """
        self.attributes['body'] += utils.GetUnicodeString(match.group(1))

    def PrintLine(self):
        """Prints a log line."""
        self.attributes['iyear'] = 2012
        return super(SyslogParser, self).PrintLine()

    # TODO: this is a rough initial implementation to get this working.
    def CreateEvent(self, timestamp, offset, attributes):
        """Creates a syslog line event.

       This overrides the default function in TextParser to create
       syslog line events instead of text events.

    Args:
      timestamp: The timestamp time value. The timestamp contains the
                 number of microseconds since Jan 1, 1970 00:00:00 UTC.
      offset: The offset of the event.
      attributes: A dict that contains the events attributes.

    Returns:
      A text event (SyslogLineEvent).
    """
        return SyslogLineEvent(timestamp, offset, attributes)

Exemplo n.º 11

0

Exibir arquivo

class SlowLexicalTextParser(interface.FileObjectParser, lexer.SelfFeederMixIn):
    """Generic text based parser that uses lexer to assist with parsing.

  This text parser is based on a rather slow lexer, which makes the
  use of this interface highly discouraged. Parsers that already
  implement it will most likely all be rewritten to support faster
  text parsing implementations.

  This text based parser needs to be extended to provide an accurate
  list of tokens that define the structure of the log file that the
  parser is designed for.
  """

    _INITIAL_FILE_OFFSET = None

    # Define the max number of lines before we determine this is
    # not the correct parser.
    MAX_LINES = 15

    # List of tokens that describe the structure of the log file.
    tokens = [
        lexer.Token(u'INITIAL', r'(.+)\n', u'ParseString', u''),
    ]

    def __init__(self, local_zone=True):
        """Constructor for the SlowLexicalTextParser.

    Args:
      local_zone: a boolean value that determines if the entries
                  in the log file are stored in the local time
                  zone of the computer that stored it or in a fixed
                  timezone, like UTC.
    """
        # TODO: remove the multiple inheritance.
        lexer.SelfFeederMixIn.__init__(self)
        interface.FileObjectParser.__init__(self)
        self._file_verified = False

        self.attributes = {
            u'body': u'',
            u'iyear': 0,
            u'imonth': 0,
            u'iday': 0,
            u'time': u'',
            u'hostname': u'',
            u'username': u'',
        }
        self.entry_offset = None
        self.line_ready = False
        self.local_zone = local_zone
        self.next_entry_offset = 0

    def ClearValues(self):
        """Clears all the values inside the attributes dict.

    All values that start with the letter 'i' are considered
    to be an integer, otherwise string value is assumed.
    """
        self.line_ready = False
        for attr in self.attributes:
            if attr.startswith(u'i'):
                self.attributes[attr] = 0
            else:
                self.attributes[attr] = u''

    def CreateEvent(self, timestamp, offset, attributes):
        """Creates an event.

       This function should be overwritten by text parsers that required
       the generation of specific event object type, the default event
       type is TextEvent.

    Args:
      timestamp: the timestamp time value. The timestamp contains the
                 number of microseconds since Jan 1, 1970 00:00:00 UTC.
      offset: the offset of the event.
      attributes: a dictionary that contains the event's attributes.

    Returns:
      An event object (instance of TextEvent).
    """
        return text_events.TextEvent(timestamp, offset, attributes)

    def ParseIncomplete(self, match=None, **unused_kwargs):
        """Parse a partial line match and append to the body attribute.

    Args:
      match: optional regular expression match object (instance of SRE_Match).
    """
        if not match:
            return

        try:
            self.attributes[u'body'] += match.group(0)
        except UnicodeDecodeError:
            # TODO: Support other encodings than UTF-8 here, read from the
            # knowledge base or parse from the file itself.
            self.attributes[u'body'] += u'{0:s}'.format(
                match.group(0).decode(u'utf-8', errors=u'replace'))

        self.line_ready = True

    def ParseMessage(self, **unused_kwargs):
        """Signal that a line is ready to be parsed."""
        self.line_ready = True

    def ParseFileObject(self, parser_mediator, file_object, **kwargs):
        """Parses a text file-like object using a lexer.

    Args:
      parser_mediator: a parser mediator object (instance of ParserMediator).
      file_object: a file-like object.

    Raises:
      UnableToParseFile: when the file cannot be parsed.
    """
        file_entry = parser_mediator.GetFileEntry()
        path_spec_printable = u'{0:s}:{1:s}'.format(
            file_entry.path_spec.type_indicator, file_entry.name)

        # TODO: this is necessary since we inherit from lexer.SelfFeederMixIn.
        self.file_object = file_object
        self._file_verified = False

        # Start by checking, is this a text file or not? Before we proceed
        # any further.
        file_object.seek(0, os.SEEK_SET)
        if not utils.IsText(file_object.read(40)):
            raise errors.UnableToParseFile(
                u'Not a text file, unable to proceed.')

        file_object.seek(0, os.SEEK_SET)

        error_count = 0
        # We need to clear out few values in the Lexer before continuing.
        # There might be some leftovers from previous run.
        self.error = 0
        self.buffer = b''

        while True:
            _ = self.NextToken()

            if self.state == u'INITIAL':
                self.entry_offset = self.next_entry_offset
                self.next_entry_offset = file_object.tell() - len(self.buffer)

            if not self._file_verified and self.error >= self.MAX_LINES * 2:
                logging.debug(
                    u'Lexer error count: {0:d} and current state {1:s}'.format(
                        self.error, self.state))
                raise errors.UnableToParseFile(
                    u'[{0:s}] unsupported file: {1:s}.'.format(
                        self.NAME, path_spec_printable))

            if self.line_ready:
                try:
                    self.ParseLine(parser_mediator)
                    self._file_verified = True

                except errors.TimestampError as exception:
                    error_count += 1
                    if self._file_verified:
                        logging.debug(
                            u'[{0:s} VERIFIED] Error count: {1:d} and ERROR: {2:d}'
                            .format(path_spec_printable, error_count,
                                    self.error))
                        logging.warning(
                            u'[{0:s}] Unable to parse timestamp with error: {1:s}'
                            .format(self.NAME, exception))

                    else:
                        logging.debug((
                            u'[{0:s} EVALUATING] Error count: {1:d} and ERROR: '
                            u'{2:d})').format(path_spec_printable, error_count,
                                              self.error))

                        if error_count >= self.MAX_LINES:
                            raise errors.UnableToParseFile(
                                u'[{0:s}] unsupported file: {1:s}.'.format(
                                    self.NAME, path_spec_printable))

                finally:
                    self.ClearValues()

            if self.Empty():
                # Try to fill the buffer to prevent the parser from ending prematurely.
                self.Feed()

            if self.Empty():
                break

        if not self._file_verified:
            raise errors.UnableToParseFile(
                u'[{0:s}] unable to parse file: {1:s}.'.format(
                    self.NAME, path_spec_printable))

        file_offset = file_object.get_offset()
        if file_offset < file_object.get_size():
            parser_mediator.ProduceParseError(
                (u'{0:s} prematurely terminated parsing: {1:s} at offset: '
                 u'0x{2:08x}.').format(self.NAME, path_spec_printable,
                                       file_offset))

    def ParseString(self, match=None, **unused_kwargs):
        """Return a string with combined values from the lexer.

    Args:
      match: optional regular expression match object (instance of SRE_Match).

    Returns:
      A string that combines the values that are so far
      saved from the lexer.
    """
        try:
            self.attributes[u'body'] += match.group(1).strip(u'\n')
        except IndexError:
            self.attributes[u'body'] += match.group(0).strip(u'\n')

    def PrintLine(self):
        """"Return a string with combined values from the lexer."""
        year = getattr(self.attributes, u'iyear', None)
        month = getattr(self.attributes, u'imonth', None)
        day = getattr(self.attributes, u'iday', None)

        if None in [year, month, day]:
            date_string = u'[DATE NOT SET]'
        else:
            try:
                year = int(year, 10)
                month = int(month, 10)
                day = int(day, 10)

                date_string = u'{0:04d}-{1:02d}-{2:02d}'.format(
                    year, month, day)
            except ValueError:
                date_string = u'[DATE INVALID]'

        time_string = getattr(self.attributes, u'time', u'[TIME NOT SET]')
        hostname_string = getattr(self.attributes, u'hostname',
                                  u'HOSTNAME NOT SET')
        reporter_string = getattr(self.attributes, u'reporter',
                                  u'[REPORTER NOT SET]')
        body_string = getattr(self.attributes, u'body', u'[BODY NOT SET]')

        # TODO: this is a work in progress. The reason for the try-catch is that
        # the text parser is handed a non-text file and must deal with converting
        # arbitrary binary data.
        try:
            line = u'{0:s} {1:s} [{2:s}] {3:s} => {4:s}'.format(
                date_string, time_string, hostname_string, reporter_string,
                body_string)
        except UnicodeError:
            line = u'Unable to print line - due to encoding error.'

        return line

    def ParseLine(self, parser_mediator):
        """Parses the current log line for events.

    Args:
      parser_mediator: a parser mediator object (instance of ParserMediator).
    """
        year_string = self.attributes.get(u'iyear')
        if not year_string:
            if not self._file_verified:
                raise errors.UnableToParseFile()
            parser_mediator.ProduceParseError(
                u'year missing in log line: {0:s}'.format(self.PrintLine()))
            return

        time_string = self.attributes.get(u'time')
        if not time_string:
            if not self._file_verified:
                raise errors.UnableToParseFile()
            parser_mediator.ProduceParseError(
                u'time values missing in log line: {0:s}'.format(
                    self.PrintLine()))
            return

        time_values = time_string.split(u':')
        if len(time_values) < 3:
            if not self._file_verified:
                raise errors.UnableToParseFile()
            parser_mediator.ProduceParseError(
                u'unsupported time format in log line: {0:s}'.format(
                    self.PrintLine()))
            return

        seconds_values = time_values[2].split(u'.')
        if len(seconds_values) == 2:
            seconds_string, microseconds_string = seconds_values
        else:
            seconds_string = time_values[2]
            microseconds_string = 0

        try:
            # TODO: fix the need to convert non string values into integers and
            # string to integer conversion without an explicit base.
            year = int(year_string)
            hours = int(time_values[0])
            minutes = int(time_values[1])
            seconds = int(seconds_string)
            microseconds = int(microseconds_string)

        except ValueError as exception:
            if not self._file_verified:
                raise errors.UnableToParseFile()
            parser_mediator.ProduceParseError(
                u'unable to parse log line: {0:s} with error: {1:s}'.format(
                    self.PrintLine(), exception))
            return

        if self.local_zone:
            timezone = parser_mediator.timezone
        else:
            timezone = pytz.UTC

        try:
            timestamp = timelib.Timestamp.FromTimeParts(
                year,
                self.attributes[u'imonth'],
                self.attributes[u'iday'],
                hours,
                minutes,
                seconds,
                microseconds=microseconds,
                timezone=timezone)
        except errors.TimestampError as exception:
            timestamp = timelib.Timestamp.NONE_TIMESTAMP
            parser_mediator.ProduceParseError(
                u'unable to determine timestamp with error: {0:s}'.format(
                    exception))

        event_object = self.CreateEvent(timestamp,
                                        getattr(self, u'entry_offset', 0),
                                        self.attributes)
        parser_mediator.ProduceEvent(event_object)

    def SetDay(self, match=None, **unused_kwargs):
        """Parses the day of the month.

       This is a callback function for the text parser (lexer) and is
       called by the corresponding lexer state.

    Args:
      match: optional regular expression match object (instance of SRE_Match).
    """
        self.attributes[u'iday'] = int(match.group(1))

    def SetMonth(self, match=None, **unused_kwargs):
        """Parses the month.

       This is a callback function for the text parser (lexer) and is
       called by the corresponding lexer state.

    Args:
      match: optional regular expression match object (instance of SRE_Match).
    """
        self.attributes[u'imonth'] = int(
            timelib.MONTH_DICT.get(match.group(1).lower(), 1))

    def SetTime(self, match=None, **unused_kwargs):
        """Set the time attribute.

    Args:
      match: optional regular expression match object (instance of SRE_Match).
    """
        self.attributes[u'time'] = match.group(1)

    def SetYear(self, match=None, **unused_kwargs):
        """Parses the year.

       This is a callback function for the text parser (lexer) and is
       called by the corresponding lexer state.

    Args:
      match: optional regular expression match object (instance of SRE_Match).
    """
        self.attributes[u'iyear'] = int(match.group(1))