def testReadline(self): """Test the readline() function.""" test_file_path = self._GetTestFilePath(['another_file']) self._SkipIfPathNotExists(test_file_path) test_path_spec = os_path_spec.OSPathSpec(location=test_file_path) file_object = os_file_io.OSFile(self._resolver_context) file_object.open(test_path_spec) line_reader = line_reader_file.BinaryLineReader(file_object) line = line_reader.readline() self.assertEqual(line, b'This is another file.\n') offset = line_reader.tell() self.assertEqual(offset, 22) line_reader = line_reader_file.BinaryLineReader(file_object) line = line_reader.readline(size=11) self.assertEqual(line, b'This is ano') offset = line_reader.tell() self.assertEqual(offset, 11) file_object.close()
def testReadlineMultipleLines(self): """Test the readline() function on multiple lines.""" test_file = self._GetTestFilePath(['password.csv']) test_path_spec = os_path_spec.OSPathSpec(location=test_file) file_object = os_file_io.OSFile(self._resolver_context) file_object.open(test_path_spec) line_reader = line_reader_file.BinaryLineReader(file_object) line = line_reader.readline() self.assertEqual(line, b'place,user,password\n') offset = line_reader.tell() self.assertEqual(offset, 20) line = line_reader.readline(size=5) self.assertEqual(line, b'bank,') offset = line_reader.tell() self.assertEqual(offset, 25) line = line_reader.readline() self.assertEqual(line, b'joesmith,superrich\n') offset = line_reader.tell() self.assertEqual(offset, 44) line = line_reader.readline() self.assertEqual(line, b'alarm system,-,1234\n') offset = line_reader.tell() self.assertEqual(offset, 64) file_object.close()
def _CreateLineReader(self, file_object): """Creates an object that reads lines from a text file. The line reader is advanced to the beginning of the DSV content, skipping any header lines. Args: file_object (dfvfs.FileIO): file-like object. Returns: TextFile|BinaryLineReader: an object that implements an iterator over lines in a text file. Raises: UnicodeDecodeError: if the file cannot be read with the specified encoding. """ # The Python 2 csv module reads bytes and the Python 3 csv module Unicode # reads strings. if py2to3.PY_3: line_reader = text_file.TextFile(file_object, encoding=self._encoding, end_of_line=self._end_of_line) else: line_reader = line_reader_file.BinaryLineReader( file_object, end_of_line=self._end_of_line) # If we specifically define a number of lines we should skip, do that here. for _ in range(0, self.NUMBER_OF_HEADER_LINES): try: line_reader.readline(self._maximum_line_length) except UnicodeDecodeError: raise return line_reader
def testIterator(self): """Tests the iterator functionality.""" test_file_path = self._GetTestFilePath(['password.csv']) self._SkipIfPathNotExists(test_file_path) resolver_context = context.Context() test_path_spec = os_path_spec.OSPathSpec(location=test_file_path) file_object = path_spec_resolver.Resolver.OpenFileObject( test_path_spec, resolver_context=resolver_context) line_reader = line_reader_file.BinaryLineReader(file_object) dsv_reader = line_reader_file.BinaryDSVReader(line_reader, delimiter=b',') rows = [] for row in dsv_reader: rows.append(row) self.assertEqual(len(rows), 5) self.assertEqual(rows[0], [b'place', b'user', b'password']) self.assertEqual(rows[1], [b'bank', b'joesmith', b'superrich']) self.assertEqual(rows[2], [b'alarm system', b'-', b'1234']) self.assertEqual(rows[3], [b'treasure chest', b'-', b'1111']) self.assertEqual(rows[4], [b'uber secret laire', b'admin', b'admin'])
def testReadlinesWithFileWithoutNewLineAtEnd(self): """Test reading lines from a file without a new line char at the end.""" test_file = self._GetTestFilePath(['mactime.body']) test_file_path_spec = os_path_spec.OSPathSpec(location=test_file) file_object = os_file_io.OSFile(self._resolver_context) file_object.open(test_file_path_spec) line_reader = line_reader_file.BinaryLineReader(file_object) lines = line_reader.readlines() self.assertEqual(len(lines), 17)
def testReadlinesWithFileWithoutNewLineAtEnd(self): """Test reading lines from a file without a new line char at the end.""" test_file_path = self._GetTestFilePath(['mactime.body']) self._SkipIfPathNotExists(test_file_path) test_path_spec = os_path_spec.OSPathSpec(location=test_file_path) file_object = path_spec_resolver.Resolver.OpenFileObject( test_path_spec, resolver_context=self._resolver_context) line_reader = line_reader_file.BinaryLineReader(file_object) lines = line_reader.readlines() self.assertEqual(len(lines), 22)
def testReadlinesWithSizeHint(self): """Test the readlines() function.""" test_file = self._GetTestFilePath(['password.csv']) test_path_spec = os_path_spec.OSPathSpec(location=test_file) file_object = os_file_io.OSFile(self._resolver_context) file_object.open(test_path_spec) line_reader = line_reader_file.BinaryLineReader(file_object) lines = line_reader.readlines(sizehint=60) self.assertEqual(len(lines), 3) self.assertEqual(lines[0], b'place,user,password\n') self.assertEqual(lines[1], b'bank,joesmith,superrich\n') self.assertEqual(lines[2], b'alarm system,-,1234\n') file_object.close()
def testReadlinesWithSizeHint(self): """Test the readlines() function.""" test_file_path = self._GetTestFilePath(['password.csv']) self._SkipIfPathNotExists(test_file_path) test_path_spec = os_path_spec.OSPathSpec(location=test_file_path) file_object = path_spec_resolver.Resolver.OpenFileObject( test_path_spec, resolver_context=self._resolver_context) line_reader = line_reader_file.BinaryLineReader(file_object) lines = line_reader.readlines(sizehint=60) self.assertEqual(len(lines), 3) self.assertEqual(lines[0], b'place,user,password\n') self.assertEqual(lines[1], b'bank,joesmith,superrich\n') self.assertEqual(lines[2], b'alarm system,-,1234\n')
def testReadlines(self): """Test the readlines() function.""" test_file = self._GetTestFilePath(['password.csv']) test_path_spec = os_path_spec.OSPathSpec(location=test_file) file_object = os_file_io.OSFile(self._resolver_context) file_object.open(test_path_spec) line_reader = line_reader_file.BinaryLineReader(file_object) lines = line_reader.readlines() self.assertEqual(len(lines), 5) self.assertEqual(lines[0], b'place,user,password\n') self.assertEqual(lines[1], b'bank,joesmith,superrich\n') self.assertEqual(lines[2], b'alarm system,-,1234\n') self.assertEqual(lines[3], b'treasure chest,-,1111\n') self.assertEqual(lines[4], b'uber secret laire,admin,admin\n') file_object.close()
def _CreateLineReader(self, file_object): """Creates an object that reads lines from a text file. The line reader is advanced to the beginning of the DSV content, skipping any header lines. Args: file_object (dfvfs.FileIO): file-like object. Returns: TextFile|BinaryLineReader: an object that implements an iterator over lines in a text file. Raises: UnicodeDecodeError: if the file cannot be read with the specified encoding. """ # The Python 2 csv module reads bytes and the Python 3 csv module Unicode # reads strings. if py2to3.PY_3: line_reader = text_file.TextFile(file_object, encoding=self._encoding, end_of_line=self._end_of_line) # pylint: disable=protected-access maximum_read_buffer_size = line_reader._MAXIMUM_READ_BUFFER_SIZE else: line_reader = line_reader_file.BinaryLineReader( file_object, end_of_line=self._end_of_line) maximum_read_buffer_size = line_reader.MAXIMUM_READ_BUFFER_SIZE # Line length is one less than the maximum read buffer size so that we # tell if there's a line that doesn't end at the end before the end of # the file. if self._maximum_line_length > maximum_read_buffer_size: self._maximum_line_length = maximum_read_buffer_size - 1 # If we specifically define a number of lines we should skip, do that here. for _ in range(0, self.NUMBER_OF_HEADER_LINES): line_reader.readline(self._maximum_line_length) return line_reader
def testIterator(self): """Test the iterator functionality.""" test_file_path = self._GetTestFilePath(['password.csv']) self._SkipIfPathNotExists(test_file_path) test_path_spec = os_path_spec.OSPathSpec(location=test_file_path) file_object = path_spec_resolver.Resolver.OpenFileObject( test_path_spec, resolver_context=self._resolver_context) line_reader = line_reader_file.BinaryLineReader(file_object) lines = [] for line in line_reader: lines.append(line) self.assertEqual(len(lines), 5) self.assertEqual(lines[0], b'place,user,password\n') self.assertEqual(lines[1], b'bank,joesmith,superrich\n') self.assertEqual(lines[2], b'alarm system,-,1234\n') self.assertEqual(lines[3], b'treasure chest,-,1111\n') self.assertEqual(lines[4], b'uber secret laire,admin,admin\n')
def _ParseFileData(self, mediator, file_object): """Parses file content (data) for user account preprocessing attributes. Args: mediator (PreprocessMediator): mediates interactions between preprocess plugins and other components, such as storage and knowledge base. file_object (dfvfs.FileIO): file-like object that contains the artifact value data. Raises: errors.PreProcessFail: if the preprocessing fails. """ line_reader = line_reader_file.BinaryLineReader(file_object) try: reader = line_reader_file.BinaryDSVReader(line_reader, b':') except csv.Error as exception: raise errors.PreProcessFail( 'Unable to read: {0:s} with error: {1!s}'.format( self.ARTIFACT_DEFINITION_NAME, exception)) for line_number, row in enumerate(reader): if len(row) < 7 or not row[0] or not row[2]: mediator.ProducePreprocessingWarning( self.ARTIFACT_DEFINITION_NAME, 'Unsupported number of values in line: {0:d}.'.format( line_number)) continue try: username = row[0].decode('utf-8') except UnicodeDecodeError: mediator.ProducePreprocessingWarning( self.ARTIFACT_DEFINITION_NAME, 'Unable to decode username.') continue try: identifier = row[2].decode('utf-8') except UnicodeDecodeError: mediator.ProducePreprocessingWarning( self.ARTIFACT_DEFINITION_NAME, 'Unable to decode user identifier.') continue group_identifier = None if row[3]: try: group_identifier = row[3].decode('utf-8') except UnicodeDecodeError: mediator.ProducePreprocessingWarning( self.ARTIFACT_DEFINITION_NAME, 'Unable to decode group identifier.') full_name = None if row[4]: try: full_name = row[4].decode('utf-8') except UnicodeDecodeError: mediator.ProducePreprocessingWarning( self.ARTIFACT_DEFINITION_NAME, 'Unable to decode full name.') user_directory = None if row[5]: try: user_directory = row[5].decode('utf-8') except UnicodeDecodeError: mediator.ProducePreprocessingWarning( self.ARTIFACT_DEFINITION_NAME, 'Unable to decode user directory.') shell = None if row[6]: try: shell = row[6].decode('utf-8') except UnicodeDecodeError: mediator.ProducePreprocessingWarning( self.ARTIFACT_DEFINITION_NAME, 'Unable to decode shell.') user_account = artifacts.UserAccountArtifact(identifier=identifier, username=username) user_account.group_identifier = group_identifier user_account.full_name = full_name user_account.user_directory = user_directory user_account.shell = shell try: mediator.AddUserAccount(user_account) except KeyError as exception: mediator.ProducePreprocessingWarning( self.ARTIFACT_DEFINITION_NAME, 'Unable to add user account with error: {0!s}'.format( exception))
def ParseFileObject(self, parser_mediator, file_object, **unused_kwargs): """Parses a DSV text file-like object. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. file_object (dfvfs.FileIO): file-like object. Raises: UnableToParseFile: when the file cannot be parsed. """ file_size = file_object.get_size() # The csv module can consume a lot of memory, 1 GiB for a 100 MiB file. # Hence that the maximum supported file size is restricted. if file_size > self._MAXIMUM_SUPPORTED_FILE_SIZE: display_name = parser_mediator.GetDisplayName() raise errors.UnableToParseFile(( '[{0:s}] Unable to parse DSV file: {1:s} size of file exceeds ' 'maximum supported size').format(self.NAME, display_name)) # TODO: Replace this with detection of the file encoding via byte-order # marks. Also see: https://github.com/log2timeline/plaso/issues/1971 if not self._encoding: self._encoding = parser_mediator.codepage # The Python 2 csv module reads bytes and the Python 3 csv module Unicode # reads strings. if py2to3.PY_3: line_reader = text_file.TextFile(file_object, encoding=self._encoding) else: line_reader = line_reader_file.BinaryLineReader(file_object) # If we specifically define a number of lines we should skip, do that here. for _ in range(0, self.NUMBER_OF_HEADER_LINES): line_reader.readline() reader = self._CreateDictReader(line_reader) row_offset = line_reader.tell() try: row = next(reader) except (StopIteration, csv.Error, UnicodeDecodeError) as exception: display_name = parser_mediator.GetDisplayName() raise errors.UnableToParseFile( '[{0:s}] Unable to parse DSV file: {1:s} with error: {2!s}.'. format(self.NAME, display_name, exception)) number_of_columns = len(self.COLUMNS) number_of_records = len(row) if number_of_records != number_of_columns: display_name = parser_mediator.GetDisplayName() raise errors.UnableToParseFile( ('[{0:s}] Unable to parse DSV file: {1:s}. Wrong number of ' 'records (expected: {2:d}, got: {3:d})').format( self.NAME, display_name, number_of_columns, number_of_records)) for key, value in row.items(): if self._MAGIC_TEST_STRING in (key, value): display_name = parser_mediator.GetDisplayName() raise errors.UnableToParseFile( ('[{0:s}] Unable to parse DSV file: {1:s}. Signature ' 'mismatch.').format(self.NAME, display_name)) row = self._ConvertRowToUnicode(parser_mediator, row) if not self.VerifyRow(parser_mediator, row): display_name = parser_mediator.GetDisplayName() raise errors.UnableToParseFile( ('[{0:s}] Unable to parse DSV file: {1:s}. Verification ' 'failed.').format(self.NAME, display_name)) self.ParseRow(parser_mediator, row_offset, row) row_offset = line_reader.tell() for row in reader: if parser_mediator.abort: break row = self._ConvertRowToUnicode(parser_mediator, row) self.ParseRow(parser_mediator, row_offset, row) row_offset = line_reader.tell()
def ParseFileObject(self, parser_mediator, file_object, **unused_kwargs): """Parses a DSV text file-like object. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. file_object (dfvfs.FileIO): file-like object. Raises: UnableToParseFile: when the file cannot be parsed. """ line_reader = line_reader_file.BinaryLineReader(file_object) # If we specifically define a number of lines we should skip, do that here. for _ in range(0, self.NUMBER_OF_HEADER_LINES): line_reader.readline() reader = self._CreateDictReader(parser_mediator, line_reader) row_offset = line_reader.tell() try: row = next(reader) except (StopIteration, csv.Error) as exception: display_name = parser_mediator.GetDisplayName() raise errors.UnableToParseFile( '[{0:s}] Unable to parse DSV file: {1:s} with error: {2!s}.'. format(self.NAME, display_name, exception)) number_of_columns = len(self.COLUMNS) number_of_records = len(row) if number_of_records != number_of_columns: display_name = parser_mediator.GetDisplayName() raise errors.UnableToParseFile( ('[{0:s}] Unable to parse DSV file: {1:s}. Wrong number of ' 'records (expected: {2:d}, got: {3:d})').format( self.NAME, display_name, number_of_columns, number_of_records)) for key, value in row.items(): if self._MAGIC_TEST_STRING in (key, value): display_name = parser_mediator.GetDisplayName() raise errors.UnableToParseFile( ('[{0:s}] Unable to parse DSV file: {1:s}. Signature ' 'mismatch.').format(self.NAME, display_name)) if not self.VerifyRow(parser_mediator, row): display_name = parser_mediator.GetDisplayName() raise errors.UnableToParseFile( ('[{0:s}] Unable to parse DSV file: {1:s}. Verification ' 'failed.').format(self.NAME, display_name)) row = self._ConvertRowToUnicode(parser_mediator, row) self.ParseRow(parser_mediator, row_offset, row) row_offset = line_reader.tell() for row in reader: if parser_mediator.abort: break row = self._ConvertRowToUnicode(parser_mediator, row) self.ParseRow(parser_mediator, row_offset, row) row_offset = line_reader.tell()
def _ParseFileData(self, knowledge_base, file_object): """Parses file content (data) for user account preprocessing attributes. Args: knowledge_base (KnowledgeBase): to fill with preprocessing information. file_object (dfvfs.FileIO): file-like object that contains the artifact value data. Raises: errors.PreProcessFail: if the preprocessing fails. """ line_reader = line_reader_file.BinaryLineReader(file_object) try: reader = line_reader_file.BinaryDSVReader(line_reader, b':') except csv.Error as exception: raise errors.PreProcessFail( 'Unable to read: {0:s} with error: {1!s}'.format( self.ARTIFACT_DEFINITION_NAME, exception)) for row in reader: if len(row) < 7 or not row[0] or not row[2]: # TODO: add and store preprocessing errors. continue try: username = row[0].decode('utf-8') except UnicodeDecodeError: # TODO: add and store preprocessing errors. logger.error('Unable to decode username.') continue try: identifier = row[2].decode('utf-8') except UnicodeDecodeError: # TODO: add and store preprocessing errors. logger.error('Unable to decode identifier.') continue group_identifier = None if row[3]: try: group_identifier = row[3].decode('utf-8') except UnicodeDecodeError: # TODO: add and store preprocessing errors. logger.error('Unable to decode group identifier.') full_name = None if row[4]: try: full_name = row[4].decode('utf-8') except UnicodeDecodeError: # TODO: add and store preprocessing errors. logger.error('Unable to decode full name.') user_directory = None if row[5]: try: user_directory = row[5].decode('utf-8') except UnicodeDecodeError: # TODO: add and store preprocessing errors. logger.error('Unable to decode user directory.') shell = None if row[6]: try: shell = row[6].decode('utf-8') except UnicodeDecodeError: # TODO: add and store preprocessing errors. logger.error('Unable to decode shell.') user_account = artifacts.UserAccountArtifact(identifier=identifier, username=username) user_account.group_identifier = group_identifier user_account.full_name = full_name user_account.user_directory = user_directory user_account.shell = shell try: knowledge_base.AddUserAccount(user_account) except KeyError: # TODO: add and store preprocessing errors. pass
def ParseFileObject(self, parser_mediator, file_object, **unused_kwargs): """Parses a DSV text file-like object. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. file_object (dfvfs.FileIO): file-like object. Raises: UnableToParseFile: when the file cannot be parsed. """ if not self._encoding: self._encoding = parser_mediator.codepage delimiter = self.DELIMITER quotechar = self.QUOTE_CHAR magic_test_string = self._MAGIC_TEST_STRING # Python 3 csv module requires arguments to constructor to be of type str. if sys.version_info[0] >= 3: delimiter = delimiter.decode(self._encoding) quotechar = quotechar.decode(self._encoding) magic_test_string = magic_test_string.decode(self._encoding) line_reader = line_reader_file.BinaryLineReader(file_object) # If we specifically define a number of lines we should skip, do that here. for _ in range(0, self.NUMBER_OF_HEADER_LINES): line_reader.readline() reader = csv.DictReader( line_reader, delimiter=delimiter, fieldnames=self.COLUMNS, quotechar=quotechar, restkey=magic_test_string, restval=magic_test_string) row_offset = line_reader.tell() try: row = next(reader) except (StopIteration, csv.Error) as exception: display_name = parser_mediator.GetDisplayName() raise errors.UnableToParseFile( '[{0:s}] Unable to parse DSV file: {1:s} with error: {2!s}.'.format( self.NAME, display_name, exception)) number_of_columns = len(self.COLUMNS) number_of_records = len(row) if number_of_records != number_of_columns: display_name = parser_mediator.GetDisplayName() raise errors.UnableToParseFile(( '[{0:s}] Unable to parse DSV file: {1:s}. Wrong number of ' 'records (expected: {2:d}, got: {3:d})').format( self.NAME, display_name, number_of_columns, number_of_records)) for key, value in row.items(): if self._MAGIC_TEST_STRING in (key, value): display_name = parser_mediator.GetDisplayName() raise errors.UnableToParseFile(( '[{0:s}] Unable to parse DSV file: {1:s}. Signature ' 'mismatch.').format(self.NAME, display_name)) if not self.VerifyRow(parser_mediator, row): display_name = parser_mediator.GetDisplayName() raise errors.UnableToParseFile(( '[{0:s}] Unable to parse DSV file: {1:s}. Verification ' 'failed.').format(self.NAME, display_name)) row = self._ConvertRowToUnicode(parser_mediator, row) self.ParseRow(parser_mediator, row_offset, row) row_offset = line_reader.tell() for row in reader: if parser_mediator.abort: break row = self._ConvertRowToUnicode(parser_mediator, row) self.ParseRow(parser_mediator, row_offset, row) row_offset = line_reader.tell()