def parse_file(self, file_number): """Parses the file to a protocol buffer block objects. Args: file_number: The file number in the report (eg. "3of4" -> 3). Yields: Each yield is a single block object (block_pb2.Block). """ row_number = 0 block_number = 0 if self.is_compressed(): tsv = gzip.open(self.file_path, 'rU') else: tsv = open(self.file_path, 'rU') current_block = block_pb2.Block(file_number=file_number) self.logger.info( 'Start parsing the HEAD block in file number %s.', file_number) for line in csv.reader(tsv, dialect=TsvDialect): row_number += 1 # Comment row. if line[0].startswith(constants.COMMENT_SIGN): continue try: row_type = self._get_row_type(line, row_number) # End of block check. if self.is_end_of_block(line, row_type, row_number, current_block): yield current_block current_block = block_pb2.Block(file_number=file_number) # HEAD/FOOT row. if (constants.HEADER_ROW_PATTERN.match(row_type) or row_type in constants.FOOT_ROWS): current_block.type = block_pb2.HEAD if row_type in constants.FOOT_ROWS: self.logger.info( 'Start parsing the FOOT block in file number %s.', file_number) current_block.type = block_pb2.FOOT if row_type == 'HEAD': self.row_validators_list = self.get_row_validators(line) current_block.version = line[1] current_block.rows.extend([ self.get_row_object(line, row_type, row_number, block_number)]) continue # Body row. block_number = self.get_block_number(line, row_number) row = self.get_row_object(line, row_type, row_number, block_number) if not current_block.type: current_block.type = block_pb2.BODY current_block.number = block_number current_block.rows.extend([row]) except error.ValidationError as e: self.logger.error(e) yield current_block
def read_blocks_from_queue(self): """Returns a generator of the blocks in the queue. Override this method if you wish to change the queue (blocks transformation) form. Yields: Each yield is a single block object (block_pb2.Block). """ message_lines = [] for line in sys.stdin: if constants.QUEUE_DELIMITER in line: block = block_pb2.Block() try: block.ParseFromString(b'\n'.join(message_lines)) except message_mod.DecodeError: sys.stderr.write( 'ERROR: Can not read protocol buffer from queue. Is ' 'human_readable perhaps set to true? I am not a human. ' 'Aborting...\n') sys.exit(-1) yield block message_lines = [] else: message_lines.append(line.rstrip(bytes('\n', encoding='utf8')))
def _create_test_block(row_types): block_proto = block_pb2.Block(type=block_pb2.BODY, number=0, file_number=1) row_number = 0 for row_type in row_types: row_number += 1 row = block_proto.rows.add() row.type = row_type row.row_number = row_number return block_proto
def block_from_ascii(cls, text): """Returns Block protobuf parsed from ASCII text.""" block = block_pb2.Block() text_format.Merge(text, block) return block
def test_is_end_of_block_true(self): line = ['SU02', 'BL8', '11', 'SR1', 'AdSupport', 'NonInterStream'] new_block = block_pb2.Block() parser = self._get_file_parser() self.assertTrue( parser.is_end_of_block(line, 'SU02', 5, new_block))
def test_is_end_of_block_false(self): first_line = ['HEAD', '123'] new_block = block_pb2.Block() parser = self._get_file_parser() self.assertFalse( parser.is_end_of_block(first_line, 'HEAD', 5, new_block))