示例#1
0
    def deserialize_from_bytes(self, data_frame, column_serializers):
        """
        Deserialize all cells in the provided data frame from a bytes representation (inplace).
        @param data_frame a pandas.DataFrame containing columns to deserialize
        @param column_serializers dict containing column names present in data_frame as keys and deserializer_ids as
                                  values. A deserializer_id should be the id of the java extension point on which the
                                  deserializer is registered. Each column identified by the dict keys is deserialized
                                  using the deserializer provided by the TypeExtensionManager for the given
                                  deserializer_id.
        """

        # print('Data frame: ' + str(data_frame) + '\nserializers: ' + str(column_serializers) + '\n')
        for column in column_serializers:
            deserializer = self._type_extension_manager.get_deserializer_by_id(
                column_serializers[column])
            for i in range(len(data_frame)):
                if debug_util.is_debug_enabled():
                    lastp = -1
                    if (i * 100 / len(data_frame)) % 5 == 0 and int(
                            i * 100 / len(data_frame)) != lastp:
                        debug_util.debug_msg(
                            str(i * 100 / len(data_frame)) +
                            ' percent done (deserialize)')
                        # lastp = int(i * 100/len(data_frame))
                col_idx = data_frame.columns.get_loc(column)
                # Using bracket accessor is necessary here for ensuring that there are no unwanted type conversions.
                value = data_frame[column][data_frame.index[i]]
                if isinstance(value, numpy.float64) and numpy.isnan(value):
                    value = None
                if value:
                    if isinstance(value, list):
                        new_list = []
                        for inner_value in value:
                            if isinstance(inner_value, numpy.float64
                                          ) and numpy.isnan(inner_value):
                                inner_value = None
                            if inner_value:
                                new_list.append(
                                    deserializer.deserialize(inner_value))
                            else:
                                new_list.append(None)
                        data_frame.iat[i, col_idx] = new_list
                    elif isinstance(value, set):
                        new_set = set()
                        for inner_value in value:
                            if isinstance(inner_value, numpy.float64
                                          ) and numpy.isnan(inner_value):
                                inner_value = None
                            if inner_value:
                                new_set.add(
                                    deserializer.deserialize(inner_value))
                            else:
                                new_set.add(None)
                        data_frame.iat[i, col_idx] = new_set
                    else:
                        data_frame.iat[i, col_idx] = deserializer.deserialize(
                            value)
                else:
                    data_frame.iat[i, col_idx] = None
示例#2
0
 def serialize_objects_to_bytes(self, data_frame, column_serializers):
     """
     Serialize all cells in the provided data frame to a bytes representation (inplace).
     @param data_frame a pandas.DataFrame containing columns to serialize
     @param column_serializers dict containing column names present in data_frame as keys and serializer_ids as
                               values.
                               A serializer_id should be the id of the java extension point on which the serializer
                               is registered. Each column identified by the dict keys is serialized using the
                               serializer provided by the TypeExtensionManager for the given serializer_id.
     """
     for column in column_serializers:
         serializer = self._type_extension_manager.get_serializer_by_id(
             column_serializers[column])
         col_idx = data_frame.columns.get_loc(column)
         if data_frame[column].dtype != 'object':
             data_frame[column] = data_frame[column].astype('object')
         for i in range(len(data_frame)):
             if debug_util.is_debug_enabled():
                 lastp = -1
                 if (i * 100 / len(data_frame)) % 5 == 0 and int(
                         i * 100 / len(data_frame)) != lastp:
                     debug_util.debug_msg(
                         str(i * 100 / len(data_frame)) +
                         ' percent done (serialize)')
                     # lastp = int(i * 100/len(data_frame))
             # Using bracket acessor is necessary here for ensuring that there are
             # no unwanted type conversions
             value = data_frame[column][data_frame.index[i]]
             if value is not None:
                 if isinstance(value, list):
                     new_list = []
                     for inner_value in value:
                         if inner_value is None:
                             new_list.append(None)
                         else:
                             new_list.append(
                                 serializer.serialize(inner_value))
                     data_frame.iat[i, col_idx] = new_list
                 elif isinstance(value, set):
                     new_set = set()
                     for inner_value in value:
                         if inner_value is None:
                             new_set.add(None)
                         else:
                             new_set.add(serializer.serialize(inner_value))
                     data_frame.iat[i, col_idx] = new_set
                 else:
                     data_frame.iat[i,
                                    col_idx] = serializer.serialize(value)