def __init__(self, source_col, transform, stage=None): """ @param source_col is the original Column or LazyColumn that this object represents a transformed version of. @param transform should be a Transform object. @param stage is a string identifier for what this particular stage of transformation represents. This is so we can later go through a series of transforms on a Column and search for key transformation points. Stage names should be unique, so you cannot set a stage name that is the same as any LazyColumn that exists in our list of ancestors. """ self._source_col = source_col self._transform = transform self._original_col = source_col.original_col # DataColumns don't have stage names, so search all the way up until # the end for lazy_col in self.transformed_ancestors: if (stage is not None) and (stage == lazy_col.stage): raise ValueError("Cannot create LazyColumn with stage '%s' " \ "because this identifier is used in ancestor"\ " column %s " % (stage, lazy_col)) self._stage = stage # FIXME: The hashing mechanism for transforms needs work... self._identity_hash = hashhelper.data_hash(source_col.identity_hash, hash(transform)) # _transformed_col says: Don't look at me from outside! I don't have a # hash, and throwing me around outside of this warm, cozy LazyColumn # could have bad and not immediately apparent effects on performance. self._transformed_col = None
def __init__(self, source_col, transform, stage=None): """ @param source_col is the original Column or LazyColumn that this object represents a transformed version of. @param transform should be a Transform object. @param stage is a string identifier for what this particular stage of transformation represents. This is so we can later go through a series of transforms on a Column and search for key transformation points. Stage names should be unique, so you cannot set a stage name that is the same as any LazyColumn that exists in our list of ancestors. """ self._source_col = source_col self._transform = transform self._original_col = source_col.original_col # DataColumns don't have stage names, so search all the way up until # the end for lazy_col in self.transformed_ancestors: if (stage is not None) and (stage == lazy_col.stage): raise ValueError("Cannot create LazyColumn with stage '%s' " \ "because this identifier is used in ancestor"\ " column %s " % (stage, lazy_col)) self._stage = stage # FIXME: The hashing mechanism for transforms needs work... self._identity_hash = hashhelper.data_hash( source_col.identity_hash, hash(transform) ) # _transformed_col says: Don't look at me from outside! I don't have a # hash, and throwing me around outside of this warm, cozy LazyColumn # could have bad and not immediately apparent effects on performance. self._transformed_col = None
def value_hash(self): """Return a SHA1 hash of the contents of this DataColumn.""" if self._value_hash is None: self._value_hash = hashhelper.data_hash(self._row_values) return self._value_hash
def param_hash(self): return hashhelper.data_hash(self._value_mapping)
def hash(self): return hashhelper.data_hash( hashhelper.source_hash(self.__class__), self.param_hash() )