def batch_tokenizer(tokenizer, txtfile_location, by_line=False, ftfy=True): # just convert to the token ids, we will do adaptative padding on training time. with tf.io.gfile.GFile(txtfile_location, "rb") as f: if by_line: sources = [l.decode("utf-8") for l in f.readlines()] else: sources = [f.read().decode("utf-8")] if len(sources) <= 0: # tokenizer crashes when given an empty list, so give it an empty string # (this happens in --by_line mode for empty files) sources = [''] if ftfy: sources = [ftfy_text(source) for source in sources] uids = [farmhash.fingerprint64(source) for source in sources] batches = tokenizer.batch_encode_plus( sources, return_token_type_ids=True, pad_to_max_length=False, truncation=False, add_special_tokens=True, return_offsets_mapping=True, verbose=False, ) return zip( uids, sources, batches["input_ids"], [[start for start, end in offsets] for offsets in batches["offset_mapping"]], [[end for start, end in offsets] for offsets in batches["offset_mapping"]], )
def variants_to_features(sample_variants): """Convert variant calls to TensorFlow features. See also https://www.tensorflow.org/versions/r0.10/how_tos/reading_data/index.html Args: sample_variants: the sample's variant calls Returns: A dictionary of TensorFlow features. """ variants_by_feature = collections.defaultdict(list) for variant in sample_variants: feature_name = variant_to_feature_name_fn(variant) words = variant_to_words_fn(variant) variants_by_feature[feature_name].extend( # fingerprint64 returns an unsigned int64 but int64 features are # signed. Convert from from unsigned to signed. [ struct.unpack('q', struct.pack('Q', farmhash.fingerprint64(w)))[0] for w in words ]) # Fill in features from variants. features = {} for feature, variants in variants_by_feature.iteritems(): if variants: features['variants_' + feature] = tf.train.Feature( int64_list=tf.train.Int64List(value=variants)) return features
def _hash_to_float(self, input_string: str, hash_range: Tuple[float, float], precision: int = tf.int32.max) -> float: """Hashes a string and returns a `float`. `hash_range` is evenly divided into a number of buckets. The hashed value of `input_string` is mapped to one of the bucket. TODO(b/158684105): Update this function to directly map the hash to the index rather than converting hash -> float -> index. Args: input_string: An input string. hash_range: A tuple representing the range for the hashed value. precision: The number of buckets in `hash_range`. Must be a positive integer. Returns: A float value being the lower bound of the bucket that the hashed string value falls into. """ (low, high) = hash_range hashed_value = farmhash.fingerprint64(input_string) hashed_value = hashed_value % precision hashed_value = ((float(hashed_value) / precision) * (high - low)) + low return hashed_value
def _generate_unsigned_hash_code(strings, max_hash_value=sys.maxsize): # type: (List[str], int) -> int """Generates a forever-fixed hash code for `strings`. The hash code generated is in the range [0, max_hash_value). Note that the hash code generated by farmhash.fingerprint64 is unsigned. """ return farmhash.fingerprint64(json.dumps(strings)) % max_hash_value
def get_data_split(fl_date): fl_date_str = str(fl_date) # Use farm fingerprint just like in BigQuery x = np.abs( np.uint64(farmhash.fingerprint64(fl_date_str)).astype('int64') % 100) if x < 60: data_split = 'TRAIN' elif x < 80: data_split = 'VALIDATE' else: data_split = 'TEST' return data_split
def create_fingerprint(): """Create a unique fingerprint for the running process. The main function should call this function once to create a unique fingerprint for that run. Returns: Unique fingerprint for this run. """ build_run_str = ( build_data.BuildData() + 'time: %d' % (int(round(time.time())))) return farmhash.fingerprint64(build_run_str)
def Fingerprint(theorem): """Compute a unique, stable fingerprint for theorem objects. Args: theorem: proof_assistant_pb2.Theorem object Returns: 62 bit non-negative integer fingerprint. Note that we truncate to 62 bits for OCaml compatibility. OCaml uses 63 bit signed integers. """ if not theorem.HasField('conclusion') and theorem.HasField('fingerprint'): return theorem.fingerprint fp = farmhash.fingerprint64(theorem.conclusion) for hypothesis in theorem.hypotheses: tmp = farmhash.fingerprint64(hypothesis) fp = _PairFingerprint(fp, tmp) result = fp & MASK62 assert (not theorem.HasField('fingerprint') or theorem.fingerprint == result), ( 'Inconsistent fingerprints %d != %d in Theorem protobuf.' % (result, theorem.fingerprint)) return result
def _compute_skew_for_features( training_feature: tf.train.Feature, serving_feature: tf.train.Feature, float_round_ndigits: Optional[int], feature_name: str) -> feature_skew_results_pb2.FeatureSkew: """Computes feature skew for a pair of training and serving features. Args: training_feature: The feature to compare from the training example. serving_feature: The feature to compare from the serving example. float_round_ndigits: Number of digits precision after the decimal point to which to round float values before comparison. feature_name: The name of the feature for which to compute skew between the examples. Returns: A FeatureSkew proto containing information about skew for the specified feature. """ skew_results = feature_skew_results_pb2.FeatureSkew() skew_results.feature_name = feature_name if training_feature is not None and serving_feature is not None: skew_results.training_count = 1 skew_results.serving_count = 1 if (farmhash.fingerprint64( _get_serialized_feature( training_feature, float_round_ndigits)) == farmhash.fingerprint64( _get_serialized_feature(serving_feature, float_round_ndigits))): skew_results.match_count = 1 else: skew_results.mismatch_count = 1 elif training_feature is not None: skew_results.training_count = 1 skew_results.training_only = 1 elif serving_feature is not None: skew_results.serving_count = 1 skew_results.serving_only = 1 return skew_results
def get_data_split_2019(fl_date): fl_date_str = str(fl_date) if fl_date_str > '2019': data_split = 'TEST' else: # Use farm fingerprint just like in BigQuery x = np.abs( np.uint64(farmhash.fingerprint64(fl_date_str)).astype('int64') % 100) if x < 95: data_split = 'TRAIN' else: data_split = 'VALIDATE' return data_split
def hashfile(job): sources, dst = job count = 0 with tf.io.gfile.GFile(dst, "w") as wf: for src in sources: count += 1 with tf.io.gfile.GFile(src, "rb") as fd: message_bytes = fd.read() base64_bytes = base64.b64encode(message_bytes) # there is no relationship here between these two methods # the base64 is a trick to allow to dedup gz/compresed files hashvalue = farmhash.fingerprint64(base64_bytes.decode("ascii")) wf.write("%s\t%d\n" % (src, hashvalue)) return dst, count
def process( self, example: tf.train.Example ) -> Iterable[Tuple[str, tf.train.Example]]: serialized_feature_values = [] for identifier_feature in self._identifier_features: feature = example.features.feature.get(identifier_feature) if feature is None: _EXAMPLES_WITH_MISSING_IDENTIFIER_COUNTER.inc() return else: serialized_feature_values.append( _get_serialized_feature(feature, self._float_round_ndigits)) yield (str(farmhash.fingerprint64("".join(serialized_feature_values))), example)
def get_hash_indices(self, data_strings: List[str]) -> List[List[int]]: """Computes the indices at which `data_strings` in IBLT. Args: data_strings: A list of strings to be hashed. Returns: hash_indices: A vector of `repetitions` hash values of `data_string`, in {0,...,`table_size`-1}. """ all_hash_indices = [] for data_string in data_strings: hash_indices = [] for i in range(self._repetitions): hash_indices.append( farmhash.fingerprint64(str(self._salt[i]) + data_string) % self._table_size) all_hash_indices.append(hash_indices) return all_hash_indices
def batch_tokenizer(tokenizer, txtfile_location, strategy="file"): # just convert to the token ids, we will do adaptative padding on training time. sources = lm.parsers.parse_url(txtfile_location, strategy=strategy) if len(sources) == 0: return uids = [farmhash.fingerprint64(source) for source in sources] batches = tokenizer.batch_encode_plus( sources, return_token_type_ids=True, pad_to_max_length=False, truncation=False, add_special_tokens=True, return_offsets_mapping=True, verbose=False, ) yield from zip( uids, sources, batches["input_ids"], [[start for start, end in offsets] for offsets in batches["offset_mapping"]], [[end for start, end in offsets] for offsets in batches["offset_mapping"]], )
def sample_variants_to_example(self, sample, sample_variants, samples_metadata): """Convert variant calls to TensorFlow Example protocol buffers. See also https://www.tensorflow.org/versions/r0.10/how_tos/reading_data/index.html Args: sample: the identifier for the sample sample_variants: the sample's variant calls samples_metadata: dictionary of metadata for all samples Returns: A filled in TensorFlow Example proto for this sample. """ # Some samples may have no metadata, but we may still want to preprocess # the data for prediction use cases. metadata = defaultdict(lambda: self.NA_STRING) if sample in samples_metadata: metadata.update(samples_metadata[sample]) variants_by_feature = collections.defaultdict(list) for variant in sample_variants: feature_name = self.variant_to_feature_name(variant) words = self.variant_to_words(variant) variants_by_feature[feature_name].extend( # fingerprint64 returns an unsigned int64 but int64 features are # signed. Convert from from unsigned to signed. [ struct.unpack('q', struct.pack('Q', farmhash.fingerprint64(w)))[0] for w in words ]) features = { 'sample_name': util.bytes_feature(str(sample)), # Nomalize population to integer or NA_INTEGER if no match. 'population': util.int64_feature(self.POPULATION_MAP[str( metadata[self.POPULATION_COLUMN])]), # Use verbatim value of population. 'population_string': util.bytes_feature(str(metadata[self.POPULATION_COLUMN])), # Nomalize super population to integer or NA_INTEGER if no match. 'super_population': util.int64_feature(self.SUPER_POPULATION_MAP[str( metadata[self.SUPER_POPULATION_COLUMN])]), # Use verbatim value of super population. 'super_population_string': util.bytes_feature(str(metadata[self.SUPER_POPULATION_COLUMN])), # Nomalize sex/gender to integer or NA_INTEGER if no match. 'gender': util.int64_feature(self.GENDER_MAP[str( metadata[self.GENDER_COLUMN])]), # Use verbatim value of sex/gender. 'gender_string': util.bytes_feature(str(metadata[self.GENDER_COLUMN])) } for feature, variants in variants_by_feature.iteritems(): if variants: features['variants_' + feature] = tf.train.Feature( int64_list=tf.train.Int64List(value=variants)) return tf.train.Example(features=tf.train.Features(feature=features))
def to_node_id(sexp: Text) -> NodeID: return NodeID(farmhash.fingerprint64(sexp))