Пример #1
0
def batch_tokenizer(tokenizer, txtfile_location, by_line=False, ftfy=True):
    # just convert to the token ids, we will do adaptative padding on training time.
    with tf.io.gfile.GFile(txtfile_location, "rb") as f:
        if by_line:
            sources = [l.decode("utf-8") for l in f.readlines()]
        else:
            sources = [f.read().decode("utf-8")]
    if len(sources) <= 0:
        # tokenizer crashes when given an empty list, so give it an empty string
        # (this happens in --by_line mode for empty files)
        sources = ['']
    if ftfy:
        sources = [ftfy_text(source) for source in sources]
    uids = [farmhash.fingerprint64(source) for source in sources]
    batches = tokenizer.batch_encode_plus(
        sources,
        return_token_type_ids=True,
        pad_to_max_length=False,
        truncation=False,
        add_special_tokens=True,
        return_offsets_mapping=True,
        verbose=False,
    )

    return zip(
        uids,
        sources,
        batches["input_ids"],
        [[start for start, end in offsets]
         for offsets in batches["offset_mapping"]],
        [[end for start, end in offsets]
         for offsets in batches["offset_mapping"]],
    )
    def variants_to_features(sample_variants):
        """Convert variant calls to TensorFlow features.

    See also
    https://www.tensorflow.org/versions/r0.10/how_tos/reading_data/index.html

    Args:
      sample_variants: the sample's variant calls

    Returns:
      A dictionary of TensorFlow features.
    """
        variants_by_feature = collections.defaultdict(list)
        for variant in sample_variants:
            feature_name = variant_to_feature_name_fn(variant)
            words = variant_to_words_fn(variant)
            variants_by_feature[feature_name].extend(
                # fingerprint64 returns an unsigned int64 but int64 features are
                # signed.  Convert from from unsigned to signed.
                [
                    struct.unpack('q',
                                  struct.pack('Q',
                                              farmhash.fingerprint64(w)))[0]
                    for w in words
                ])

        # Fill in features from variants.
        features = {}
        for feature, variants in variants_by_feature.iteritems():
            if variants:
                features['variants_' + feature] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=variants))

        return features
Пример #3
0
    def _hash_to_float(self,
                       input_string: str,
                       hash_range: Tuple[float, float],
                       precision: int = tf.int32.max) -> float:
        """Hashes a string and returns a `float`.

    `hash_range` is evenly divided into a number of buckets. The hashed value of
    `input_string` is mapped to one of the bucket.

    TODO(b/158684105): Update this function to directly map the hash to the
    index rather than converting hash -> float -> index.

    Args:
      input_string: An input string.
      hash_range: A tuple representing the range for the hashed value.
      precision: The number of buckets in `hash_range`. Must be a positive
        integer.

    Returns:
      A float value being the lower bound of the bucket that the hashed string
      value falls into.
    """
        (low, high) = hash_range
        hashed_value = farmhash.fingerprint64(input_string)
        hashed_value = hashed_value % precision
        hashed_value = ((float(hashed_value) / precision) * (high - low)) + low
        return hashed_value
def _generate_unsigned_hash_code(strings, max_hash_value=sys.maxsize):
    # type: (List[str], int) -> int
    """Generates a forever-fixed hash code for `strings`.

  The hash code generated is in the range [0, max_hash_value). Note that the
  hash code generated by farmhash.fingerprint64 is unsigned.
  """
    return farmhash.fingerprint64(json.dumps(strings)) % max_hash_value
def get_data_split(fl_date):
    fl_date_str = str(fl_date)
    # Use farm fingerprint just like in BigQuery
    x = np.abs(
        np.uint64(farmhash.fingerprint64(fl_date_str)).astype('int64') % 100)
    if x < 60:
        data_split = 'TRAIN'
    elif x < 80:
        data_split = 'VALIDATE'
    else:
        data_split = 'TEST'
    return data_split
Пример #6
0
def create_fingerprint():
  """Create a unique fingerprint for the running process.

  The main function should call this function once to create a unique
  fingerprint for that run.

  Returns:
    Unique fingerprint for this run.
  """
  build_run_str = (
      build_data.BuildData() + 'time: %d' % (int(round(time.time()))))
  return farmhash.fingerprint64(build_run_str)
Пример #7
0
def Fingerprint(theorem):
    """Compute a unique, stable fingerprint for theorem objects.

  Args:
    theorem: proof_assistant_pb2.Theorem object

  Returns:
    62 bit non-negative integer fingerprint. Note that we truncate to 62 bits
    for OCaml compatibility. OCaml uses 63 bit signed integers.
  """
    if not theorem.HasField('conclusion') and theorem.HasField('fingerprint'):
        return theorem.fingerprint
    fp = farmhash.fingerprint64(theorem.conclusion)
    for hypothesis in theorem.hypotheses:
        tmp = farmhash.fingerprint64(hypothesis)
        fp = _PairFingerprint(fp, tmp)
    result = fp & MASK62
    assert (not theorem.HasField('fingerprint')
            or theorem.fingerprint == result), (
                'Inconsistent fingerprints %d != %d in Theorem protobuf.' %
                (result, theorem.fingerprint))
    return result
Пример #8
0
def _compute_skew_for_features(
        training_feature: tf.train.Feature, serving_feature: tf.train.Feature,
        float_round_ndigits: Optional[int],
        feature_name: str) -> feature_skew_results_pb2.FeatureSkew:
    """Computes feature skew for a pair of training and serving features.

  Args:
    training_feature: The feature to compare from the training example.
    serving_feature: The feature to compare from the serving example.
    float_round_ndigits: Number of digits precision after the decimal point to
      which to round float values before comparison.
    feature_name: The name of the feature for which to compute skew between the
      examples.

  Returns:
    A FeatureSkew proto containing information about skew for the specified
      feature.
  """
    skew_results = feature_skew_results_pb2.FeatureSkew()
    skew_results.feature_name = feature_name
    if training_feature is not None and serving_feature is not None:
        skew_results.training_count = 1
        skew_results.serving_count = 1
        if (farmhash.fingerprint64(
                _get_serialized_feature(
                    training_feature,
                    float_round_ndigits)) == farmhash.fingerprint64(
                        _get_serialized_feature(serving_feature,
                                                float_round_ndigits))):
            skew_results.match_count = 1
        else:
            skew_results.mismatch_count = 1
    elif training_feature is not None:
        skew_results.training_count = 1
        skew_results.training_only = 1
    elif serving_feature is not None:
        skew_results.serving_count = 1
        skew_results.serving_only = 1
    return skew_results
def get_data_split_2019(fl_date):
    fl_date_str = str(fl_date)
    if fl_date_str > '2019':
        data_split = 'TEST'
    else:
        # Use farm fingerprint just like in BigQuery
        x = np.abs(
            np.uint64(farmhash.fingerprint64(fl_date_str)).astype('int64') %
            100)
        if x < 95:
            data_split = 'TRAIN'
        else:
            data_split = 'VALIDATE'
    return data_split
Пример #10
0
def hashfile(job):
    sources, dst = job
    count = 0
    with tf.io.gfile.GFile(dst, "w") as wf:
        for src in sources:
            count += 1
            with tf.io.gfile.GFile(src, "rb") as fd:
                message_bytes = fd.read()
                base64_bytes = base64.b64encode(message_bytes)
                # there is no relationship here between these two methods
                # the base64 is a trick to allow to dedup gz/compresed files
                hashvalue = farmhash.fingerprint64(base64_bytes.decode("ascii"))
                wf.write("%s\t%d\n" % (src, hashvalue))
    return dst, count
Пример #11
0
 def process(
         self, example: tf.train.Example
 ) -> Iterable[Tuple[str, tf.train.Example]]:
     serialized_feature_values = []
     for identifier_feature in self._identifier_features:
         feature = example.features.feature.get(identifier_feature)
         if feature is None:
             _EXAMPLES_WITH_MISSING_IDENTIFIER_COUNTER.inc()
             return
         else:
             serialized_feature_values.append(
                 _get_serialized_feature(feature,
                                         self._float_round_ndigits))
     yield (str(farmhash.fingerprint64("".join(serialized_feature_values))),
            example)
Пример #12
0
    def get_hash_indices(self, data_strings: List[str]) -> List[List[int]]:
        """Computes the indices at which `data_strings` in IBLT.

    Args:
      data_strings: A list of strings to be hashed.

    Returns:
      hash_indices: A vector of `repetitions` hash values of `data_string`,
      in {0,...,`table_size`-1}.
    """
        all_hash_indices = []
        for data_string in data_strings:
            hash_indices = []
            for i in range(self._repetitions):
                hash_indices.append(
                    farmhash.fingerprint64(str(self._salt[i]) + data_string) %
                    self._table_size)
            all_hash_indices.append(hash_indices)

        return all_hash_indices
Пример #13
0
def batch_tokenizer(tokenizer, txtfile_location, strategy="file"):
    # just convert to the token ids, we will do adaptative padding on training time.
    sources = lm.parsers.parse_url(txtfile_location, strategy=strategy)
    if len(sources) == 0:
        return
    uids = [farmhash.fingerprint64(source) for source in sources]
    batches = tokenizer.batch_encode_plus(
        sources,
        return_token_type_ids=True,
        pad_to_max_length=False,
        truncation=False,
        add_special_tokens=True,
        return_offsets_mapping=True,
        verbose=False,
    )

    yield from zip(
        uids,
        sources,
        batches["input_ids"],
        [[start for start, end in offsets] for offsets in batches["offset_mapping"]],
        [[end for start, end in offsets] for offsets in batches["offset_mapping"]],
    )
Пример #14
0
    def sample_variants_to_example(self, sample, sample_variants,
                                   samples_metadata):
        """Convert variant calls to TensorFlow Example protocol buffers.

    See also
    https://www.tensorflow.org/versions/r0.10/how_tos/reading_data/index.html

    Args:
      sample: the identifier for the sample
      sample_variants: the sample's variant calls
      samples_metadata: dictionary of metadata for all samples

    Returns:
      A filled in TensorFlow Example proto for this sample.
    """
        # Some samples may have no metadata, but we may still want to preprocess
        # the data for prediction use cases.
        metadata = defaultdict(lambda: self.NA_STRING)
        if sample in samples_metadata:
            metadata.update(samples_metadata[sample])

        variants_by_feature = collections.defaultdict(list)
        for variant in sample_variants:
            feature_name = self.variant_to_feature_name(variant)
            words = self.variant_to_words(variant)
            variants_by_feature[feature_name].extend(
                # fingerprint64 returns an unsigned int64 but int64 features are
                # signed.  Convert from from unsigned to signed.
                [
                    struct.unpack('q',
                                  struct.pack('Q',
                                              farmhash.fingerprint64(w)))[0]
                    for w in words
                ])

        features = {
            'sample_name':
            util.bytes_feature(str(sample)),
            # Nomalize population to integer or NA_INTEGER if no match.
            'population':
            util.int64_feature(self.POPULATION_MAP[str(
                metadata[self.POPULATION_COLUMN])]),
            # Use verbatim value of population.
            'population_string':
            util.bytes_feature(str(metadata[self.POPULATION_COLUMN])),
            # Nomalize super population to integer or NA_INTEGER if no match.
            'super_population':
            util.int64_feature(self.SUPER_POPULATION_MAP[str(
                metadata[self.SUPER_POPULATION_COLUMN])]),
            # Use verbatim value of super population.
            'super_population_string':
            util.bytes_feature(str(metadata[self.SUPER_POPULATION_COLUMN])),
            # Nomalize sex/gender to integer or NA_INTEGER if no match.
            'gender':
            util.int64_feature(self.GENDER_MAP[str(
                metadata[self.GENDER_COLUMN])]),
            # Use verbatim value of sex/gender.
            'gender_string':
            util.bytes_feature(str(metadata[self.GENDER_COLUMN]))
        }

        for feature, variants in variants_by_feature.iteritems():
            if variants:
                features['variants_' + feature] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=variants))

        return tf.train.Example(features=tf.train.Features(feature=features))
Пример #15
0
def to_node_id(sexp: Text) -> NodeID:
  return NodeID(farmhash.fingerprint64(sexp))