Exemplo n.º 1
0
    def fit(self, data_pack: DataPack, verbose: int = 1):
        """
        Fit pre-processing context for transformation.

        :param verbose: Verbosity.
        :param data_pack: data_pack to be preprocessed.
        :return: class:`DSSMPreprocessor` instance.
        """
        DEBUG = False
        if DEBUG:
            func2 = chain_transform(self.old_units())
            data_packx = data_pack.apply_on_text(func2, verbose=verbose)
            # transform text, after tokenizing, remove stop words and blah blah
            vocab_unit2 = build_vocab_unit(data_packx, verbose=verbose)
            vocab_size_without_using_letter_ngram = len(
                vocab_unit2.state['term_index']) + 1
            print("Vocab size without using letter_ngram",
                  vocab_size_without_using_letter_ngram)

        func = chain_transform(self._default_units())
        data_pack = data_pack.apply_on_text(func, verbose=verbose)
        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)

        self._context['vocab_unit'] = vocab_unit
        vocab_size = len(vocab_unit.state['term_index']) + 1
        if DEBUG:
            print("Vocab size using letter_ngram", vocab_size)
        self._context['vocab_size'] = vocab_size
        self._context['embedding_input_dim'] = vocab_size
        self._context['input_shapes'] = [(vocab_size, ), (vocab_size, )]
        return self
Exemplo n.º 2
0
    def reorganize_data_pack(cls,
                             data_pack: DataPack,
                             num_dup: int = 1,
                             num_neg: int = 1):
        """Re-organize the data pack as pair-wise format.

        :param data_pack: the input :class:`DataPack`.
        :param num_dup: number of duplicates for each positive sample.
        :param num_neg: number of negative samples associated with each
            positive sample.
        :return: the reorganized :class:`DataPack` object.
        """
        pairs = []
        groups = data_pack.relation.sort_values(
            'label', ascending=False).groupby('id_left')
        for idx, group in groups:
            labels = group.label.unique()
            for label in labels[:-1]:
                pos_samples = group[group.label == label]
                pos_samples = pd.concat([pos_samples] * num_dup)
                neg_samples = group[group.label < label]
                for _, pos_sample in pos_samples.iterrows():
                    pos_sample = pd.DataFrame([pos_sample])
                    neg_sample = neg_samples.sample(num_neg, replace=True)
                    pairs.extend((pos_sample, neg_sample))
        new_relation = pd.concat(pairs, ignore_index=True)
        return DataPack(relation=new_relation,
                        left=data_pack.left.copy(),
                        right=data_pack.right.copy())
Exemplo n.º 3
0
    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
        """
        Apply transformation on data, create `tri-letter` representation.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        data_pack = data_pack.copy()
        units_ = self._default_units()
        if self._with_word_hashing:
            term_index = self._context['vocab_unit'].state['term_index']
            units_.append(units.WordHashing(term_index))
        func = chain_transform(units_)
        data_pack.apply_on_text(func, inplace=True, verbose=verbose)
        return data_pack
Exemplo n.º 4
0
    def fit(self, data_pack: DataPack, verbose: int = 1):
        """
        Fit pre-processing context for transformation.

        :param verbose: Verbosity.
        :param data_pack: data_pack to be preprocessed.
        :return: class:`DSSMPreprocessor` instance.
        """

        func = chain_transform(self._default_units())
        data_pack = data_pack.apply_on_text(func, verbose=verbose)
        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)

        self._context['vocab_unit'] = vocab_unit
        vocab_size = len(vocab_unit.state['term_index'])
        self._context['vocab_size'] = vocab_size
        self._context['embedding_input_dim'] = vocab_size
        return self