Пример #1
0
 def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
     vect = CountVectorizer(strip_accents='unicode', analyzer='word',
                            stop_words='english')
     return [('item_select', vectorizers.DictItemSelector('address')),
             ('clean', vectorizers.ReplaceNoneTransformer('')),
             ('vect', vect),
             ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(vect)
    def build_vectorization_pipeline(
            self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
        vect_cur = CountVectorizer(strip_accents='unicode',
                                   analyzer='word',
                                   stop_words='english',
                                   tokenizer=vectorizers.whole_value_as_token)
        vect_amount = vectorizers.NumberVectorizer()

        def get_feature_names_(vect_cur_, vect_amount_):
            def res():
                return ['currency_' + str(c) for c in vect_cur_.get_feature_names()] \
                       + ['amount_' + str(fn) for fn in vect_amount_.get_feature_names()]

            return res

        return [
            ('vect',
             FeatureUnion(transformer_list=[(
                 'currency',
                 Pipeline([
                     ('selector',
                      vectorizers.DictItemSelector(item='currency')),
                     ('clean', vectorizers.ReplaceNoneTransformer('')),
                     ('vect', vect_cur),
                     ('tfidf', TfidfTransformer()),
                 ])),
                                            ('amount',
                                             Pipeline([
                                                 ('selector',
                                                  vectorizers.DictItemSelector(
                                                      item='amount')),
                                                 ('vect', vect_amount),
                                             ]))]))
        ], get_feature_names_(vect_cur, vect_amount)
Пример #3
0
 def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
     count_vectorizer = CountVectorizer(strip_accents='unicode', analyzer='word',
                                        stop_words='english',
                                        tokenizer=vectorizers.list_items_as_tokens)
     return [('clean', vectorizers.ReplaceNoneTransformer('')),
             ('vect', count_vectorizer),
             ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(count_vectorizer)
Пример #4
0
 def build_vectorization_pipeline(self) -> List[Tuple[str, Any]]:
     return [
         ('vect', FeatureUnion(transformer_list=[
             ('currency', Pipeline([
                 ('selector', vectorizers.DictItemSelector(item='currency')),
                 ('clean', vectorizers.ReplaceNoneTransformer('')),
                 ('vect', CountVectorizer(strip_accents='unicode', analyzer='word',
                                          stop_words='english', tokenizer=vectorizers.whole_value_as_token)),
                 ('tfidf', TfidfTransformer()),
             ])),
             ('amount', Pipeline([
                 ('selector', vectorizers.DictItemSelector(item='amount')),
                 ('vect', vectorizers.NumberVectorizer()),
             ]))
         ]))
     ]
Пример #5
0
    def build_vectorization_pipeline(self) -> List[Tuple[str, Any]]:
        """
        Build SKLearn vectorization pipeline for this field.
        This is used in field-based machine learning when we calculate value of one field based on the
        values of other fields of this document.

        We are able to detect only choice fields this way at the moment.

        To reach this we need to build a feature vector of all dependencies of the field being detected.
        This feature vector is built as a union of feature vectors of each dependency.

        See how the whole pipeline is built in FieldBasedMLOnlyFieldDetectionStrategy.build_pipeline(..)

        :return:
        """
        return [('clean', vectorizers.ReplaceNoneTransformer('')),
                ('vect', CountVectorizer(strip_accents='unicode', analyzer='word',
                                         stop_words='english')),
                ('tfidf', TfidfTransformer())]
Пример #6
0
 def build_vectorization_pipeline(self) -> List[Tuple[str, Any]]:
     return [('clean', vectorizers.ReplaceNoneTransformer('')),
             ('vect', CountVectorizer(strip_accents='unicode', analyzer='word',
                                      stop_words='english', tokenizer=vectorizers.whole_value_as_token)),
             ('tfidf', TfidfTransformer())]
Пример #7
0
 def build_vectorization_pipeline(self) -> List[Tuple[str, Any]]:
     return [('item_select', vectorizers.DictItemSelector('address')),
             ('clean', vectorizers.ReplaceNoneTransformer('')),
             ('vect', CountVectorizer(strip_accents='unicode', analyzer='word',
                                      stop_words='english')),
             ('tfidf', TfidfTransformer())]