Пример #1
0
    def __init__(self):
        FeatureDefinition.__init__(self)

        # NODES
        node_transformer = FeatureUnion([                               \
              ("geometry"           , Node_Geometry())         # one can set nQuantile=...
                                        ])

        # EDGES
        # which types of edge can we get??
        # It depends on the type of graph!!
        lEdgeClass = [HorizontalEdge, VerticalEdge]
        # standard set of features, including a constant 1 for CRF
        edge_transformer =  FeatureUnion([                                            \
                  ('1hot'   , Edge_Type_1Hot(lEdgeClass=lEdgeClass)) # Edge class 1 hot encoded (PUT IT FIRST)
                , ('geom'   , Edge_Geometry())                       # one can set nQuantile=...
                            ])

        # OPTIONNALLY, you can have one range of features per type of edge.
        # the 1-hot encoding must be the first part of the union and it will determine
        #   by how much the rest of the feature are shifted.
        #
        # IMPORTANT: 1hot is first of union   AND   the correct number of edge classes
        if self.bShiftEdgeByClass:
            edge_transformer = Pipeline([
                ('edge_transformer', edge_transformer),
                ('shifter', EdgeClassShifter(len(lEdgeClass)))
            ])

        self.setTransformers(node_transformer, edge_transformer)
 def __init__(self, **kwargs):
     """
     set _node_transformer, _edge_transformer, tdifNodeTextVectorizer
     """
     FeatureDefinition.__init__(self)
     self._node_transformer     = None
     self._edge_transformer     = None
     self._node_text_vectorizer = None #tdifNodeTextVectorizer
Пример #3
0
    def __init__(self):
        FeatureDefinition.__init__(self)

        # NODES
        self.lNodeFeature = [                               \
              ("geometry"           , Node_Geometry())         # one can set nQuantile=...
            , ("neighbor_count"     , Node_Neighbour_Count())  # one can set nQuantile=...
            , ("text"               , Node_Text_NGram( 'char'    # character n-grams
                                                       , 500     # number of N-grams
                                                       , (2,3)    # N
                                                       , False    # lowercase?))
                                                       ))
                            ]
        node_transformer = FeatureUnion(self.lNodeFeature)

        # EDGES
        # which types of edge can we get??
        # It depends on the type of graph!!
        lEdgeClass = [HorizontalEdge, VerticalEdge]
        # standard set of features, including a constant 1 for CRF
        self.lEdgeFeature = [                                            \
                  ('1hot'   , Edge_Type_1Hot(lEdgeClass=lEdgeClass)) # Edge class 1 hot encoded (PUT IT FIRST)
                , ('1'      , Edge_1())                              # optional constant 1 for CRF
                , ('geom'   , Edge_Geometry())                       # one can set nQuantile=...
                , ('src_txt', Edge_Source_Text_NGram( 'char'    # character n-grams
                                               , 250     # number of N-grams
                                               , (2,3)    # N
                                               , False    # lowercase?))
                                               ))
                , ('tgt_txt', Edge_Target_Text_NGram( 'char'    # character n-grams
                                               , 250     # number of N-grams
                                               , (2,3)    # N
                                               , False    # lowercase?))
                                               ))
                            ]
        if self.bSeparator:
            self.lEdgeFeature = self.lEdgeFeature + [
                ('sprtr_bool', Separator_boolean()),
                ('sprtr_num', Separator_num())
            ]
        fu = FeatureUnion(self.lEdgeFeature)

        # you can use directly this union of features!
        edge_transformer = fu

        # OPTIONNALLY, you can have one range of features per type of edge.
        # the 1-hot encoding must be the first part of the union and it will determine
        #   by how much the rest of the feature are shifted.
        #
        # IMPORTANT: 1hot is first of union   AND   the correct number of edge classes
        if self.bShiftEdgeByClass:
            ppl = Pipeline([('fu', fu),
                            ('shifter', EdgeClassShifter(len(lEdgeClass)))])
            edge_transformer = ppl

        self.setTransformers(node_transformer, edge_transformer)
Пример #4
0
    def __init__(self):
        FeatureDefinition.__init__(self)

        node_transformer = FeatureUnion(
            [  # CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                (
                    "xywh",
                    Pipeline([
                        ('selector', NodeTransformerXYWH()),
                        # v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                        ('xywh',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  # use in-place scaling
                    ])),
                (
                    "neighbors",
                    Pipeline([
                        ('selector', NodeTransformerNeighbors()),
                        # v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                        ('neighbors',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  # use in-place scaling
                    ])),
                (
                    "twy",
                    Pipeline([
                        ('selector', NodeTransformerTWY()),
                        ('twy', QuantileTransformer(n_quantiles=16, copy=False)
                         )  # use in-place scaling
                    ]))
            ])

        lEdgeFeature = [  # CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
            ("boolean", Pipeline([('boolean', EdgeBooleanAlignmentFeatures())
                                  ])),
            (
                "numerical",
                Pipeline([
                    ('selector', EdgeNumericalSelector_noText()),
                    # v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                    ('numerical',
                     QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                         copy=False))  # use in-place scaling
                ]))
        ]

        edge_transformer = FeatureUnion(lEdgeFeature)

        # return _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        self._node_transformer = node_transformer
        self._edge_transformer = edge_transformer
        self.tfidfNodeTextVectorizer = None  # tdifNodeTextVectorizer
Пример #5
0
    def __init__(self):
        FeatureDefinition.__init__(self)

        # NODES
        self.lNodeFeature = [
            ("geometry", Node_Geometry())  # one can set nQuantile=...
        ]
        node_transformer = FeatureUnion(self.lNodeFeature)

        # EDGES
        # standard set of features, including a constant 1 for CRF
        self.lEdgeFeature = [
            ('1hot', Edge_Type_1Hot(lEdgeClass=lEdgeClass)
             )  # Edge class 1 hot encoded (PUT IT FIRST)
            ,
            ('geom', Edge_Geometry())  # one can set nQuantile=...
        ]
        if self.bSeparator:
            self.lEdgeFeature = self.lEdgeFeature + [
                ('sprtr_bool', Separator_boolean()),
                ('sprtr_num', Separator_num())
            ]
        edge_transformer = FeatureUnion(self.lEdgeFeature)

        # OPTIONNALLY, you can have one range of features per type of edge.
        # the 1-hot encoding must be the first part of the union and it will determine
        #   by how much the rest of the feature are shifted.
        #
        # IMPORTANT: 1hot is first of union   AND   the correct number of edge classes
        if self.bShiftEdgeByClass:
            edge_transformer = Pipeline([
                ('edge_transformer', edge_transformer),
                ('shifter', EdgeClassShifter(len(lEdgeClass)))
            ])

        self.setTransformers(node_transformer, edge_transformer)
    def __init__(self,
                 n_tfidf_node=None,
                 t_ngrams_node=None,
                 b_tfidf_node_lc=None,
                 n_tfidf_edge=None,
                 t_ngrams_edge=None,
                 b_tfidf_edge_lc=None,
                 bMirrorPage=True,
                 bMultiPage=True):
        FeatureDefinition.__init__(self)

        self.n_tfidf_node, self.t_ngrams_node, self.b_tfidf_node_lc = n_tfidf_node, t_ngrams_node, b_tfidf_node_lc
        self.n_tfidf_edge, self.t_ngrams_edge, self.b_tfidf_edge_lc = n_tfidf_edge, t_ngrams_edge, b_tfidf_edge_lc
        self.bMirrorPage = bMirrorPage
        self.bMultiPage = bMultiPage
        tdifNodeTextVectorizer = TfidfVectorizer(
            lowercase=self.b_tfidf_node_lc,
            max_features=self.n_tfidf_node,
            analyzer='char',
            ngram_range=self.t_ngrams_node  #(2,6)
            ,
            dtype=np.float64)

        node_transformer = FeatureUnion(
            [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                (
                    "text",
                    Pipeline([
                        ('selector', NodeTransformerTextEnclosed()),
                        #                                                         ('tfidf', TfidfVectorizer(lowercase=self.b_tfidf_node_lc, max_features=self.n_tfidf_node
                        #                                                                                   , analyzer = 'char', ngram_range=self.tNODE_NGRAMS #(2,6)
                        #                                                                                   , dtype=np.float64)),
                        (
                            'tfidf', tdifNodeTextVectorizer
                        ),  #we can use it separately from the pipleline once fitted
                        ('todense', SparseToDense()
                         )  #pystruct needs an array, not a sparse matrix
                    ])),
                (
                    "textlen",
                    Pipeline([
                        ('selector', NodeTransformerTextLen()),
                        ('textlen',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  #use in-place scaling
                    ])),
                (
                    "xywh",
                    Pipeline([
                        ('selector', NodeTransformerXYWH()),
                        #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                        ('xywh',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  #use in-place scaling
                    ])),
                (
                    "neighbors",
                    Pipeline([
                        ('selector', NodeTransformerNeighbors()),
                        #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                        ('neighbors',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  #use in-place scaling
                    ])),
                (
                    "1hot",
                    Pipeline([('1hot', Node1HotFeatures()
                               )  #does the 1-hot encoding directly
                              ]))
                #, ("sem", Pipeline([
                #                     ('sem', NodeSemanticLabels())  #add semantic labels
                #                     ])
                #  )  # Added  by Animesh
                #                                     , ('ocr' , Pipeline([
                #                                                          ('ocr', NodeOCRFeatures())
                #                                                          ])
                #                                        )
                #                                     , ('pnumre' , Pipeline([
                #                                                          ('pnumre', NodePNumFeatures())
                #                                                          ])
                #                                        )
                #                                     , ("doc_tfidf", Pipeline([
                #                                                          ('zero', Zero2Features())
                #                                                          #THIS ONE MUST BE LAST, because it include a placeholder column for the doculent-level tfidf
                #                                                          ])
                #                                        )
            ])

        lEdgeFeature = [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
            ("1hot",
             Pipeline([('1hot',
                        Edge1HotFeatures(PageNumberSimpleSequenciality()))])),
            ("boolean", Pipeline([('boolean', EdgeBooleanFeatures())])),
            (
                "numerical",
                Pipeline([
                    ('selector', EdgeNumericalSelector()),
                    #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                    ('numerical',
                     QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                         copy=False))  #use in-place scaling
                ]))
            #                                     , ("sourcetext0", Pipeline([
            #                                                        ('selector', EdgeTransformerSourceText(0, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)),
            #                                                        ('tfidf', TfidfVectorizer(lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge
            #                                                                                  , analyzer = 'char', ngram_range=self.t_ngrams_edge  #(2,6)
            #                                                                                  , dtype=np.float64)),
            #                                                        ('todense', SparseToDense())  #pystruct needs an array, not a sparse matrix
            #                                                        ])
            #                                        )
            #                                     , ("targettext0", Pipeline([
            #                                                        ('selector', EdgeTransformerTargetText(0, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)),
            #                                                        ('tfidf', TfidfVectorizer(lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge
            #                                                                                  , analyzer = 'char', ngram_range=self.t_ngrams_edge
            #                                                                                  #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS
            #                                                                                  , dtype=np.float64)),
            #                                                        ('todense', SparseToDense())  #pystruct needs an array, not a sparse matrix
            #                                                        ])
            #                                        )
            #                                     , ("sourcetext1", Pipeline([
            #                                                        ('selector', EdgeTransformerSourceText(1, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)),
            #                                                        ('tfidf', TfidfVectorizer(lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge
            #                                                                                  , analyzer = 'char', ngram_range=self.t_ngrams_edge  #(2,6)
            #                                                                                  , dtype=np.float64)),
            #                                                        ('todense', SparseToDense())  #pystruct needs an array, not a sparse matrix
            #                                                        ])
            #                                        )
            #                                     , ("targettext1", Pipeline([
            #                                                        ('selector', EdgeTransformerTargetText(1, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)),
            #                                                        ('tfidf', TfidfVectorizer(lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge
            #                                                                                  , analyzer = 'char', ngram_range=self.t_ngrams_edge
            #                                                                                  #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS
            #                                                                                  , dtype=np.float64)),
            #                                                        ('todense', SparseToDense())  #pystruct needs an array, not a sparse matrix
            #                                                        ])
            #                                        )
        ]

        if self.bSeparator:
            lEdgeFeature = lEdgeFeature + [('sprtr_bool', Separator_boolean()),
                                           ('sprtr_num', Separator_num())]

        if bMultiPage:
            lEdgeFeature.extend([
                (
                    "sourcetext2",
                    Pipeline([
                        ('selector',
                         EdgeTransformerSourceText(2,
                                                   bMirrorPage=bMirrorPage,
                                                   bMultiPage=bMultiPage)),
                        (
                            'tfidf',
                            TfidfVectorizer(
                                lowercase=self.b_tfidf_edge_lc,
                                max_features=self.n_tfidf_edge,
                                analyzer='char',
                                ngram_range=self.t_ngrams_edge  #(2,6)
                                ,
                                dtype=np.float64)),
                        ('todense', SparseToDense()
                         )  #pystruct needs an array, not a sparse matrix
                    ])),
                (
                    "targettext2",
                    Pipeline([
                        ('selector',
                         EdgeTransformerTargetText(2,
                                                   bMirrorPage=bMirrorPage,
                                                   bMultiPage=bMultiPage)),
                        (
                            'tfidf',
                            TfidfVectorizer(
                                lowercase=self.b_tfidf_edge_lc,
                                max_features=self.n_tfidf_edge,
                                analyzer='char',
                                ngram_range=self.t_ngrams_edge
                                #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS
                                ,
                                dtype=np.float64)),
                        ('todense', SparseToDense()
                         )  #pystruct needs an array, not a sparse matrix
                    ]))
            ])

        edge_transformer = FeatureUnion(lEdgeFeature)

        #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        self._node_transformer = node_transformer
        self._edge_transformer = edge_transformer
        self.tfidfNodeTextVectorizer = tdifNodeTextVectorizer
Пример #7
0
    def __init__(
        self,
        n_tfidf_node=None,
        t_ngrams_node=None,
        b_tfidf_node_lc=None
        # , n_tfidf_edge=None, t_ngrams_edge=None, b_tfidf_edge_lc=None
    ):
        FeatureDefinition.__init__(self)

        self.n_tfidf_node, self.t_ngrams_node, self.b_tfidf_node_lc = n_tfidf_node, t_ngrams_node, b_tfidf_node_lc
        # self.n_tfidf_edge, self.t_ngrams_edge, self.b_tfidf_edge_lc = n_tfidf_edge, t_ngrams_edge, b_tfidf_edge_lc

        tdifNodeTextVectorizer = TfidfVectorizer(
            lowercase=self.b_tfidf_node_lc,
            max_features=self.n_tfidf_node,
            analyzer='char',
            ngram_range=self.t_ngrams_node  # (2,6)
            ,
            dtype=np.float64)

        node_transformer = FeatureUnion(
            [  # CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                (
                    "text",
                    Pipeline([
                        ('selector', NodeTransformerTextEnclosed()),
                        (
                            'tfidf', tdifNodeTextVectorizer
                        ),  # we can use it separately from the pipleline once fitted
                        ('todense', SparseToDense()
                         )  # pystruct needs an array, not a sparse matrix
                    ])),
                (
                    "textlen",
                    Pipeline([
                        ('selector', NodeTransformerTextLen()),
                        ('textlen',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  # use in-place scaling
                    ])),
                (
                    "xywh",
                    Pipeline([
                        ('selector', NodeTransformerXYWH()),
                        # v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                        ('xywh',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  # use in-place scaling
                    ])),
                (
                    "neighbors",
                    Pipeline([
                        ('selector', NodeTransformerNeighbors()),
                        # v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                        ('neighbors',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  # use in-place scaling
                    ])),
                (
                    "twy",
                    Pipeline([
                        ('selector', NodeTransformerTWY()),
                        ('twy', QuantileTransformer(n_quantiles=16, copy=False)
                         )  # use in-place scaling
                    ]))
            ])

        lEdgeFeature = [  # CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
            ("boolean", Pipeline([('boolean', EdgeBooleanAlignmentFeatures())
                                  ])),
            (
                "numerical",
                Pipeline([
                    ('selector', EdgeNumericalSelector_noText()),
                    # v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                    ('numerical',
                     QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                         copy=False))  # use in-place scaling
                ]))
        ]
        edge_transformer = FeatureUnion(lEdgeFeature)

        # return _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        self._node_transformer = node_transformer
        self._edge_transformer = edge_transformer
        self.tfidfNodeTextVectorizer = None  # tdifNodeTextVectorizer