Exemplo n.º 1
0
    def __init__(self, **kwargs):
        """
        set _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        """
        FeatureDefinition.__init__(self)

        nbTypes = self._getTypeNumber(kwargs)
        
        print("BETTER FEATURES")
        
        
        block_transformer = FeatureUnion( [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                                    ("xywh", Pipeline([
                                                         ('selector', NodeTransformerXYWH_v2()),
                                                         #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                       )
                                    , ("neighbors", Pipeline([
                                                         ('selector', NodeTransformerNeighbors()),
                                                         #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                       )
                                    , ("1hot", Pipeline([
                                                         ('1hot', Node1HotFeatures())  #does the 1-hot encoding directly
                                                         ])
                                       )
                                      ])
        grid_line_transformer = GridLine_NodeTransformer_v2()
        
        self._node_transformer = TransformerListByType([block_transformer, grid_line_transformer]) 
        
        edge_BB_transformer = FeatureUnion( [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                                      ("1hot", Pipeline([
                                                         ('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality()))
                                                         ])
                                        )
                                    , ("boolean", Pipeline([
                                                         ('boolean', EdgeBooleanFeatures_v2())
                                                         ])
                                        )
                                    , ("numerical", Pipeline([
                                                         ('selector', EdgeNumericalSelector()),
                                                         #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                        )
                                          ] )
        edge_BL_transformer = Block2GridLine_EdgeTransformer()
        edge_LL_transformer = GridLine2GridLine_EdgeTransformer()
        self._edge_transformer = TransformerListByType([edge_BB_transformer,
                                                  edge_BL_transformer,
                                                  edge_BL_transformer,  # useless but required
                                                  edge_LL_transformer 
                                                  ])
          
        self.tfidfNodeTextVectorizer = None #tdifNodeTextVectorizer
Exemplo n.º 2
0
    def __init__(self): 
        FeatureDefinition.__init__(self)
        
#         self.n_tfidf_node, self.t_ngrams_node, self.b_tfidf_node_lc = n_tfidf_node, t_ngrams_node, b_tfidf_node_lc
#         self.n_tfidf_edge, self.t_ngrams_edge, self.b_tfidf_edge_lc = n_tfidf_edge, t_ngrams_edge, b_tfidf_edge_lc

#         tdifNodeTextVectorizer = TfidfVectorizer(lowercase=self.b_tfidf_node_lc, max_features=self.n_tfidf_node
#                                                                                   , analyzer = 'char', ngram_range=self.t_ngrams_node #(2,6)
#                                                                                   , dtype=np.float64)
        
        node_transformer = FeatureUnion( [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                                    ("xywh", Pipeline([
                                                         ('selector', NodeTransformerXYWH_v2()),
                                                         #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                       )
                                    , ("neighbors", Pipeline([
                                                         ('selector', NodeTransformerNeighbors()),
                                                         #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                       )
                                    , ("1hot", Pipeline([
                                                         ('1hot', Node1HotFeatures())  #does the 1-hot encoding directly
                                                         ])
                                       )
                                      ])
    
        lEdgeFeature = [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                                      ("1hot", Pipeline([
                                                         ('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality()))
                                                         ])
                                        )
                                    , ("boolean", Pipeline([
                                                         ('boolean', EdgeBooleanFeatures_v2())
                                                         ])
                                        )
                                    , ("numerical", Pipeline([
                                                         ('selector', EdgeNumericalSelector()),
                                                         #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                        )
                        ]
                        
        edge_transformer = FeatureUnion( lEdgeFeature )
          
        #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        self._node_transformer = node_transformer
        self._edge_transformer = edge_transformer
        self.tfidfNodeTextVectorizer = None #tdifNodeTextVectorizer
Exemplo n.º 3
0
    def __init__(self, **kwargs):
        FeatureDefinition.__init__(self, **kwargs)

        nbTypes = self._getTypeNumber(kwargs)
        
        node_transformer = TransformerListByType([ FeatureUnion( [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                                    ("xywh", Pipeline([
                                                         ('selector', NodeTransformerXYWH_v2()),
                                                         #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                       )
                                    , ("neighbors", Pipeline([
                                                         ('selector', NodeTransformerNeighbors()),
                                                         #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                       )
                                    , ("1hot", Pipeline([
                                                         ('1hot', Node1HotFeatures())  #does the 1-hot encoding directly
                                                         ])
                                       )
                                      ]) for _i in range(nbTypes) ])
    
        edge_transformer = TransformerListByType([ FeatureUnion( [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                                      ("1hot", Pipeline([
                                                         ('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality()))
                                                         ])
                                        )
                                    , ("boolean", Pipeline([
                                                         ('boolean', EdgeBooleanFeatures_v2())
                                                         ])
                                        )
                                    , ("numerical", Pipeline([
                                                         ('selector', EdgeNumericalSelector()),
                                                         #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                        )
                                          ] ) for _i in range(nbTypes*nbTypes) ])
          
        #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        self._node_transformer = node_transformer
        self._edge_transformer = edge_transformer
    def __init__(self):
        FeatureDefinition.__init__(self)

        node_transformer = FeatureUnion(
            [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                (
                    "xywh",
                    Pipeline([
                        ('selector', NodeTransformerXYWH_v2()),
                        ('xywh',
                         StandardScaler(copy=False,
                                        with_mean=True,
                                        with_std=True))  #use in-place scaling
                    ])),
                ("neighbors", NodeTransformerNeighbors_v2()),
                ("1hot", Node1HotFeatures())  #does the 1-hot encoding directly
            ])

        lEdgeFeature = [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
            ("1hot", Edge1HotFeatures(PageNumberSimpleSequenciality())),
            ('boolean', EdgeBooleanFeatures_v2()),
            (
                "numerical",
                Pipeline([
                    ('selector', EdgeNumericalSelector()),
                    ('numerical',
                     StandardScaler(copy=False, with_mean=True,
                                    with_std=True))  #use in-place scaling
                ]))
        ]

        edge_transformer = FeatureUnion(lEdgeFeature)

        #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        self._node_transformer = node_transformer
        self._edge_transformer = edge_transformer
        self.tfidfNodeTextVectorizer = None  #tdifNodeTextVectorizer
Exemplo n.º 5
0
    def __init__(self,
                 n_tfidf_node=None,
                 t_ngrams_node=None,
                 b_tfidf_node_lc=None,
                 n_tfidf_edge=None,
                 t_ngrams_edge=None,
                 b_tfidf_edge_lc=None,
                 bMirrorPage=True,
                 bMultiPage=True):
        FeatureDefinition.__init__(self)

        self.n_tfidf_node, self.t_ngrams_node, self.b_tfidf_node_lc = n_tfidf_node, t_ngrams_node, b_tfidf_node_lc
        self.n_tfidf_edge, self.t_ngrams_edge, self.b_tfidf_edge_lc = n_tfidf_edge, t_ngrams_edge, b_tfidf_edge_lc
        self.bMirrorPage = bMirrorPage
        self.bMultiPage = bMultiPage
        tdifNodeTextVectorizer = TfidfVectorizer(
            lowercase=self.b_tfidf_node_lc,
            max_features=self.n_tfidf_node,
            analyzer='char',
            ngram_range=self.t_ngrams_node  #(2,6)
            ,
            dtype=np.float64)

        node_transformer = FeatureUnion(
            [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                (
                    "text",
                    Pipeline([
                        ('selector', NodeTransformerTextEnclosed()),
                        #                                                         ('tfidf', TfidfVectorizer(lowercase=self.b_tfidf_node_lc, max_features=self.n_tfidf_node
                        #                                                                                   , analyzer = 'char', ngram_range=self.tNODE_NGRAMS #(2,6)
                        #                                                                                   , dtype=np.float64)),
                        (
                            'tfidf', tdifNodeTextVectorizer
                        ),  #we can use it separately from the pipleline once fitted
                        ('todense', SparseToDense()
                         )  #pystruct needs an array, not a sparse matrix
                    ])),
                (
                    "textlen",
                    Pipeline([
                        ('selector', NodeTransformerTextLen()),
                        ('textlen',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  #use in-place scaling
                    ])),
                (
                    "xywh",
                    Pipeline([
                        ('selector', NodeTransformerXYWH_v2()),
                        #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                        ('xywh',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  #use in-place scaling
                    ])),
                (
                    "neighbors",
                    Pipeline([
                        ('selector', NodeTransformerNeighbors()),
                        #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                        ('neighbors',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  #use in-place scaling
                    ])),
                (
                    "1hot",
                    Pipeline([('1hot', Node1HotFeatures()
                               )  #does the 1-hot encoding directly
                              ]))
                #                                     , ('ocr' , Pipeline([
                #                                                          ('ocr', NodeOCRFeatures())
                #                                                          ])
                #                                        )
                #                                     , ('pnumre' , Pipeline([
                #                                                          ('pnumre', NodePNumFeatures())
                #                                                          ])
                #                                        )
                #                                     , ("doc_tfidf", Pipeline([
                #                                                          ('zero', Zero2Features())
                #                                                          #THIS ONE MUST BE LAST, because it include a placeholder column for the doculent-level tfidf
                #                                                          ])
                #                                        )
            ])

        lEdgeFeature = [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
            ("1hot",
             Pipeline([('1hot',
                        Edge1HotFeatures(PageNumberSimpleSequenciality()))])),
            ("boolean", Pipeline([('boolean', EdgeBooleanFeatures_v2())])),
            (
                "numerical",
                Pipeline([
                    ('selector', EdgeNumericalSelector()),
                    #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                    ('numerical',
                     QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                         copy=False))  #use in-place scaling
                ])),
            (
                "sourcetext0",
                Pipeline([
                    ('selector',
                     EdgeTransformerSourceText(0,
                                               bMirrorPage=bMirrorPage,
                                               bMultiPage=bMultiPage)),
                    (
                        'tfidf',
                        TfidfVectorizer(
                            lowercase=self.b_tfidf_edge_lc,
                            max_features=self.n_tfidf_edge,
                            analyzer='char',
                            ngram_range=self.t_ngrams_edge  #(2,6)
                            ,
                            dtype=np.float64)),
                    ('todense', SparseToDense()
                     )  #pystruct needs an array, not a sparse matrix
                ])),
            (
                "targettext0",
                Pipeline([
                    ('selector',
                     EdgeTransformerTargetText(0,
                                               bMirrorPage=bMirrorPage,
                                               bMultiPage=bMultiPage)),
                    (
                        'tfidf',
                        TfidfVectorizer(
                            lowercase=self.b_tfidf_edge_lc,
                            max_features=self.n_tfidf_edge,
                            analyzer='char',
                            ngram_range=self.t_ngrams_edge
                            #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS
                            ,
                            dtype=np.float64)),
                    ('todense', SparseToDense()
                     )  #pystruct needs an array, not a sparse matrix
                ])),
            (
                "sourcetext1",
                Pipeline([
                    ('selector',
                     EdgeTransformerSourceText(1,
                                               bMirrorPage=bMirrorPage,
                                               bMultiPage=bMultiPage)),
                    (
                        'tfidf',
                        TfidfVectorizer(
                            lowercase=self.b_tfidf_edge_lc,
                            max_features=self.n_tfidf_edge,
                            analyzer='char',
                            ngram_range=self.t_ngrams_edge  #(2,6)
                            ,
                            dtype=np.float64)),
                    ('todense', SparseToDense()
                     )  #pystruct needs an array, not a sparse matrix
                ])),
            (
                "targettext1",
                Pipeline([
                    ('selector',
                     EdgeTransformerTargetText(1,
                                               bMirrorPage=bMirrorPage,
                                               bMultiPage=bMultiPage)),
                    (
                        'tfidf',
                        TfidfVectorizer(
                            lowercase=self.b_tfidf_edge_lc,
                            max_features=self.n_tfidf_edge,
                            analyzer='char',
                            ngram_range=self.t_ngrams_edge
                            #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS
                            ,
                            dtype=np.float64)),
                    ('todense', SparseToDense()
                     )  #pystruct needs an array, not a sparse matrix
                ]))
        ]
        if bMultiPage:
            lEdgeFeature.extend([
                (
                    "sourcetext2",
                    Pipeline([
                        ('selector',
                         EdgeTransformerSourceText(2,
                                                   bMirrorPage=bMirrorPage,
                                                   bMultiPage=bMultiPage)),
                        (
                            'tfidf',
                            TfidfVectorizer(
                                lowercase=self.b_tfidf_edge_lc,
                                max_features=self.n_tfidf_edge,
                                analyzer='char',
                                ngram_range=self.t_ngrams_edge  #(2,6)
                                ,
                                dtype=np.float64)),
                        ('todense', SparseToDense()
                         )  #pystruct needs an array, not a sparse matrix
                    ])),
                (
                    "targettext2",
                    Pipeline([
                        ('selector',
                         EdgeTransformerTargetText(2,
                                                   bMirrorPage=bMirrorPage,
                                                   bMultiPage=bMultiPage)),
                        (
                            'tfidf',
                            TfidfVectorizer(
                                lowercase=self.b_tfidf_edge_lc,
                                max_features=self.n_tfidf_edge,
                                analyzer='char',
                                ngram_range=self.t_ngrams_edge
                                #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS
                                ,
                                dtype=np.float64)),
                        ('todense', SparseToDense()
                         )  #pystruct needs an array, not a sparse matrix
                    ]))
            ])

        edge_transformer = FeatureUnion(lEdgeFeature)

        #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        self._node_transformer = node_transformer
        self._edge_transformer = edge_transformer
        self.tfidfNodeTextVectorizer = tdifNodeTextVectorizer
Exemplo n.º 6
0
    def __init__(self,
                 nbClass=None,
                 n_feat_node=None,
                 t_ngrams_node=None,
                 b_node_lc=None,
                 n_feat_edge=None,
                 t_ngrams_edge=None,
                 b_edge_lc=None,
                 n_jobs=1):
        FeatureDefinition.__init__(self)
        assert nbClass, "Error: indicate the number of classes"
        self.nbClass = nbClass
        self.n_feat_node, self.t_ngrams_node, self.b_node_lc = n_feat_node, t_ngrams_node, b_node_lc
        self.n_feat_edge, self.t_ngrams_edge, self.b_edge_lc = n_feat_edge, t_ngrams_edge, b_edge_lc

        #         tdifNodeTextVectorizer = TfidfVectorizer(lowercase=self.b_node_lc, max_features=self.n_feat_node
        #                                                                                   , analyzer = 'char', ngram_range=self.t_ngrams_node #(2,6)
        #                                                                                   , dtype=np.float64)
        """
        - loading pre-computed data from: CV_5/model_A_fold_1_transf.pkl
                 no such file : CV_5/model_A_fold_1_transf.pkl
Traceback (most recent call last):
  File "/opt/project/read/jl_git/TranskribusDU/src/tasks/DU_GTBooks_5labels.py", line 216, in <module>
    oReport = doer._nfold_RunFoldFromDisk(options.iFoldRunNum, options.warm)
  File "/opt/project/read/jl_git/TranskribusDU/src/tasks/DU_CRF_Task.py", line 481, in _nfold_RunFoldFromDisk
    oReport = self._nfold_RunFold(iFold, ts_trn, lFilename_trn, train_index, test_index, bWarm=bWarm)
  File "/opt/project/read/jl_git/TranskribusDU/src/tasks/DU_CRF_Task.py", line 565, in _nfold_RunFold
    fe.fitTranformers(lGraph_trn)
  File "/opt/project/read/jl_git/TranskribusDU/src/crf/FeatureDefinition_PageXml_logit_v2.py", line 141, in fitTranformers
    self._node_transformer.fit(lAllNode)
  File "/opt/project/read/VIRTUALENV_PYTHON_FULL_type/lib/python2.7/site-packages/sklearn/pipeline.py", line 712, in fit
    for _, trans, _ in self._iter())
  File "/opt/project/read/VIRTUALENV_PYTHON_FULL_type/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 768, in __call__
    self.retrieve()
  File "/opt/project/read/VIRTUALENV_PYTHON_FULL_type/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 719, in retrieve
    raise exception
RuntimeError: maximum recursion depth exceeded
"""
        """
        I guess this is due to the cyclic links to node's neighbours.
        But joblib.Parallel uses cPickle, so we cannot specialize the serialization of the Block objects.
        
        JLM April 2017
        """
        n_jobs_from_graph = 1  #we cannot pickl the list of graph, so n_jobs = 1 for this part!
        #         n_jobs_NodeTransformerLogit = max(1, n_jobs/2)  #half of the jobs for the NodeTransformerLogit, the rets for the others
        n_jobs_NodeTransformerLogit = max(1, n_jobs - 1)

        #we keep a ref onto it because its fitting needs not only all the nodes, but also additional info, available on the graph objects
        self._node_transf_logit = NodeTransformerLogit(
            nbClass,
            self.n_feat_node,
            self.t_ngrams_node,
            self.b_node_lc,
            n_jobs=n_jobs_NodeTransformerLogit)

        node_transformer = FeatureUnion(
            [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                ("text", self._node_transf_logit),
                (
                    "textlen",
                    Pipeline([
                        ('selector', NodeTransformerTextLen()),
                        ('textlen',
                         StandardScaler(copy=False,
                                        with_mean=True,
                                        with_std=True))  #use in-place scaling
                    ])),
                (
                    "xywh",
                    Pipeline([
                        ('selector', NodeTransformerXYWH_v2()),
                        ('xywh',
                         StandardScaler(copy=False,
                                        with_mean=True,
                                        with_std=True))  #use in-place scaling
                    ])),
                (
                    "neighbors",
                    Pipeline([
                        ('selector', NodeTransformerNeighbors()),
                        ('neighbors',
                         StandardScaler(copy=False,
                                        with_mean=True,
                                        with_std=True))  #use in-place scaling
                    ])),
                (
                    "1hot",
                    Pipeline([('1hot', Node1HotFeatures()
                               )  #does the 1-hot encoding directly
                              ]))
                #                                     , ('ocr' , Pipeline([
                #                                                          ('ocr', NodeOCRFeatures())
                #                                                          ])
                #                                        )
                #                                     , ('pnumre' , Pipeline([
                #                                                          ('pnumre', NodePNumFeatures())
                #                                                          ])
                #                                        )
                #                                     , ("doc", Pipeline([
                #                                                          ('zero', Zero2Features())
                #                                                          #THIS ONE MUST BE LAST, because it include a placeholder column for the doculent-level tfidf
                #                                                          ])
                #                                        )
            ],
            n_jobs=n_jobs_from_graph)

        lEdgeFeature = [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
            ("1hot",
             Pipeline([('1hot',
                        Edge1HotFeatures(PageNumberSimpleSequenciality()))])),
            ("boolean", Pipeline([('boolean', EdgeBooleanFeatures_v2())])),
            (
                "numerical",
                Pipeline([
                    ('selector', EdgeNumericalSelector()),
                    ('numerical',
                     StandardScaler(copy=False, with_mean=True,
                                    with_std=True))  #use in-place scaling
                ])),
            ("nodetext", EdgeTransformerLogit(nbClass,
                                              self._node_transf_logit))
        ]

        edge_transformer = FeatureUnion(lEdgeFeature, n_jobs=n_jobs_from_graph)

        #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        self._node_transformer = node_transformer
        self._edge_transformer = edge_transformer
Exemplo n.º 7
0
    def __init__(self,
                 nbClass=None,
                 n_feat_node=None,
                 t_ngrams_node=None,
                 b_node_lc=None,
                 n_feat_edge=None,
                 t_ngrams_edge=None,
                 b_edge_lc=None,
                 n_jobs=1):
        FeatureDefinition.__init__(self, nbClass)
        assert nbClass, "Error: indicate the numbe of classes"
        self.nbClass = nbClass
        self.n_feat_node, self.t_ngrams_node, self.b_node_lc = n_feat_node, t_ngrams_node, b_node_lc
        self.n_feat_edge, self.t_ngrams_edge, self.b_edge_lc = n_feat_edge, t_ngrams_edge, b_edge_lc

        #         tdifNodeTextVectorizer = TfidfVectorizer(lowercase=self.b_node_lc, max_features=self.n_feat_node
        #                                                                                   , analyzer = 'char', ngram_range=self.t_ngrams_node #(2,6)
        #                                                                                   , dtype=np.float64)
        """
        I tried to parallelize this code but I'm getting an error on Windows:
        
  File "c:\Local\meunier\git\TranskribusDU\src\crf\FeatureDefinition_PageXml_logit.py", line 144, in fitTranformers
    self._node_transformer.fit(lAllNode)
  File "C:\Anaconda2\lib\site-packages\sklearn\pipeline.py", line 709, in fit
    for _, trans, _ in self._iter())
  File "C:\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 768, in __call__
    self.retrieve()
  File "C:\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 719, in retrieve
    raise exception
TypeError: can't pickle PyCapsule objects        

(virtual_python_pystruct) (C:\Anaconda2) c:\tmp_READ\tuto>python -c "import sklearn; print sklearn.__version__"
0.18.1
        => I force n_jobs to 1
        
        """
        n_jobs = 1

        n_jobs_NodeTransformerLogit = max(
            1, n_jobs / 2
        )  #half of the jobs for the NodeTransformerLogit, the rets for the others

        #we keep a ref onto it because its fitting needs not only all the nodes, but also additional info, available on the graph objects
        self._node_transf_logit = NodeTransformerLogit(
            nbClass,
            self.n_feat_node,
            self.t_ngrams_node,
            self.b_node_lc,
            n_jobs=n_jobs_NodeTransformerLogit)

        node_transformer = FeatureUnion(
            [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                ("text", self._node_transf_logit),
                (
                    "textlen",
                    Pipeline([
                        ('selector', NodeTransformerTextLen()),
                        ('textlen',
                         StandardScaler(copy=False,
                                        with_mean=True,
                                        with_std=True))  #use in-place scaling
                    ])),
                (
                    "xywh",
                    Pipeline([
                        ('selector', NodeTransformerXYWH()),
                        ('xywh',
                         StandardScaler(copy=False,
                                        with_mean=True,
                                        with_std=True))  #use in-place scaling
                    ])),
                (
                    "neighbors",
                    Pipeline([
                        ('selector', NodeTransformerNeighbors()),
                        ('neighbors',
                         StandardScaler(copy=False,
                                        with_mean=True,
                                        with_std=True))  #use in-place scaling
                    ])),
                (
                    "1hot",
                    Pipeline([('1hot', Node1HotFeatures()
                               )  #does the 1-hot encoding directly
                              ]))
                #                                     , ('ocr' , Pipeline([
                #                                                          ('ocr', NodeOCRFeatures())
                #                                                          ])
                #                                        )
                #                                     , ('pnumre' , Pipeline([
                #                                                          ('pnumre', NodePNumFeatures())
                #                                                          ])
                #                                        )
                #                                     , ("doc", Pipeline([
                #                                                          ('zero', Zero2Features())
                #                                                          #THIS ONE MUST BE LAST, because it include a placeholder column for the doculent-level tfidf
                #                                                          ])
                #                                        )
            ],
            n_jobs=max(1, n_jobs - n_jobs_NodeTransformerLogit))

        lEdgeFeature = [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
            ("1hot",
             Pipeline([('1hot',
                        Edge1HotFeatures(PageNumberSimpleSequenciality()))])),
            ("boolean", Pipeline([('boolean', EdgeBooleanFeatures())])),
            (
                "numerical",
                Pipeline([
                    ('selector', EdgeNumericalSelector()),
                    ('numerical',
                     StandardScaler(copy=False, with_mean=True,
                                    with_std=True))  #use in-place scaling
                ])),
            ("nodetext", EdgeTransformerLogit(nbClass,
                                              self._node_transf_logit))
        ]

        edge_transformer = FeatureUnion(lEdgeFeature, n_jobs=n_jobs)

        #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        self._node_transformer = node_transformer
        self._edge_transformer = edge_transformer
Exemplo n.º 8
0
    def __init__(self,
                 n_tfidf_node=None,
                 t_ngrams_node=None,
                 b_tfidf_node_lc=None,
                 n_tfidf_edge=None,
                 t_ngrams_edge=None,
                 b_tfidf_edge_lc=None,
                 feat_select=None,
                 text_neighbors=False,
                 n_tfidf_node_neighbors=500,
                 XYWH_v2=False,
                 edge_features=False):

        FeatureDefinition.__init__(self)

        self.n_tfidf_node, self.t_ngrams_node, self.b_tfidf_node_lc = n_tfidf_node, t_ngrams_node, b_tfidf_node_lc
        self.n_tfidf_edge, self.t_ngrams_edge, self.b_tfidf_edge_lc = n_tfidf_edge, t_ngrams_edge, b_tfidf_edge_lc

        self.text_neighbors = text_neighbors
        self.n_tfidf_node_neighbors = n_tfidf_node_neighbors
        self.XYWH_v2 = XYWH_v2
        self.edge_features = edge_features
        #TODO n_jobs=4

        if feat_select == 'chi2':
            feat_selector = SelectKBest(chi2, k=self.n_tfidf_node)

        elif feat_select == 'mi_rr':
            print('Using Mutual Information Round Robin as Feature Selection')
            feat_selector = SelectRobinBest(mutual_information,
                                            k=self.n_tfidf_node)
            feat_selector_neigh = SelectRobinBest(mutual_information,
                                                  k=self.n_tfidf_node)

        elif feat_select == 'chi2_rr':
            #chi_score = lambda x,y : chi2(x,y)[0] #this can not be pickled ...
            feat_selector = SelectRobinBest(chi2_scores, k=self.n_tfidf_node)
            feat_selector_neigh = SelectRobinBest(chi2_scores,
                                                  k=self.n_tfidf_node)

        elif feat_select == 'tf' or feat_select is None:
            feat_selector = None

        else:
            raise ValueError('Invalid Feature Selection method', feat_select)

        if feat_selector:
            tdifNodeTextVectorizer = TfidfVectorizer(
                lowercase=self.b_tfidf_node_lc,
                max_features=10000,
                analyzer='char',
                ngram_range=self.t_ngrams_node)  #(2,6)

            text_pipeline = Pipeline([
                ('selector', NodeTransformerTextEnclosed()),
                ('tf', tdifNodeTextVectorizer),
                ('word_selector', feat_selector),
                #('todense', SparseToDense()) #Here we don't need to convert to Dense anymore
            ])
        else:
            tdifNodeTextVectorizer = TfidfVectorizer(
                lowercase=self.b_tfidf_node_lc,
                max_features=self.n_tfidf_node,
                analyzer='char',
                ngram_range=self.t_ngrams_node  #(2,6)
                ,
                dtype=np.float64)
            text_pipeline = Pipeline([
                ('selector', NodeTransformerTextEnclosed()),
                ('tf', tdifNodeTextVectorizer),
                #('todense', SparseToDense()) #Here we don't need to convert to Dense anymore
            ])

        node_transformer_ops = [
            ("text", text_pipeline),
            (
                "textlen",
                Pipeline([
                    ('selector', NodeTransformerTextLen()),
                    ('textlen',
                     StandardScaler(copy=False, with_mean=True,
                                    with_std=True))  #use in-place scaling
                ])),
            (
                "neighbors",
                Pipeline([
                    ('selector', NodeTransformerNeighbors()),
                    ('neighbors',
                     StandardScaler(copy=False, with_mean=True,
                                    with_std=True))  #use in-place scaling
                ])),
            (
                "1hot",
                Pipeline([('1hot', Node1HotFeatures()
                           )  #does the 1-hot encoding directly
                          ]))
        ]

        if self.XYWH_v2 is True:
            feat_xy = (
                "xywh",
                Pipeline([
                    ('selector', NodeTransformerXYWH_v2()),
                    ('xywh',
                     StandardScaler(copy=False, with_mean=True,
                                    with_std=True))  #use in-place scaling
                ]))

        else:
            feat_xy = (
                "xywh",
                Pipeline([
                    ('selector', NodeTransformerXYWH()),
                    ('xywh',
                     StandardScaler(copy=False, with_mean=True,
                                    with_std=True))  #use in-place scaling
                ]))

        node_transformer_ops.append(feat_xy)

        if text_neighbors:
            #BY DEFAULT we use chi2
            if self.n_tfidf_node_neighbors > 0:
                feat_selector_neigh = SelectKBest(
                    chi2, k=self.n_tfidf_node_neighbors)
                neighborsTextVectorizer = TfidfVectorizer(
                    lowercase=self.b_tfidf_node_lc,
                    analyzer='char',
                    ngram_range=self.t_ngrams_node)  #(2,6)
                neighbors_text_pipeline = Pipeline([
                    ('selector', NodeTransformerNeighborsAllText()),
                    ('tf_neighbors', neighborsTextVectorizer),
                    ('feat_selector', feat_selector_neigh),
                ])

            else:
                neighborsTextVectorizer = TfidfVectorizer(
                    lowercase=self.b_tfidf_node_lc,
                    analyzer='char',
                    ngram_range=self.t_ngrams_node)  #(2,6)
                neighbors_text_pipeline = Pipeline([
                    ('selector', NodeTransformerNeighborsAllText()),
                    ('tf_neighbors', neighborsTextVectorizer)
                ])
            node_transformer_ops.append(
                ('text_neighbors', neighbors_text_pipeline))

        print(node_transformer_ops)

        node_aggregated_edge_features = [('1hot_edge',
                                          NodeEdgeTransformer(Edge1HotFeatures(
                                              PageNumberSimpleSequenciality()),
                                                              agg_func='sum'))]
        node_aggregated_edge_features.append(
            ('boolean_edge',
             NodeEdgeTransformer(EdgeBooleanFeatures_v2(), agg_func='sum')))
        #Aggregated Numerical Features do not make a lot of sense here ....

        if edge_features:
            node_transformer_ops.extend(node_aggregated_edge_features)

        print(node_transformer_ops)
        node_transformer = FeatureUnion(node_transformer_ops)

        #Minimal EdgeFeature Here
        lEdgeFeature = [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
            ("1hot",
             Pipeline([('1hot',
                        Edge1HotFeatures(PageNumberSimpleSequenciality()))])),
            ("boolean", Pipeline([('boolean', EdgeBooleanFeatures())])),
            (
                "numerical",
                Pipeline([
                    ('selector', EdgeNumericalSelector()),
                    ('numerical',
                     StandardScaler(copy=False, with_mean=True,
                                    with_std=True))  #use in-place scaling
                ]))
        ]

        edge_transformer = FeatureUnion(lEdgeFeature)
        self._node_transformer = node_transformer
        self._edge_transformer = edge_transformer
        self.tfidfNodeTextVectorizer = tdifNodeTextVectorizer