예제 #1
0
    def __init__(self, **kwargs):
        """
        set _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        """
        FeatureDefinition.__init__(self)

        nbTypes = self._getTypeNumber(kwargs)
        
        print("BETTER FEATURES")
        
        
        block_transformer = FeatureUnion( [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                                    ("xywh", Pipeline([
                                                         ('selector', NodeTransformerXYWH_v2()),
                                                         #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                       )
                                    , ("neighbors", Pipeline([
                                                         ('selector', NodeTransformerNeighbors()),
                                                         #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                       )
                                    , ("1hot", Pipeline([
                                                         ('1hot', Node1HotFeatures())  #does the 1-hot encoding directly
                                                         ])
                                       )
                                      ])
        grid_line_transformer = GridLine_NodeTransformer_v2()
        
        self._node_transformer = TransformerListByType([block_transformer, grid_line_transformer]) 
        
        edge_BB_transformer = FeatureUnion( [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                                      ("1hot", Pipeline([
                                                         ('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality()))
                                                         ])
                                        )
                                    , ("boolean", Pipeline([
                                                         ('boolean', EdgeBooleanFeatures_v2())
                                                         ])
                                        )
                                    , ("numerical", Pipeline([
                                                         ('selector', EdgeNumericalSelector()),
                                                         #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                        )
                                          ] )
        edge_BL_transformer = Block2GridLine_EdgeTransformer()
        edge_LL_transformer = GridLine2GridLine_EdgeTransformer()
        self._edge_transformer = TransformerListByType([edge_BB_transformer,
                                                  edge_BL_transformer,
                                                  edge_BL_transformer,  # useless but required
                                                  edge_LL_transformer 
                                                  ])
          
        self.tfidfNodeTextVectorizer = None #tdifNodeTextVectorizer
예제 #2
0
    def __init__(self): 
        FeatureDefinition.__init__(self)
        
#         self.n_tfidf_node, self.t_ngrams_node, self.b_tfidf_node_lc = n_tfidf_node, t_ngrams_node, b_tfidf_node_lc
#         self.n_tfidf_edge, self.t_ngrams_edge, self.b_tfidf_edge_lc = n_tfidf_edge, t_ngrams_edge, b_tfidf_edge_lc

#         tdifNodeTextVectorizer = TfidfVectorizer(lowercase=self.b_tfidf_node_lc, max_features=self.n_tfidf_node
#                                                                                   , analyzer = 'char', ngram_range=self.t_ngrams_node #(2,6)
#                                                                                   , dtype=np.float64)
        
        node_transformer = FeatureUnion( [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                                    ("xywh", Pipeline([
                                                         ('selector', NodeTransformerXYWH_v2()),
                                                         #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                       )
                                    , ("neighbors", Pipeline([
                                                         ('selector', NodeTransformerNeighbors()),
                                                         #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                       )
                                    , ("1hot", Pipeline([
                                                         ('1hot', Node1HotFeatures())  #does the 1-hot encoding directly
                                                         ])
                                       )
                                      ])
    
        lEdgeFeature = [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                                      ("1hot", Pipeline([
                                                         ('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality()))
                                                         ])
                                        )
                                    , ("boolean", Pipeline([
                                                         ('boolean', EdgeBooleanFeatures_v2())
                                                         ])
                                        )
                                    , ("numerical", Pipeline([
                                                         ('selector', EdgeNumericalSelector()),
                                                         #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                        )
                        ]
                        
        edge_transformer = FeatureUnion( lEdgeFeature )
          
        #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        self._node_transformer = node_transformer
        self._edge_transformer = edge_transformer
        self.tfidfNodeTextVectorizer = None #tdifNodeTextVectorizer
예제 #3
0
    def __init__(self, **kwargs):
        FeatureDefinition.__init__(self, **kwargs)

        nbTypes = self._getTypeNumber(kwargs)
        
        node_transformer = TransformerListByType([ FeatureUnion( [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                                    ("xywh", Pipeline([
                                                         ('selector', NodeTransformerXYWH_v2()),
                                                         #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                       )
                                    , ("neighbors", Pipeline([
                                                         ('selector', NodeTransformerNeighbors()),
                                                         #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                       )
                                    , ("1hot", Pipeline([
                                                         ('1hot', Node1HotFeatures())  #does the 1-hot encoding directly
                                                         ])
                                       )
                                      ]) for _i in range(nbTypes) ])
    
        edge_transformer = TransformerListByType([ FeatureUnion( [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                                      ("1hot", Pipeline([
                                                         ('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality()))
                                                         ])
                                        )
                                    , ("boolean", Pipeline([
                                                         ('boolean', EdgeBooleanFeatures_v2())
                                                         ])
                                        )
                                    , ("numerical", Pipeline([
                                                         ('selector', EdgeNumericalSelector()),
                                                         #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                        )
                                          ] ) for _i in range(nbTypes*nbTypes) ])
          
        #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        self._node_transformer = node_transformer
        self._edge_transformer = edge_transformer
예제 #4
0
    def __init__(self,
                 n_tfidf_node=None,
                 t_ngrams_node=None,
                 b_tfidf_node_lc=None,
                 n_tfidf_edge=None,
                 t_ngrams_edge=None,
                 b_tfidf_edge_lc=None,
                 bMirrorPage=True,
                 bMultiPage=True):
        FeatureDefinition.__init__(self)

        self.n_tfidf_node, self.t_ngrams_node, self.b_tfidf_node_lc = n_tfidf_node, t_ngrams_node, b_tfidf_node_lc
        self.n_tfidf_edge, self.t_ngrams_edge, self.b_tfidf_edge_lc = n_tfidf_edge, t_ngrams_edge, b_tfidf_edge_lc
        self.bMirrorPage = bMirrorPage
        self.bMultiPage = bMultiPage
        tdifNodeTextVectorizer = TfidfVectorizer(
            lowercase=self.b_tfidf_node_lc,
            max_features=self.n_tfidf_node,
            analyzer='char',
            ngram_range=self.t_ngrams_node  #(2,6)
            ,
            dtype=np.float64)

        node_transformer = FeatureUnion(
            [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                (
                    "text",
                    Pipeline([
                        ('selector', NodeTransformerTextEnclosed()),
                        #                                                         ('tfidf', TfidfVectorizer(lowercase=self.b_tfidf_node_lc, max_features=self.n_tfidf_node
                        #                                                                                   , analyzer = 'char', ngram_range=self.tNODE_NGRAMS #(2,6)
                        #                                                                                   , dtype=np.float64)),
                        (
                            'tfidf', tdifNodeTextVectorizer
                        ),  #we can use it separately from the pipleline once fitted
                        ('todense', SparseToDense()
                         )  #pystruct needs an array, not a sparse matrix
                    ])),
                (
                    "textlen",
                    Pipeline([
                        ('selector', NodeTransformerTextLen()),
                        ('textlen',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  #use in-place scaling
                    ])),
                (
                    "xywh",
                    Pipeline([
                        ('selector', NodeTransformerXYWH_v2()),
                        #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                        ('xywh',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  #use in-place scaling
                    ])),
                (
                    "neighbors",
                    Pipeline([
                        ('selector', NodeTransformerNeighbors()),
                        #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                        ('neighbors',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  #use in-place scaling
                    ])),
                (
                    "1hot",
                    Pipeline([('1hot', Node1HotFeatures()
                               )  #does the 1-hot encoding directly
                              ]))
                #                                     , ('ocr' , Pipeline([
                #                                                          ('ocr', NodeOCRFeatures())
                #                                                          ])
                #                                        )
                #                                     , ('pnumre' , Pipeline([
                #                                                          ('pnumre', NodePNumFeatures())
                #                                                          ])
                #                                        )
                #                                     , ("doc_tfidf", Pipeline([
                #                                                          ('zero', Zero2Features())
                #                                                          #THIS ONE MUST BE LAST, because it include a placeholder column for the doculent-level tfidf
                #                                                          ])
                #                                        )
            ])

        lEdgeFeature = [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
            ("1hot",
             Pipeline([('1hot',
                        Edge1HotFeatures(PageNumberSimpleSequenciality()))])),
            ("boolean", Pipeline([('boolean', EdgeBooleanFeatures_v2())])),
            (
                "numerical",
                Pipeline([
                    ('selector', EdgeNumericalSelector()),
                    #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                    ('numerical',
                     QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                         copy=False))  #use in-place scaling
                ])),
            (
                "sourcetext0",
                Pipeline([
                    ('selector',
                     EdgeTransformerSourceText(0,
                                               bMirrorPage=bMirrorPage,
                                               bMultiPage=bMultiPage)),
                    (
                        'tfidf',
                        TfidfVectorizer(
                            lowercase=self.b_tfidf_edge_lc,
                            max_features=self.n_tfidf_edge,
                            analyzer='char',
                            ngram_range=self.t_ngrams_edge  #(2,6)
                            ,
                            dtype=np.float64)),
                    ('todense', SparseToDense()
                     )  #pystruct needs an array, not a sparse matrix
                ])),
            (
                "targettext0",
                Pipeline([
                    ('selector',
                     EdgeTransformerTargetText(0,
                                               bMirrorPage=bMirrorPage,
                                               bMultiPage=bMultiPage)),
                    (
                        'tfidf',
                        TfidfVectorizer(
                            lowercase=self.b_tfidf_edge_lc,
                            max_features=self.n_tfidf_edge,
                            analyzer='char',
                            ngram_range=self.t_ngrams_edge
                            #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS
                            ,
                            dtype=np.float64)),
                    ('todense', SparseToDense()
                     )  #pystruct needs an array, not a sparse matrix
                ])),
            (
                "sourcetext1",
                Pipeline([
                    ('selector',
                     EdgeTransformerSourceText(1,
                                               bMirrorPage=bMirrorPage,
                                               bMultiPage=bMultiPage)),
                    (
                        'tfidf',
                        TfidfVectorizer(
                            lowercase=self.b_tfidf_edge_lc,
                            max_features=self.n_tfidf_edge,
                            analyzer='char',
                            ngram_range=self.t_ngrams_edge  #(2,6)
                            ,
                            dtype=np.float64)),
                    ('todense', SparseToDense()
                     )  #pystruct needs an array, not a sparse matrix
                ])),
            (
                "targettext1",
                Pipeline([
                    ('selector',
                     EdgeTransformerTargetText(1,
                                               bMirrorPage=bMirrorPage,
                                               bMultiPage=bMultiPage)),
                    (
                        'tfidf',
                        TfidfVectorizer(
                            lowercase=self.b_tfidf_edge_lc,
                            max_features=self.n_tfidf_edge,
                            analyzer='char',
                            ngram_range=self.t_ngrams_edge
                            #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS
                            ,
                            dtype=np.float64)),
                    ('todense', SparseToDense()
                     )  #pystruct needs an array, not a sparse matrix
                ]))
        ]
        if bMultiPage:
            lEdgeFeature.extend([
                (
                    "sourcetext2",
                    Pipeline([
                        ('selector',
                         EdgeTransformerSourceText(2,
                                                   bMirrorPage=bMirrorPage,
                                                   bMultiPage=bMultiPage)),
                        (
                            'tfidf',
                            TfidfVectorizer(
                                lowercase=self.b_tfidf_edge_lc,
                                max_features=self.n_tfidf_edge,
                                analyzer='char',
                                ngram_range=self.t_ngrams_edge  #(2,6)
                                ,
                                dtype=np.float64)),
                        ('todense', SparseToDense()
                         )  #pystruct needs an array, not a sparse matrix
                    ])),
                (
                    "targettext2",
                    Pipeline([
                        ('selector',
                         EdgeTransformerTargetText(2,
                                                   bMirrorPage=bMirrorPage,
                                                   bMultiPage=bMultiPage)),
                        (
                            'tfidf',
                            TfidfVectorizer(
                                lowercase=self.b_tfidf_edge_lc,
                                max_features=self.n_tfidf_edge,
                                analyzer='char',
                                ngram_range=self.t_ngrams_edge
                                #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS
                                ,
                                dtype=np.float64)),
                        ('todense', SparseToDense()
                         )  #pystruct needs an array, not a sparse matrix
                    ]))
            ])

        edge_transformer = FeatureUnion(lEdgeFeature)

        #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        self._node_transformer = node_transformer
        self._edge_transformer = edge_transformer
        self.tfidfNodeTextVectorizer = tdifNodeTextVectorizer
    def __init__(self, nbClass=None
                     , n_feat_node=None, t_ngrams_node=None, b_node_lc=None
                     , n_feat_edge=None, t_ngrams_edge=None, b_edge_lc=None
                     , n_jobs=1): 
        FeatureDefinition.__init__(self)
        assert nbClass, "Error: indicate the number of classes"
        self.nbClass = nbClass
        self.n_feat_node, self.t_ngrams_node, self.b_node_lc = n_feat_node, t_ngrams_node, b_node_lc
        self.n_feat_edge, self.t_ngrams_edge, self.b_edge_lc = n_feat_edge, t_ngrams_edge, b_edge_lc
        
#         tdifNodeTextVectorizer = TfidfVectorizer(lowercase=self.b_node_lc, max_features=self.n_feat_node
#                                                                                   , analyzer = 'char', ngram_range=self.t_ngrams_node #(2,6)
#                                                                                   , dtype=np.float64)
        """
        - loading pre-computed data from: CV_5/model_A_fold_1_transf.pkl
                 no such file : CV_5/model_A_fold_1_transf.pkl
Traceback (most recent call last):
  File "/opt/project/read/jl_git/TranskribusDU/src/tasks/DU_GTBooks_5labels.py", line 216, in <module>
    oReport = doer._nfold_RunFoldFromDisk(options.iFoldRunNum, options.warm)
  File "/opt/project/read/jl_git/TranskribusDU/src/tasks/DU_CRF_Task.py", line 481, in _nfold_RunFoldFromDisk
    oReport = self._nfold_RunFold(iFold, ts_trn, lFilename_trn, train_index, test_index, bWarm=bWarm)
  File "/opt/project/read/jl_git/TranskribusDU/src/tasks/DU_CRF_Task.py", line 565, in _nfold_RunFold
    fe.fitTranformers(lGraph_trn)
  File "/opt/project/read/jl_git/TranskribusDU/src/crf/FeatureDefinition_PageXml_logit_v2.py", line 141, in fitTranformers
    self._node_transformer.fit(lAllNode)
  File "/opt/project/read/VIRTUALENV_PYTHON_FULL_type/lib/python2.7/site-packages/sklearn/pipeline.py", line 712, in fit
    for _, trans, _ in self._iter())
  File "/opt/project/read/VIRTUALENV_PYTHON_FULL_type/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 768, in __call__
    self.retrieve()
  File "/opt/project/read/VIRTUALENV_PYTHON_FULL_type/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 719, in retrieve
    raise exception
RuntimeError: maximum recursion depth exceeded
"""
        """
        I guess this is due to the cyclic links to node's neighbours.
        But joblib.Parallel uses cPickle, so we cannot specialize the serialization of the Block objects.
        
        JLM April 2017
        """
        n_jobs_from_graph = 1   #we cannot pickl the list of graph, so n_jobs = 1 for this part!
#         n_jobs_NodeTransformerLogit = max(1, n_jobs/2)  #half of the jobs for the NodeTransformerLogit, the rets for the others
        n_jobs_NodeTransformerLogit = max(1, n_jobs - 1)
        
        #we keep a ref onto it because its fitting needs not only all the nodes, but also additional info, available on the graph objects
        self._node_transf_logit = NodeTransformerLogit(nbClass, self.n_feat_node, self.t_ngrams_node, self.b_node_lc, n_jobs=n_jobs_NodeTransformerLogit)

        node_transformer = FeatureUnion( [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                                    ("text", self._node_transf_logit)
                                    , 
                                    ("textlen", Pipeline([
                                                         ('selector', NodeTransformerTextLen()),
                                                         #v2 ('textlen', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('textlen', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                       )
                                    , ("xywh", Pipeline([
                                                         ('selector', NodeTransformerXYWH_v2()),
                                                         #v2 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                       )
                                    , ("neighbors", Pipeline([
                                                         ('selector', NodeTransformerNeighbors()),
                                                         #v2 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                       )
                                    , ("1hot", Pipeline([
                                                         ('1hot', Node1HotFeatures())  #does the 1-hot encoding directly
                                                         ])
                                       )
#                                     , ('ocr' , Pipeline([
#                                                          ('ocr', NodeOCRFeatures())
#                                                          ])
#                                        )
#                                     , ('pnumre' , Pipeline([
#                                                          ('pnumre', NodePNumFeatures())
#                                                          ])
#                                        )                                          
#                                     , ("doc", Pipeline([
#                                                          ('zero', Zero2Features()) 
#                                                          #THIS ONE MUST BE LAST, because it include a placeholder column for the doculent-level tfidf
#                                                          ])
#                                        )                                          
                                      ], n_jobs=n_jobs_from_graph)

        lEdgeFeature = [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                                      ("1hot", Pipeline([
                                                         ('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality()))
                                                         ])
                                        )
                                    , ("boolean", Pipeline([
                                                         ('boolean', EdgeBooleanFeatures_v2())
                                                         ])
                                        )
                                    , ("numerical", Pipeline([
                                                         ('selector', EdgeNumericalSelector()),
                                                         #v2 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                        )
                                    , ("nodetext", EdgeTransformerLogit(nbClass, self._node_transf_logit))
                        ]
                        
        edge_transformer = FeatureUnion( lEdgeFeature, n_jobs=n_jobs_from_graph )
          
        #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        self._node_transformer = node_transformer
        self._edge_transformer = edge_transformer