def __init__(self, **kwargs): """ set _node_transformer, _edge_transformer, tdifNodeTextVectorizer """ FeatureDefinition.__init__(self) nbTypes = self._getTypeNumber(kwargs) print("BETTER FEATURES") block_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("xywh", Pipeline([ ('selector', NodeTransformerXYWH_v2()), #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("1hot", Pipeline([ ('1hot', Node1HotFeatures()) #does the 1-hot encoding directly ]) ) ]) grid_line_transformer = GridLine_NodeTransformer_v2() self._node_transformer = TransformerListByType([block_transformer, grid_line_transformer]) edge_BB_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Pipeline([ ('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality())) ]) ) , ("boolean", Pipeline([ ('boolean', EdgeBooleanFeatures_v2()) ]) ) , ("numerical", Pipeline([ ('selector', EdgeNumericalSelector()), #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) ] ) edge_BL_transformer = Block2GridLine_EdgeTransformer() edge_LL_transformer = GridLine2GridLine_EdgeTransformer() self._edge_transformer = TransformerListByType([edge_BB_transformer, edge_BL_transformer, edge_BL_transformer, # useless but required edge_LL_transformer ]) self.tfidfNodeTextVectorizer = None #tdifNodeTextVectorizer
def __init__(self): FeatureDefinition.__init__(self) # self.n_tfidf_node, self.t_ngrams_node, self.b_tfidf_node_lc = n_tfidf_node, t_ngrams_node, b_tfidf_node_lc # self.n_tfidf_edge, self.t_ngrams_edge, self.b_tfidf_edge_lc = n_tfidf_edge, t_ngrams_edge, b_tfidf_edge_lc # tdifNodeTextVectorizer = TfidfVectorizer(lowercase=self.b_tfidf_node_lc, max_features=self.n_tfidf_node # , analyzer = 'char', ngram_range=self.t_ngrams_node #(2,6) # , dtype=np.float64) node_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("xywh", Pipeline([ ('selector', NodeTransformerXYWH_v2()), #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("1hot", Pipeline([ ('1hot', Node1HotFeatures()) #does the 1-hot encoding directly ]) ) ]) lEdgeFeature = [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Pipeline([ ('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality())) ]) ) , ("boolean", Pipeline([ ('boolean', EdgeBooleanFeatures_v2()) ]) ) , ("numerical", Pipeline([ ('selector', EdgeNumericalSelector()), #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) ] edge_transformer = FeatureUnion( lEdgeFeature ) #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer self._node_transformer = node_transformer self._edge_transformer = edge_transformer self.tfidfNodeTextVectorizer = None #tdifNodeTextVectorizer
def __init__(self, **kwargs): FeatureDefinition.__init__(self, **kwargs) nbTypes = self._getTypeNumber(kwargs) node_transformer = TransformerListByType([ FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("xywh", Pipeline([ ('selector', NodeTransformerXYWH_v2()), #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("1hot", Pipeline([ ('1hot', Node1HotFeatures()) #does the 1-hot encoding directly ]) ) ]) for _i in range(nbTypes) ]) edge_transformer = TransformerListByType([ FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Pipeline([ ('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality())) ]) ) , ("boolean", Pipeline([ ('boolean', EdgeBooleanFeatures_v2()) ]) ) , ("numerical", Pipeline([ ('selector', EdgeNumericalSelector()), #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) ] ) for _i in range(nbTypes*nbTypes) ]) #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer self._node_transformer = node_transformer self._edge_transformer = edge_transformer
def __init__(self, n_tfidf_node=None, t_ngrams_node=None, b_tfidf_node_lc=None, n_tfidf_edge=None, t_ngrams_edge=None, b_tfidf_edge_lc=None, bMirrorPage=True, bMultiPage=True): FeatureDefinition.__init__(self) self.n_tfidf_node, self.t_ngrams_node, self.b_tfidf_node_lc = n_tfidf_node, t_ngrams_node, b_tfidf_node_lc self.n_tfidf_edge, self.t_ngrams_edge, self.b_tfidf_edge_lc = n_tfidf_edge, t_ngrams_edge, b_tfidf_edge_lc self.bMirrorPage = bMirrorPage self.bMultiPage = bMultiPage tdifNodeTextVectorizer = TfidfVectorizer( lowercase=self.b_tfidf_node_lc, max_features=self.n_tfidf_node, analyzer='char', ngram_range=self.t_ngrams_node #(2,6) , dtype=np.float64) node_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ( "text", Pipeline([ ('selector', NodeTransformerTextEnclosed()), # ('tfidf', TfidfVectorizer(lowercase=self.b_tfidf_node_lc, max_features=self.n_tfidf_node # , analyzer = 'char', ngram_range=self.tNODE_NGRAMS #(2,6) # , dtype=np.float64)), ( 'tfidf', tdifNodeTextVectorizer ), #we can use it separately from the pipleline once fitted ('todense', SparseToDense() ) #pystruct needs an array, not a sparse matrix ])), ( "textlen", Pipeline([ ('selector', NodeTransformerTextLen()), ('textlen', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) #use in-place scaling ])), ( "xywh", Pipeline([ ('selector', NodeTransformerXYWH_v2()), #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) #use in-place scaling ])), ( "neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) #use in-place scaling ])), ( "1hot", Pipeline([('1hot', Node1HotFeatures() ) #does the 1-hot encoding directly ])) # , ('ocr' , Pipeline([ # ('ocr', NodeOCRFeatures()) # ]) # ) # , ('pnumre' , Pipeline([ # ('pnumre', NodePNumFeatures()) # ]) # ) # , ("doc_tfidf", Pipeline([ # ('zero', Zero2Features()) # #THIS ONE MUST BE LAST, because it include a placeholder column for the doculent-level tfidf # ]) # ) ]) lEdgeFeature = [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Pipeline([('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality()))])), ("boolean", Pipeline([('boolean', EdgeBooleanFeatures_v2())])), ( "numerical", Pipeline([ ('selector', EdgeNumericalSelector()), #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ])), ( "sourcetext0", Pipeline([ ('selector', EdgeTransformerSourceText(0, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)), ( 'tfidf', TfidfVectorizer( lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge, analyzer='char', ngram_range=self.t_ngrams_edge #(2,6) , dtype=np.float64)), ('todense', SparseToDense() ) #pystruct needs an array, not a sparse matrix ])), ( "targettext0", Pipeline([ ('selector', EdgeTransformerTargetText(0, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)), ( 'tfidf', TfidfVectorizer( lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge, analyzer='char', ngram_range=self.t_ngrams_edge #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS , dtype=np.float64)), ('todense', SparseToDense() ) #pystruct needs an array, not a sparse matrix ])), ( "sourcetext1", Pipeline([ ('selector', EdgeTransformerSourceText(1, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)), ( 'tfidf', TfidfVectorizer( lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge, analyzer='char', ngram_range=self.t_ngrams_edge #(2,6) , dtype=np.float64)), ('todense', SparseToDense() ) #pystruct needs an array, not a sparse matrix ])), ( "targettext1", Pipeline([ ('selector', EdgeTransformerTargetText(1, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)), ( 'tfidf', TfidfVectorizer( lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge, analyzer='char', ngram_range=self.t_ngrams_edge #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS , dtype=np.float64)), ('todense', SparseToDense() ) #pystruct needs an array, not a sparse matrix ])) ] if bMultiPage: lEdgeFeature.extend([ ( "sourcetext2", Pipeline([ ('selector', EdgeTransformerSourceText(2, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)), ( 'tfidf', TfidfVectorizer( lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge, analyzer='char', ngram_range=self.t_ngrams_edge #(2,6) , dtype=np.float64)), ('todense', SparseToDense() ) #pystruct needs an array, not a sparse matrix ])), ( "targettext2", Pipeline([ ('selector', EdgeTransformerTargetText(2, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)), ( 'tfidf', TfidfVectorizer( lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge, analyzer='char', ngram_range=self.t_ngrams_edge #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS , dtype=np.float64)), ('todense', SparseToDense() ) #pystruct needs an array, not a sparse matrix ])) ]) edge_transformer = FeatureUnion(lEdgeFeature) #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer self._node_transformer = node_transformer self._edge_transformer = edge_transformer self.tfidfNodeTextVectorizer = tdifNodeTextVectorizer
def __init__(self, nbClass=None, n_feat_node=None, t_ngrams_node=None, b_node_lc=None, n_feat_edge=None, t_ngrams_edge=None, b_edge_lc=None, n_jobs=1): FeatureDefinition.__init__(self) assert nbClass, "Error: indicate the number of classes" self.nbClass = nbClass self.n_feat_node, self.t_ngrams_node, self.b_node_lc = n_feat_node, t_ngrams_node, b_node_lc self.n_feat_edge, self.t_ngrams_edge, self.b_edge_lc = n_feat_edge, t_ngrams_edge, b_edge_lc # tdifNodeTextVectorizer = TfidfVectorizer(lowercase=self.b_node_lc, max_features=self.n_feat_node # , analyzer = 'char', ngram_range=self.t_ngrams_node #(2,6) # , dtype=np.float64) """ - loading pre-computed data from: CV_5/model_A_fold_1_transf.pkl no such file : CV_5/model_A_fold_1_transf.pkl Traceback (most recent call last): File "/opt/project/read/jl_git/TranskribusDU/src/tasks/DU_GTBooks_5labels.py", line 216, in <module> oReport = doer._nfold_RunFoldFromDisk(options.iFoldRunNum, options.warm) File "/opt/project/read/jl_git/TranskribusDU/src/tasks/DU_CRF_Task.py", line 481, in _nfold_RunFoldFromDisk oReport = self._nfold_RunFold(iFold, ts_trn, lFilename_trn, train_index, test_index, bWarm=bWarm) File "/opt/project/read/jl_git/TranskribusDU/src/tasks/DU_CRF_Task.py", line 565, in _nfold_RunFold fe.fitTranformers(lGraph_trn) File "/opt/project/read/jl_git/TranskribusDU/src/crf/FeatureDefinition_PageXml_logit_v2.py", line 141, in fitTranformers self._node_transformer.fit(lAllNode) File "/opt/project/read/VIRTUALENV_PYTHON_FULL_type/lib/python2.7/site-packages/sklearn/pipeline.py", line 712, in fit for _, trans, _ in self._iter()) File "/opt/project/read/VIRTUALENV_PYTHON_FULL_type/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 768, in __call__ self.retrieve() File "/opt/project/read/VIRTUALENV_PYTHON_FULL_type/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 719, in retrieve raise exception RuntimeError: maximum recursion depth exceeded """ """ I guess this is due to the cyclic links to node's neighbours. But joblib.Parallel uses cPickle, so we cannot specialize the serialization of the Block objects. JLM April 2017 """ n_jobs_from_graph = 1 #we cannot pickl the list of graph, so n_jobs = 1 for this part! # n_jobs_NodeTransformerLogit = max(1, n_jobs/2) #half of the jobs for the NodeTransformerLogit, the rets for the others n_jobs_NodeTransformerLogit = max(1, n_jobs - 1) #we keep a ref onto it because its fitting needs not only all the nodes, but also additional info, available on the graph objects self._node_transf_logit = NodeTransformerLogit( nbClass, self.n_feat_node, self.t_ngrams_node, self.b_node_lc, n_jobs=n_jobs_NodeTransformerLogit) node_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("text", self._node_transf_logit), ( "textlen", Pipeline([ ('selector', NodeTransformerTextLen()), ('textlen', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ])), ( "xywh", Pipeline([ ('selector', NodeTransformerXYWH_v2()), ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ])), ( "neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ])), ( "1hot", Pipeline([('1hot', Node1HotFeatures() ) #does the 1-hot encoding directly ])) # , ('ocr' , Pipeline([ # ('ocr', NodeOCRFeatures()) # ]) # ) # , ('pnumre' , Pipeline([ # ('pnumre', NodePNumFeatures()) # ]) # ) # , ("doc", Pipeline([ # ('zero', Zero2Features()) # #THIS ONE MUST BE LAST, because it include a placeholder column for the doculent-level tfidf # ]) # ) ], n_jobs=n_jobs_from_graph) lEdgeFeature = [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Pipeline([('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality()))])), ("boolean", Pipeline([('boolean', EdgeBooleanFeatures_v2())])), ( "numerical", Pipeline([ ('selector', EdgeNumericalSelector()), ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ])), ("nodetext", EdgeTransformerLogit(nbClass, self._node_transf_logit)) ] edge_transformer = FeatureUnion(lEdgeFeature, n_jobs=n_jobs_from_graph) #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer self._node_transformer = node_transformer self._edge_transformer = edge_transformer
def __init__(self, nbClass=None, n_feat_node=None, t_ngrams_node=None, b_node_lc=None, n_feat_edge=None, t_ngrams_edge=None, b_edge_lc=None, n_jobs=1): FeatureDefinition.__init__(self, nbClass) assert nbClass, "Error: indicate the numbe of classes" self.nbClass = nbClass self.n_feat_node, self.t_ngrams_node, self.b_node_lc = n_feat_node, t_ngrams_node, b_node_lc self.n_feat_edge, self.t_ngrams_edge, self.b_edge_lc = n_feat_edge, t_ngrams_edge, b_edge_lc # tdifNodeTextVectorizer = TfidfVectorizer(lowercase=self.b_node_lc, max_features=self.n_feat_node # , analyzer = 'char', ngram_range=self.t_ngrams_node #(2,6) # , dtype=np.float64) """ I tried to parallelize this code but I'm getting an error on Windows: File "c:\Local\meunier\git\TranskribusDU\src\crf\FeatureDefinition_PageXml_logit.py", line 144, in fitTranformers self._node_transformer.fit(lAllNode) File "C:\Anaconda2\lib\site-packages\sklearn\pipeline.py", line 709, in fit for _, trans, _ in self._iter()) File "C:\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 768, in __call__ self.retrieve() File "C:\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 719, in retrieve raise exception TypeError: can't pickle PyCapsule objects (virtual_python_pystruct) (C:\Anaconda2) c:\tmp_READ\tuto>python -c "import sklearn; print sklearn.__version__" 0.18.1 => I force n_jobs to 1 """ n_jobs = 1 n_jobs_NodeTransformerLogit = max( 1, n_jobs / 2 ) #half of the jobs for the NodeTransformerLogit, the rets for the others #we keep a ref onto it because its fitting needs not only all the nodes, but also additional info, available on the graph objects self._node_transf_logit = NodeTransformerLogit( nbClass, self.n_feat_node, self.t_ngrams_node, self.b_node_lc, n_jobs=n_jobs_NodeTransformerLogit) node_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("text", self._node_transf_logit), ( "textlen", Pipeline([ ('selector', NodeTransformerTextLen()), ('textlen', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ])), ( "xywh", Pipeline([ ('selector', NodeTransformerXYWH()), ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ])), ( "neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ])), ( "1hot", Pipeline([('1hot', Node1HotFeatures() ) #does the 1-hot encoding directly ])) # , ('ocr' , Pipeline([ # ('ocr', NodeOCRFeatures()) # ]) # ) # , ('pnumre' , Pipeline([ # ('pnumre', NodePNumFeatures()) # ]) # ) # , ("doc", Pipeline([ # ('zero', Zero2Features()) # #THIS ONE MUST BE LAST, because it include a placeholder column for the doculent-level tfidf # ]) # ) ], n_jobs=max(1, n_jobs - n_jobs_NodeTransformerLogit)) lEdgeFeature = [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Pipeline([('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality()))])), ("boolean", Pipeline([('boolean', EdgeBooleanFeatures())])), ( "numerical", Pipeline([ ('selector', EdgeNumericalSelector()), ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ])), ("nodetext", EdgeTransformerLogit(nbClass, self._node_transf_logit)) ] edge_transformer = FeatureUnion(lEdgeFeature, n_jobs=n_jobs) #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer self._node_transformer = node_transformer self._edge_transformer = edge_transformer
def __init__(self, n_tfidf_node=None, t_ngrams_node=None, b_tfidf_node_lc=None, n_tfidf_edge=None, t_ngrams_edge=None, b_tfidf_edge_lc=None, feat_select=None, text_neighbors=False, n_tfidf_node_neighbors=500, XYWH_v2=False, edge_features=False): FeatureDefinition.__init__(self) self.n_tfidf_node, self.t_ngrams_node, self.b_tfidf_node_lc = n_tfidf_node, t_ngrams_node, b_tfidf_node_lc self.n_tfidf_edge, self.t_ngrams_edge, self.b_tfidf_edge_lc = n_tfidf_edge, t_ngrams_edge, b_tfidf_edge_lc self.text_neighbors = text_neighbors self.n_tfidf_node_neighbors = n_tfidf_node_neighbors self.XYWH_v2 = XYWH_v2 self.edge_features = edge_features #TODO n_jobs=4 if feat_select == 'chi2': feat_selector = SelectKBest(chi2, k=self.n_tfidf_node) elif feat_select == 'mi_rr': print('Using Mutual Information Round Robin as Feature Selection') feat_selector = SelectRobinBest(mutual_information, k=self.n_tfidf_node) feat_selector_neigh = SelectRobinBest(mutual_information, k=self.n_tfidf_node) elif feat_select == 'chi2_rr': #chi_score = lambda x,y : chi2(x,y)[0] #this can not be pickled ... feat_selector = SelectRobinBest(chi2_scores, k=self.n_tfidf_node) feat_selector_neigh = SelectRobinBest(chi2_scores, k=self.n_tfidf_node) elif feat_select == 'tf' or feat_select is None: feat_selector = None else: raise ValueError('Invalid Feature Selection method', feat_select) if feat_selector: tdifNodeTextVectorizer = TfidfVectorizer( lowercase=self.b_tfidf_node_lc, max_features=10000, analyzer='char', ngram_range=self.t_ngrams_node) #(2,6) text_pipeline = Pipeline([ ('selector', NodeTransformerTextEnclosed()), ('tf', tdifNodeTextVectorizer), ('word_selector', feat_selector), #('todense', SparseToDense()) #Here we don't need to convert to Dense anymore ]) else: tdifNodeTextVectorizer = TfidfVectorizer( lowercase=self.b_tfidf_node_lc, max_features=self.n_tfidf_node, analyzer='char', ngram_range=self.t_ngrams_node #(2,6) , dtype=np.float64) text_pipeline = Pipeline([ ('selector', NodeTransformerTextEnclosed()), ('tf', tdifNodeTextVectorizer), #('todense', SparseToDense()) #Here we don't need to convert to Dense anymore ]) node_transformer_ops = [ ("text", text_pipeline), ( "textlen", Pipeline([ ('selector', NodeTransformerTextLen()), ('textlen', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ])), ( "neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ])), ( "1hot", Pipeline([('1hot', Node1HotFeatures() ) #does the 1-hot encoding directly ])) ] if self.XYWH_v2 is True: feat_xy = ( "xywh", Pipeline([ ('selector', NodeTransformerXYWH_v2()), ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ])) else: feat_xy = ( "xywh", Pipeline([ ('selector', NodeTransformerXYWH()), ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ])) node_transformer_ops.append(feat_xy) if text_neighbors: #BY DEFAULT we use chi2 if self.n_tfidf_node_neighbors > 0: feat_selector_neigh = SelectKBest( chi2, k=self.n_tfidf_node_neighbors) neighborsTextVectorizer = TfidfVectorizer( lowercase=self.b_tfidf_node_lc, analyzer='char', ngram_range=self.t_ngrams_node) #(2,6) neighbors_text_pipeline = Pipeline([ ('selector', NodeTransformerNeighborsAllText()), ('tf_neighbors', neighborsTextVectorizer), ('feat_selector', feat_selector_neigh), ]) else: neighborsTextVectorizer = TfidfVectorizer( lowercase=self.b_tfidf_node_lc, analyzer='char', ngram_range=self.t_ngrams_node) #(2,6) neighbors_text_pipeline = Pipeline([ ('selector', NodeTransformerNeighborsAllText()), ('tf_neighbors', neighborsTextVectorizer) ]) node_transformer_ops.append( ('text_neighbors', neighbors_text_pipeline)) print(node_transformer_ops) node_aggregated_edge_features = [('1hot_edge', NodeEdgeTransformer(Edge1HotFeatures( PageNumberSimpleSequenciality()), agg_func='sum'))] node_aggregated_edge_features.append( ('boolean_edge', NodeEdgeTransformer(EdgeBooleanFeatures_v2(), agg_func='sum'))) #Aggregated Numerical Features do not make a lot of sense here .... if edge_features: node_transformer_ops.extend(node_aggregated_edge_features) print(node_transformer_ops) node_transformer = FeatureUnion(node_transformer_ops) #Minimal EdgeFeature Here lEdgeFeature = [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Pipeline([('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality()))])), ("boolean", Pipeline([('boolean', EdgeBooleanFeatures())])), ( "numerical", Pipeline([ ('selector', EdgeNumericalSelector()), ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ])) ] edge_transformer = FeatureUnion(lEdgeFeature) self._node_transformer = node_transformer self._edge_transformer = edge_transformer self.tfidfNodeTextVectorizer = tdifNodeTextVectorizer