def __init__(self): FeatureDefinition.__init__(self) # NODES node_transformer = FeatureUnion([ \ ("geometry" , Node_Geometry()) # one can set nQuantile=... ]) # EDGES # which types of edge can we get?? # It depends on the type of graph!! lEdgeClass = [HorizontalEdge, VerticalEdge] # standard set of features, including a constant 1 for CRF edge_transformer = FeatureUnion([ \ ('1hot' , Edge_Type_1Hot(lEdgeClass=lEdgeClass)) # Edge class 1 hot encoded (PUT IT FIRST) , ('geom' , Edge_Geometry()) # one can set nQuantile=... ]) # OPTIONNALLY, you can have one range of features per type of edge. # the 1-hot encoding must be the first part of the union and it will determine # by how much the rest of the feature are shifted. # # IMPORTANT: 1hot is first of union AND the correct number of edge classes if self.bShiftEdgeByClass: edge_transformer = Pipeline([ ('edge_transformer', edge_transformer), ('shifter', EdgeClassShifter(len(lEdgeClass))) ]) self.setTransformers(node_transformer, edge_transformer)
def __init__(self, **kwargs): """ set _node_transformer, _edge_transformer, tdifNodeTextVectorizer """ FeatureDefinition.__init__(self) self._node_transformer = None self._edge_transformer = None self._node_text_vectorizer = None #tdifNodeTextVectorizer
def __init__(self): FeatureDefinition.__init__(self) # NODES self.lNodeFeature = [ \ ("geometry" , Node_Geometry()) # one can set nQuantile=... , ("neighbor_count" , Node_Neighbour_Count()) # one can set nQuantile=... , ("text" , Node_Text_NGram( 'char' # character n-grams , 500 # number of N-grams , (2,3) # N , False # lowercase?)) )) ] node_transformer = FeatureUnion(self.lNodeFeature) # EDGES # which types of edge can we get?? # It depends on the type of graph!! lEdgeClass = [HorizontalEdge, VerticalEdge] # standard set of features, including a constant 1 for CRF self.lEdgeFeature = [ \ ('1hot' , Edge_Type_1Hot(lEdgeClass=lEdgeClass)) # Edge class 1 hot encoded (PUT IT FIRST) , ('1' , Edge_1()) # optional constant 1 for CRF , ('geom' , Edge_Geometry()) # one can set nQuantile=... , ('src_txt', Edge_Source_Text_NGram( 'char' # character n-grams , 250 # number of N-grams , (2,3) # N , False # lowercase?)) )) , ('tgt_txt', Edge_Target_Text_NGram( 'char' # character n-grams , 250 # number of N-grams , (2,3) # N , False # lowercase?)) )) ] if self.bSeparator: self.lEdgeFeature = self.lEdgeFeature + [ ('sprtr_bool', Separator_boolean()), ('sprtr_num', Separator_num()) ] fu = FeatureUnion(self.lEdgeFeature) # you can use directly this union of features! edge_transformer = fu # OPTIONNALLY, you can have one range of features per type of edge. # the 1-hot encoding must be the first part of the union and it will determine # by how much the rest of the feature are shifted. # # IMPORTANT: 1hot is first of union AND the correct number of edge classes if self.bShiftEdgeByClass: ppl = Pipeline([('fu', fu), ('shifter', EdgeClassShifter(len(lEdgeClass)))]) edge_transformer = ppl self.setTransformers(node_transformer, edge_transformer)
def __init__(self): FeatureDefinition.__init__(self) node_transformer = FeatureUnion( [ # CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ( "xywh", Pipeline([ ('selector', NodeTransformerXYWH()), # v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) # use in-place scaling ])), ( "neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), # v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) # use in-place scaling ])), ( "twy", Pipeline([ ('selector', NodeTransformerTWY()), ('twy', QuantileTransformer(n_quantiles=16, copy=False) ) # use in-place scaling ])) ]) lEdgeFeature = [ # CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("boolean", Pipeline([('boolean', EdgeBooleanAlignmentFeatures()) ])), ( "numerical", Pipeline([ ('selector', EdgeNumericalSelector_noText()), # v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) # use in-place scaling ])) ] edge_transformer = FeatureUnion(lEdgeFeature) # return _node_transformer, _edge_transformer, tdifNodeTextVectorizer self._node_transformer = node_transformer self._edge_transformer = edge_transformer self.tfidfNodeTextVectorizer = None # tdifNodeTextVectorizer
def __init__(self): FeatureDefinition.__init__(self) # NODES self.lNodeFeature = [ ("geometry", Node_Geometry()) # one can set nQuantile=... ] node_transformer = FeatureUnion(self.lNodeFeature) # EDGES # standard set of features, including a constant 1 for CRF self.lEdgeFeature = [ ('1hot', Edge_Type_1Hot(lEdgeClass=lEdgeClass) ) # Edge class 1 hot encoded (PUT IT FIRST) , ('geom', Edge_Geometry()) # one can set nQuantile=... ] if self.bSeparator: self.lEdgeFeature = self.lEdgeFeature + [ ('sprtr_bool', Separator_boolean()), ('sprtr_num', Separator_num()) ] edge_transformer = FeatureUnion(self.lEdgeFeature) # OPTIONNALLY, you can have one range of features per type of edge. # the 1-hot encoding must be the first part of the union and it will determine # by how much the rest of the feature are shifted. # # IMPORTANT: 1hot is first of union AND the correct number of edge classes if self.bShiftEdgeByClass: edge_transformer = Pipeline([ ('edge_transformer', edge_transformer), ('shifter', EdgeClassShifter(len(lEdgeClass))) ]) self.setTransformers(node_transformer, edge_transformer)
def __init__(self, n_tfidf_node=None, t_ngrams_node=None, b_tfidf_node_lc=None, n_tfidf_edge=None, t_ngrams_edge=None, b_tfidf_edge_lc=None, bMirrorPage=True, bMultiPage=True): FeatureDefinition.__init__(self) self.n_tfidf_node, self.t_ngrams_node, self.b_tfidf_node_lc = n_tfidf_node, t_ngrams_node, b_tfidf_node_lc self.n_tfidf_edge, self.t_ngrams_edge, self.b_tfidf_edge_lc = n_tfidf_edge, t_ngrams_edge, b_tfidf_edge_lc self.bMirrorPage = bMirrorPage self.bMultiPage = bMultiPage tdifNodeTextVectorizer = TfidfVectorizer( lowercase=self.b_tfidf_node_lc, max_features=self.n_tfidf_node, analyzer='char', ngram_range=self.t_ngrams_node #(2,6) , dtype=np.float64) node_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ( "text", Pipeline([ ('selector', NodeTransformerTextEnclosed()), # ('tfidf', TfidfVectorizer(lowercase=self.b_tfidf_node_lc, max_features=self.n_tfidf_node # , analyzer = 'char', ngram_range=self.tNODE_NGRAMS #(2,6) # , dtype=np.float64)), ( 'tfidf', tdifNodeTextVectorizer ), #we can use it separately from the pipleline once fitted ('todense', SparseToDense() ) #pystruct needs an array, not a sparse matrix ])), ( "textlen", Pipeline([ ('selector', NodeTransformerTextLen()), ('textlen', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) #use in-place scaling ])), ( "xywh", Pipeline([ ('selector', NodeTransformerXYWH()), #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) #use in-place scaling ])), ( "neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) #use in-place scaling ])), ( "1hot", Pipeline([('1hot', Node1HotFeatures() ) #does the 1-hot encoding directly ])) #, ("sem", Pipeline([ # ('sem', NodeSemanticLabels()) #add semantic labels # ]) # ) # Added by Animesh # , ('ocr' , Pipeline([ # ('ocr', NodeOCRFeatures()) # ]) # ) # , ('pnumre' , Pipeline([ # ('pnumre', NodePNumFeatures()) # ]) # ) # , ("doc_tfidf", Pipeline([ # ('zero', Zero2Features()) # #THIS ONE MUST BE LAST, because it include a placeholder column for the doculent-level tfidf # ]) # ) ]) lEdgeFeature = [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Pipeline([('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality()))])), ("boolean", Pipeline([('boolean', EdgeBooleanFeatures())])), ( "numerical", Pipeline([ ('selector', EdgeNumericalSelector()), #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ])) # , ("sourcetext0", Pipeline([ # ('selector', EdgeTransformerSourceText(0, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)), # ('tfidf', TfidfVectorizer(lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge # , analyzer = 'char', ngram_range=self.t_ngrams_edge #(2,6) # , dtype=np.float64)), # ('todense', SparseToDense()) #pystruct needs an array, not a sparse matrix # ]) # ) # , ("targettext0", Pipeline([ # ('selector', EdgeTransformerTargetText(0, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)), # ('tfidf', TfidfVectorizer(lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge # , analyzer = 'char', ngram_range=self.t_ngrams_edge # #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS # , dtype=np.float64)), # ('todense', SparseToDense()) #pystruct needs an array, not a sparse matrix # ]) # ) # , ("sourcetext1", Pipeline([ # ('selector', EdgeTransformerSourceText(1, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)), # ('tfidf', TfidfVectorizer(lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge # , analyzer = 'char', ngram_range=self.t_ngrams_edge #(2,6) # , dtype=np.float64)), # ('todense', SparseToDense()) #pystruct needs an array, not a sparse matrix # ]) # ) # , ("targettext1", Pipeline([ # ('selector', EdgeTransformerTargetText(1, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)), # ('tfidf', TfidfVectorizer(lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge # , analyzer = 'char', ngram_range=self.t_ngrams_edge # #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS # , dtype=np.float64)), # ('todense', SparseToDense()) #pystruct needs an array, not a sparse matrix # ]) # ) ] if self.bSeparator: lEdgeFeature = lEdgeFeature + [('sprtr_bool', Separator_boolean()), ('sprtr_num', Separator_num())] if bMultiPage: lEdgeFeature.extend([ ( "sourcetext2", Pipeline([ ('selector', EdgeTransformerSourceText(2, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)), ( 'tfidf', TfidfVectorizer( lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge, analyzer='char', ngram_range=self.t_ngrams_edge #(2,6) , dtype=np.float64)), ('todense', SparseToDense() ) #pystruct needs an array, not a sparse matrix ])), ( "targettext2", Pipeline([ ('selector', EdgeTransformerTargetText(2, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)), ( 'tfidf', TfidfVectorizer( lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge, analyzer='char', ngram_range=self.t_ngrams_edge #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS , dtype=np.float64)), ('todense', SparseToDense() ) #pystruct needs an array, not a sparse matrix ])) ]) edge_transformer = FeatureUnion(lEdgeFeature) #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer self._node_transformer = node_transformer self._edge_transformer = edge_transformer self.tfidfNodeTextVectorizer = tdifNodeTextVectorizer
def __init__( self, n_tfidf_node=None, t_ngrams_node=None, b_tfidf_node_lc=None # , n_tfidf_edge=None, t_ngrams_edge=None, b_tfidf_edge_lc=None ): FeatureDefinition.__init__(self) self.n_tfidf_node, self.t_ngrams_node, self.b_tfidf_node_lc = n_tfidf_node, t_ngrams_node, b_tfidf_node_lc # self.n_tfidf_edge, self.t_ngrams_edge, self.b_tfidf_edge_lc = n_tfidf_edge, t_ngrams_edge, b_tfidf_edge_lc tdifNodeTextVectorizer = TfidfVectorizer( lowercase=self.b_tfidf_node_lc, max_features=self.n_tfidf_node, analyzer='char', ngram_range=self.t_ngrams_node # (2,6) , dtype=np.float64) node_transformer = FeatureUnion( [ # CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ( "text", Pipeline([ ('selector', NodeTransformerTextEnclosed()), ( 'tfidf', tdifNodeTextVectorizer ), # we can use it separately from the pipleline once fitted ('todense', SparseToDense() ) # pystruct needs an array, not a sparse matrix ])), ( "textlen", Pipeline([ ('selector', NodeTransformerTextLen()), ('textlen', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) # use in-place scaling ])), ( "xywh", Pipeline([ ('selector', NodeTransformerXYWH()), # v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) # use in-place scaling ])), ( "neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), # v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) # use in-place scaling ])), ( "twy", Pipeline([ ('selector', NodeTransformerTWY()), ('twy', QuantileTransformer(n_quantiles=16, copy=False) ) # use in-place scaling ])) ]) lEdgeFeature = [ # CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("boolean", Pipeline([('boolean', EdgeBooleanAlignmentFeatures()) ])), ( "numerical", Pipeline([ ('selector', EdgeNumericalSelector_noText()), # v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) # use in-place scaling ])) ] edge_transformer = FeatureUnion(lEdgeFeature) # return _node_transformer, _edge_transformer, tdifNodeTextVectorizer self._node_transformer = node_transformer self._edge_transformer = edge_transformer self.tfidfNodeTextVectorizer = None # tdifNodeTextVectorizer