예제 #1
0
 def convertStringtoPattern(xcur):
     ## need to integrate the 'virtual level'
     lRes=[]
     for elt in xcur:
         if isinstance(elt,list):
             lRes.extend([convertStringtoPattern(elt)])
         else:
             try:
                 float(elt)
                 f= featureObject()
                 f.setName("x")
                 f.setType(featureObject.NUMERICAL)
                 f.setValue(elt)
                 f.setObjectName(elt)
                 f.setWeight(1)
                 f.setTH(self.THNUMERICAL)                
             except:
                 f= featureObject() 
                 f.setName("f")
                 f.setType(featureObject.EDITDISTANCE)
                 f.setValue(elt)
                 f.setTH(100.0)                
             lRes.append(f)                    
     
     return lRes
예제 #2
0
    def getSetOfVInfoFeatures(self, TH, lAttributes, myObject):
        """
            
        """
        from spm.feature import featureObject

        if self._lBasicFeatures is None:
            self._lBasicFeatures = []
        # needed to keep canonical values!
        elif self.getSetofFeatures() != []:
            return self.getSetofFeatures()

            for attr in lAttributes:
                name = attr[0].getName()
                value = attr[0].getValue()
                feature = featureObject()
                feature.setName(name)
                feature.setTH(TH)
                feature.addNode(self)
                feature.setObjectName(self)
                feature.setValue(value)
                feature.setType(feature.NUMERICAL)
                self.addFeature(feature)

        if self.getSetofFeatures() == []:
            feature = featureObject()
            feature.setName('EMPTY')
            feature.setTH(TH)
            feature.addNode(self)
            feature.setObjectName(self)
            feature.setValue(True)
            feature.setType(featureObject.BOOLEAN)
            self.addFeature(feature)

        return self.getSetofFeatures()
예제 #3
0
    def getSetOfListedAttributes(self, TH, lAttributes, myObject):
        """
        
            move to XMLObjectClass ??
            
            Generate a set of features: 
            
        """
        from spm.feature import featureObject

        if self._lBasicFeatures is None:
            self._lBasicFeatures = []
        # needed to keep canonical values!
        elif self.getSetofFeatures() != []:
            return self.getSetofFeatures()

        lHisto = {}
        for elt in self.getAllNamedObjects(myObject):
            for attr in lAttributes:
                try:
                    lHisto[attr]
                except KeyError:
                    lHisto[attr] = {}
                if elt.hasAttribute(attr):
                    try:
                        lHisto[attr][round(float(
                            elt.getAttribute(attr)))].append(elt)
                    except KeyError:
                        lHisto[attr][round(float(
                            elt.getAttribute(attr)))] = [elt]

        if lHisto != {}:
            for attr in lAttributes:
                for value in lHisto[attr]:
                    if len(lHisto[attr][value]) > 0.1:
                        ftype = featureObject.NUMERICAL
                        feature = featureObject()
                        feature.setName(attr)
                        #                     feature.setName('f')
                        feature.setTH(TH)
                        feature.addNode(self)
                        feature.setObjectName(self)
                        feature.setValue(value)
                        feature.setType(ftype)
                        self.addFeature(feature)

        if 'virtual' in lAttributes:
            ftype = featureObject.BOOLEAN
            feature = featureObject()
            feature.setName('f')
            feature.setTH(TH)
            feature.addNode(self)
            feature.setObjectName(self)
            feature.setValue(self.getAttribute('virtual'))
            feature.setType(ftype)
            self.addFeature(feature)

        return self.getSetofFeatures()
예제 #4
0
    def getSetOfFeaturesBB(self, TH, lAttributes, myObject):
        """
            features: BB X Y H W
            
        """
        from spm.feature import featureObject

        if self._lBasicFeatures is None:
            self._lBasicFeatures = []
        # needed to keep canonical values!
        elif self.getSetofFeatures() != []:
            return self.getSetofFeatures()

        #build BB
        if self.getBB() is None:
            self.addBoundingBox()
        x, y, h, w = self.getBB()

        feature = featureObject()
        feature.setName('lm')
        feature.setTH(TH)
        feature.addNode(self)
        feature.setObjectName(self.getName())
        feature.setValue(round(x))
        feature.setType(feature.NUMERICAL)
        self.addFeature(feature)

        feature = featureObject()
        feature.setName('rm')
        feature.setTH(TH)
        feature.addNode(self)
        feature.setObjectName(self.getName())
        feature.setValue(round(x + w))
        feature.setType(feature.NUMERICAL)
        self.addFeature(feature)

        if self.getSetofFeatures() == []:
            feature = featureObject()
            feature.setName('EMPTY')
            feature.setTH(TH)
            feature.addNode(self)
            feature.setObjectName(self.getName())
            feature.setValue(True)
            feature.setType(featureObject.BOOLEAN)
            self.addFeature(feature)

        return self.getSetofFeatures()
예제 #5
0
    def getContentAnaphoraAttributes(self,TH,lAttributes,myObject):
        """
        
            textual content + position wrt parent
    
        """
        from spm.feature import featureObject
     
        if self._lBasicFeatures is None:
            self._lBasicFeatures = []
        # needed to keep canonical values!
        elif self.getSetofFeatures() != []:
            return self.getSetofFeatures()
               
              
        lHisto = {}
        lHisto['position']={}
        for elt in self.getAllNamedObjects(myObject):
            ## if elt is first in elt.getParent()
            position=  elt.getParent().getObjects().index(elt)
            if position == len(elt.getParent().getObjects())-1:position=-1
            try:lHisto['position'][str(position)].append(elt)
            except KeyError : lHisto['position'][str(position)]= [elt]
            for attr in lAttributes:
                try:lHisto[attr]
                except KeyError:lHisto[attr] = {}
                if elt.hasAttribute(attr):
                    try:
                        try:lHisto[attr][round(float(elt.getAttribute(attr)))].append(elt)
                        except KeyError: lHisto[attr][round(float(elt.getAttribute(attr)))] = [elt]
                    except TypeError:pass
                elif attr == 'text':
                    try:lHisto[attr][elt.getContent()].append(elt)
                    except KeyError: lHisto[attr][elt.getContent()] = [elt]
        
        for attr in lAttributes:
            for value in lHisto[attr]:
                if  len(lHisto[attr][value]) > 0.1:  # 0.1: keep all!
                    if attr not in ['position','text']:
                        ftype= featureObject.NUMERICAL
                    else:
                        ftype= featureObject.EDITDISTANCE
                    feature = featureObject()
                    feature.setName(attr)
#                     feature.setName('f')
                    feature.setTH(TH)
                    feature.addNode(self)
                    feature.setObjectName(self)
                    feature.setValue(value)
                    feature.setType(ftype)
                    self.addFeature(feature)
         
      
            
        return self.getSetofFeatures()         
예제 #6
0
    def getSetOfFeaturesPageSize(self, TH, lAttributes, myObject):
        """
            features: BB X Y H W
            
        """
        from spm.feature import featureObject

        if self._lBasicFeatures is None:
            self._lBasicFeatures = []
        # needed to keep canonical values!
        elif self.getSetofFeatures() != []:
            return self.getSetofFeatures()

        feature = featureObject()
        feature.setName('h')
        feature.setTH(TH)
        feature.addNode(self)
        feature.setObjectName(self)
        feature.setValue(round(float(self.getAttribute('height'))))
        feature.setType(feature.NUMERICAL)
        self.addFeature(feature)

        feature = featureObject()
        feature.setName('w')
        feature.setTH(TH)
        feature.addNode(self)
        feature.setObjectName(self)
        feature.setValue(round(float(self.getAttribute('width'))))
        feature.setType(feature.NUMERICAL)
        self.addFeature(feature)

        if self.getSetofFeatures() == []:
            feature = featureObject()
            feature.setName('EMPTY')
            feature.setTH(TH)
            feature.addNode(self)
            feature.setObjectName(self)
            feature.setValue(True)
            feature.setType(featureObject.BOOLEAN)
            self.addFeature(feature)

        return self.getSetofFeatures()
예제 #7
0
    def getSetOfFeaturesXPos(self, TH, lAttr, myObject):

        from spm.feature import featureObject

        if self._lBasicFeatures is None:
            self._lBasicFeatures = []

        ftype = featureObject.NUMERICAL
        feature = featureObject()
        feature.setName('x')
        feature.setTH(TH)
        feature.addNode(self)
        feature.setObjectName(self)
        feature.setValue(round(self.getX()))
        feature.setType(ftype)
        self.addFeature(feature)

        ftype = featureObject.NUMERICAL
        feature = featureObject()
        feature.setName('x2')
        feature.setTH(TH)
        feature.addNode(self)
        feature.setObjectName(self)
        feature.setValue(round(self.getX() + self.getWidth()))
        feature.setType(ftype)
        self.addFeature(feature)

        ftype = featureObject.NUMERICAL
        feature = featureObject()
        feature.setName('xc')
        feature.setTH(TH)
        feature.addNode(self)
        feature.setObjectName(self)
        feature.setValue(round(self.getX() + self.getWidth() / 2))
        feature.setType(ftype)
        self.addFeature(feature)

        return self.getSetofFeatures()
예제 #8
0
    def getSetOfListedAttributes(self, TH, lAttributes, myObject):
        """
            Generate a set of features: X start of the lines
            
            
        """
        from spm.feature import featureObject

        if self._lBasicFeatures is None:
            self._lBasicFeatures = []
        # needed to keep canonical values!
        elif self.getSetofFeatures() != []:
            return self.getSetofFeatures()

        lHisto = {}
        for elt in self.getAllNamedObjects(myObject):
            for attr in lAttributes:
                try:
                    lHisto[attr]
                except KeyError:
                    lHisto[attr] = {}
                if elt.hasAttribute(attr):
                    #                     if elt.getWidth() >500:
                    #                         print elt.getName(),attr, elt.getAttribute(attr) #, elt.getNode()
                    try:
                        try:
                            lHisto[attr][round(float(
                                elt.getAttribute(attr)))].append(elt)
                        except KeyError:
                            lHisto[attr][round(float(
                                elt.getAttribute(attr)))] = [elt]
                    except TypeError:
                        pass

        for attr in lAttributes:
            for value in lHisto[attr]:
                #                 print attr, value, lHisto[attr][value]
                if len(lHisto[attr][value]) > 0.1:
                    ftype = featureObject.NUMERICAL
                    feature = featureObject()
                    feature.setName(attr)
                    #                     feature.setName('f')
                    feature.setTH(TH)
                    feature.addNode(self)
                    feature.setObjectName(self)
                    feature.setValue(value)
                    feature.setType(ftype)
                    self.addFeature(feature)

        if 'text' in lAttributes:
            if len(self.getContent()):
                ftype = featureObject.EDITDISTANCE
                feature = featureObject()
                #                     feature.setName('content')
                feature.setName('f')
                feature.setTH(90)
                feature.addNode(self)
                feature.setObjectName(self)
                feature.setValue(self.getContent().split()[0])
                feature.setType(ftype)
                self.addFeature(feature)

        if 'tokens' in lAttributes:
            if len(self.getContent()):
                for token in self.getContent().split():
                    if len(token) > 4:
                        ftype = featureObject.EDITDISTANCE
                        feature = featureObject()
                        feature.setName('token')
                        feature.setTH(TH)
                        feature.addNode(self)
                        feature.setObjectName(self)
                        feature.setValue(token.lower())
                        feature.setType(ftype)
                        self.addFeature(feature)

        if 'xc' in lAttributes:
            ftype = featureObject.NUMERICAL
            feature = featureObject()
            #                 feature.setName('xc')
            feature.setName('xc')
            feature.setTH(TH)
            feature.addNode(self)
            feature.setObjectName(self)
            feature.setValue(round(self.getX() + self.getWidth() / 2))
            feature.setType(ftype)
            self.addFeature(feature)
#
        if 'virtual' in lAttributes:
            ftype = featureObject.BOOLEAN
            feature = featureObject()
            feature.setName('f')
            feature.setTH(TH)
            feature.addNode(self)
            feature.setObjectName(self)
            feature.setValue(self.getAttribute('virtual'))
            feature.setType(ftype)
            self.addFeature(feature)

        if 'bl' in lAttributes:
            for inext in self.next:
                ftype = featureObject.NUMERICAL
                feature = featureObject()
                baseline = self.getBaseline()
                nbl = inext.getBaseline()
                if baseline and nbl:
                    feature.setName('bl')
                    feature.setTH(TH)
                    feature.addNode(self)
                    feature.setObjectName(self)
                    # avg of baseline?
                    avg1 = baseline.getY() + (baseline.getY2() -
                                              baseline.getY()) / 2
                    avg2 = nbl.getY() + (nbl.getY2() - nbl.getY()) / 2
                    feature.setValue(round(abs(avg2 - avg1)))
                    feature.setType(ftype)
                    self.addFeature(feature)

        if 'linegrid' in lAttributes:
            #lgridlist.append((ystart,rowH, y1,yoverlap))
            for ystart, rowh, _, _ in self.lgridlist:
                ftype = featureObject.BOOLEAN
                feature = featureObject()
                feature.setName('linegrid%s' % rowh)
                feature.setTH(TH)
                feature.addNode(self)
                feature.setObjectName(self)
                feature.setValue(ystart)
                feature.setType(ftype)
                self.addFeature(feature)

        return self.getSetofFeatures()
예제 #9
0
    def getSetOfListedAttributes(self, TH, lAttributes, myObject):
        """
            Generate a set of features: X start of the lines
            
            
        """
        from spm.feature import featureObject

        if self._lBasicFeatures is None:
            self._lBasicFeatures = []
        # needed to keep canonical values!
        elif self.getSetofFeatures() != []:
            return self.getSetofFeatures()

        if 'virtual' in lAttributes:
            ftype = featureObject.BOOLEAN
            feature = featureObject()
            feature.setName('f')
            feature.setTH(TH)
            feature.addNode(self)
            feature.setObjectName(self)
            feature.setValue(self.getAttribute('virtual'))
            feature.setType(ftype)
            self.addFeature(feature)
            return self.getSetofFeatures()

        lHisto = {}
        for elt in self.getAllNamedObjects(myObject):
            if float(elt.getAttribute('width')) > 0:
                for attr in lAttributes:
                    try:
                        lHisto[attr]
                    except KeyError:
                        lHisto[attr] = {}
                    if elt.hasAttribute(attr):
                        try:
                            lHisto[attr][round(float(
                                elt.getAttribute(attr)))].append(elt)
                        except:
                            lHisto[attr][round(float(
                                elt.getAttribute(attr)))] = [elt]

        for attr in lAttributes:
            for value in lHisto[attr]:
                if len(lHisto[attr][value]) > 0.1:
                    ftype = featureObject.NUMERICAL
                    feature = featureObject()
                    feature.setName(attr)
                    l = sum(x.getHeight() for x in lHisto[attr][value])
                    feature.setWeight(l)
                    feature.setTH(TH)
                    feature.addNode(self)
                    feature.setObjectName(self)
                    feature.setValue(value)
                    feature.setType(ftype)
                    self.addFeature(feature)

        if self.getSetofFeatures() == []:
            feature = featureObject()
            feature.setName('EMPTY')
            feature.setTH(TH)
            feature.addNode(self)
            feature.setObjectName(self)
            feature.setValue(True)
            feature.setType(featureObject.BOOLEAN)
            self.addFeature(feature)

        return self.getSetofFeatures()
예제 #10
0
    def getSetOfListedAttributes(self, TH, lAttributes, myObject):
        """
        
            move to XMLObjectClass ??
            
            Generate a set of features: 
            
        """
        from spm.feature import featureObject

        if self._lBasicFeatures is None:
            self._lBasicFeatures = []
        # needed to keep canonical values!
        elif self.getSetofFeatures() != []:
            return self.getSetofFeatures()

        lHisto = {}
        for elt in self.getAllNamedObjects(myObject):
            for attr in lAttributes:
                try:
                    lHisto[attr]
                except KeyError:
                    lHisto[attr] = {}
                if elt.hasAttribute(attr):
                    try:
                        try:
                            lHisto[attr][round(float(
                                elt.getAttribute(attr)))].append(elt)
                        except KeyError:
                            lHisto[attr][round(float(
                                elt.getAttribute(attr)))] = [elt]
                    except TypeError:
                        pass

        # empty object
        if lHisto == {}:
            return self.getSetofFeatures()

        for attr in lAttributes:
            for value in lHisto[attr]:
                #                 print attr, value, lHisto[attr][value]
                if len(lHisto[attr][value]) > 0.1:
                    ftype = featureObject.NUMERICAL
                    feature = featureObject()
                    feature.setName(attr)
                    #                     feature.setName('f')
                    feature.setTH(TH)
                    for o in lHisto[attr][value]:
                        feature.addNode(o)
                    feature.setObjectName(self)
                    feature.setValue(value)
                    feature.setType(ftype)
                    self.addFeature(feature)

        if 'tokens' in lAttributes:
            if len(self.getContent()):
                for token in self.getContent().split():
                    if len(token) > 4:
                        ftype = featureObject.EDITDISTANCE
                        feature = featureObject()
                        feature.setName('token')
                        feature.setTH(TH)
                        feature.addNode(self)
                        feature.setObjectName(self)
                        feature.setValue(token.lower())
                        feature.setType(ftype)
                        self.addFeature(feature)

#
        if 'virtual' in lAttributes:
            ftype = featureObject.BOOLEAN
            feature = featureObject()
            feature.setName('f')
            feature.setTH(TH)
            feature.addNode(self)
            feature.setObjectName(self)
            feature.setValue(self.getAttribute('virtual'))
            feature.setType(ftype)
            self.addFeature(feature)

        return self.getSetofFeatures()