示例#1
0
    def train(self):
        """Train the model using the known output for the given urls
        """
        # rate xpaths by the similarity of their content with the output
        model = HtmlXpathSet()
        i = 0
        while i < len(self._docs[0].outputs()):
            isGroup = False
            xpaths = []
            if self._debug:
                print self._docs[0].outputs()[i]
            for doc in self._docs:
                if i < len(doc.outputs()):
                    outputs = doc.outputs()[i]
                    if isinstance(outputs, list):
                        isGroup = True
                    else:
                        outputs = [outputs]
                    for output in outputs:
                        outputScores = defaultdict(int)
                        for xpath, score in doc.matchXpaths(normalizeStr(output)):
                            outputScores[xpath] += score

                        # select best xpath match for each output
                        bestScore = min([score for (xpath, score) in outputScores.items()])
                        if bestScore > 0: 
                            print "Warning: could not find '%s' (score=%d)" % (output, score)
                        xpaths.extend([xpath for (xpath, score) in outputScores.items() if score == bestScore])
                        if self._debug:
                            print pretty([(self._docs.index(doc), xpath, score) for (xpath, score) in outputScores.items() if score == bestScore])
            if xpaths:
                if isGroup:
                    model.append(self.abstractXpaths(xpaths))
                else:
                    model.append(tuple(self.rankXpaths(xpaths)))
                if self._debug:
                    print 'Best:\n%s\n' % model[-1]
            i += 1

        if self._attributes:
            self.addAttributes(model)
        return model
示例#2
0
 def getElementText(self, e):
     """Extract text under this HtmlElement
     """
     return normalizeStr(e.text_content().strip())
示例#3
0
 def getElementText(self, e):
     """Extract text under this HtmlElement
     """
     return normalizeStr(e.text_content().strip())