示例#1
0
class CarbonaraBros():
    def __init__(self, relevant_threshold=0.8):
        self.fe = FeaturesExtractor()
        self.relevant_threshold = relevant_threshold
        self.tableClassifier = Classifier('models/table_classifier.h5')
        self.listClassifier = Classifier('models/list_classifier.h5')

    def processDom(self, dom):
        analysis = {
            'table': {
                'relevant': [],
                'not_relevant': [],
            },
            'list': {
                'relevant': [],
                'not_relevant': []
            }
        }

        # table
        for table in dom.xpath("//table"):
            features = self.fe.extract(
                table,
                selected=DefaultFeatures.table_selected,
                features_descriptor=DefaultFeatures.table)
            features_array = self.fe.toArray(features)
            probabilities = self.tableClassifier.classify(features_array)

            score = probabilities[1]
            if score >= self.relevant_threshold:
                analysis['table']['relevant'].append((score, table))
            else:
                analysis['table']['not_relevant'].append((score, table))

        lists = dom.xpath("//ul")
        lists = lists + dom.xpath("//ol")
        lists = lists + dom.xpath("//dl")

        for list in lists:
            features = self.fe.extract(
                list,
                selected=DefaultFeatures.list_selected,
                features_descriptor=DefaultFeatures.list)
            features_array = self.fe.toArray(features)
            probabilities = self.listClassifier.classify(features_array)
            score = probabilities[1]

            if score >= self.relevant_threshold:
                analysis['list']['relevant'].append((score, list))
            else:
                analysis['list']['not_relevant'].append((score, list))

        return analysis
示例#2
0
def print_result(result, color):
    for score, node in result:

        # 1° column: score
        score = round(score, 2)

        # 2° column: text d
        summary_length = 60
        node_summary = node_text_summary(node, length=summary_length)
        node_summary = '"{}"'.format(node_summary)

        # 3° column: feature vector
        descriptor = DefaultFeatures.table if node.tag == "table" else DefaultFeatures.list
        selected = DefaultFeatures.table_selected if node.tag == "table" else DefaultFeatures.list_selected
        ft = FeaturesExtractor()
        features = ft.extract(node,
                              selected=selected,
                              features_descriptor=descriptor)
        features_array = ft.toArray(features)

        padding = " " * (summary_length - len(node_summary))

        print(with_color(score, color=color), node_summary, padding,
              str(list(features_array)))