class CarbonaraBros(): def __init__(self, relevant_threshold=0.8): self.fe = FeaturesExtractor() self.relevant_threshold = relevant_threshold self.tableClassifier = Classifier('models/table_classifier.h5') self.listClassifier = Classifier('models/list_classifier.h5') def processDom(self, dom): analysis = { 'table': { 'relevant': [], 'not_relevant': [], }, 'list': { 'relevant': [], 'not_relevant': [] } } # table for table in dom.xpath("//table"): features = self.fe.extract( table, selected=DefaultFeatures.table_selected, features_descriptor=DefaultFeatures.table) features_array = self.fe.toArray(features) probabilities = self.tableClassifier.classify(features_array) score = probabilities[1] if score >= self.relevant_threshold: analysis['table']['relevant'].append((score, table)) else: analysis['table']['not_relevant'].append((score, table)) lists = dom.xpath("//ul") lists = lists + dom.xpath("//ol") lists = lists + dom.xpath("//dl") for list in lists: features = self.fe.extract( list, selected=DefaultFeatures.list_selected, features_descriptor=DefaultFeatures.list) features_array = self.fe.toArray(features) probabilities = self.listClassifier.classify(features_array) score = probabilities[1] if score >= self.relevant_threshold: analysis['list']['relevant'].append((score, list)) else: analysis['list']['not_relevant'].append((score, list)) return analysis
def print_result(result, color): for score, node in result: # 1° column: score score = round(score, 2) # 2° column: text d summary_length = 60 node_summary = node_text_summary(node, length=summary_length) node_summary = '"{}"'.format(node_summary) # 3° column: feature vector descriptor = DefaultFeatures.table if node.tag == "table" else DefaultFeatures.list selected = DefaultFeatures.table_selected if node.tag == "table" else DefaultFeatures.list_selected ft = FeaturesExtractor() features = ft.extract(node, selected=selected, features_descriptor=descriptor) features_array = ft.toArray(features) padding = " " * (summary_length - len(node_summary)) print(with_color(score, color=color), node_summary, padding, str(list(features_array)))