def extract_blocks(input_file, k_base): '''Extract block structure for each value in input file''' r_input = [r for r in F.read_input(input_file)] normalized_input = [ F.remove_stop_words(F.normalize_str(v)) for v in r_input ] blocks = [] for raw_terms, record in zip(r_input, normalized_input): blocks.append(build_blocks(record.split(), raw_terms.split(), k_base)) return blocks
def create_dataframe(self, kb_file): num_attributes = len(self.get_attributes()) try: tree = ET.parse(kb_file) except ET.ParseError as error: print("Error reading KB file for Pandas Dataframe. Cause: " + error.msg) sys.exit(1) record = tree.getroot() data = {'segment': [], 'attribute': []} for segment in record: data['segment'].append(F.normalize_str(segment.text)) data['attribute'].append(segment.tag) self.df = pd.DataFrame(data, columns=['segment', 'attribute', 'label']) le = preprocessing.LabelEncoder() self.df['label'] = le.fit_transform(self.df['attribute']) self.labels_dict = le.inverse_transform(range(0, num_attributes))
def init_kb(self, kb_file): try: tree = ET.parse(kb_file) except ET.ParseError as error: print("Error reading KB file. Cause: " + error.msg) sys.exit(1) data = tree.getroot() for segment in data: attr = segment.tag text = F.normalize_str(segment.text) self.registers.add(text) if attr not in self.k_base: self.k_base[attr] = {} terms = text.split() for term in terms: if term not in self.k_base[attr]: self.k_base[attr][term] = 0 self.k_base[attr][term] += 1
def init_kb(self, kb_file): '''Parse Knowledge Base and prepare it to extract the content-based features''' logger.info('Parsing knowledge base file...') data = F.read_k_base(kb_file) for item in data: attribute = item.tag value = F.remove_stop_words(F.normalize_str(item.text)) # Check if a value contains only stop words if not value: continue terms = value.split() i = 0 while i < len(terms) - 1: if terms[i] in self.co_occurrences: if (terms[i + 1], attribute) not in self.co_occurrences[terms[i]]: self.co_occurrences[terms[i]].append( (terms[i + 1], attribute)) else: self.co_occurrences[terms[i]] = [] i += 1 if terms[-1] not in self.co_occurrences: self.co_occurrences[terms[-1]] = [] for term in terms: occurrence = Occurrence(term) if attribute in self.k_base: if term not in [ obj.term for obj in self.k_base[attribute] ]: self.k_base[attribute].append(occurrence) else: occ = [ v for v in self.k_base[attribute] if v.term == term ] occ[0].frequency += 1 else: self.k_base[attribute] = [occurrence]
def record_evaluation(reference_file, results_file, attributes): '''Compute evaluation metrics per Record''' step = 'Matching Step' if results_file in 'matching_results.xml' else 'Reinforcement Step' reference = F.read_input(reference_file) results = F.read_input(results_file) record_evaluation = [] for res, ref in zip(results, reference): results_stats = {} reference_stats = {} right_answers = {} attr_evaluation = {} result_record = ET.fromstring('<record>'+res+'</record>') reference_record = ET.fromstring('<record>'+ref+'</record>') for reference_block in reference_record: if reference_block.tag not in reference_stats: reference_stats[reference_block.tag] = len( reference_block.text.split()) else: reference_stats[reference_block.tag] += len( reference_block.text.split()) for result_block in result_record: if result_block.tag is not 'none' and result_block.tag not in results_stats: results_stats[result_block.tag] = len( result_block.text.split()) else: results_stats[result_block.tag] += len( result_block.text.split()) for result_block in result_record: for reference_block in reference_record: if F.normalize_str(result_block.text) in F.normalize_str(reference_block.text) and result_block.tag == reference_block.tag: if result_block.tag not in right_answers: right_answers[result_block.tag] = len( result_block.text.split()) else: right_answers[result_block.tag] += len( result_block.text.split()) break for attr in attributes: if attr in results_stats and attr in reference_stats and attr in right_answers: attr_evaluation[attr] = Metrics() attr_evaluation[attr].precision = right_answers[attr] / \ results_stats[attr] attr_evaluation[attr].recall = right_answers[attr] / \ reference_stats[attr] attr_evaluation[attr].f_measure = (2*attr_evaluation[attr].precision*attr_evaluation[attr].recall)/( attr_evaluation[attr].precision+attr_evaluation[attr].recall) elif attr in results_stats and attr not in reference_stats: attr_evaluation[attr] = Metrics() record = Metrics() for attr in attr_evaluation: record.precision += attr_evaluation[attr].precision record.recall += attr_evaluation[attr].recall record.f_measure += attr_evaluation[attr].f_measure record.precision /= len(attr_evaluation) record.recall /= len(attr_evaluation) record.f_measure /= len(attr_evaluation) record_evaluation.append(record) precision = 0 recall = 0 f_measure = 0 for record in record_evaluation: precision += record.precision recall += record.recall f_measure += record.f_measure precision /= len(results) recall /= len(results) f_measure /= len(results) print( '----------------------------------------------------------------------------') print('{0} - Results Evaluation Per Record'.format(step)) print( '----------------------------------------------------------------------------') print('{:<20} {:<20} {:<18}'.format( 'Precision', 'Recall', 'F-Measure')) print('{:<20} {:<20} {:<18}'.format(precision, recall, f_measure)) print()
def attribute_evaluation(reference_file, results_file, attributes): '''Compute evaluation metrics per Attribute''' step = 'Matching Step' if results_file in 'matching_results.xml' else 'Reinforcement Step' reference = F.read_input(reference_file) results = F.read_input(results_file) results_stats = {} reference_stats = {} right_answers = {} attr_evaluation = {} for attr in attributes: attr_evaluation[attr] = Metrics() for res, ref in zip(results, reference): result_record = ET.fromstring('<record>'+res+'</record>') reference_record = ET.fromstring('<record>'+ref+'</record>') for reference_block in reference_record: if reference_block.tag not in reference_stats: reference_stats[reference_block.tag] = len( reference_block.text.split()) else: reference_stats[reference_block.tag] += len( reference_block.text.split()) for result_block in result_record: if result_block.tag is not 'none' and result_block.tag not in results_stats: results_stats[result_block.tag] = len( result_block.text.split()) else: results_stats[result_block.tag] += len( result_block.text.split()) for result_block in result_record: for reference_block in reference_record: if F.normalize_str(result_block.text) in F.normalize_str(reference_block.text) and result_block.tag == reference_block.tag: if result_block.tag not in right_answers: right_answers[result_block.tag] = len( result_block.text.split()) else: right_answers[result_block.tag] += len( result_block.text.split()) break for attr in attributes: if attr in results_stats and attr in reference_stats and attr in right_answers: attr_evaluation[attr].precision = right_answers[attr] / \ results_stats[attr] attr_evaluation[attr].recall = right_answers[attr] / \ reference_stats[attr] attr_evaluation[attr].f_measure = (2*attr_evaluation[attr].precision*attr_evaluation[attr].recall)/( attr_evaluation[attr].precision+attr_evaluation[attr].recall) print( '----------------------------------------------------------------------------') print('{0} - Results Evaluation Per Attribute'.format(step)) print( '----------------------------------------------------------------------------') print('{:<15} {:<20} {:<20} {:<18}'.format( 'Attribute', 'Precision', 'Recall', 'F-Measure')) for k, v in attr_evaluation.items(): if v.f_measure > 0: print('{:<15} {:<20} {:<20} {:<18}'.format( k, v.precision, v.recall, v.f_measure))
def build_blocks(record, k_base): segments = record.split(",") blocks_list = [] for b in segments: blocks_list.append(Block(F.normalize_str(b), b)) return blocks_list
def evaluate_results_per_record(results, reference_file, attributes): reference = F.read_file(reference_file) record_evaluation = [] for result_record, ref in zip(results, reference): results_stats = {} reference_stats = {} right_answers = {} attr_evaluation = {} reference_record = ET.fromstring('<record>' + ref + '</record>') for reference_block in reference_record: if reference_block.tag not in reference_stats: reference_stats[reference_block.tag] = len( reference_block.text.split()) else: reference_stats[reference_block.tag] += len( reference_block.text.split()) for result_block in result_record: if result_block.attr != 'none' and result_block.attr not in results_stats: results_stats[result_block.attr] = len( result_block.value.split()) else: results_stats[result_block.attr] += len( result_block.value.split()) for result_block in result_record: for reference_block in reference_record: if result_block.value in F.normalize_str( reference_block.text ) and result_block.attr == reference_block.tag: if result_block.attr not in right_answers: right_answers[result_block.attr] = len( result_block.value.split()) else: right_answers[result_block.attr] += len( result_block.value.split()) break for attr in attributes: if attr in results_stats and attr in reference_stats and attr in right_answers: attr_evaluation[attr] = Metrics() attr_evaluation[ attr].precision = right_answers[attr] / results_stats[attr] attr_evaluation[ attr].recall = right_answers[attr] / reference_stats[attr] attr_evaluation[attr].calculate_f_measure() elif attr in results_stats and attr not in reference_stats: attr_evaluation[attr] = Metrics() record = Metrics() for attr in attr_evaluation: record.precision += attr_evaluation[attr].precision record.recall += attr_evaluation[attr].recall record.f_measure += attr_evaluation[attr].f_measure record.precision /= len(attr_evaluation) record.recall /= len(attr_evaluation) record.f_measure /= len(attr_evaluation) record_evaluation.append(record) final_metrics = Metrics() for record in record_evaluation: final_metrics.precision += record.precision final_metrics.recall += record.recall final_metrics.f_measure += record.f_measure final_metrics.precision /= len(record_evaluation) final_metrics.recall /= len(record_evaluation) final_metrics.f_measure /= len(record_evaluation) print('---------- Results Evaluation Per Record ----------') print('{:<20} {:<20} {:<18}'.format('Precision', 'Recall', 'F-Measure')) print('{:<20} {:<20} {:<18}'.format(final_metrics.precision, final_metrics.recall, final_metrics.f_measure)) print()
def evaluate_results_per_attribute(results, reference_file, attributes): reference = F.read_file(reference_file) results_stats = {} reference_stats = {} right_answers = {} attr_evaluation = {} record_evaluation = [] for attr in attributes: attr_evaluation[attr] = Metrics() for result_record, ref in zip(results, reference): reference_record = ET.fromstring('<record>' + ref + '</record>') for reference_block in reference_record: if reference_block.tag not in reference_stats: reference_stats[reference_block.tag] = len( reference_block.text.split()) else: reference_stats[reference_block.tag] += len( reference_block.text.split()) for result_block in result_record: if result_block.attr != 'none': if result_block.attr not in results_stats: results_stats[result_block.attr] = len( result_block.value.split()) else: results_stats[result_block.attr] += len( result_block.value.split()) for result_block in result_record: for reference_block in reference_record: if result_block.value in F.normalize_str( reference_block.text ) and result_block.attr == reference_block.tag: if result_block.attr not in right_answers: right_answers[result_block.attr] = len( result_block.value.split()) else: right_answers[result_block.attr] += len( result_block.value.split()) break for attr in attributes: if attr in results_stats and attr in reference_stats and attr in right_answers: attr_evaluation[ attr].precision = right_answers[attr] / results_stats[attr] attr_evaluation[ attr].recall = right_answers[attr] / reference_stats[attr] attr_evaluation[attr].calculate_f_measure() print() print('---------- Results Evaluation Per Attribute ----------') print('{:<15} {:<20} {:<20} {:<18}'.format('Attribute', 'Precision', 'Recall', 'F-Measure')) total_metrics = Metrics() non_zero_attrs = 0 for k, v in attr_evaluation.items(): if v.f_measure > 0: print('{:<15} {:<20} {:<20} {:<18}'.format(k, v.precision, v.recall, v.f_measure)) total_metrics.precision += v.precision total_metrics.recall += v.recall total_metrics.f_measure += v.f_measure non_zero_attrs += 1 total_metrics.precision /= non_zero_attrs total_metrics.recall /= non_zero_attrs total_metrics.f_measure /= non_zero_attrs print() print('{:<15} {:<20} {:<20} {:<18}'.format("Total", total_metrics.precision, total_metrics.recall, total_metrics.f_measure)) print()