def generate(hg): count = 0 ent_count = hg.atom_count() + hg.edge_count() + 1 i = 0 with progressbar.ProgressBar(max_value=ent_count) as bar: for ent in hg.all(): if entity_type(ent)[0] == 'c': subs = subtypes(hg, ent) # check if the concept should be assigned to a synonym set if len(subs) > 0: # find set with the highest degree and normalize set # degrees by total degree sub_degs = [hg.degree(sub) for sub in subs] total_deg = sum(sub_degs) total_deg = 1 if total_deg == 0 else total_deg sub_ratios = [sub_deg / total_deg for sub_deg in sub_degs] max_ratio = 0. best_pos = -1 for pos, ratio in enumerate(sub_ratios): if ratio > max_ratio: max_ratio = ratio best_pos = pos # compute some degree-related metrics sdd = hg.deep_degree(subs[best_pos]) rdd = root_deep_degree(hg, ent) sub_to_root_dd = \ 0. if rdd == 0 else float(sdd) / float(rdd) d = hg.degree(ent) dd = hg.deep_degree(ent) r = float(d) / float(dd) ld, ldd = lemma_degrees(hg, ent) lr = float(ld) / float(ldd) # use metric to decide if (rdd > 5 and max_ratio >= .7 and r >= .05 and lr >= .05 and sub_to_root_dd >= .1 and (is_edge(ent) or len(root(ent)) > 2)): make_synonyms(hg, ent, subs[best_pos]) count += 1 # print('\n++++++====== {} ======++++++'.format(ent)) # print('SYNONYM: {}'.format(str(subs[best_pos]))) # print('root deep degree: {}'.format(rdd)) # print('sub/root ddegree: {}'.format(sub_to_root_dd)) # print('degree: {}; deep degree: {}; ' # 'ratio: {}'.format(d, dd, r)) # print('sub deep degree: {}'.format(sdd)) # print('lemma degree: {}; lemma deep degree: {};' # ' lemma ratio: {}'.format(ld, ldd, lr)) i += 1 bar.update(i) return count
def process_edge(self, edge, depth): hg = self.system.get_hg(self) if edge.type()[0] == 'C' and edge not in self.done: self.done.add(edge) subs = tuple(subtypes(hg, edge)) # check if the concept should be assigned to a synonym set if len(subs) > 0: # find set with the highest degree and normalize set # degrees by total degree sub_degs = [hg.deep_degree(sub) for sub in subs] total_deg = sum(sub_degs) total_deg = 1 if total_deg == 0 else total_deg sub_ratios = [sub_deg / total_deg for sub_deg in sub_degs] max_ratio = 0. best_pos = -1 for pos, ratio in enumerate(sub_ratios): if ratio > max_ratio: max_ratio = ratio best_pos = pos # compute some degree-related metrics sdd = hg.deep_degree(subs[best_pos]) dd = hg.deep_degree(edge) if dd > sdd: sdd_dd = float(sdd) / float(dd) self.logger.debug('concept: {}'.format(edge.to_str())) self.logger.debug('subconcepts: {}'.format(subs)) self.logger.debug('# subs: {}'.format(len(subs))) self.logger.debug('max_ratio: {}'.format(max_ratio)) self.logger.debug('sdd: {}'.format(sdd)) self.logger.debug('dd: {}'.format(dd)) self.logger.debug('sdd_dd: {}'.format(sdd_dd)) if max_ratio >= .7: # and sdd_dd < .5: edge1 = edge edge2 = subs[best_pos] self.logger.debug('are corefs: {} | {}'.format( edge1.to_str(), edge2.to_str())) self.corefs += 1 for op in make_corefs_ops(hg, edge1, edge2): yield op
def _make_singular_plural_relation(self, singular, plural): hg = self.system.get_hg(self) self.logger.debug('singular: {}; plural: {}'.format(singular, plural)) for op in make_singular_plural_ops(hg, singular, plural): yield op self.sng_pl += 1 for op in make_corefs_ops(hg, singular, plural): yield op self.corefs += 1 for subtype in subtypes(hg, singular): plural_edge = subtype.replace_main_concept(plural) if plural_edge and hg.exists(plural_edge): for op in self._make_singular_plural_relation(subtype, plural_edge): yield op
def input_edge(self, edge): if edge.type()[0] == 'c': subs = tuple(subtypes(self.hg, edge)) # check if the concept should be assigned to a synonym set if len(subs) > 0: # find set with the highest degree and normalize set # degrees by total degree sub_degs = [self.hg.degree(sub) for sub in subs] total_deg = sum(sub_degs) total_deg = 1 if total_deg == 0 else total_deg sub_ratios = [sub_deg / total_deg for sub_deg in sub_degs] max_ratio = 0. best_pos = -1 for pos, ratio in enumerate(sub_ratios): if ratio > max_ratio: max_ratio = ratio best_pos = pos # compute some degree-related metrics sdd = self.hg.deep_degree(subs[best_pos]) _, rdd = self.hg.root_degrees(edge) sub_to_root_dd = \ 0. if rdd == 0 else float(sdd) / float(rdd) d = self.hg.degree(edge) dd = self.hg.deep_degree(edge) r = float(d) / float(dd) ld, ldd = self.hg.lemma_degrees(edge) lr = float(ld) / float(ldd) # use metric to decide if (rdd > 5 and max_ratio >= .7 and r >= .05 and lr >= .05 and sub_to_root_dd >= .1 and (not edge.is_atom() or len(edge.root()) > 2)): make_corefs(self.hg, edge, subs[best_pos]) self.corefs += 1