예제 #1
0
def generate(hg):
    count = 0
    ent_count = hg.atom_count() + hg.edge_count() + 1
    i = 0
    with progressbar.ProgressBar(max_value=ent_count) as bar:
        for ent in hg.all():
            if entity_type(ent)[0] == 'c':
                subs = subtypes(hg, ent)

                # check if the concept should be assigned to a synonym set
                if len(subs) > 0:
                    # find set with the highest degree and normalize set
                    # degrees by total degree
                    sub_degs = [hg.degree(sub) for sub in subs]
                    total_deg = sum(sub_degs)
                    total_deg = 1 if total_deg == 0 else total_deg
                    sub_ratios = [sub_deg / total_deg for sub_deg in sub_degs]
                    max_ratio = 0.
                    best_pos = -1
                    for pos, ratio in enumerate(sub_ratios):
                        if ratio > max_ratio:
                            max_ratio = ratio
                            best_pos = pos

                    # compute some degree-related metrics
                    sdd = hg.deep_degree(subs[best_pos])
                    rdd = root_deep_degree(hg, ent)
                    sub_to_root_dd = \
                        0. if rdd == 0 else float(sdd) / float(rdd)
                    d = hg.degree(ent)
                    dd = hg.deep_degree(ent)
                    r = float(d) / float(dd)
                    ld, ldd = lemma_degrees(hg, ent)
                    lr = float(ld) / float(ldd)

                    # use metric to decide
                    if (rdd > 5 and max_ratio >= .7 and r >= .05 and lr >= .05
                            and sub_to_root_dd >= .1
                            and (is_edge(ent) or len(root(ent)) > 2)):

                        make_synonyms(hg, ent, subs[best_pos])
                        count += 1
                        # print('\n++++++====== {} ======++++++'.format(ent))
                        # print('SYNONYM: {}'.format(str(subs[best_pos])))
                        # print('root deep degree: {}'.format(rdd))
                        # print('sub/root ddegree: {}'.format(sub_to_root_dd))
                        # print('degree: {}; deep degree: {}; '
                        #       'ratio: {}'.format(d, dd, r))
                        # print('sub deep degree: {}'.format(sdd))
                        # print('lemma degree: {}; lemma deep degree: {};'
                        #       ' lemma ratio: {}'.format(ld, ldd, lr))
            i += 1
            bar.update(i)
    return count
예제 #2
0
    def process_edge(self, edge, depth):
        hg = self.system.get_hg(self)

        if edge.type()[0] == 'C' and edge not in self.done:
            self.done.add(edge)

            subs = tuple(subtypes(hg, edge))

            # check if the concept should be assigned to a synonym set
            if len(subs) > 0:
                # find set with the highest degree and normalize set
                # degrees by total degree
                sub_degs = [hg.deep_degree(sub) for sub in subs]
                total_deg = sum(sub_degs)
                total_deg = 1 if total_deg == 0 else total_deg
                sub_ratios = [sub_deg / total_deg for sub_deg in sub_degs]
                max_ratio = 0.
                best_pos = -1
                for pos, ratio in enumerate(sub_ratios):
                    if ratio > max_ratio:
                        max_ratio = ratio
                        best_pos = pos

                # compute some degree-related metrics
                sdd = hg.deep_degree(subs[best_pos])
                dd = hg.deep_degree(edge)

                if dd > sdd:
                    sdd_dd = float(sdd) / float(dd)

                    self.logger.debug('concept: {}'.format(edge.to_str()))
                    self.logger.debug('subconcepts: {}'.format(subs))
                    self.logger.debug('# subs: {}'.format(len(subs)))
                    self.logger.debug('max_ratio: {}'.format(max_ratio))
                    self.logger.debug('sdd: {}'.format(sdd))
                    self.logger.debug('dd: {}'.format(dd))
                    self.logger.debug('sdd_dd: {}'.format(sdd_dd))

                    if max_ratio >= .7:  # and sdd_dd < .5:
                        edge1 = edge
                        edge2 = subs[best_pos]

                        self.logger.debug('are corefs: {} | {}'.format(
                            edge1.to_str(), edge2.to_str()))

                        self.corefs += 1
                        for op in make_corefs_ops(hg, edge1, edge2):
                            yield op
예제 #3
0
    def _make_singular_plural_relation(self, singular, plural):
        hg = self.system.get_hg(self)

        self.logger.debug('singular: {}; plural: {}'.format(singular, plural))

        for op in make_singular_plural_ops(hg, singular, plural):
            yield op
        self.sng_pl += 1

        for op in make_corefs_ops(hg, singular, plural):
            yield op
        self.corefs += 1

        for subtype in subtypes(hg, singular):
            plural_edge = subtype.replace_main_concept(plural)
            if plural_edge and hg.exists(plural_edge):
                for op in self._make_singular_plural_relation(subtype,
                                                              plural_edge):
                    yield op
예제 #4
0
    def input_edge(self, edge):
        if edge.type()[0] == 'c':
            subs = tuple(subtypes(self.hg, edge))

            # check if the concept should be assigned to a synonym set
            if len(subs) > 0:
                # find set with the highest degree and normalize set
                # degrees by total degree
                sub_degs = [self.hg.degree(sub) for sub in subs]
                total_deg = sum(sub_degs)
                total_deg = 1 if total_deg == 0 else total_deg
                sub_ratios = [sub_deg / total_deg for sub_deg in sub_degs]
                max_ratio = 0.
                best_pos = -1
                for pos, ratio in enumerate(sub_ratios):
                    if ratio > max_ratio:
                        max_ratio = ratio
                        best_pos = pos

                # compute some degree-related metrics
                sdd = self.hg.deep_degree(subs[best_pos])
                _, rdd = self.hg.root_degrees(edge)
                sub_to_root_dd = \
                    0. if rdd == 0 else float(sdd) / float(rdd)
                d = self.hg.degree(edge)
                dd = self.hg.deep_degree(edge)
                r = float(d) / float(dd)
                ld, ldd = self.hg.lemma_degrees(edge)
                lr = float(ld) / float(ldd)

                # use metric to decide
                if (rdd > 5 and max_ratio >= .7 and r >= .05 and
                        lr >= .05 and sub_to_root_dd >= .1 and
                        (not edge.is_atom() or len(edge.root()) > 2)):

                    make_corefs(self.hg, edge, subs[best_pos])
                    self.corefs += 1