示例#1
0
def exclude(edge):
    if sym.is_edge(edge):
        rel = edge[0]
        if sym.is_edge(rel):
            return False
        return rel in EXCLUDE_RELS
    else:
        return True
示例#2
0
def rel_contains(full_edge, term):
    if sym.is_edge(full_edge) and len(full_edge) > 2:
        if len(full_edge) > 3 or sym.is_edge(full_edge[2]):
            rel = full_edge[0]
            if sym.is_edge(rel):
                return term in rel
            else:
                return rel == term
    return False
示例#3
0
 def rel_has_term(self, edge):
     if sym.is_edge(edge) and len(edge) > 2:
         if len(edge) > 3 or sym.is_edge(edge[2]):
             rel = edge[0]
             if sym.is_edge(rel):
                 return self.term in rel
             else:
                 return rel == self.term
     return False
示例#4
0
def edge2label(edge):
    if sym.is_edge(edge):
        _edge = list(edge[:])
        if _edge[0] == '+':
            _edge = _edge[1:]
        if not sym.is_edge(_edge[0]):
            if _edge[0][0] == '+':
                _edge[0] = _edge[0][1:]
        return ' '.join([edge2label(item) for item in _edge])
    else:
        return str(edge)
示例#5
0
 def get_label(self, edge):
     edges = self.pattern2edges([const.has_label, edge, None])
     if len(edges) > 0:
         label_symbol = edges.pop()[2]
         if not sym.is_edge(label_symbol):
             return sym.symbol2str(label_symbol)
     return sym.symbol2str(edge)
示例#6
0
    def add_edge(self, edge_ns):
        is_edge = sym.is_edge(edge_ns)
        edge = ed.without_namespaces(edge_ns)

        # discard common words
        if not is_edge:
            word = self.parser.make_word(edge)
            if word.prob > MAX_PROB:
                return False

        orig = edge2str(edge)

        # add to edge_map
        if orig not in self.edge_map:
            self.edge_map[orig] = set()
        self.edge_map[orig].add(edge_ns)

        concept = is_concept(edge)

        self.vertices.add(orig)
        self.atoms[orig] = ed.depth(edge)

        if is_edge:
            for e in edge_ns:
                targ = edge2str(e)
                if targ:
                    if self.add_edge(e):
                        if concept:
                            self.add_link(orig, targ)
        return True
示例#7
0
def enrich_edge(parser, edge):
    if sym.is_edge(edge):
        eedge = [enrich_edge(parser, item) for item in edge]
        prob = 1.
        total_prob = 0.
        word_count = 0
        words = []
        for item in eedge:
            word_count += item['word_count']
            prob *= item['prob']
            total_prob += item['prob'] * item['word_count']
            words += item['words']
        mean_prob = total_prob / word_count
        return {'edge': edge, 'eedge': eedge, 'words': words, 'prob': prob, 'word_count': word_count,
                'mean_prob': mean_prob}

    ngram = sym.symbol2str(edge)
    tokens = [token for token in ngram.split(' ') if len(token) > 0]
    for i in range(len(tokens)):
        if tokens[i][0] == '+':
            tokens[i] = tokens[i][1:]
    tokens = [token for token in tokens if len(token) > 0]
    words = [parser.make_word(token) for token in tokens]
    prob = 1.
    total_prob = 0.
    for word in words:
        p = math.exp(word.prob)
        prob *= p
        total_prob += p
    word_count = len(words)
    if word_count > 0:
        mean_prob = total_prob / word_count
    else:
        mean_prob = 1.
    return {'symbol': edge, 'words': words, 'prob': prob, 'word_count': word_count, 'mean_prob': mean_prob}
示例#8
0
 def infer_from_edge(self, edge, arity):
     pred = edge[0]
     if not self.pred_table[pred][str(arity)]:
         return
     actor_orig = syn.main_synonym(self.hg, edge[1])
     if self.is_actor(actor_orig):
         actor_targs = set()
         concepts = set()
         for entity in edge[2:]:
             syn_entity = syn.main_synonym(self.hg, entity)
             if self.is_actor(syn_entity):
                 actor_targs.add(syn_entity)
             else:
                 concepts |= self.get_concepts(entity)
         if self.pred_table[pred]['claim']:
             if len(edge) > 2 and sym.is_edge(edge[2]):
                 self.claims += 1
                 for concept in concepts:
                     self.add_mention(actor_orig, concept, edge)
         if self.pred_table[pred]['conflict']:
             for actor in actor_targs:
                 self.conflicts += 1
                 self.add_conflict(actor_orig, actor)
                 for concept in concepts:
                     self.add_conflict_over(actor_orig, actor, concept, edge)
示例#9
0
def is_concept(edge):
    if sym.is_edge(edge):
        if len(edge) > 1:
            for item in edge[1:]:
                if not is_concept(item):
                    return False
        return edge[0] == '+'
    return True
示例#10
0
def is_candidate(edge):
    if sym.is_edge(edge) and len(edge) > 1:
        # discard posessives
        if edge[1] in {
                "'s", 'in', 'of', 'with', 'and', 'a', 'on', 'for', 'to', 'from'
        }:
            return False
    return True
示例#11
0
 def post_assignments(self, edge):
     if sym.is_edge(edge):
         for e in edge:
             self.post_assignments(e)
     else:
         term = self.edge2str(edge)
         if term in self.edge_map:
             if edge[-4:] == 'noun' or edge[-5:] == 'propn':
                 self.edge_map[term].add(edge)
示例#12
0
    def add_edges(self, edge):
        if sym.is_edge(edge):
            for item in edge:
                self.add_edges(item)

        edge_str = ed.edge2str(edge, namespaces=False)
        if not sym.is_edge(edge):
            if edge_str[0] == '+':
                edge_str = edge_str[1:]
            if len(edge_str) == 0:
                return
            if not edge_str[0].isalnum():
                return
            if self.parser.make_word(edge_str).prob > MAX_PROB:
                return
        if edge_str not in self.edge_counts:
            self.edge_counts[edge_str] = 0
        self.edge_counts[edge_str] += 1
示例#13
0
 def recover_words(self, edge):
     if sym.is_edge(edge):
         for e in edge:
             self.recover_words(e)
     else:
         term = edge2str(edge)
         if term in self.edge_map:
             if edge[-4:] == 'noun' or edge[-5:] == 'propn':
                 self.edge_map[term].add(edge)
示例#14
0
def contains(edge, concept, deep=False):
    if sym.is_edge(edge):
        for x in edge:
            if x == concept:
                return True
            if deep:
                if contains(x, concept, True):
                    return True
        return False
    else:
        return edge == concept
示例#15
0
def main_synonym(hg, edge, in_adp=False):
    """Finds the main synonym of an edge or symbol. The main synonym is usually a special type
       of symbol that all synonyms point to, used as an identifier for the synonym set.

       If parameter in_adp is True, in case of adpositional phrases this function looks for the main
       synonym contained in the phrase. E.g. in (+/gb with/nlp.with.adp india/nlp.india.propn)
       the main synonym for india/nlp.india.propn is returned.

       In case no main synonym exists, the edge or symbol itself is returned."""
    if in_adp and sym.is_edge(edge):
        if len(edge) == 3 and edge[0] == '+/gb':
            if not sym.is_edge(edge[1]) and edge[1][-4:] == '.adp':
                # if ed.is_concept(edge[2]):
                return main_synonym(hg, edge[2])
            elif not sym.is_edge(edge[2]) and edge[2][-4:] == '.adp':
                # if ed.is_concept(edge[1]):
                return main_synonym(hg, edge[1])
    edges = hg.pattern2edges([cons.are_synonyms, edge, None])
    if len(edges) > 0:
        return edges.pop()[2]
    return edge
示例#16
0
 def add_claim(self, edge):
     orig = self.edge2str(edge)
     if not orig:
         return
     self.vertices.add(orig)
     self.atoms[orig] = ed.depth(edge)
     if sym.is_edge(edge):
         for element in edge:
             targ = self.edge2str(element)
             if targ:
                 self.vertices.add(targ)
                 self.atoms[targ] = ed.depth(element)
                 self.add_link(orig, targ)
             self.add_claim(element)
示例#17
0
 def synonym_ids_in(self, edge):
     sids = set()
     atom = self.edge2str(edge)
     atom_syn_id = self.syn_id(atom)
     if atom_syn_id:
         sids.add(atom_syn_id)
     if sym.is_edge(edge):
         for element in edge:
             atom = self.edge2str(element)
             atom_syn_id = self.syn_id(atom)
             if atom_syn_id:
                 sids.add(atom_syn_id)
             sids = sids.union(self.synonym_ids_in(element))
     return sids
示例#18
0
    def lemmatize(self, edge):
        if edge in self.lemmas:
            return self.lemmas[edge]

        lemma = edge
        if sym.is_edge(edge):
            lemma = tuple([self.lemmatize(item) for item in edge])
        else:
            edges = self.hg.pattern2edges((const.have_same_lemma, edge, None))
            if len(edges) > 0:
                lemma = edges.pop()[2]

        self.lemmas[edge] = lemma
        return lemma
示例#19
0
    def find_co_synonyms(self, edge):
        co_syns = set()
        if sym.is_edge(edge):
            for item in edge:
                co_syns = co_syns.union(self.find_co_synonyms(item))

        edge_str = ed.edge2str(edge, namespaces=False)

        for atom in self.atom_set:
            if atom == edge_str:
                co_syns.add(self.synonym_map[atom])
                return co_syns

        return co_syns
示例#20
0
 def get_concepts(self, edge):
     if sym.is_edge(edge):
         concepts = {syn.main_synonym(self.hg, edge)}
         if len(edge) > 1:
             for item in edge[1:]:
                 concepts |= self.get_concepts(item)
         return concepts
     else:
         word = self.parser.make_word(unidecode(ed.without_namespaces(edge)))
         if word.prob > MAX_PROB:
             return set()
         if edge[0] in {'`', '_', "'"}:
             return set()
         else:
             return {syn.main_synonym(self.hg, edge)}
示例#21
0
    def edge2str(self, edge):
        s = ed.edge2str(edge, namespaces=False)
        if sym.is_edge(edge):
            return s

        if s[0] == '+':
            s = s[1:]

        if len(s) == 0:
            return None

        word = self.parser.make_word(s)
        if word.prob < MAX_PROB:
            return s

        return None
示例#22
0
def valid_symbol(s):
    if sym.is_edge(s):
        return True
    if sym.is_root(s):
        return False
    if sym.nspace(s) == 'gb':
        return False
    if s[0] == '+':
        return False
    if sym.nspace(s)[:3] != 'nlp':
        return False
    if sym.nspace(s)[-3:] == 'adp':
        return False
    if sym.nspace(s)[-3:] == 'det':
        return False
    if sym.nspace(s)[-4:] == 'verb':
        return False
    if sym.nspace(s)[-4:] == 'pron':
        return False
    return True
示例#23
0
    def add_claim(self, edge):
        orig = self.edge2syn(edge)
        if not orig:
            return
        self.vertices.add(orig)
        if sym.is_edge(edge):
            elements = []
            # links from part to whole
            for element in edge:
                targ = self.edge2syn(element)
                if targ:
                    elements.append(targ)
                    self.vertices.add(targ)
                    self.add_link(orig, targ)
                self.add_claim(element)

            # links between peers
            combs = itertools.combinations(elements, 2)
            for comb in combs:
                self.add_link(*comb)
示例#24
0
def json_str(hg, symbol):
    labels = {symbol: hg.get_label(symbol)}

    actors = set()

    conflict_map = {}
    for edge in hg.pattern2edges(('conflict/gb.inf', symbol, None, None)):
        targ = edge[2]
        if not sym.is_edge(targ):
            actors.add(targ)
            labels[targ] = hg.get_label(targ)
            if targ not in conflict_map:
                conflict_map[targ] = {'topics': set()}
            topic = edge[3]
            labels[topic] = hg.get_label(topic)
            conflict_map[targ]['topics'].add(topic)
    for edge in hg.pattern2edges(('conflict/gb.inf', None, symbol, None)):
        targ = edge[1]
        if not sym.is_edge(targ):
            actors.add(targ)
            labels[targ] = hg.get_label(targ)
            if targ not in conflict_map:
                conflict_map[targ] = {'topics': set()}
            topic = edge[3]
            labels[topic] = hg.get_label(topic)
            conflict_map[targ]['topics'].add(topic)

    conflict = [{
        'target': targ,
        'topics': tuple(conflict_map[targ]['topics'])
    } for targ in conflict_map]

    nodes = []
    actor_id = {}
    i = 0
    for actor in actors:
        actor_id[actor] = i
        nodes.append({'label': labels[actor], 'r': 3})
        i += 1
    links = []
    for actor in actors:
        targets = conflict_targets(hg, actors, actor)
        for target in targets:
            links.append({
                'source': actor_id[actor],
                'target': actor_id[target]
            })
            nodes[actor_id[actor]]['r'] += 1
            nodes[actor_id[target]]['r'] += 1

    data = {
        'entity': symbol,
        'labels': labels,
        'conflict': conflict,
        'conflict_graph': {
            'nodes': nodes,
            'links': links
        }
    }

    return data
示例#25
0
def generate(hg):
    print('starting parser...')
    parser = par.Parser()

    mer = Meronomy(hg, parser)

    print('reading edges...')
    total_edges = 0
    total_beliefs = 0

    total_verts = hg.symbol_count() + hg.edge_count()
    i = 0
    with progressbar.ProgressBar(max_value=total_verts) as bar:
        for vertex in hg.all():
            if sym.is_edge(vertex):
                edge = vertex
                total_edges += 1
                if hg.is_belief(edge):
                    mer.add_edge(edge)
                    total_beliefs += 1
            i += 1
            if (i % 1000) == 0:
                bar.update(i)

    print('edges: %s; beliefs: %s' % (total_edges, total_beliefs))

    print('post assignments...')
    i = 0
    with progressbar.ProgressBar(max_value=total_verts) as bar:
        for vertex in hg.all():
            if sym.is_edge(vertex):
                edge = vertex
                if hg.is_belief(edge):
                    mer.post_assignments(edge)
            i += 1
            if (i % 1000) == 0:
                bar.update(i)

    print('generating meronomy graph...')
    mer.generate()

    print('normalizing meronomy graph...')
    mer.normalize_graph()

    print('generating synonyms...')
    mer.generate_synonyms()

    print('writing synonyms...')
    i = 0
    with progressbar.ProgressBar(max_value=len(mer.synonym_sets)) as bar:
        for syn_id in mer.synonym_sets:
            edges = set()
            for atom in mer.synonym_sets[syn_id]:
                if atom in mer.edge_map:
                    edges |= mer.edge_map[atom]
            best_count = -1
            best_label_edge = None
            for edge in edges:
                if mer.edge_counts[edge] > best_count:
                    best_count = mer.edge_counts[edge]
                    best_label_edge = edge
            label = hg.get_label(best_label_edge)
            syn_symbol = sym.build(label, 'syn%s' % syn_id)
            for edge in edges:
                syn_edge = (cons.are_synonyms, edge, syn_symbol)
                hg.add(syn_edge)
            label_symbol = sym.build(label, cons.label_namespace)
            label_edge = (cons.has_label, syn_symbol, label_symbol)
            hg.add(label_edge)
            i += 1
            if i % 1000 == 0:
                bar.update(i)
        bar.update(i)

    print('%s synonym sets created' % len(mer.synonym_sets))
    print('done.')
示例#26
0
def is_concept(edge):
    rel = edge[0]
    if sym.is_edge(rel):
        return False
    return rel[0] == '+'