def get_label(self, edge): edges = self.pattern2edges([const.has_label, edge, None]) if len(edges) > 0: label_symbol = edges.pop()[2] if not sym.is_edge(label_symbol): return sym.symbol2str(label_symbol) return sym.symbol2str(edge)
def down(hg, symbol, visited=None): if not visited: visited = set() if sym.symbol2str(symbol) in visited: return None # print(symbol) visited.add(sym.symbol2str(symbol)) synonyms = [synonym for synonym in syn.synonyms(hg, symbol)] edges = [s for s in hg.star(symbol)] edges = [edge for edge in edges if is_concept(edge)] return {'symbol': symbol, 'synonyms': [down(hg, synonym, visited) for synonym in synonyms], 'derived_symbols': [down(hg, edge, visited) for edge in edges]}
def best_sense(self, roots, aux_text, namespaces=None): start = time.time() # reset profiling self.candidates = 0 self.words1 = 0 self.words2 = 0 candidates = set() exclude = set() for root in roots: candidates = candidates.union(self.hg.symbols_with_root(root)) text = sym.symbol2str(root) for token in text.split(): exclude.add(token) self.candidates = len(candidates) words1 = self.words_from_text(aux_text) self.words1 = len(words1) best = None best_cm = CandidateMetrics() for candidate in candidates: if check_namespace(candidate, namespaces): words2 = self.words_around_symbol(candidate) self.words2 += len(words1) cm = CandidateMetrics() cm.score = self.words_similarity(words1, words2, exclude) cm.degree = ksyn.degree(self.hg, candidate) logging.info('%s %s' % (candidate, cm)) if cm.better_than(best_cm): best_cm = cm best = candidate self.best_sense_t += time.time() - start return best, best_cm
def enrich_edge(parser, edge): if sym.is_edge(edge): eedge = [enrich_edge(parser, item) for item in edge] prob = 1. total_prob = 0. word_count = 0 words = [] for item in eedge: word_count += item['word_count'] prob *= item['prob'] total_prob += item['prob'] * item['word_count'] words += item['words'] mean_prob = total_prob / word_count return {'edge': edge, 'eedge': eedge, 'words': words, 'prob': prob, 'word_count': word_count, 'mean_prob': mean_prob} ngram = sym.symbol2str(edge) tokens = [token for token in ngram.split(' ') if len(token) > 0] for i in range(len(tokens)): if tokens[i][0] == '+': tokens[i] = tokens[i][1:] tokens = [token for token in tokens if len(token) > 0] words = [parser.make_word(token) for token in tokens] prob = 1. total_prob = 0. for word in words: p = math.exp(word.prob) prob *= p total_prob += p word_count = len(words) if word_count > 0: mean_prob = total_prob / word_count else: mean_prob = 1. return {'symbol': edge, 'words': words, 'prob': prob, 'word_count': word_count, 'mean_prob': mean_prob}
def derived_symbols(hg, ont, symbols=None, depth=0): if not symbols: symbols = {} symbol = ont['symbol'] degree = syn.degree(hg, symbol) symbols[sym.symbol2str(symbol)] = {'degree': degree, 'depth': depth} for subont in ont['derived_symbols']: derived_symbols(hg, subont, symbols, depth + 1) return symbols
def symbol_html(symbol, rel): label = sym.symbol2str(symbol) if rel: return '<div class="rel"><a href="/vertex?id=%s">%s</a></div><div class="arrow"></div>'\ % (symbol, label) else: extra_class = SYMBOL_CLASSES[symbol_to_int(symbol) % 5] return '<button type="button" class="btn %s symbol"><a class="symbol" href="/vertex?id=%s">%s</a></button>'\ % (extra_class, symbol, label)
def html(hg, eid): vertex = ed.str2edge(eid) if sym.sym_type(vertex) == sym.SymbolType.EDGE: title = edge_html(hg, vertex) else: title = '<h1>%s</h1>' % sym.symbol2str(eid) return """ <div class="container" role="main"> <div class="page-header"> %s <h4>%s</h4> </div> %s </div> """ % (title, eid, edges_html(hg, vertex))
def words_around_symbol(self, symbol): start = time.time() edges = self.hg.star(symbol, limit=STAR_LIMIT) words = set() for edge in edges: for entity in edge: for symbol in ed.symbols(entity): term = sym.symbol2str(symbol) for token in term.split(): word = self.parser.make_word(token) if word.prob < MAX_PROB and np.count_nonzero( word.vector) > 0: words.add(word) self.words_around_symbol_t += time.time() - start return words