def tile_stats(orfs, tiles): """compute tile stats orfs and tiles are name->seq dicts NOTE: for prefix trie stats (e.g., num of tiles per orf), it is assumed the orf name is a prefix to the name of a tile from that orf """ import numpy as np tile_lens = np.asarray([len(t) for t in tiles.values()]) orf_lens = np.asarray([len(o) for o in orfs.values()]) tile_size = int(round(np.median(tile_lens)).tolist()) # compute tile counts for each orf orf_prefixes = CharTrie() for name in orfs: orf_prefixes[name] = True # ensure that no ORF name is a prefix for a different valid ORF for name in orfs: if len(orf_prefixes.keys(name)) != 1: print(orf_prefixes.keys(name)) raise ValueError( "some ORF name is a prefix for a different valid ORF") tile_prefixes = CharTrie() for name in tiles: tile_prefixes[name] = True # compute orf coverages orf_coverages = {} for (orf, seq) in orfs.items(): orf_residues = len(seq) tile_residues = 0.0 if tile_prefixes.has_subtrie(orf) or (orf in tile_prefixes): for tile in tile_prefixes.keys(orf): tile_residues += len(tiles[tile]) orf_coverages[orf] = tile_residues / orf_residues stats = {} stats["tile_size"] = tile_size stats["num_tiles"] = len(tiles) stats["total_tile_residues"] = tile_lens.sum().tolist() stats["avg_orf_coverage"] = tile_lens.sum().tolist() / orf_lens.sum( ).tolist() stats["num_orfs_smaller_than_tile_size"] = (orf_lens < tile_size).sum().tolist() stats["approx_num_tiles_naive_1x_tiling"] = (np.ceil( orf_lens / tile_size).sum().tolist()) stats["avg_orf_coverage"] = sum( orf_coverages.values()) / len(orf_coverages) stats["max_tiles_per_len_normed_orf"] = max(orf_coverages.values()) stats["tile_len_hist"] = compute_int_hist(tile_lens) # what is the tile coverage of each ORF (tot tile residues / orf residues) # tiles are assigned to ORFs if they share a name stats["orf_coverage_hist"] = compute_float_hist( list(orf_coverages.values())) stats["top_5_orf_cov"] = list( map( list, sorted(orf_coverages.items(), key=lambda tup: tup[1], reverse=True)[:5], )) stats["bot_5_orf_cov"] = list( map(list, sorted(orf_coverages.items(), key=lambda tup: tup[1])[:5])) return stats
print 'route computed:', route[:-3] ''' Command processing ''' fragments = action.split() command_name = fragments[0] command_trie = CharTrie() command_trie['?'] = question_command command_trie['back'] = back_command #Can change location command_trie['bearings'] = bearings_command command_trie['history'] = history_command command_trie['moveto'] = move_command #Can change location command_trie['route'] = route_command command_trie['rope'] = rope_command command_trie['shop'] = shop_command if command_trie.has_subtrie(command_name) or command_trie.has_key( command_name): list(command_trie[command_name:])[0]() elif command_name == 'q': continue_loop = False elif command_name == '': print 'You think now might be the time for action.' else: print 'You\'re not really sure what that means.' print '' print('Quitting...') ''' End runstrip '''
class NERLinker: """ Disambiguates named entities and stores new unknown if they satisfy type restrictions. """ def __init__(self, outer_graph=gall, ner_type_resolver=NERTypeResolver(), metric_threshold=0.8, strict_type_match=True): self.ntr = ner_type_resolver # Init storage self._trie = CharTrie() self._metric_threshold = metric_threshold self._strict_type_match = strict_type_match self._allowed_types = ENT_CLASSES self.predicate_namespace = dbo # todo: move to constructor args self.outer_graph = outer_graph self.cache = dict() def update(self, uri_sf_pairs): """ :param uri_sf_pairs: List[Tuple[URIRef, Option[str]]]: tolerable to None surface forms :return: """ uri2sf = groupby(first, uri_sf_pairs) # group by the same uri uris = list(uri2sf.keys()) with Pool() as pool: def mmap(f, it): return list( map(f, it) ) # todo: pool.map doesn't work: pickle issues with decorators ent_types = mmap(self.ntr.get_by_uri, uris) labels = mmap(get_label, uris) all_redirects = mmap(get_fellow_redirects, uris) # lists of synonyms for each uri all_disambigs = mmap(get_fellow_disambiguations, uris) # lists of omonyms for each uri for i, (ent_uri, ent_type, base_label, redirects, disambigs) in enumerate( zip(uris, ent_types, labels, all_redirects, all_disambigs), 1): if ent_type: entries = {TrieEntry(ent_uri, base_label, ent_type)} # basic entry entries.update( TrieEntry(ent_uri, sf, ent_type) for sfs in uri2sf[ent_uri] for sf in sfs) # entries from provided surface forms redirects_labels = mmap(get_label, redirects) entries.update( TrieEntry(ent_uri, sf, ent_type) for sf in redirects_labels) disambigs_labels = mmap(get_label, disambigs) disambigs_types = mmap(self.ntr.get_by_uri, disambigs) entries.update( TrieEntry(duri, dsf, dtype) for duri, dsf, dtype in zip(disambigs, disambigs_labels, disambigs_types)) entries = filter( all, entries) # all fields of entry should evaluate to True sfgroups = groupby(lambda entry: entry.sf.lower(), entries) # build 'index' of trie _new = _upd = 0 for sfkey, group in sfgroups.items(): if not self._trie.has_key(sfkey): self._trie[sfkey] = set(group) _new += 1 else: self._trie[sfkey].update(group) _upd += 1 log.info( 'NERLinker: ent #{}: added {:3d}, updated {:3d} sfgroups; "{}"' .format(i, _new, _upd, str(ent_uri))) def _resolve_edges(self, target_nodes, source_nodes): ont_graph = self.outer_graph ns = self.predicate_namespace new_edges = set() targets = set(target_nodes) for s in source_nodes: for rel, obj in ont_graph.predicate_objects(subject=s): if obj in targets and rel.startswith(ns) and obj != s: new_edges.add((s, obj)) return new_edges def _resolve_nodes(self, uri): ont_graph = self.outer_graph ns = self.predicate_namespace objs = { obj for rel, obj in ont_graph.predicate_objects(subject=uri) if rel.startswith(ns) and not isinstance(obj, Literal) } subjs = { subj for subj, rel in ont_graph.subject_predicates(object=uri) if rel.startswith(ns) } new_edges = {(uri, obj) for obj in objs}.union({(subj, uri) for subj in subjs}) new_nodes = objs.union(subjs) return new_nodes, new_edges def get_path_graph(self, uris, depth=2): """ Based on the paper: https://arxiv.org/pdf/1707.05288.pdf :param uris: uris to build graph for :param depth: depth of the paths to search :return: """ edges = set() nodes = set(uris) log.info('linker: started building subgraph on {} nodes with depth {}'. format(len(nodes), depth)) mmap = map # todo: make parallel queries for i in range(depth - 1): new_nodes = set() for uri_nodes, uri_edges in mmap(self._resolve_nodes, nodes): new_nodes.update(uri_nodes) edges.update(uri_edges) nodes = new_nodes log.info('linker: finished iter {}/{} with {} new nodes, {} edges'. format(i + 1, depth, len(new_nodes), len(edges))) # Last step can be done easier edges.update(self._resolve_edges(uris, nodes)) log.info('linker: finished building subgraph: {} edges'.format( len(edges))) graph = nx.DiGraph() graph.add_nodes_from(uris) # need only original entities graph.add_edges_from(edges) subgraph = nx.transitive_closure(graph).subgraph(nbunch=uris) log.info('linker: ended extracting subgraph: {} edges'.format( len(subgraph.edges()))) return subgraph def link(self, ents, depth=2): """ :param ents: :return: Dict[spacy.token.Span, rdflib.URIRef] """ answers = { ent: None for ent in ents if ent.label_ in self._allowed_types } # Get candidate sets for all ents all_candidates = [(cand.uri, ent) for ent in ents for cand in self.get_candidates(ent)] # Each candidate can resolve multiple entities candidates = defaultdict(list) for cand_uri, ent in all_candidates: candidates[cand_uri].append(ent) # Build subgraph for these candidates graph = self.get_path_graph(candidates, depth=depth) # Apply HITS or PageRank algorithm hubs, authorities = nx.hits(graph, max_iter=20) # Sort according to authority value authorities = sorted(authorities.items(), key=second, reverse=True) # todo: what to do with equally probable authorities? or with 'zero' authorities? # maybe somehow preserve initial sort by get_candidates()? or returned weights (if any) for uri, auth_value in authorities: ents = candidates.get(uri, list()) for ent in ents: if not answers[ent]: answers[ent] = uri return answers def __call__(self, doc): answer_lists = {ent: self.get_candidates(ent) for ent in doc.ents} self.cache.update({ ent: [str(entry.uri) for entry in answers] for ent, answers in answer_lists.items() }) return doc def get(self, span, default=None): return self.cache.get( span, None) or default # cache can return some evaluated to false value # todo: return some kind of weight or probability with matches def get_candidates(self, span): """ :param span: spacy.token.Span :return: List[TrieEntry] """ _trie = self._trie text = span.text.lower() candidates_filtered = [] if span.label_ in self._allowed_types: # Determine how it's better to search if span.label_ == 'PERSON': # If ner type is Person: try all permutations of tokens tokens = filter(bool, text.split(' ')) lprefixes = [ self._longest_prefix(' '.join(p)) for p in permutations(tokens) ] lprefixes = filter(bool, lprefixes) lprefix = max(lprefixes, key=len, default=None) else: lprefix = self._longest_prefix(text) if lprefix is not None: # log.info('span: "{}"; found prefix: "{}"'.format(span, lprefix)) candidate_sets = _trie.itervalues(prefix=lprefix) candidates = list(chain.from_iterable(candidate_sets)) # todo: it is temporary for keeping consistency with saved in trie old entity type schema tmap = ENT_MAPPING typed = groupby( lambda e: (tmap.get(e.ent_type) or e.ent_type) == span.label_, candidates) # typed = groupby(lambda entry: entry.ent_type == span.label_, candidates) search_in = [True] if not self._strict_type_match: search_in.append(False) # Search with the same type first for is_same_type in search_in: typed_candidates = typed.get(is_same_type) if typed_candidates: candidates_filtered.extend( self._fuzzy_filter(span.text, typed_candidates)) return candidates_filtered # in the case of not-found just the empty list def _fuzzy_filter(self, text, candidates, metric=fuzz.ratio): """ :param text: str :param candidates: List[TrieEntry] :param metric: (str, str) -> Numeric :return: List[TrieEntry] """ # similar = groupby(lambda entry: metric(entry.sf, text), candidates) # group by val of metric # Calculate a metric measured = [(metric(entry.sf, text), entry) for entry in candidates] # Group by the same uri similar = groupby(lambda entry: entry[1].uri, measured) # uri: (m, entry) # In each group of same matches leave only the one with the highest match-metric similar = [max(sames, key=first) for sames in similar.values()] # Sort by the metric best_matches = sorted(similar, key=first, reverse=True) # Filter bad matches best_matches = [ entry for m, entry in best_matches if m >= self._metric_threshold * 100 ] # Some more checks on the best matches if there're several matches if len(best_matches) > 1: # best_matches = [max(best_matches, key=lambda entry: metric(raw_d(raw(entry.uri)), text))] best_matches = groupby( lambda entry: metric(raw_d(raw(entry.uri)), text), best_matches) best_matches = best_matches[max(best_matches)] return best_matches def _is_acronym(self, text, len_threshold=5): return len(text) < len_threshold and text.isupper() def _longest_prefix(self, text): l = len(text) left = max(1, floor(l * self._metric_threshold)) for end in range(l, left - 1, -1): if self._trie.has_subtrie(text[:end]): return text[:end] def save(self, model_dir): model_name = type(self).__name__.lower() + '.pck' with open(os.path.join(model_dir, model_name), 'wb') as f: pickle.dump(self._trie, f) def load(self, model_dir): model_name = type(self).__name__.lower() + '.pck' with open(os.path.join(model_dir, model_name), 'rb') as f: self._trie = pickle.load(f)