def _classify_node(self, node): """Classifies the node, returns True if relevant, False otherwise. """ prefix = "re" if node in self._visited else "" # logging.debug(prefix + "classifying the node '{}'".format(uri_to_title(node))) is_relevant = self._classifier.predict(self, [node])[0] if is_relevant: logging.debug("Node {}:'{}' is relevant".format( self.get_depth(node), uri_to_title(node))) else: logging.debug("Node {}:'{}' is irrelevant".format( self.get_depth(node), uri_to_title(node))) return is_relevant
def get_depth(self, node): """Returns the previously computed depth of the node.""" if node not in self._depth: raise ValueError("Cannot compute the depth of '{}': " "the node has not been processed.".format( uri_to_title(node))) return self._depth[node]
def supercats(uri): title = uri_to_title(uri) params = { 'action': 'query', 'prop': 'categories', 'format': 'json', 'titles': 'Category:{}'.format(title), 'clshow': '!hidden' } response = requests.get(API_URL, params=params, headers=HEADERS) data = response.json() page = list(data['query']['pages'].values())[0] if page.get('missing') == '': logging.warning('Wikipedia page not found: title={}'.format(title)) return None def get_title(cat): return title_to_uri(re.sub('^Category:', '', cat['title']), category=True) if 'categories' in page: return list(map(get_title, page['categories'])) else: logging.warning('Category: {} has no parent categories.'.format(title)) return []
def sample_paths_through_from_anywhere(self, n_nodes, fixed_length=True, random_seed=29121985): random.seed(random_seed) result = collections.OrderedDict() all_nodes = list(self.selection) while len(result) < n_nodes: start_node = random.choice(all_nodes) path = self._sample_path_through(start_node) if fixed_length: path = path[:self.selection._max_depth] print("* " + uri_to_title(start_node) + " *") print(' -> '.join(uri_to_title(node) for node in path)) for node in path: result[node] = None return result.keys()
def _should_schedule_child(self, node, child): if not self._classifier and child in self._visited: return False # not re-scheduling the visited nodes # when the selection is unconditional (without classifier) if self.get_depth(node) >= self._max_depth: return False self._update_caches(node, child) if self._is_ancestor(node, child): # TODO: report the whole loop rather than only the endpoints. self._loops.append((to_title(node), to_title(child))) logging.warning("Loop '{}'<->'{}'".format(uri_to_title(node), uri_to_title(child))) return False child_was_irrelevant = not self.is_relevant(child) self._compute_and_update_relevance_status(child) child_is_relevant = self.is_relevant(child) # Re-scheduling only new nodes or nodes that were previously irrelevant; # this is an approximation. return child_was_irrelevant and child_is_relevant
def sample_paths_down_from_root(self, n_nodes, random_seed=29121985): random.seed(random_seed) result = collections.OrderedDict() while len(result) < n_nodes: start_node = self.selection._root path = self._sample_path_down(start_node)[1:] # without root print(' -> '.join(uri_to_title(node) for node in path)) for node in path: result[node] = None return result.keys()
def sample_paths_down_from_anywhere(self, n_nodes, random_seed=29121985): random.seed(random_seed) result = collections.OrderedDict() all_nodes = list(self.selection) while len(result) < n_nodes: start_node = random.choice(all_nodes) path = self._sample_path_down(start_node) print(' -> '.join(uri_to_title(node) for node in path)) for node in path: result[node] = None return result.keys()
def _step(self): """Performs a single step of the currently BFS-like selection algorithm. Examines the next node in the schedule, computes the scores for its children, and schedules them. Returns the node. """ node = self._next_node() logging.info("Processing the node {}:'{}'".format( self.get_depth(node), uri_to_title(node))) for child in sorted(self._compute_children(node)): logging.info("Processing the child '{}'->'{}'".format( uri_to_title(node), uri_to_title(child))) if self._should_schedule_child(node, child): self._schedule_node(child) logging.debug("Finished processing the children of '{}'".format(node)) return node
def _get_related_topics(self, topic, relation, cache, api_get): """Gets the topics related to the given topic, e.g. subcats or supercats.""" if topic.encode('utf-8') in cache: related = cache[topic].decode('utf-8').split() else: related = api_get(topic) if related is None: logging.warning( 'Page not in Wikipedia, perhaps deleted: {}'.format( uri_to_title(topic))) related = [] cache[topic] = ' '.join(related) return related
def save_topic_labels(self, topics, labels, filename, data_sampler: TrainingDataSelection = None): if (data_sampler): paths = (data_sampler._shortest_path_from_root(topic)[1:] for topic in topics) else: paths = ([topic] for topic in topics) path_strings = (' -> '.join( uri_to_title(topic).replace(' ', '_') for topic in topics) + ' ' + self._label_string(label) for topics, label in zip(paths, labels)) with open(filename, 'w', encoding='utf-8') as file: file.write('\n'.join(path_strings))
def sample_paths_up_from_leaves(self, n_nodes, random_seed=29121985): random.seed(random_seed) result = collections.OrderedDict() def is_leaf(node): return not self.selection.get_children(node) leaf_nodes = [ node for node, parent in self.selection._bfs(should_report=is_leaf) ] while len(result) < n_nodes: start_node = random.choice(leaf_nodes) path = self._sample_path_up(start_node)[1:] # without the root print(' -> '.join(uri_to_title(node) for node in path)) for node in path: result[node] = None return result.keys()
def subcats(uri): title = uri_to_title(uri) params = { 'action': 'query', 'list': 'categorymembers', 'cmtitle': 'Category:{}'.format(title), 'cmtype': 'subcat', 'cmlimit': '500', 'format': 'json' } response = requests.get(API_URL, params=params, headers=HEADERS) data = response.json() categories = list(data['query']['categorymembers']) def get_uri(cat): return title_to_uri(re.sub('^Category:', '', cat['title']), category=True) return list(map(get_uri, categories))
def ftr(uri): return pos == pos_tag(uri_to_title(uri))[0][1]
def ftr(uri): return uri_to_title(uri).lower().endswith(letter)
def generate_all_suffices(topic_uris): return [ word[-i:] for i in [1, 2, 3] for uri in topic_uris for word in nltk.word_tokenize(uri_to_title(uri)) ]
def save_pairs(self, pairs, filename): pair_strings = (' -> '.join( uri_to_title(topic).replace(' ', '_') for topic in pair) for pair in pairs) with open(filename, 'w', encoding='utf-8') as file: file.write('\n'.join(pair_strings))
def ftr(uri): return uri_to_title(uri).endswith(suffix)
def report_parent_child(parent, child): parent_title = uri_to_title(node).replace(' ', '_') child_title = uri_to_title(child).replace(' ', '_') print("Sampled '{}' -> '{}'".format(parent_title, child_title))
def ftr(uri): tagging = pos_tag(uri_to_title(uri)) pos_set = set(pos for token, pos in tagging) return pos in pos_set
def ftr(uri): title = uri_to_title(uri) head_word, head_pos = head_word_pos(title) return head_pos == pos
def ftr(uri): title = uri_to_title(uri) head_word, head_pos = head_word_pos(title) return head_word.endswith(suffix)
def to_title(title_or_uri): if is_category_uri(title_or_uri): return uri_to_title(title_or_uri) else: return title_or_uri
def generate_all_pos(topic_uris): return [ pos for uri in topic_uris for token, pos in pos_tag(uri_to_title(uri)) ]