예제 #1
0
    def _classify_node(self, node):
        """Classifies the node, returns True if relevant, False otherwise.

        """
        prefix = "re" if node in self._visited else ""
        #         logging.debug(prefix + "classifying the node '{}'".format(uri_to_title(node)))
        is_relevant = self._classifier.predict(self, [node])[0]
        if is_relevant:
            logging.debug("Node {}:'{}' is relevant".format(
                self.get_depth(node), uri_to_title(node)))
        else:
            logging.debug("Node {}:'{}' is irrelevant".format(
                self.get_depth(node), uri_to_title(node)))
        return is_relevant
예제 #2
0
 def get_depth(self, node):
     """Returns the previously computed depth of the node."""
     if node not in self._depth:
         raise ValueError("Cannot compute the depth of '{}': "
                          "the node has not been processed.".format(
                              uri_to_title(node)))
     return self._depth[node]
예제 #3
0
def supercats(uri):
    title = uri_to_title(uri)

    params = {
        'action': 'query',
        'prop': 'categories',
        'format': 'json',
        'titles': 'Category:{}'.format(title),
        'clshow': '!hidden'
    }

    response = requests.get(API_URL, params=params, headers=HEADERS)
    data = response.json()

    page = list(data['query']['pages'].values())[0]

    if page.get('missing') == '':
        logging.warning('Wikipedia page not found: title={}'.format(title))
        return None

    def get_title(cat):
        return title_to_uri(re.sub('^Category:', '', cat['title']),
                            category=True)

    if 'categories' in page:
        return list(map(get_title, page['categories']))
    else:
        logging.warning('Category: {} has no parent categories.'.format(title))
        return []
예제 #4
0
 def sample_paths_through_from_anywhere(self,
                                        n_nodes,
                                        fixed_length=True,
                                        random_seed=29121985):
     random.seed(random_seed)
     result = collections.OrderedDict()
     all_nodes = list(self.selection)
     while len(result) < n_nodes:
         start_node = random.choice(all_nodes)
         path = self._sample_path_through(start_node)
         if fixed_length:
             path = path[:self.selection._max_depth]
         print("* " + uri_to_title(start_node) + " *")
         print(' -> '.join(uri_to_title(node) for node in path))
         for node in path:
             result[node] = None
     return result.keys()
예제 #5
0
 def _should_schedule_child(self, node, child):
     if not self._classifier and child in self._visited:
         return False  # not re-scheduling the visited nodes
         # when the selection is unconditional (without classifier)
     if self.get_depth(node) >= self._max_depth:
         return False
     self._update_caches(node, child)
     if self._is_ancestor(node, child):
         # TODO: report the whole loop rather than only the endpoints.
         self._loops.append((to_title(node), to_title(child)))
         logging.warning("Loop '{}'<->'{}'".format(uri_to_title(node),
                                                   uri_to_title(child)))
         return False
     child_was_irrelevant = not self.is_relevant(child)
     self._compute_and_update_relevance_status(child)
     child_is_relevant = self.is_relevant(child)
     # Re-scheduling only new nodes or nodes that were previously irrelevant;
     # this is an approximation.
     return child_was_irrelevant and child_is_relevant
예제 #6
0
 def sample_paths_down_from_root(self, n_nodes, random_seed=29121985):
     random.seed(random_seed)
     result = collections.OrderedDict()
     while len(result) < n_nodes:
         start_node = self.selection._root
         path = self._sample_path_down(start_node)[1:]  # without root
         print(' -> '.join(uri_to_title(node) for node in path))
         for node in path:
             result[node] = None
     return result.keys()
예제 #7
0
 def sample_paths_down_from_anywhere(self, n_nodes, random_seed=29121985):
     random.seed(random_seed)
     result = collections.OrderedDict()
     all_nodes = list(self.selection)
     while len(result) < n_nodes:
         start_node = random.choice(all_nodes)
         path = self._sample_path_down(start_node)
         print(' -> '.join(uri_to_title(node) for node in path))
         for node in path:
             result[node] = None
     return result.keys()
예제 #8
0
    def _step(self):
        """Performs a single step of the currently BFS-like selection algorithm.

        Examines the next node in the schedule, computes the scores for its
        children, and schedules them. Returns the node.

        """
        node = self._next_node()

        logging.info("Processing the node {}:'{}'".format(
            self.get_depth(node), uri_to_title(node)))

        for child in sorted(self._compute_children(node)):
            logging.info("Processing the child '{}'->'{}'".format(
                uri_to_title(node), uri_to_title(child)))
            if self._should_schedule_child(node, child):
                self._schedule_node(child)

        logging.debug("Finished processing the children of '{}'".format(node))
        return node
예제 #9
0
 def _get_related_topics(self, topic, relation, cache, api_get):
     """Gets the topics related to the given topic, e.g. subcats or supercats."""
     if topic.encode('utf-8') in cache:
         related = cache[topic].decode('utf-8').split()
     else:
         related = api_get(topic)
         if related is None:
             logging.warning(
                 'Page not in Wikipedia, perhaps deleted: {}'.format(
                     uri_to_title(topic)))
             related = []
         cache[topic] = ' '.join(related)
     return related
예제 #10
0
 def save_topic_labels(self,
                       topics,
                       labels,
                       filename,
                       data_sampler: TrainingDataSelection = None):
     if (data_sampler):
         paths = (data_sampler._shortest_path_from_root(topic)[1:]
                  for topic in topics)
     else:
         paths = ([topic] for topic in topics)
     path_strings = (' -> '.join(
         uri_to_title(topic).replace(' ', '_')
         for topic in topics) + ' ' + self._label_string(label)
                     for topics, label in zip(paths, labels))
     with open(filename, 'w', encoding='utf-8') as file:
         file.write('\n'.join(path_strings))
예제 #11
0
    def sample_paths_up_from_leaves(self, n_nodes, random_seed=29121985):
        random.seed(random_seed)
        result = collections.OrderedDict()

        def is_leaf(node):
            return not self.selection.get_children(node)

        leaf_nodes = [
            node for node, parent in self.selection._bfs(should_report=is_leaf)
        ]
        while len(result) < n_nodes:
            start_node = random.choice(leaf_nodes)
            path = self._sample_path_up(start_node)[1:]  # without the root
            print(' -> '.join(uri_to_title(node) for node in path))
            for node in path:
                result[node] = None
        return result.keys()
예제 #12
0
def subcats(uri):
    title = uri_to_title(uri)

    params = {
        'action': 'query',
        'list': 'categorymembers',
        'cmtitle': 'Category:{}'.format(title),
        'cmtype': 'subcat',
        'cmlimit': '500',
        'format': 'json'
    }

    response = requests.get(API_URL, params=params, headers=HEADERS)
    data = response.json()

    categories = list(data['query']['categorymembers'])

    def get_uri(cat):
        return title_to_uri(re.sub('^Category:', '', cat['title']),
                            category=True)

    return list(map(get_uri, categories))
예제 #13
0
 def ftr(uri):
     return pos == pos_tag(uri_to_title(uri))[0][1]
예제 #14
0
 def ftr(uri):
     return uri_to_title(uri).lower().endswith(letter)
예제 #15
0
def generate_all_suffices(topic_uris):
    return [
        word[-i:] for i in [1, 2, 3] for uri in topic_uris
        for word in nltk.word_tokenize(uri_to_title(uri))
    ]
예제 #16
0
 def save_pairs(self, pairs, filename):
     pair_strings = (' -> '.join(
         uri_to_title(topic).replace(' ', '_') for topic in pair)
                     for pair in pairs)
     with open(filename, 'w', encoding='utf-8') as file:
         file.write('\n'.join(pair_strings))
예제 #17
0
 def ftr(uri):
     return uri_to_title(uri).endswith(suffix)
예제 #18
0
 def report_parent_child(parent, child):
     parent_title = uri_to_title(node).replace(' ', '_')
     child_title = uri_to_title(child).replace(' ', '_')
     print("Sampled '{}' -> '{}'".format(parent_title, child_title))
예제 #19
0
 def ftr(uri):
     tagging = pos_tag(uri_to_title(uri))
     pos_set = set(pos for token, pos in tagging)
     return pos in pos_set
예제 #20
0
 def ftr(uri):
     title = uri_to_title(uri)
     head_word, head_pos = head_word_pos(title)
     return head_pos == pos
예제 #21
0
 def ftr(uri):
     title = uri_to_title(uri)
     head_word, head_pos = head_word_pos(title)
     return head_word.endswith(suffix)
예제 #22
0
def to_title(title_or_uri):
    if is_category_uri(title_or_uri):
        return uri_to_title(title_or_uri)
    else:
        return title_or_uri
예제 #23
0
def generate_all_pos(topic_uris):
    return [
        pos for uri in topic_uris for token, pos in pos_tag(uri_to_title(uri))
    ]