def find_events_from_wikipedia_baseline(self,
                                            word,
                                            max_events_per_year,
                                            years,
                                            include_score=False,
                                            min_occurrences=5):
        """
        find for each given year: events that contain the given word the most times
        """
        if not word:
            return None
        word = word.lower()
        key_years_to_events = OrderedDict([(year, []) for year in years])
        for key_year in years:
            # find the key events from that year
            top_key_events = MaxHeap(max_events_per_year)
            # take the events that are most similar to the event
            for e in self.year_to_event[key_year]:
                # count number of occurrences of the given word in the Wiki content
                score = sum(1 for _ in re.finditer(
                    r'\b%s\b' %
                    re.escape(word), self.event_to_text_content[e].lower()))
                if score > min_occurrences:
                    top_key_events.add(score, e)
            top_key_events = sorted(top_key_events.heap, reverse=True)
            key_years_to_events[key_year] = [
                item[1] + '--' +
                str(round(item[0], 2)) if include_score else item[1]
                for item in top_key_events
            ]

        return key_years_to_events
예제 #2
0
 def getSkyline(self, buildings):
     """
     :type buildings: List[List[int]]
     :rtype: List[List[int]]
     """
     if len(buildings) <= 0:
         return
     data = [[build[0], build[2], 0] for build in buildings]
     data += [[build[1], build[2], 1] for build in buildings]
     data = sorted(data, key=lambda x: (x[0], x[1]))
     print data
     heap = MaxHeap()
     print heap.size()
     result = []
     for item in data:
         x, h, tag = item[0:]
         pre_height = -heap.top() if heap.size() > 0 else 0
         if tag == 0:
             heap.add(-h)  #top is the lowest height
             print 'push', -h
         else:
             print 'pop', heap.top()
             heap.pop()  #pop the lowest height
         cur_height = -heap.top() if heap.size() > 0 else 0
         if tag == 0 and cur_height > pre_height:
             result.append([x, h])
         elif tag == 1 and h > cur_height:
             result.append([x, cur_height])
         print item, pre_height, cur_height, result
     return result
    def find_key_events_by_word(self,
                                word,
                                max_events_per_year,
                                years,
                                include_score=False):
        """
        find events closest to the given word
        """
        if not word:
            return None
        word = word.lower()
        key_years_to_events = OrderedDict([(year, []) for year in years])
        for key_year in years:
            model = self.get_model(key_year)
            # find the key events from that year
            top_key_events = MaxHeap(max_events_per_year)
            # take the events that are most similar to the event
            events = self.get_relevant_events(key_year)
            for e in events:
                if word in self.event_to_content[
                        e] and model.contains_all_words([e, word]):
                    similarity = model.similarity(e, word)
                    if similarity > self.knn_threshold:
                        top_key_events.add(similarity, e)
            top_key_events = sorted(top_key_events.heap, reverse=True)
            key_years_to_events[key_year] = [
                item[1] + '--' +
                str(round(item[0], 2)) if include_score else item[1]
                for item in top_key_events
            ]

        return key_years_to_events
 def find_key_events_by_classifier(self,
                                   word,
                                   min_classifier_score,
                                   max_events_per_year,
                                   existing_key_years_to_events,
                                   include_score=False):
     """
     find important events using our events classifier, and word2vec similarities as a filter.
     'key_years_to_events' should be calculated by another method ('find_key_events_...'),
     preferably with a bigger max_events_num, as we don't want to just filter an existing method.
     """
     if not word:
         return None
     word = word.lower()
     key_years_to_events = OrderedDict([(year, []) for year in all_years])
     for key_year, top_events_scores in existing_key_years_to_events.items(
     ):
         if not top_events_scores:
             continue
         # run the classifier for these events
         event_to_features = {}
         event_to_prev_method_score = {}
         for event, score in top_events_scores:
             event_to_prev_method_score[event] = float(score)
             feature_vector, feature_names = self.classifier.featurize_event_word(
                 (event, word))
             if feature_vector is not None:
                 event_to_features[event] = feature_vector
         probs = list(
             self.classifier.classifier.classifier.predict_proba(
                 list(event_to_features.values()))
         )  # probabilities for the true class
         y_prob = np.array(probs)[:, 1]
         top_key_events = MaxHeap(max_events_per_year)
         for event_i, event in enumerate(list(event_to_features.keys())):
             event_score = (y_prob[event_i] * 4 +
                            event_to_prev_method_score[event] * 6) / 10
             top_key_events.add(event_score, event)
         top_key_events = sorted(top_key_events.heap, reverse=True)
         key_years_to_events[key_year] = [
             item[1] + '--' +
             str(round(item[0], 2)) if include_score else item[1]
             for item in top_key_events if item[0] > min_classifier_score
         ]
     return key_years_to_events
예제 #5
0
class PriorityQueue(QueueBase):
    def __init__(self):
        self._max_heap = MaxHeap()

    def get_size(self):
        return self._max_heap.size()

    def is_empty(self):
        return self._max_heap.is_empty()

    def get_front(self):
        return self._max_heap.find_max()

    def enqueue(self, e):
        self._max_heap.add(e)

    def dequeue(self):
        return self._max_heap.extract_max()
    def find_key_events_by_knn(self,
                               word,
                               max_events_per_year,
                               years,
                               include_score=False):
        """
        find  events that are closest to the given word and its nearest neighbors
        """
        if not word:
            return None
        word = word.lower()
        year_to_similar_words = self.get_similar_words_per_year(word)
        key_years_to_events = OrderedDict([(year, []) for year in years])

        for key_year in years:
            model = self.get_model(key_year)
            # find the key events from that year
            top_key_events = MaxHeap(max_events_per_year)
            # take the events that are most similar to the KNN
            word_knn = [word] + year_to_similar_words[
                key_year] if year_to_similar_words[key_year] is not None else [
                    word
                ]
            events = self.get_relevant_events(key_year)
            for e in events:
                knn_similarities = [
                    model.similarity(e, sim_word) for sim_word in word_knn
                    if word in self.event_to_content[e]
                    and model.contains_all_words([e, sim_word])
                ]
                if len(knn_similarities) > 0:
                    similarity = np.mean(knn_similarities)
                    if similarity > self.knn_threshold:
                        top_key_events.add(similarity, e)
            top_key_events = sorted(top_key_events.heap, reverse=True)
            key_years_to_events[key_year] = [
                (item[1], str(round(item[0], 2))) if include_score else item[1]
                for item in top_key_events
            ]

        return key_years_to_events
예제 #7
0
    def knn_search(self, Xi):
        tree = self.tree
        heap = MaxHeap(self.k_neighbors, lambda x: x.dist)
        # 搜索Xi时,从根节点到叶节点的路径
        nd = tree.search(Xi, tree.root)
        # 初始化队列
        que = [(tree.root, nd)]
        while que:
            # 计算Xi和根节点的距离
            nd_root, nd_cur = que.pop(0)
            nd_root.dist = tree.get_eu_dist(Xi, nd_root)
            heap.add(nd_root)
            while nd_cur is not nd_root:
                # 计算Xi和当前节点的距离
                nd_cur.dist = tree.get_eu_dist(Xi, nd_cur)
                # 更新最好的节点和距离
                heap.add(nd_cur)
                if nd_cur.brother and (not heap or heap.items[0].dist > tree.get_hyper_plane_dist(Xi, nd_cur.father)):
                    _nd = tree.search(Xi, nd_cur.brother)
                    que.append((nd_cur.brother, _nd))
                nd_cur = nd_cur.father

        return heap