示例#1
0
	def __init__(self, *args, **kwargs):
		list.__init__(self, *args)
		heapq.heapify(self)
		if 'maxsize' in kwargs:
			self.maxsize = kwargs['maxsize']
		else:
			self.maxsize = 10
示例#2
0
    def _update_unavailable_forwarders(self, forwarder, action):
        """
        Update forwarder status.
        """

        with self._forwarders_lock.reader_lock:
            if forwarder in {
                    forwarder_load.forwarder
                    for forwarder_load in self._available_forwarderloads
            }:
                for index, entry in enumerate(
                        deepcopy(self._available_forwarderloads)):
                    if entry.forwarder == forwarder:
                        if action == self.DEPLOY_INPUT:
                            entry.load -= 1
                        self._unavailable_forwarderloads.append(entry)
                        del self._available_forwarderloads[index]
                        heapq.heapify(self._available_forwarderloads)
                        break
            else:
                for index, entry in enumerate(
                        self._unavailable_forwarderloads):
                    if entry.forwarder == forwarder:
                        if action == self.DEPLOY_INPUT:
                            entry.load -= 1
                        break
示例#3
0
    def __init__(self, items=[]):
        """ Create a new PriorityQueueSet.

            items:
                An initial item list - it can be unsorted and
                non-unique. The data structure will be created in
                O(N).
        """
        self.set = dict((item, True) for item in items)
        self.heap = self.set.keys()
        heapq.heapify(self.heap)
示例#4
0
    def __init__(self, items=[]):
        """ Create a new PriorityQueueSet.

            items:
                An initial item list - it can be unsorted and
                non-unique. The data structure will be created in
                O(N).
        """
        self.set = dict((item, True) for item in items)
        self.heap = self.set.keys()
        heapq.heapify(self.heap)
示例#5
0
    def set_proxy_status(self, url, proxy, status):
        """.. :py:method::
            Now, crawler only set status after proxy failed.
            remove this proxy from self._pool, self._table[domain],
            and priority queue.

        :param url: get the domain
        :param proxy:
        :param status: fail or success
        """
        domain = urlparse.urlparse(url).netloc
        proxies_table = self._table[domain]
        now = time.time()

        if status == self.FAILED:
            if proxy in proxies_table:
                last_time, count = proxies_table[proxy]
                _count = self._count_rule('set', count)
                if _count >= self.FAIL_THRESHOLD:
                    proxies_table.pop(proxy)
                    # 1. this proxy not available, remove from pool
                    # 2. this proxy avaiable for other sites, not remove
                    self._pool.remove(proxy)

                    if 'priority' in proxies_table:
                        proxies_table['priority'].remove(
                            [last_time, count, proxy])
                        heapq.heapify(proxies_table['priority'])
                else:
                    proxies_table[proxy][1] = _count
                    idx = proxies_table['priority'].index(
                        [last_time, count, proxy])
                    proxies_table['priority'][idx][1] = _count

            else:  # not execute here now
                proxies_table[proxy] = (now, 1)
        elif status == self.SUCCESS:
            pass
示例#6
0
    def _handle_forwarders(self):
        """
        Handle settings of forwarders.
        """

        forwarder_schema = self._dispatch_schema_manager.get_forwarder_schema()
        forwarders = {
            forwarder_name: forwarder_setting
            for forwarder_name, forwarder_setting in
            self._settings[forwarder_schema].iteritems()
        }

        # Update available forwarders and forwarders dispatch map
        with self._forwarders_lock.writer_lock:
            self._available_forwarderloads = [
                ForwarderLoad(
                    forwarder,
                    self._dispatch_snapshot_manager.get_forwarder_load(
                        forwarder))
                for forwarder, forwarder_setting in forwarders.iteritems()
                if not self._dispatch_schema_manager.forwarder_is_disabled(
                    forwarder_setting)
            ]

            heapq.heapify(self._available_forwarderloads)
            self._unavailable_forwarderloads = []

        with self._forwarders_snapshot_lock.writer_lock:
            self._forwarders_snapshot = \
                self._get_forwarders_snapshot_callback()

            # Forwarders new to reset
            forwarders_reset_new = {
                forwarder_name: forwarder_setting
                for forwarder_name, forwarder_setting in
                forwarders.iteritems()
                if forwarder_name not in self._forwarders_snapshot
            }

            # Forwarders exist to reset
            forwarders_reset_exist = {
                forwarder_name: forwarder_setting
                for forwarder_name, forwarder_setting in
                forwarders.iteritems()
                if forwarder_name in self._forwarders_snapshot
                and self._dispatch_schema_manager.forwarder_is_disabled(
                    forwarder_setting)
            }

            # Forwarders delete to reset
            forwarders_reset_delete = {
                forwarder_name: forwarder_setting
                for forwarder_name, forwarder_setting in
                self._forwarders_snapshot.iteritems()
                if forwarder_name not in forwarders
            }

            # Update forwarder snapshot
            for forwarder_name, forwarder_setting in forwarders.iteritems():
                if forwarder_name in self._forwarders_snapshot:
                    self._forwarders_snapshot[forwarder_name] = \
                        deepcopy(forwarder_setting)
            try:
                self._update_forwarders_snapshot_callback(
                    self._forwarders_snapshot)
            except Exception as e:
                log.logger.warn(
                    "message=\"Update forwarders snapshot failed, "
                    "will try to update forwarders snapshot next "
                    "time\" "
                    "detail_info=\"%s\"", traceback.format_exc(e))
        handle_futures = []
        for forwarder_name, forwarder_setting in \
                forwarders_reset_new.iteritems():
            handle_futures.append(
                self._threadpool_executor.submit(self._reset_forwarder,
                                                 forwarder_name,
                                                 forwarder_setting,
                                                 self.FORWARDER_NEW))

        for forwarder_name, forwarder_setting in \
                forwarders_reset_exist.iteritems():
            handle_futures.append(
                self._threadpool_executor.submit(self._reset_forwarder,
                                                 forwarder_name,
                                                 forwarder_setting,
                                                 self.FORWARDER_EXIST))

        for forwarder_name, forwarder_setting in \
                forwarders_reset_delete.iteritems():
            handle_futures.append(
                self._threadpool_executor.submit(self._reset_forwarder,
                                                 forwarder_name,
                                                 forwarder_setting,
                                                 self.FORWARDER_DELETE))
        # Wait until all tasks are done
        futures.wait(handle_futures, return_when=futures.ALL_COMPLETED)
示例#7
0
 def findKthLargest(self, nums, k):
     from Queue import heapq
     heapq.heapify(nums)
     sorted_heap = [heapq.heappop(nums) for _ in xrange(len(nums))]
     return sorted_heap[-k]
示例#8
0
 def search(self, query, return_length=100, passage_len=50, return_urls_only=False):
     '''
         Performs search on loaded data. 
         Returns list of sorted by rank:
           * tuples (url, rank) if return_urls_only == False
           * url                if return_urls_only == True
     '''
     query = query.strip()
     words = filter(lambda x: x != '', query.split(" "))
     result = None
     if len(words) == 0:
         return []
     word_index = [None] * len(words)
     for z, word in enumerate(words):
         word = self.norm(word.decode('utf-8').strip())
         if word in self.dictionary:
             self.index.seek(self.dictionary[word], 0)
             compressed = self.index.readline().strip()
             decompressed = None
             if self.encoding == VARBYTE:
                 decompressed = decode_varbyte(base64.b64decode(compressed))
             elif self.encoding == SIMPLE9:
                 decompressed = decode_simple9(base64.b64decode(compressed))
             decompressed, word_index[z] = from_flat(decompressed)
             if result == None:
                 result = decompressed
             else:
                 result = join(result, decompressed)
 
     k1 = 2
     b = 0.75
     if result == None or len(result) == 0:
         return []
        
     #Now we have a list of candidates. We apply BM25 to leave only return_length of them
     avg_len = 0.
     j = 0
     while word_index[j] == None:
         j += 1
         
     for i in xrange(len(result)):
         if result[i] in word_index[j]:
             avg_len += word_index[j][result[i]][0]
         
     avg_len /= len(result)
     BM25 = [0] * len(result)
     for j in xrange(len(words)):
         if word_index[j] != None:
             idf = log(float(self.N) / len(word_index[j]))
             for i in xrange(len(result)):
                 if result[i] in word_index[j]:
                     tf = float(len(word_index[j][result[i]][1])) / word_index[j][result[i]][0]
                     BM25[i] += tf * idf / (tf + k1 * (b + word_index[j][result[i]][0] / avg_len * (1 - b)))
     if len(result) > return_length:
         tpr = [(x, y) for x, y in zip(BM25, result)]
         heap = tpr[:return_length]
         heapq.heapify(heap)
         for rank, ind in tpr[return_length:]:
             if heapq.nsmallest(1, heap)[0][0] < rank:
                 heapq.heappop(heap)
                 heapq.heappush(heap, (rank, ind))
         result = [ind for rank, ind in heap]
     #Now we have a shortened list of candidates. We apply passage algorithm to leave top maxPASSpass
     scores = [0] * len(result)
     for i in xrange(len(result)):
         passage = []
         for j in xrange(len(words)):
              if word_index[j] != None and result[i] in word_index[j]:
                     passage.extend([(x, j) for x in word_index[j][result[i]][1]])
         passage.sort()
         l = 0
         r = 0
         features = [0] * 5
         for l in xrange(len(passage)):
             for r in xrange(l, len(passage)):
                 if passage[r][0] - passage[l][0] + 1 > passage_len:
                     continue
                 passage_w = [x[1] for x in passage[l:r+1]]
                 features[0] = len(set([x[1] for x in passage[l:r+1]])) / float(len(words))
                 features[1] = 1 - float(passage[l][0]) / word_index[passage[l][1]][result[i]][0]
                 features[2] = 1 - float(r - l + 1) / (passage[r][0] - passage[l][0] + 1)
                            
                 features[3] = 0
                 for j in xrange(len(words)):
                     if word_index[j] != None:
                         idf = log(float(self.N) / len(word_index[j])) / log(self.N)
                         tf = float(passage_w.count(j)) / (passage[r][0] - passage[l][0] + 1)
                         features[3] += tf * idf
                 features[4] = 0
                 for j in xrange(len(passage_w)-1):
                     for k in xrange(j + 1, len(passage_w)):
                         if passage_w[j] > passage_w[k]:
                             features[4] += 1
                 if len(passage_w) != 1:
                     features[4] /= float(len(passage_w) * (len(passage_w) - 1) / 2)
                     
                 score = reduce(lambda x,y: x + y, features)
                 if score > scores[i]:
                     scores[i] = score
         
     final_result = []
     for score, url_id in sorted(zip(scores, result), reverse=True):
         final_result.append((url_id, self.urls[url_id], score))
     if return_urls_only:
         final_result = [x[:-1] for x in final_result]
     return final_result