예제 #1
0
def update_keywords():
    sm_api = SmugmugAPI()

    def worker():
        logger.info('[Worker started]')
        while True:
            item = q.get()
            try:
                sm_api.update_image_keywords(*item)
            finally:
                q.task_done()

    q = JoinableQueue(maxsize=100)
    for i in range(50):
        gevent.spawn(worker)

    photos = (Photo.select(Photo.local_path, Photo.ext_key).where(
        (Photo.status == 'uploaded')))
    photos = list(photos)
    print("Total photos to update:", len(photos))
    cnt = 0
    for p in photos:
        cnt += 1
        print(cnt)
        keywords = get_keywords(p.local_path)
        q.put((p.ext_key, keywords))

    q.join()
예제 #2
0
def upload_photos_in_pending(with_failed=True):
    q_filter = ['pending']
    if with_failed:
        q_filter.append('failed')

    photos = (Photo.select(Photo.local_path, Photo.ext_album_key).where(
        (Photo.status << q_filter)))
    photos = list(photos)

    def worker():
        logger.info('[New worker started]')
        while True:
            item = q.get()
            try:
                upload_photo(item)
            finally:
                q.task_done()

    q = JoinableQueue(maxsize=10)
    for i in range(UPLOADING_WORKERS_COUNT):
        gevent.spawn(worker)

    for p in photos:
        q.put((p.local_path, p.ext_album_key))

    q.join()
예제 #3
0
파일: gqueue.py 프로젝트: whtsky/gqueue
class GQueue(object):
    def __init__(self):
        self.__QUEUE = JoinableQueue()

    def job(self, func):
        @functools.wraps(func)
        def f(*args, **kwargs):
            self.__QUEUE.put([func, args, kwargs])

        return f

    def join(self):
        self.__QUEUE.join()

    def work(self):
        while True:
            func, args, kwargs = self.__QUEUE.get()
            try:
                func(*args, **kwargs)
            finally:
                self.__QUEUE.task_done()

    def run_worker(self, num=1):
        for i in range(num):
            gevent.spawn(self.work)
예제 #4
0
def extract(input_dir, output_path, func):
    with open(output_path, 'w') as output:

        tasks = JoinableQueue()
        for file_name in os.listdir(input_dir):
            tasks.put(file_name)

        def _extract(file_name):
            file_path = os.path.join(input_dir, file_name)

            with open(file_path) as f:
                try:
                    json = simplejson.load(f)
                except Exception as e:
                    print(str(e))
                    print('Failed to load json file {}'.format(file_path))

                for pair in func(json):
                    output.write('\t'.join([str(x) for x in pair]) + '\n')

        def worker():
            while True:
                file_name = tasks.get()
                _extract(file_name)
                print(file_name)
                tasks.task_done()


        for i in range(10):
            gevent.spawn(worker)

        tasks.join()
def processor(data):
    """
    Each launched process(=NUM_CORES) executes 1 item in the list map_data as data.
    For given start_id and batch_size, launches gevent consumers to scrape data for the given ID
    Also, the main thread acts as a producer to produce the data for the workers to use 
    """
    try:
        NUM_GREENLETS = 8  # Depending on how much I/O block is expected. Varies for each problem.
        process_id = multiprocessing.current_process()
        monkey.patch_all(
        )  # Patch all the libraries to support non-IO blocking

        start_id = data["start_id"]
        batch_size = data["batch_size"]

        joinable_queue = JoinableQueue()

        # Launch NUM_GREENLETS workers
        for i in range(NUM_GREENLETS):
            gevent.spawn(worker,
                         joinable_queue=joinable_queue,
                         greenlet_id=i,
                         process_id=process_id)

        # Producer
        for id in range(start_id, start_id + batch_size):
            joinable_queue.put(id)

        joinable_queue.join()

    except:
        # If the processes have any uncaptured error, it'd not redirect to stderr,
        # as it's a different Pipe for each process fork spawned
        print(traceback.format_exc())
예제 #6
0
def handle():
    connection = create_postgresql_connection()

    cursor = connection.cursor()
    cursor.execute("BEGIN;")
    cursor.execute("DELETE FROM core_ratequery;")
    cursor.execute("COMMIT;")
    cursor.close()

    queue = JoinableQueue()
    event = Event()

    age_ids = age_map(connection).values() + [None]
    sex_ids = sex_map(connection).values() + [None]
    education_ids = education_map(connection).values() + [None]
    province_ids = province_map(connection).values() + [None]

    cursor = connection.cursor()
    cursor.execute("SELECT DISTINCT cycle FROM core_microdata;");
    cycles = [row[0] for row in cursor]
    cursor.close()

    greenlets = []

    for i in range(50):
        gv = gevent.spawn(worker, queue, event)
        greenlets.append(gv)

    combs = itertools.product(age_ids, sex_ids, province_ids, education_ids, cycles)
    for c in combs:
        queue.put(c)

    queue.join()
    event.set()
    gevent.joinall(greenlets)
예제 #7
0
class GeventPoolExecutor2(LoggerMixin):
    def __init__(
        self,
        max_works,
    ):
        check_gevent_monkey_patch()
        self._q = JoinableQueue(maxsize=max_works)
        # self._q = Queue(maxsize=max_works)
        for _ in range(max_works):
            # self.logger.debug('yyyyyy')
            gevent.spawn(self.__worker)
        atexit.register(self.__atexit)

    def __worker(self):
        while True:
            fn, args, kwargs = self._q.get()
            # noinspection PyBroadException
            try:
                fn(*args, **kwargs)
            except Exception as exc:
                self.logger.exception(
                    f'函数 {fn.__name__} 中发生错误,错误原因是 {type(exc)} {exc} ')
            finally:
                pass
                self._q.task_done()

    def submit(self, fn: Callable, *args, **kwargs):
        # self.logger.debug(self._q.qsize())
        self._q.put((fn, args, kwargs))

    def __atexit(self):
        self.logger.critical('想即将退出程序。')
        self._q.join()
class GeventPoolExecutor2(LoggerMixin):
    def __init__(
        self,
        max_works,
    ):
        self._q = JoinableQueue(maxsize=max_works)
        # self._q = Queue(maxsize=max_works)
        for _ in range(max_works):
            gevent.spawn(self.__worker)
        # atexit.register(self.__atexit)
        self._q.join(timeout=100)

    def __worker(self):
        while True:
            fn, args, kwargs = self._q.get()
            try:
                fn(*args, **kwargs)
            except Exception as exc:
                self.logger.exception(
                    f'函数 {fn.__name__} 中发生错误,错误原因是 {type(exc)} {exc} ')
            finally:
                pass
                self._q.task_done()

    def submit(self, fn: Callable, *args, **kwargs):
        self._q.put((fn, args, kwargs))

    def __atexit(self):
        self.logger.critical('想即将退出程序。')
        self._q.join()
예제 #9
0
class Dispatcher(gevent.Greenlet):
    """
    The Dispatcher class handles routing communications to and from the Gateway.
    It implements an Actor interface as made popular by Erlang.
    """
    def __init__(self):
        self._gw_inbox = JoinableQueue()
        super().__init__()

    def _run(self):
        while True:
            try:
                event = self._gw_inbox.get(block=False)
                # Dispatch the event back to interface
                self._gw_inbox.task_done()
            finally:
                gevent.sleep(1)

    @property
    def gw_inbox(self):
        """
        This is the inbox for the Gateway. It's not accessible outside the class methods.

        :return: None
        """
        return None

    @gw_inbox.setter
    def gw_inbox(self, message):
        self._gw_inbox.put(message)
예제 #10
0
def get_movie_id():
    baidu_tool = MysqlCurd('douban_movie')
    baidu_tool.connect_mysql()
    result = baidu_tool.query_mysql_condition('movie_name', [{'version': 0}, ['name']])
    q = JoinableQueue()
    for temp in result:
        if not baidu_tool.query_mysql_condition('name_id', [{'movie_name': temp[0]}, ['movie_id']]):
            q.put(temp[0])
    baidu_tool.close_connect()
    error_q = JoinableQueue()

    def crawl(time):
        while not q.empty():
            tool = MysqlCurd('douban_movie')
            tool.connect_mysql()
            name = q.get()
            try:
                page = super_downloader('https://movie.douban.com/subject_search?', params={'search_text': name},
                                        cookies=True, proxy=True)
            except requests.exceptions.RequestException:
                print('get movie id ' + name + 'download error!')
                return False
            page = etree.HTML(page)
            gevent.sleep(random.uniform(time[0], time[1]))
            try:
                count = 0
                count1 = 0
                for _ in page.xpath('//*[@id="content"]/div/div[1]/div[2]/table[@width="100%"]'):
                    try:
                        mark = _.xpath('tr/td[2]/div')[0]
                        id = mark.xpath('a')[0].get('href')[33:-1]
                        _name = mark.xpath('a')[0].text.split('/')[0].strip()
                        # score = mark.xpath('div/span[2]')[0].text
                        # comment_num = mark.xpath('div/span[3]')[0].text[1:-4]
                        tool.replace_mysql('name_id', {'movie_id': id, 'movie_name': _name})
                        count1 += 1
                        print('get movie id '+_name+'completed!!!')
                    except IndexError as e:
                        print('get movie id sub error!!!'+repr(e))
                        continue
                    count += 1
                    if count == 3:
                        break
                if count1>0:
                    # tool.replace_mysql('movie_name', {'version': 1, 'name': name})
                    tool.close_connect()
                print('get movie id ' + name + ' completed!')
            except Exception as e:
                error_q.put(name)
                print('get movie id ' + name + ' error!')
                print(e)
    worker = SleepFunction()
    worker.run(crawl)
    with open('errorlist//movie_id.txt', 'a', encoding='utf8') as f:
        if not error_q.empty():
            print(get_time(), file=f)
            while not error_q.empty():
                print(error_q.get(), file=f)
예제 #11
0
    def test_main(self):
        queue = JoinableQueue()
        print dir(queue)
        queue.put(1)
        queue.put(3)
        queue.put(2)
        queue.put(6)
        print queue.qsize()

        print '1', queue.get(), queue.get()
예제 #12
0
def fetch_worker(fetch_queue: JoinableQueue, save_queue: JoinableQueue, direction: str):

    while True:
        word = fetch_queue.get()
        print(word)
        res = fetch(word)
        if res:
            save_queue.put((word, direction, res))
            fetch_queue.task_done()
        else:
            fetch_queue.put(word)
예제 #13
0
class TaskList:
    def __init__(self):
        self.queue = JoinableQueue()
        self.all_tasks = {}

    def add_task(self, task):
        self.all_tasks[task.get_id()] = task
        self.queue.put(task)

    def get_queue(self):
        return self.queue

    def join(self, timeout=None):
        return self.queue.join(timeout)
예제 #14
0
def save_worker(dsn: str, save_queue: JoinableQueue):
    conn = psycopg2.connect(dsn)
    while True:
        word, direction, data = save_queue.get()
        try:
            with conn:
                with conn.cursor() as cur:
                    psycopg2.extensions.register_type(psycopg2.extensions.UNICODE, cur)
                    cur.execute("INSERT INTO youdao_bilingual (keyword, direction, data) VALUES (%s, %s, %s)",
                                (word, direction, data))
            save_queue.task_done()

        except Exception as e:
            print(e)
            save_queue.put((word, direction, data))

    conn.close()
예제 #15
0
class ApartmentManager(Greenlet):
    def __init__(self, name, urls):
        Greenlet.__init__(self)
        self.JobQueue = JoinableQueue()
        self.name = name
        self.assigning = True
        self.urls = urls

    def assignJob(self, job):
        print 'Manager {0} -> {1}'.format(self.name, job)
        self.JobQueue.put(job)
        gevent.sleep(0)

    def _run(self):
        for url in self.urls:
            self.assignJob(url)
        self.assigning = False
예제 #16
0
def get_movie_comment(movie_name):
    tool = MysqlCurd('douban_movie')
    tool.connect_mysql()
    try:
        data = tool.query_mysql_condition('movie_info', [{'movie_name': movie_name}, ['movie_id',
                                                                                      'movie_name',
                                                                                      'comment_num']])[0]
        tool.close_connect()
    except IndexError:
        print('dont have this movie id!')
        return False
    print(data[1] + ' started!' + str(data[2]) + 'comments!')
    m_table = Movie(data[0], data[1])
    m_table.tool.create_table(m_table.name, {'user_name': 'varchar(255) not null primary key',
                                             'score': 'int',
                                             'comment_time': 'varchar(45)',
                                             'vote': 'int',
                                             'comment': 'varchar(1000) not null'})
    m_table.tool.close_connect()
    comment_num = data[2]
    num_q = JoinableQueue()
    for i in range(0, comment_num, 20):
        num_q.put(str(i))

    def temp(time):
        while not num_q.empty():
            m = Movie(data[0], data[1])
            index = num_q.get()  # 非阻塞的
            try:
                m.result = m.downloader(m.comment_url.format(index), cookies=True, proxy=True)
            except requests.exceptions.RequestException as e:
                print(index + 'download error!' + repr(e))
            gevent.sleep(random.uniform(time[0], time[1]))
            try:
                m.analysis_comment()
                print(index + 'completed')
            except Exception as e:
                print(e)
                print(index + 'error')
            m.tool.close_connect()
            print(len(num_q))
    worker = SleepFunction()
    worker.run(temp)
예제 #17
0
class NoticeClient(object):
    init_flag = False

    def __init__(self, app_key, app_secret, api_url):
        if not self.init_flag:  # 防止重复执行init方法
            self.api_url = api_url
            self.app_key = app_key
            self.app_secret = app_secret
            self.req_q = JoinableQueue(MAXSIZE)
            self.init_flag = True
            t1 = threading.Thread(target=http_request, args=[self.api_url, self.req_q])
            t1.start()
        else:
            return

    def sys_params(self, body):
        """构造请求参数参数"""
        time.sleep(1)
        now = int(time.time())
        auth_key = '%d-%s-%s' % (now, self.app_secret, self.app_key)
        auth_key_md5 = get_md5(auth_key)
        auth_str = auth_key_md5[0:4] + str(random.randint(100, 999)) + auth_key_md5[4:24] + str(
            random.randint(10000, 99999)) + auth_key_md5[24:]
        _params = {
            "key": self.app_key,
            "auth_str": auth_str,
            "timestamp": now,
            "req_msg": body,
        }
        return _params

    def send(self, data, to_users):
        to_users = "|".join(to_users)
        data = message_format(data)
        body = {
            "to_user": to_users,
            "content": data
        }
        _params = self.sys_params(body)
        self.req_q.put(_params)

        return True
예제 #18
0
def get_person_info():
    baidu_tool = MysqlCurd('douban_person')
    baidu_tool.connect_mysql()
    result = baidu_tool.query_mysql_condition('person_name_id', [{
        'version': 0
    }, ['person_id', 'person_name']])
    print(result)
    print(result.__len__())
    q = JoinableQueue()
    for _ in result:
        if not baidu_tool.query_mysql_condition('person_info',
                                                [{
                                                    'person_id': _[0]
                                                }, ['person_name']]):
            q.put(_)
    error_q = JoinableQueue()
    baidu_tool.close_connect()

    def temp(param):
        while not q.empty():
            i = q.get()
            p = Person(id=i[0], name=i[1])
            flag = p.analysis_person_info()
            if flag:
                name_id_tool = MysqlCurd('douban_person')
                name_id_tool.connect_mysql()
                name_id_tool.replace_mysql('person_name_id', {
                    'person_id': p.id,
                    'person_name': p.name,
                    'version': 1
                })
                name_id_tool.close_connect()
            else:
                error_q.put((p.id, p.name))

    worker = SleepFunction()
    worker.run(temp)
    with open('errorlist//person_id.txt', 'a', encoding='utf8') as f:
        if not error_q.empty():
            print(get_time(), file=f)
            while not error_q.empty():
                print(error_q.get(), file=f)
예제 #19
0
def spider(start_url, max_depth=1, no_of_workers=10, page_fn=check_page_for_profanities):
    """
    Concurrently spider the web, starting from web page, executing page_fn
    on each page.

    start_url specifies the document the spider starts from.
    max_depth specifies the maximum link depth from the start_url that
    processing will occur.
    no_of_workers specifies how many concurrent workers process the job queue.
    page_fn is a function that takes BeautifulSoup parsed html and a url and
    processes them as required
    """
    seen_urls = set((start_url,))
    job_queue = JoinableQueue()
    job_queue.put((start_url, max_depth))

    for i in range(no_of_workers):
        gevent.spawn(job_worker, job_queue, seen_urls, page_fn)

    job_queue.join()
예제 #20
0
class Receiver(gevent.Greenlet):
    PORT = 20000
    CHUNK = 512

    def __init__(self):
        gevent.Greenlet.__init__(self)
        self.queue = JoinableQueue()

    def _run(self):
        context = zmq.Context()
        receiver = context.socket(zmq.PULL)
        receiver.connect("tcp://localhost:%s" % self.PORT)
        print 'rcv_on'
        while True:
            frame =  receiver.recv()
            sys.stdout.write('.')
            sys.stdout.flush()

            self.queue.put(frame)
            time.sleep(0.0001)
예제 #21
0
def handle():
    #The expected format is:
    #ciclo	edad	sexo	nforma	prov	aoi	factorel
    csv_path = sys.argv[1]

    queue = JoinableQueue()
    event = Event()

    greenlets = []

    for i in range(90):
        gv = gevent.spawn(worker, queue, event)
        greenlets.append(gv)

    with io.open(csv_path, 'r') as f:
        for line in f:
            queue.put(line)

    queue.join()
    event.set()
    gevent.joinall(greenlets)
예제 #22
0
    def start(self):
        if not self.__threads:
            self.__threads = len(IPNetwork(self.__ip)) if len(IPNetwork(self.__ip)) <= 10 else 10
        if len(IPNetwork(self.__ip)) < int(self.__threads):
            print "Please decrease number of threads to number of hosts <= %s" % len(IPNetwork(self.__ip))
            exit()

        queue = JoinableQueue()
        [queue.put(str(ip)) for ip in IPNetwork(self.__ip)]

        workers = [spawn(self.get_ip_info, queue, self.__apis) for t in range(int(self.__threads))]

        queue.join()
예제 #23
0
def get_movie_info(name=None):
    q = JoinableQueue()
    tool = MysqlCurd('douban_movie')
    tool.connect_mysql()
    if name:
        try:
            movie_id = tool.query_mysql_condition('name_id', [{'movie_name': name}, ['movie_id']])[0][0]
            q.put((movie_id, name))
        except IndexError:
            print('no id!')
    else:
        result = tool.query_mysql_condition('name_id', [{'version': 0}, ['movie_id', 'movie_name']])
        for temp in result:
            if not tool.query_mysql_condition('movie_info', [{'movie_id': temp[0]}, ['movie_name']]):
                q.put(temp)
    tool.close_connect()
    error_q = JoinableQueue()

    def temp(time):
        while not q.empty():
            data = q.get()
            m = Movie(data[0], data[1])
            try:
                print('analysis movie info ' + data[1] + 'started')  # 显示到控制台进行到哪个电影
                m.analysis_movie_info()
                gevent.sleep(random.uniform(time[0], time[1]))
            except Exception as e:
                print(e)
                print('analysis movie info ' + data[1] + 'error')
                error_q.put(data[1])
            m.tool.close_connect()
            print(len(q), 'remain!')
    worker = SleepFunction()
    worker.run(temp)
    with open('errorlist//movie_info.txt', 'a', encoding='utf8') as f:
        if not error_q.empty():
            print(get_time(), file=f)
            while not error_q.empty():
                print(error_q.get(), file=f)
예제 #24
0
def handle():
    connection = create_postgresql_connection()

    cursor = connection.cursor()
    cursor.execute("BEGIN;")
    cursor.execute("DELETE FROM core_ratequery;")
    cursor.execute("COMMIT;")
    cursor.close()

    queue = JoinableQueue()
    event = Event()

    age_ids = age_map(connection).values() + [None]
    sex_ids = sex_map(connection).values() + [None]
    education_ids = education_map(connection).values() + [None]
    province_ids = province_map(connection).values() + [None]

    cursor = connection.cursor()
    cursor.execute("SELECT DISTINCT cycle FROM core_microdata;")
    cycles = [row[0] for row in cursor]
    cursor.close()

    greenlets = []

    for i in range(50):
        gv = gevent.spawn(worker, queue, event)
        greenlets.append(gv)

    combs = itertools.product(age_ids, sex_ids, province_ids, education_ids,
                              cycles)
    for c in combs:
        queue.put(c)

    queue.join()
    event.set()
    gevent.joinall(greenlets)
예제 #25
0
    def start(self):
        if not self.__threads:
            self.__threads = len(IPNetwork(
                self.__ip)) if len(IPNetwork(self.__ip)) <= 10 else 10
        if len(IPNetwork(self.__ip)) < int(self.__threads):
            print "Please decrease number of threads to number of hosts <= %s" % len(
                IPNetwork(self.__ip))
            exit()

        queue = JoinableQueue()
        [queue.put(str(ip)) for ip in IPNetwork(self.__ip)]

        workers = [
            spawn(self.get_ip_info, queue, self.__apis)
            for t in range(int(self.__threads))
        ]

        queue.join()
예제 #26
0
파일: massget.py 프로젝트: beched/hehdirb
class MassGet(FastGet):
    def __init__(self, urls, dic, threads=10, report_db=False, keepalive=None, each_threads=10):
        self.dic = dic
        self.report_db = report_db
        self.table = None
        if report_db:
            self.sql_conn(report_db)
        self.keepalive = keepalive
        self.each_threads = each_threads
        self.queue = JoinableQueue()
        [self.queue.put(x.strip()) for x in urls]
        [spawn(self.worker) for _ in xrange(threads)]
        self.queue.join()

    def worker(self):
        while not self.queue.empty():
            url = self.queue.get()
            try:
                FastGet(url, self.dic, self.each_threads, self.report_db, self.keepalive, self.table)
            except Exception as e:
                logging.error('Worker global exception for %s: %s' % (url, e))
            finally:
                self.queue.task_done()
예제 #27
0
class HttpScanner(object):
    def __init__(self, args):
        """
        Initialise HTTP scanner
        :param args:
        :return:
        """
        self.args = args
        self.output = HttpScannerOutput(args)
        self._init_scan_options()

        # Reading files
        self.output.write_log("Reading files and deduplicating.", logging.INFO)
        self.hosts = self._file_to_list(args.hosts)
        self.urls = self._file_to_list(args.urls)

        #
        self._calc_urls()
        out = 'Loaded %i hosts %i urls' % (self.hosts_count, self.urls_count)
        if self.args.ports is not None:
            out += ' %i ports' % len(self.args.ports)
        self.output.print_and_log(out)

        if self.args.ports is not None and not self.args.syn:
            new_hosts = []
            for host in self.hosts:
                for port in self.args.ports:
                    # print(host, port)
                    new_hosts.append(helper.generate_url(host, port))
            self.hosts = new_hosts

        #
        self._calc_urls()
        self.output.print_and_log('%i full urls to scan' % self.full_urls_count)

        # Queue and workers
        self.hosts_queue = JoinableQueue()
        self.workers = []

    def _file_to_list(self, filename, dedup=True):
        """
        Get list from file
        :param filename: file to read
        :return: list of lines
        """
        if not path.exists(filename) or not path.isfile(filename):
            self.output.print_and_log('File %s not found!' % filename, logging.ERROR)
            exit(-1)

        # Preparing lines list
        lines = filter(lambda line: line is not None and len(line) > 0, open(filename).read().split('\n'))
        if len(lines) == 0:
            self.output.print_and_log('File %s is empty!' % filename, logging.ERROR)
            exit(-1)

        return helper.deduplicate(lines) if dedup else lines

    def _init_scan_options(self):
        # Session
        self.session = session()
        self.session.timeout = self.args.timeout
        self.session.verify = False

        # TODO: debug and check
        # self.session.mount("http://", HTTPAdapter(max_retries=self.args.max_retries))
        # self.session.mount("https://", HTTPAdapter(max_retries=self.args.max_retries))
        # http://stackoverflow.com/questions/15431044/can-i-set-max-retries-for-requests-request
        # Max retries
        adapters.DEFAULT_RETRIES = self.args.max_retries

        # TOR
        if self.args.tor:
            self.output.write_log("TOR usage detected. Making some checks.")
            self.session.proxies = {
                'http': 'socks5://127.0.0.1:9050',
                'https': 'socks5://127.0.0.1:9050'
            }

            url = 'http://ifconfig.me/ip'
            real_ip, tor_ip = None, None

            # Ger real IP address
            try:
                real_ip = get(url).text.strip()
            except Exception as exception:
                self.output.print_and_log("Couldn't get real IP address. Check yout internet connection.",
                                          logging.ERROR)
                self.output.write_log(str(exception), logging.ERROR)
                exit(-1)

            # Get TOR IP address
            try:
                tor_ip = self.session.get(url).text.strip()
            except Exception as exception:
                self.output.print_and_log("TOR socks proxy doesn't seem to be working.", logging.ERROR)
                self.output.write_log(str(exception), logging.ERROR)
                exit(-1)

            # Show IP addresses
            self.output.print_and_log('Real IP: %s TOR IP: %s' % (real_ip, tor_ip))
            if real_ip == tor_ip:
                self.output.print_and_log("TOR doesn't work! Stop to be secure.", logging.ERROR)
                exit(-1)

        # Proxy
        if self.args.proxy is not None:
            self.session.proxies = {"https": self.args.proxy,
                                    "http": self.args.proxy}

        # Auth
        if self.args.auth is not None:
            items = self.args.auth.split(':')
            self.session.auth = (items[0], items[1])

        # Cookies
        self.cookies = {}
        if self.args.cookies is not None:
            self.cookies = Cookies.from_request(self.args.cookies)

        # Cookies from file
        if self.args.load_cookies is not None:
            if not path.exists(self.args.load_cookies) or not path.isfile(self.args.load_cookies):
                self.output.print_and_log('Could not find cookie file: %s' % self.args.load_cookies, logging.ERROR)
                exit(-1)

            self.cookies = MozillaCookieJar(self.args.load_cookies)
            self.cookies.load()

        self.session.cookies = self.cookies

        # User-Agent
        self.ua = UserAgent() if self.args.random_agent else None

    def worker(self, worker_id):
        self.output.write_log('Worker %i started.' % worker_id)
        while not self.hosts_queue.empty():
            host = self.hosts_queue.get()
            try:
                self.scan_host(worker_id, host)
            finally:
                self.output.write_log('Worker %i finished.' % worker_id)
                self.hosts_queue.task_done()

    def _head_available(self, host):
        """
        Determine if HEAD requests is allowed
        :param host:
        :return:
        """
        # Trying to use OPTIONS request
        try:
            response = self.session.options(host, headers=self._fill_headers())
            o = response.headers['allow'] if 'allow' in response.headers else None
            if o is not None and o.find('HEAD') != -1:
                return True
        except:
            # TODO: fix
            pass

        try:
            return False if self.session.head(host, headers=self._fill_headers()).status_code == 405 else True
        except:
            # TODO: fix
            return False

    def scan_host(self, worker_id, host):
        # check if resolvable
        ip = helper.url_to_ip(host)
        if ip is None:
            self.output.write_log('Could not resolve %s  Skipping...' % host, logging.WARNING)
            self.output.urls_scanned += len(self.urls)
            return

        # Check for HEAD
        host_url = helper.host_to_url(host)
        head_available = False
        if self.args.head:
            head_available = self._head_available(host)
            if head_available:
                self.output.write_log('HEAD is supported for %s' % host)

        errors_count, urls_scanned = 0, 0
        for url in self.urls:
            full_url = urljoin(host_url, url)
            r = self.scan_url(full_url, head_available)
            urls_scanned += 1
            self.output.urls_scanned += 1

            # Output
            r['worker'] = worker_id
            self.output.write(**r)
            if r['exception'] is not None:
                errors_count += 1

            # Skip host on errors
            if self.args.skip is not None and errors_count == self.args.skip:
                self.output.write_log('Errors limit reached on %s Skipping other urls.' % host, logging.WARNING)
                self.output.urls_scanned += len(self.urls) - urls_scanned
                break

        # cookies bugfix?
        self.session.cookies.clear()

    def _fill_headers(self):
        # Fill UserAgent in headers
        headers = {}
        if self.args.user_agent is not None:
            headers['User-agent'] = self.args.user_agent
        elif self.args.random_agent:
            headers['User-agent'] = self.ua.random

        # Fill Referer in headers
        if self.args.referer is not None:
            headers['Referer'] = self.args.referer

        return headers

    def _parse_response(self, url, response, exception):
        res = {'url': url,
               'response': response,
               'exception': exception}

        if response is None or exception is not None:
            res.update({
                'status': -1,
                'length': -1,
            })
            return res

        try:
            length = int(response.headers['content-length']) if 'content-length' in response.headers else len(
                response.text)
        except Exception as exception:
            self.output.write_log(
                "Exception while getting content length for URL: %s Exception: %s" % (url, str(exception)),
                logging.ERROR)
            length = 0

        res.update({
            'status': response.status_code,
            'length': length,
        })
        return res

    def scan_url(self, url, use_head=False):
        self.output.write_log('Scanning %s' % url, logging.DEBUG)

        # Query URL and handle exceptions
        response, exception = None, None
        method = 'HEAD' if use_head else 'GET'
        try:
            # TODO: add support for user:password in URL
            response = self.session.request(method, url, headers=self._fill_headers(),
                                            allow_redirects=self.args.allow_redirects)
        except ConnectionError as ex:
            self.output.write_log('Connection error while quering %s' % url, logging.ERROR)
            exception = ex
        except HTTPError as ex:
            self.output.write_log('HTTP error while quering %s' % url, logging.ERROR)
            exception = ex
        except Timeout as ex:
            self.output.write_log('Timeout while quering %s' % url, logging.ERROR)
            exception = ex
        except TooManyRedirects as ex:
            self.output.write_log('Too many redirects while quering %s' % url, logging.ERROR)
            exception = ex
        except Exception as ex:
            self.output.write_log('Unknown exception while quering %s' % url, logging.ERROR)
            exception = ex


        # print('cookies: %s' % self.cookies)
        print('session.cookies: %s' % self.session.cookies)
        # self.session.cookies = self.cookies

        return self._parse_response(url, response, exception)

    def signal_handler(self):
        """
        Signal hdndler
        :return:
        """
        # TODO: add saving status via pickle
        self.output.print_and_log('Signal caught. Stopping...', logging.WARNING)
        self.stop()
        exit(signal.SIGINT)

    def _calc_urls(self):
        # Calculations
        self.urls_count = len(self.urls)
        self.hosts_count = len(self.hosts)
        self.full_urls_count = len(self.urls) * len(self.hosts)
        self.output.args.urls_count = self.full_urls_count

    def start(self):
        """
        Start mulithreaded scan
        :return:
        """
        # Set signal handler
        gevent.signal(signal.SIGTERM, self.signal_handler)
        gevent.signal(signal.SIGINT, self.signal_handler)
        gevent.signal(signal.SIGQUIT, self.signal_handler)

        # ICMP scan
        if self.args.icmp:
            if geteuid() != 0:
                self.output.print_and_log('To use ICMP scan option you must run as root. Skipping ICMP scan', logging.WARNING)
            else:
                self.output.print_and_log('Starting ICMP scan.')
                self.hosts = helper.icmp_scan(self.hosts, self.args.timeout)
                self._calc_urls()
                self.output.print_and_log('After ICMP scan %i hosts %i urls loaded, %i urls to scan' %
                                          (self.hosts_count, self.urls_count, self.full_urls_count))

        # SYN scan
        if self.args.syn:
            if self.args.tor or self.args.proxy is not None:
                self.output.print_and_log('SYN scan via tor or proxy is impossible!', logging.WARNING)
                self.output.print_and_log('Stopping to prevent deanonymization!', logging.WARNING)
                exit(-1)

            if geteuid() != 0:
                self.output.print_and_log('To use SYN scan option you must run as root. Skipping SYN scan', logging.WARNING)
            else:
                self.output.print_and_log('Starting SYN scan.')
                self.hosts = helper.syn_scan(self.hosts, self.args.ports, self.args.timeout)
                self._calc_urls()
                self.output.print_and_log('After SYN scan %i hosts %i urls loaded, %i urls to scan' %
                                          (self.hosts_count, self.urls_count, self.full_urls_count))

        # Check threds count vs hosts count
        if self.args.threads > self.hosts_count:
            self.output.write_log('Too many threads! Fixing threads count to %i' % self.hosts_count, logging.WARNING)
            threads_count = self.hosts_count
        else:
            threads_count = self.args.threads

        # Output urls count
        self.output.args.urls_count = self.full_urls_count

        # Start workers
        self.workers = [spawn(self.worker, i) for i in range(threads_count)]

        # Fill and join queue
        [self.hosts_queue.put(host) for host in self.hosts]
        self.hosts_queue.join()

    def stop(self):
        """
        Stop scan
        :return:
        """
        # TODO: stop correctly
        gevent.killall(self.workers)
예제 #28
0
class WebServer(Flask):
    def __init__(self, *args, **kwargs):
        super(WebServer, self).__init__(*args, **kwargs)
        print 'Webserver started'
        self.debug = True
        self.cmd_queue = JoinableQueue()
        self.event_queue = JoinableQueue()
        self.cmd_id = 0
        self.cmd_results = {}
        gevent.spawn(self.send_commands_to_debugger)
        gevent.spawn(self.receive_events_from_debugger)

    def do_command(self, cmd, args=''):
        cmd_id = self.generate_cmd_id()
        self.cmd_results[cmd_id] = AsyncResult()
        self.cmd_queue.put((
            cmd_id, 
            json.dumps({
                'cmd' : cmd,
                'args' : args, 
            }))
        )
        result = self.cmd_results[cmd_id].wait()
        return json.loads(result)

    def generate_cmd_id(self):
        self.cmd_id += 1
        return self.cmd_id

    def send_commands_to_debugger(self):
        print 'start send_commands_to_debugger'
        conn = None
        while True:
            cmd_id, cmd = self.cmd_queue.get()
            if not cmd:
                break
            print 'send command', cmd
            conn = socket.create_connection(config.command_socket_addr)
            conn.send(cmd)
            result = ''
            while True:
                data = conn.recv(4096)
                if not data: break
                result += data
            cmd_result = self.cmd_results.pop(cmd_id)
            cmd_result.set(result)
            conn.close()
        
    def receive_events_from_debugger(self):
        print 'start receive_events_from_debugger'
        self.event_server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.event_server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        self.event_server.bind(config.event_socket_addr)
        self.event_server.listen(16)
        conn, _ = self.event_server.accept()
        while True:
            self.event_queue.put(conn.recv(4096))

    def clear_event_queue(self):
        self.event_queue = JoinableQueue()

    def shutdown(self):
        self.event_server.close()
예제 #29
0
def get_person_id():
    baidu_tool = MysqlCurd('douban_person')
    baidu_tool.connect_mysql()
    result = baidu_tool.query_mysql_condition('person_name', [{
        'version': 0
    }, ['name']])
    print(result)
    print(result.__len__())
    q = JoinableQueue()
    for _ in result:
        if not baidu_tool.query_mysql_condition('person_name_id',
                                                [{
                                                    'person_name': _[0]
                                                }, ['person_id']]):
            q.put(_[0].strip('\n'))
    error_q = JoinableQueue()

    def crawl(param):
        while not q.empty():
            tool = MysqlCurd('douban_person')
            tool.connect_mysql()
            name = q.get()
            try:
                result = super_downloader(
                    'https://movie.douban.com/subject_search?',
                    params={'search_text': name},
                    proxy=True,
                    cookies=True)
                gevent.sleep(random.uniform(2, 6.5))
            except requests.exceptions.RequestException as e:
                print(name + 'download error!')
                continue
            try:
                page = etree.HTML(result)
                basic = page.xpath(
                    '//*[@id="content"]/div/div[@class="article"]/div[1]/'
                    'div[@class="result-item"]/div[@class="content"]/h3/a')[0]
                id = basic.get('href')[35:-1]
                name = basic.text.split()[0]
                tool.replace_mysql('person_name_id', {
                    'person_id': id,
                    'person_name': name,
                })
                baidu_tool = MysqlCurd('douban_person')
                baidu_tool.connect_mysql()
                baidu_tool.replace_mysql('person_name', {
                    'name': name,
                    'version': 1
                })
                baidu_tool.close_connect()
                tool.close_connect()
                print(name + 'completed')
            except IndexError:
                error_q.put(name)
                print(name + 'error!')

    worker = SleepFunction()
    worker.run(crawl)
    with open('errorlist//person_id.txt', 'a', encoding='utf8') as f:
        if not error_q.empty():
            print(get_time(), file=f)
            while not error_q.empty():
                print(error_q.get(), file=f)
예제 #30
0
파일: scraper.py 프로젝트: mbr/ragstoriches
    def scrape(self,
               url=None,
               scraper_name='index',
               session=None,
               burst_limit=None,
               rate_limit=None,
               receivers=[],
               initial_scope={},
               exception_handler=None):
        pool = Pool(10000)  # almost no limit, limit connections instead
        job_queue = JoinableQueue()
        data_queue = JoinableQueue()

        scope = Scope()
        scope['log'] = logbook.Logger(self.name)
        scope['push_data'] = lambda name, data:\
            data_queue.put((name, data))

        rs = session or requests.Session()
        rs.hooks['response'] = lambda r: glocal.log.info(r.url)
        cticket_gen = TicketGenerator(rate_limit, burst_limit)
        adapter = TicketBoundHTTPAdapter(cticket_gen)
        rs.mount('http://', adapter)
        rs.mount('https://', adapter)
        scope['requests'] = rs
        scope.update(initial_scope)

        job_queue.put(Job(self, scraper_name, url, scope))

        aborted = False

        def run_job(job):
            # runs a single job in the current greenlet
            try:
                # setup new log
                for val in job.run():
                    job_queue.put(job.from_yield(val))
            except CapacityError as e:
                job.log.warning('CapacityError: %s, backing off')
                job.log.debug(traceback.format_exc())
                # FIXME: throttle
            except TemporaryError as e:
                job.log.warning('Temporary failure on %s, ' 'rescheduling')
                job.log.debug(traceback.format_exc())
                job_queue.put(job.retry())
                # FIXME: add limit for retries
            except PermanentError as e:
                job.log.error(e)
                job.log.debug(traceback.format_exc())
            except CriticalError as e:
                job.log.critical(e)
                job.log.debug(traceback.format_exc())
                job.log.debug('Aborting scrape...')
            except Exception as e:
                job.log.error('Error handling job "%s" "%s": %s' %
                              (scraper_name, url, e))
                job.log.debug(traceback.format_exc())
                if exception_handler:
                    exception_handler(sys.exc_info())
            finally:
                job_queue.task_done()

        def job_spawner():
            # using the pool, spawns a new job for every job in the queue
            while not aborted:
                job = job_queue.get()
                if job is None:
                    break
                pool.spawn(run_job, job)

        def receiver_spawner():
            while not aborted:
                record = data_queue.get()
                if record is None:
                    break

                for receiver in receivers:
                    pool.spawn(receiver.process, record, scope)

                data_queue.task_done()

        spawner_greenlet = pool.spawn(job_spawner)
        receiver_greenlet = pool.spawn(receiver_spawner)

        # join queue
        job_queue.join()
        data_queue.join()

        # tell spawner to exit
        job_queue.put(None)
        data_queue.put(None)

        pool.join()

        # now perform all post-processing
        for receiver in receivers:
            if receiver._post_process:
                post_scope = scope.new_child()
                post_scope['log'] = logbook.Logger('%s-post_process')
                post_scope.inject_and_call(receiver._post_process)
예제 #31
0
파일: migrate.py 프로젝트: tempodb/export
class Migrator:
    def __init__(self, scheme, create_devices=True,
                 write_data=True,
                 start_date="2000-01-01T00:00:00Z",
                 end_date="2014-12-31T00:00:00Z",
                 pool_size=3):
        self.scheme = scheme
        self.create_devices = create_devices
        self.should_write_data = write_data
        self.start_date = start_date
        self.end_date = end_date
        self.tdb = TDBClient(scheme.db_key, scheme.db_key,
                             scheme.db_secret,
                             base_url=scheme.db_baseurl)

        iq_endpoint = HTTPEndpoint(scheme.iq_baseurl,
                                   scheme.iq_key,
                                   scheme.iq_secret)
        self.tiq = TIQClient(iq_endpoint)
        self.queue = JoinableQueue()
        self.lock = Lock()
        self.dp_count = 0
        self.req_count = 0
        self.dp_reset = time.time()
        for i in range(pool_size):
            gevent.spawn(self.worker)

    def worker(self):
        while True:
            series = self.queue.get()
            try:
                self.migrate_series(series)
            finally:
                self.queue.task_done()

    def migrate_all_series(self, start_key="", limit=None):
        start_time = time.time()

        (keys, tags, attrs) = self.scheme.identity_series_filter()
        series_set = self.tdb.list_series(keys, tags, attrs)

        # Keep our own state of whether we passed the resume point, so we don't
        # need to assume client and server sort strings the same.
        found_first_series = False

        series_count = 0

        for series in series_set:
            if not found_first_series and series.key < start_key:
                continue
            else:
                found_first_series = True

            if limit and series_count >= limit:
                print("Reached limit of %d devices, stopping." % (limit))
                break

            if self.scheme.identity_series_client_filter(series):
                # If the series looks like an identity series,
                # queue it to be processed by the threadpool
                self.queue.put(series)
                series_count += 1

        self.queue.join()

        end_time = time.time()
        print("Exporting {} devices took {} seconds".format(series_count, end_time - start_time))

    def migrate_series(self, series):
        print("  Beginning to migrate series: %s" % (series.key))
        error = False
        try:
            if self.create_devices:
                error = self.create_device(series)

            if self.should_write_data and not error:
                error = self.write_data(series)
        except Exception, e:
            logging.exception(e)
            error = True

        if not error:
            print("COMPLETED migrating for series %s" % (series.key))
        else:
            print("ERROR migrating series %s" % (series.key))
예제 #32
0
파일: spindl.py 프로젝트: disruptek/spindl
class FileQueue(JoinableQueue):
    """Where files go to die"""
    def put(self, fn, *args, **kw):
        if isinstance(fn, MusicFile):
            mf = fn
        elif isinstance(fn, basestring):
            try:
                mf = MusicFile(fn.strip())
            except BadMusicFile, e:
                log.error(str(e))
                return
        else:
            raise NotImplementedError("bad input: {!r}".format(fn))
        log.debug("adding {} to queue".format(mf))
        return JoinableQueue.put(self, mf, *args, **kw)

    def put_iterable(self, iterable):
        for n in iterable:
            try:
                mf = MusicFile(n.strip())
            except BadMusicFile, e:
                log.error(str(e))
                continue
            if not mf.is_regular_file():
                log.info("skipping {}".format(mf))
                continue
            self.put(mf)

    def get(self, *args, **kw):
        result = JoinableQueue.get(self, *args, **kw)
예제 #33
0
        assert last_nl != -1
        body = reply[:index].decode('utf-8')
        assert reply[-1] == '\n'
        try:
            (lang, page_count) = re_meta.match(reply[index:-1]).groups()
        except:
            print 'searching:', index, `reply[index:-1]`
            raise
        assert page_count.isdigit()
        if body != '':
            meta_xml = urlread_keep_trying('http://%s%s/%s_meta.xml' % (host, path, ia))
            root = fromstring(meta_xml)
            collection = [e.text for e in root.findall('collection')]

            #print 'solr_queue.put((ia, body, page_count))'
            solr_queue.put((ia, body, lang, page_count, collection))
            #print 'solr_queue.put() done'
            items_processed += 1
        else:
            done(ia, False)
        host_queues[host].task_done()

#def index_items():
#    while True:
#        (num, ia, host, path) = item_and_host_queue.get()
#
#        host_queues[host].put((num, ia, path, filename))
#        if host not in host_threads:
#            host_threads[host] = spawn_link_exception(read_text_from_node, host)
#        item_and_host_queue.task_done()
예제 #34
0
파일: ftp2.py 프로젝트: bearnard/ftptest
    zfile = '%s.zip' % site_id
    with zipfile.ZipFile(zfile) as z:
        z.extractall('tmp')

    file_workers = [
        pool.spawn(upload_files, i, worker_id, file_queue)
        for i in xrange(concurrency)
    ]

    for dirname, dirnames, filenames in os.walk('tmp/%s' % site_id):
        # print path to all subdirectories first.
        files = []
        for filename in filenames:
            files.append(os.path.join(dirname, filename))
        for f in files:
            file_queue.put(f, block=False)
        print "START_DIRS"
        dirs = []
        for subdirname in dirnames:
            dirs.append(os.path.join(dirname, subdirname))
        if dirs:
            print "POOLING:", dirs
            dir_pool.imap(mkdirs, dirs)
        print "END"
    #joinall(dir_jobs)
    #joinall([
    #    spawn([s_dir] + dirs) for s_dir, dirs in skel_dirs.iteritems()
    #])

    file_queue.join()
예제 #35
0
class Migrator:
    def __init__(self,
                 scheme,
                 create_devices=True,
                 write_data=True,
                 start_date="2000-01-01T00:00:00Z",
                 end_date="2014-12-31T00:00:00Z",
                 pool_size=3):
        self.scheme = scheme
        self.create_devices = create_devices
        self.should_write_data = write_data
        self.start_date = start_date
        self.end_date = end_date
        self.tdb = TDBClient(scheme.db_key,
                             scheme.db_key,
                             scheme.db_secret,
                             base_url=scheme.db_baseurl)

        iq_endpoint = HTTPEndpoint(scheme.iq_baseurl, scheme.iq_key,
                                   scheme.iq_secret)
        self.tiq = TIQClient(iq_endpoint)
        self.queue = JoinableQueue()
        self.lock = Lock()
        self.dp_count = 0
        self.req_count = 0
        self.dp_reset = time.time()
        for i in range(pool_size):
            gevent.spawn(self.worker)

    def worker(self):
        while True:
            series = self.queue.get()
            try:
                self.migrate_series(series)
            finally:
                self.queue.task_done()

    def migrate_all_series(self, start_key="", limit=None):
        start_time = time.time()

        (keys, tags, attrs) = self.scheme.identity_series_filter()
        series_set = self.tdb.list_series(keys, tags, attrs)

        # Keep our own state of whether we passed the resume point, so we don't
        # need to assume client and server sort strings the same.
        found_first_series = False

        series_count = 0

        for series in series_set:
            if not found_first_series and series.key < start_key:
                continue
            else:
                found_first_series = True

            if limit and series_count >= limit:
                print("Reached limit of %d devices, stopping." % (limit))
                break

            if self.scheme.identity_series_client_filter(series):
                # If the series looks like an identity series,
                # queue it to be processed by the threadpool
                self.queue.put(series)
                series_count += 1

        self.queue.join()

        end_time = time.time()
        print("Exporting {} devices took {} seconds".format(
            series_count, end_time - start_time))

    def migrate_series(self, series):
        print("  Beginning to migrate series: %s" % (series.key))
        error = False
        try:
            if self.create_devices:
                error = self.create_device(series)

            if self.should_write_data and not error:
                error = self.write_data(series)
        except Exception, e:
            logging.exception(e)
            error = True

        if not error:
            print("COMPLETED migrating for series %s" % (series.key))
        else:
            print("ERROR migrating series %s" % (series.key))
예제 #36
0
class FastGet:
    def __init__(self, url, dic, threads=100, report_db=False, keepalive=None, table_name=None):
        self.url = url
        parts = urlparse(url)
        self.scheme, self.host, self.port = parts.scheme, parts.hostname, parts.port
        if not self.port:
            self.port = 443 if self.scheme == 'https' else 80

        self.keepalive = keepalive
        try:
            instance = HehReq(self.host, int(self.port), self.scheme, self.keepalive)
        except Exception as e:
            logging.error('Init exception for %s: %s' % (self.url, e))
            return
        if not keepalive:
            self.keepalive = instance.detect_keepalive()
        if self.keepalive == 0:
            logging.error('Keep-Alive value for %s appears to be 0, check the connection' % url)
            return
        logging.warning('Calculated Keep-Alive for %s: %s' % (url, self.keepalive))

        self.report_db = report_db
        if report_db:
            self.table = table_name
            self.sql_conn(report_db)

        self.queue = JoinableQueue()
        [self.queue.put(dic[i:i + self.keepalive]) for i in xrange(0, len(dic), self.keepalive)]
        [spawn(self.worker) for _ in xrange(threads)]
        self.queue.join()

    def sql_conn(self, report_db):
        self.conn = MySQLdb.connect(report_db['host'], report_db['user'], report_db['passwd'], report_db['db'])
        self.cur = self.conn.cursor()
        if not self.table:
            self.table = 'scan_%s' % datetime.strftime(datetime.now(), '%Y_%m_%d_%H%M%S')
            self.cur.execute(
                'create table %s(scheme varchar(16), host varchar(128), port smallint, uri varchar(128),\
                code smallint, size int, type varchar(128))' % self.table)

    def report(self, result):
        if result[1] not in [302, 404]:
            logging.warning('Path %s://%s:%s/%s, response code %s, content-length %s, content-type %s' % (
                self.scheme, self.host, self.port, result[0], result[1], result[2], result[3]))
        if self.report_db:
            p = [self.scheme, self.host, self.port] + list(result)
            self.cur.execute('insert into %s values(%%s,%%s,%%s,%%s,%%s,%%s,%%s)' % self.table, p)

    def worker(self):
        try:
            instance = HehReq(self.host, int(self.port), self.scheme, self.keepalive)
        except Exception as e:
            logging.error('Worker init exception for %s: %s' % (self.url, e))
            return
        while not self.queue.empty():
            paths = self.queue.get()
            try:
                for x in instance.bulk_get(paths):
                    self.report(x)
            except Exception as e:
                logging.error('Worker loop exception for %s: %s' % (self.url, e))
            finally:
                if self.report_db:
                    self.conn.commit()
                self.queue.task_done()
def start_fluud():
    parser = argparse.ArgumentParser()
    parser.add_argument('host', help='mongo host')
    parser.add_argument('port', help='mongo port')
    parser.add_argument('--login', help='mongo login')
    parser.add_argument('--password', help='mongo password')
    args = parser.parse_args()

    if args.login and args.password:
        login = urllib.quote_plus(args.login)
        password = urllib.quote_plus(args.password)
        uri = 'mongodb://{}:{}@{}:{}/'.format(login, password, args.host, args.port)
    else:
        uri = 'mongodb://{}:{}/'.format(args.host, args.port)

    client = MongoClient(uri)

    template = {
        "first_sample_timestamp": dateutil.parser.parse("2015-09-02T13:08:20.314Z"),
        "last_sample_timestamp":  dateutil.parser.parse("2015-09-02T13:08:20.314Z"),
        "metadata": {
            "typeURI": "http://schemas.dmtf.org/cloud/audit/1.0/event",
            "initiator": {
                "typeURI": "service/security/account/user",
                "host": {
                    "address": "192.168.0.2"
                },
                "id": "openstack:610e7d74-16af-4358-9b77-5275194fa6e4",
                "name": "8b07b49216d243d2b49561759bd104f4"
            },
            "target": {
                "typeURI": "service/security/account/user",
                "id": "openstack:fc43ddcf-d147-466c-adfe-d60bd2b773ba"
            },
            "observer": {
                "typeURI": "service/security",
                "id": "openstack:a256def4-0a36-472e-95e5-e456db4e0681"
            },
            "eventType": "activity",
            "eventTime": "2015-09-02T13:08:20.256770+0000",
            "host": "identity.node-1",
            "action": "authenticate",
            "outcome": "success",
            "id": "openstack:00244b9a-1a43-48a5-b75e-9d68dd647487",
            "event_type": "identity.authenticate"
        },
        "meter": [
            {
                "counter_name": "identity.authenticate.success",
                "counter_unit": "user",
                "counter_type": "delta"
            }
        ],
        "project_id": None,
        "source": "openstack",
        "user_id": "openstack:610e7d74-16af-4358-9b77-5275194fa6e4"
    }

    data = [copy.deepcopy(template) for _ in range(10000)]

    def progress():
        while True:
            print client.ceilometer.resource.count()
            sys.stdout.flush()
            sleep(2)

    spawn(progress)

    def worker():
        while True:
            q.get()
            try:
                client.ceilometer.resource.insert_many(copy.deepcopy(data), False)
            finally:
                q.task_done()

    q = JoinableQueue()
    for i in range(10):
        spawn(worker)

    for i in range(100):
        q.put(0)

    q.join()
예제 #38
0
class Importer(object):
    def __init__(self, creds, pool_size=POOL_SIZE):
        self.client = get_session(creds['host'],
                                  creds['key'],
                                  creds['secret'])
        self.queue = JoinableQueue(maxsize=POOL_SIZE*2)
        for i in range(pool_size):
            gevent.spawn(self.worker)

    def worker(self):
        while True:
            job = self.queue.get()
            typ = job.get('type')
            try:
                if typ == 'device':
                    self._process_device(job['data'])
                elif typ == 'datapoints':
                    self._process_datapoints(job['data'])
            finally:
                self.queue.task_done()

    def write_devices(self, devices):
        for device in devices:
            self.queue.put({'type': 'device', 'data': device})
        self.queue.join()

    def write_datapoints_from_file(self, infile):
        points = {}
        lineno = 0
        for line in infile:
            lineno += 1
            (device, sensor, ts, val) = line.split('\t')
            pts = points.setdefault(device, {}).setdefault(sensor, [])
            pts.append({'t': ts, 'v': float(val)})

            if lineno % 1000 == 0:
                self.queue.put({'type': 'datapoints', 'data': points})
                points = {}

        if points:
            self.queue.put({'type': 'datapoints', 'data': points})
        self.queue.join()

    def _process_device(self, device, retries=5):
        res = self.client.create_device(device)
        if res.successful != tempoiq.response.SUCCESS:
            if 'A device with that key already exists' in res.body:
                print("Skipping creating existing device {}"
                      .format(device['key']))
                return

            if retries > 0:
                print("Retrying device create {}, error {}"
                      .format(device['key'], res.body))
                self._process_device(device, retries - 1)
            else:
                print("Retries exceeded; couldn't create device {}"
                      .format(device['key']))

    def _process_datapoints(self, write_request, retries=5):
        try:
            res = self.client.write(write_request)
        except Exception, e:
            print("ERROR with request: --->")
            print(json.dumps(write_request, default=WriteEncoder().default))
            raise e

        if res.successful != tempoiq.response.SUCCESS:
            if retries > 0:
                print("Retrying write, error was: {}".format(res.body))
                return self._process_datapoints(write_request, retries - 1)
            else:
                print("Retries exceeded; lost data!")
                print(json.dumps(write_request, default=WriteEncoder().default))
                return True
        return False
예제 #39
0
class InterceptedStreamsMixin(object):
    """
    Mixin class for GethProcess instances that feeds all of the stdout and
    stderr lines into some set of provided callback functions.
    """
    stdout_callbacks = None
    stderr_callbacks = None

    def __init__(self, *args, **kwargs):
        super(InterceptedStreamsMixin, self).__init__(*args, **kwargs)
        self.stdout_callbacks = []
        self.stdout_queue = JoinableQueue()

        self.stderr_callbacks = []
        self.stderr_queue = JoinableQueue()

    def register_stdout_callback(self, callback_fn):
        self.stdout_callbacks.append(callback_fn)

    def register_stderr_callback(self, callback_fn):
        self.stderr_callbacks.append(callback_fn)

    def produce_stdout_queue(self):
        for line in iter(self.proc.stdout.readline, b''):
            self.stdout_queue.put(line)
            gevent.sleep(0)

    def produce_stderr_queue(self):
        for line in iter(self.proc.stderr.readline, b''):
            self.stderr_queue.put(line)
            gevent.sleep(0)

    def consume_stdout_queue(self):
        while True:
            line = self.stdout_queue.get()
            for fn in self.stdout_callbacks:
                fn(line.strip())
            gevent.sleep(0)

    def consume_stderr_queue(self):
        while True:
            line = self.stderr_queue.get()
            for fn in self.stderr_callbacks:
                fn(line.strip())
            gevent.sleep(0)

    def start(self):
        super(InterceptedStreamsMixin, self).start()

        gevent.spawn(self.produce_stdout_queue)
        gevent.spawn(self.produce_stderr_queue)

        gevent.spawn(self.consume_stdout_queue)
        gevent.spawn(self.consume_stderr_queue)

    def stop(self):
        super(InterceptedStreamsMixin, self).stop()

        try:
            self.stdout_queue.join(5)
        except Timeout:
            pass

        try:
            self.stderr_queue.join(5)
        except Timeout:
            pass
예제 #40
0
    m.update(response_content)
    m.digest()

    #Extract the links and add them to the queue. Using links_added
    #counter to limit the number of links to fetch.
    for link in re.findall('<a href="(http.*?)"', response_content):
        if links_added < num_to_crawl:
            links_added += 1
            q.put(link) 

#Worker spawned by gevent. Continously gets links, works on them and marks
#them as done.
def worker(crawler_id):
    while True:
        item = q.get()
        try:
            do_work(item, crawler_id)
        finally:
            q.task_done()

#Spawning worker threads.
crawler_id = 0
for i in range(num_worker_threads):
    gevent.spawn(worker, crawler_id)
    crawler_id += 1 

q.put(source)
links_added += 1

q.join()  # block until all tasks are done
예제 #41
0
class Service(object):
    def __init__(self, callback, **args):
        self.callback = callback
        self.result_queue = args.get('result_queue')
        self.package_queue = JoinableQueue()
        self.failed_queue = []
        self.env = args.get('env')

        self.main_greenlet = None
        self.pool = Pool(args.get('concurrency'))
        self.should_run = True

        self.subscribers = []
        self.logger = Logger(self.name, args.get('log_level'))

    @property
    def name(self):
        return self.__class__.__name__.lower()

    def queue(self, package, sender_name, **data):
        assert (sender_name == 'downloadmanager' and data.get('path')) or True
        self.package_queue.put((package, (sender_name, data)))
        self.logger.level(3, ' * queue(from=%s, to=%s, package=%s, data=%s)',
                          sender_name, self.name, package, data)

    def consume(self):
        package, sender_data = self.package_queue.get()
        self.pool.spawn(self._run_service, package, sender_data)
        self.logger.level(3, ' * %s.run(package=%s, sender_data=%s)',
                          self.name, package, sender_data)

    def subscribe(self, other):
        other.subscribers.append(self)

    def loop(self):
        while self.should_run:
            self.consume()

    def start(self):
        self.main_greenlet = gevent.spawn(self.loop)

    def stop(self, force=False):
        # This will force the current iteraton on `loop()` to be the last one,
        # so the thing we're processing will be able to finish;
        self.should_run = False

        # if the caller is in a hurry, we'll just kill everything mercilessly
        if force and self.main_greenlet:
            self.main_greenlet.kill()

    def _run_service(self, package, sender_data):
        try:
            data = self.callback(package, sender_data)
        except NotForMe:
            return
        except ReportableError as exc:
            self.failed_queue.append((package, exc))
            self.logger.level(0, "Error: %s", exc)
        except BaseException as exc:
            self.failed_queue.append((package, exc))
            self.logger.traceback(4,
                'failed to run %s (requested by:%s) for package %s:',
                self.name, sender_data[0], package, exc=exc)
        else:
            # Let's notify our subscribers
            for subscriber in self.subscribers:
                subscriber.queue(package, self.name, **(data or {}))

            # If the callback worked, let's go ahead and tell the world. If and
            # only if requested by the caller, of course.
            if self.result_queue:
                self.result_queue.put(package)
예제 #42
0
    def _response_handler (self, env, start_response):
        """handle HTTP request/response"""
        uri_path = env["PATH_INFO"]
        body = JoinableQueue()

        if self._uow and self._uow.handle_endpoints(self, uri_path, env, start_response, body):
            pass

        ##########################################
        # Worker endpoints

        elif uri_path == '/shard/config':
            # configure the service to run a shard
            Greenlet(self.shard_config, env, start_response, body).start()

        elif uri_path == '/shard/stop':
            # shutdown the service
            ## NB: must parse POST data specially, to avoid exception
            payload = loads(env["wsgi.input"].read())
            Greenlet(self.shard_stop, payload).start_later(1)

            # HTTP response starts first, to avoid error after server stops
            start_response('200 OK', [('Content-Type', 'text/plain')])
            body.put("Goodbye\r\n")
            body.put(StopIteration)

        elif uri_path == '/queue/wait':
            # wait until all shards have finished sending task_queue requests
            Greenlet(self.queue_wait, env, start_response, body).start()

        elif uri_path == '/queue/join':
            # join on the task_queue, as a barrier to wait until it empties
            Greenlet(self.queue_join, env, start_response, body).start()

        elif uri_path == '/check/persist':
            ## NB: TODO checkpoint the service state to durable storage
            start_response('200 OK', [('Content-Type', 'text/plain')])
            body.put("Bokay\r\n")
            body.put(StopIteration)

        elif uri_path == '/check/recover':
            ## NB: TODO restart the service, recovering from most recent checkpoint
            start_response('200 OK', [('Content-Type', 'text/plain')])
            body.put("Bokay\r\n")
            body.put(StopIteration)

        ##########################################
        # HashRing endpoints

        elif uri_path == '/ring/init':
            # initialize the HashRing
            Greenlet(self.ring_init, env, start_response, body).start()

        elif uri_path == '/ring/add':
            ## NB: TODO add a node to the HashRing
            start_response('200 OK', [('Content-Type', 'text/plain')])
            body.put("Bokay\r\n")
            body.put(StopIteration)

        elif uri_path == '/ring/del':
            ## NB: TODO delete a node from the HashRing
            start_response('200 OK', [('Content-Type', 'text/plain')])
            body.put("Bokay\r\n")
            body.put(StopIteration)

        ##########################################
        # utility endpoints

        elif uri_path == '/':
            # dump info about the service in general
            start_response('200 OK', [('Content-Type', 'text/plain')])
            body.put(str(env) + "\r\n")
            body.put(StopIteration)

        else:
            # ne znayu
            start_response('404 Not Found', [('Content-Type', 'text/plain')])
            body.put('Not Found\r\n')
            body.put(StopIteration)

        return body
예제 #43
0
파일: logging.py 프로젝트: ZigmundRat/moat
class BaseLogger(Collected, Jobber):
    """\
		This class implements one particular way to log things.
		"""
    storage = Loggers.storage
    q = None
    job = None
    ready = False
    _in_flush = False

    def __init__(self, level):
        self.level = level

        global logger_nr
        logger_nr += 1

        if not hasattr(self, "name") or self.name is None:
            self.name = Name(self.__class__.__name__, "x" + str(logger_nr))

        super(BaseLogger, self).__init__()
        self._init()

    def _init(self):
        """Fork off the writer thread.
		   Override this to do nothing if you don't have one."""

        self.q = JoinableQueue(100)
        self.start_job("job", self._writer)
        self.job.link(self.delete)
        if self.ready is False:
            self.ready = True
        else:
            self.stop_job("job")  # concurrency issues?

    def _writer(self):
        errs = 0
        for r in self.q:
            try:
                if r is FlushMe:
                    self._flush()
                else:
                    self._log(*r)
            except Exception as ex:
                errs += 1
                fix_exception(ex)
                from moat.run import process_failure
                process_failure(ex)
                if errs > 10:
                    reraise(ex)
            else:
                if errs:
                    errs -= 1
            finally:
                self.q.task_done()
        self.q.task_done()  # for the StopIter

    # Collection stuff
    def list(self):
        yield super(BaseLogger, self)
        yield ("Type", self.__class__.__name__)
        yield ("Level", LogNames[self.level])
        yield ("Queue", self.q.qsize())

    def info(self):
        return LogNames[self.level] + ": " + self.__class__.__name__

    def delete(self, ctx=None):
        if self.ready:
            self.ready = None
            super(BaseLogger, self).delete(ctx)
        try:
            if self.q:
                self.q.put(StopIteration, block=False)
        except Full:
            ## panic?
            pass
        if self.job is not None:
            self.job.join(timeout=1)
            self.stop_job("job")

    def _wlog(self, *a):
        try:
            self.q.put(a, block=False)
        except Full:
            ## panic?
            self.delete()

    def _log(self, level, *a):
        a = " ".join(
            (x if isinstance(x, six.string_types) else str(x) for x in a))
        self._slog(level, a)

    def _slog(self, a):
        raise NotImplementedError("You need to override %s._log or ._slog" %
                                  (self.__class__.__name__, ))

    def _flush(self):
        pass

    def log(self, level, *a):
        if LogLevels[level] >= self.level:
            self._wlog(level, *a)
            if TESTING and not (hasattr(a[0], "startswith")
                                and a[0].startswith("TEST")):
                self.flush()
            else:
                gevent.sleep(0)

    def log_event(self, event, level):
        if level >= self.level:
            for r in report_(event, 99):
                self._wlog(LogNames[level], r)
            if TESTING:
                self.flush()

    def log_failure(self, err, level=WARN):
        if level >= self.level:
            self._wlog(LogNames[level], format_exception(err))
            if TESTING:
                self.flush()

    def flush(self):
        if self._in_flush: return
        if self.q is not None:
            try:
                self._in_flush = True
                self.q.put(FlushMe)
                self.q.join()
            finally:
                self._in_flush = False

    def end_logging(self):
        self.flush()
        self.delete()
        pass
    #print ('%s: %s bytes: %r' % (url, len(data), data[:50]))

def worker():
    while True:
        url = q.get()
        try:
            print_head(url)
        finally:
            q.task_done()

NUM_WORKER_THREADS = 50
NUM_REQUESTS = 5000

q = JoinableQueue()
for i in range(NUM_WORKER_THREADS):
     gevent.spawn(worker)

start_time = time.time()

for i in xrange(NUM_REQUESTS):
    url = 'http://127.0.0.1/' + str(i)
    q.put(url)

q.join()  # block until all tasks are done

end_time = time.time()

show_stats( start_time, end_time, NUM_REQUESTS)

예제 #45
0
            index += 10
        print('analysis person work: ' + self.name + 'finished!')
        return True


if __name__ == '__main__':
    tool = MysqlCurd('douban_person')
    tool.connect_mysql()
    persons = tool.query_mysql_condition('person_info', [{
        'version': 0
    }, ['person_name', 'person_id']])
    print(persons)
    print(persons.__len__())
    q = JoinableQueue()
    for _ in persons:
        q.put(_)

    def temp(param):
        while not q.empty():
            i = q.get()
            p = Person(id=i[1], name=i[0])

            flag = p.analysis_person_info()
            if flag:
                name_id_tool = MysqlCurd('douban_person')
                name_id_tool.connect_mysql()
                name_id_tool.replace_mysql('person_name_id',
                                           [{
                                               'person_id': p.id,
                                               'person_name': p.name,
                                               'version': 0
예제 #46
0
            save_queue.put((word, direction, data))

    conn.close()



arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('--dict', type=str, help='dict path')
arg_parser.add_argument('--direction', type=str, help='direction')

args = arg_parser.parse_args()

word_list = get_words(args.dict)
print('word list size = %d' % (len(word_list)))
print(word_list[0:10])

fetch_queue = JoinableQueue()

save_queue = JoinableQueue()

for i in range(100):
    gevent.spawn(fetch_worker, fetch_queue, save_queue, args.direction)

gevent.spawn(save_worker, DSN, save_queue)

for word in word_list:
    fetch_queue.put(word)

fetch_queue.join()
save_queue.join()
예제 #47
0
class ScoringService(Service):
    """A service that assigns a score to submission results.

    A submission result is ready to be scored when its compilation is
    unsuccessful (in this case, no evaluation will be performed) or
    after it has been evaluated. The goal of scoring is to use the
    evaluations to determine score, score_details, public_score,
    public_score_details and ranking_score_details (all non-null).
    Scoring is done by the compute_score method of the ScoreType
    defined by the dataset of the result.

    ScoringService keeps a queue of (submission_id, dataset_id) pairs
    identifying submission results to score. A greenlet is spawned to
    consume this queue, one item at a time. The queue is filled by the
    new_evaluation and the invalidate_submissions RPC methods, and by a
    sweeper greenlet, whose duty is to regularly check all submissions
    in the database and put the unscored ones in the queue (this check
    can also be forced by the search_jobs_not_done RPC method).

    """

    # How often we look for submission results not scored.
    SWEEPER_TIMEOUT = 347.0

    def __init__(self, shard):
        """Initialize the ScoringService.

        """
        Service.__init__(self, shard)

        # Set up communication with ProxyService.
        self.proxy_service = self.connect_to(ServiceCoord("ProxyService", 0))

        # Set up and spawn the scorer.
        # TODO Link to greenlet: when it dies, log CRITICAL and exit.
        self._scorer_queue = JoinableQueue()
        gevent.spawn(self._scorer_loop)

        # Set up and spawn the sweeper.
        # TODO Link to greenlet: when it dies, log CRITICAL and exit.
        self._sweeper_start = None
        self._sweeper_event = Event()
        gevent.spawn(self._sweeper_loop)

    def _scorer_loop(self):
        """Monitor the queue, scoring its top element.

        This is an infinite loop that, at each iteration, gets an item
        from the queue (blocking until there is one, if the queue is
        empty) and scores it. Any error during the scoring is sent to
        the logger and then suppressed, because the loop must go on.

        """
        while True:
            submission_id, dataset_id = self._scorer_queue.get()
            try:
                self._score(submission_id, dataset_id)
            except Exception:
                logger.error("Unexpected error when scoring submission %d on "
                             "dataset %d.", submission_id, dataset_id,
                             exc_info=True)
            finally:
                self._scorer_queue.task_done()

    def _score(self, submission_id, dataset_id):
        """Assign a score to a submission result.

        This is the core of ScoringService: here we retrieve the result
        from the database, check if it is in the correct status,
        instantiate its ScoreType, compute its score, store it back in
        the database and tell ProxyService to update RWS if needed.

        submission_id (int): the id of the submission that has to be
            scored.
        dataset_id (int): the id of the dataset to use.

        """
        with SessionGen() as session:
            # Obtain submission.
            submission = Submission.get_from_id(submission_id, session)
            if submission is None:
                raise ValueError("Submission %d not found in the database." %
                                 submission_id)

            # Obtain dataset.
            dataset = Dataset.get_from_id(dataset_id, session)
            if dataset is None:
                raise ValueError("Dataset %d not found in the database." %
                                 dataset_id)

            # Obtain submission result.
            submission_result = submission.get_result(dataset)

            # It means it was not even compiled (for some reason).
            if submission_result is None:
                raise ValueError("Submission result %d(%d) was not found." %
                                 (submission_id, dataset_id))

            # Check if it's ready to be scored.
            if not submission_result.needs_scoring():
                if submission_result.scored():
                    logger.info("Submission result %d(%d) is already scored.",
                                submission_id, dataset_id)
                    return
                else:
                    raise ValueError("The state of the submission result "
                                     "%d(%d) doesn't allow scoring." %
                                     (submission_id, dataset_id))

            # Instantiate the score type.
            score_type = get_score_type(dataset=dataset)

            # Compute score and fill it in the database.
            submission_result.score, \
                submission_result.score_details, \
                submission_result.public_score, \
                submission_result.public_score_details, \
                submission_result.ranking_score_details = \
                score_type.compute_score(submission_result)

            # Store it.
            session.commit()

            # If dataset is the active one, update RWS.
            if dataset is submission.task.active_dataset:
                self.proxy_service.submission_scored(
                    submission_id=submission.id)

    def _sweeper_loop(self):
        """Regularly check the database for unscored results.

        Try to sweep the database once every SWEEPER_TIMEOUT seconds
        but make sure that no two sweeps run simultaneously. That is,
        start a new sweep SWEEPER_TIMEOUT seconds after the previous
        one started or when the previous one finished, whatever comes
        last.

        The search_jobs_not_done RPC method can interfere with this
        regularity, as it tries to run a sweeper as soon as possible:
        immediately, if no sweeper is running, or as soon as the
        current one terminates.

        Any error during the sweep is sent to the logger and then
        suppressed, because the loop must go on.

        """
        while True:
            self._sweeper_start = monotonic_time()
            self._sweeper_event.clear()

            try:
                self._sweep()
            except Exception:
                logger.error("Unexpected error when searching for unscored "
                             "submissions.", exc_info=True)

            self._sweeper_event.wait(max(self._sweeper_start +
                                         self.SWEEPER_TIMEOUT -
                                         monotonic_time(), 0))

    def _sweep(self):
        """Check the database for unscored submission results.

        Obtain a list of all the submission results in the database,
        check each of them to see if it's still unscored and, in case,
        put it in the queue.

        """
        counter = 0

        with SessionGen() as session:
            for sr in get_submission_results(session=session):
                if sr is not None and sr.needs_scoring():
                    self._scorer_queue.put((sr.submission_id, sr.dataset_id))
                    counter += 1

        if counter > 0:
            logger.info("Found %d unscored submissions.", counter)

    @rpc_method
    def search_jobs_not_done(self):
        """Make the sweeper loop fire the sweeper as soon as possible.

        """
        self._sweeper_event.set()

    @rpc_method
    def new_evaluation(self, submission_id, dataset_id):
        """Schedule the given submission result for scoring.

        Put it in the queue to have it scored, sooner or later. Usually
        called by EvaluationService when it's done with a result.

        submission_id (int): the id of the submission that has to be
            scored.
        dataset_id (int): the id of the dataset to use.

        """
        self._scorer_queue.put((submission_id, dataset_id))

    @rpc_method
    def invalidate_submission(self, submission_id=None, dataset_id=None,
                              user_id=None, task_id=None, contest_id=None):
        """Invalidate (and re-score) some submission results.

        Invalidate the scores of the submission results that:
        - belong to submission_id or, if None, to any submission of
          user_id and/or task_id or, if both None, to any submission
          of contest_id or, if None, to any submission in the database.
        - belong to dataset_id or, if None, to any dataset of task_id
          or, if None, to any dataset of contest_id or, if None, to any
          dataset in the database.

        submission_id (int|None): id of the submission whose results
            should be invalidated, or None.
        dataset_id (int|None): id of the dataset whose results should
            be invalidated, or None.
        user_id (int|None): id of the user whose results should be
            invalidated, or None.
        task_id (int|None): id of the task whose results should be
            invalidated, or None.
        contest_id (int|None): id of the contest whose results should
            be invalidated, or None.

        """
        logger.info("Invalidation request received.")

        # We can put results in the scorer queue only after they have
        # been invalidated (and committed to the database). Therefore
        # we temporarily save them somewhere else.
        temp_queue = list()

        with SessionGen() as session:
            submission_results = \
                get_submission_results(contest_id, user_id, task_id,
                                       submission_id, dataset_id,
                                       session=session)

            for sr in submission_results:
                if sr.scored():
                    sr.invalidate_score()
                    temp_queue.append((sr.submission_id, sr.dataset_id))

            session.commit()

        for item in temp_queue:
            self._scorer_queue.put(item)

        logger.info("Invalidated %d submissions.", len(temp_queue))
예제 #48
0
파일: ftp2.py 프로젝트: bearnard/ftptest
    zfile = '%s.zip' % site_id
    with zipfile.ZipFile(zfile) as z:
        z.extractall('tmp')

    file_workers = [
        pool.spawn(upload_files, i, worker_id, file_queue) for i in xrange(concurrency)
    ]

    for dirname, dirnames, filenames in os.walk('tmp/%s' % site_id):
        # print path to all subdirectories first.
        files = []
        for filename in filenames:
            files.append(os.path.join(dirname, filename))
        for f in files:
            file_queue.put(f, block=False)
        print "START_DIRS"
        dirs = []
        for subdirname in dirnames:
            dirs.append(os.path.join(dirname, subdirname))
        if dirs:
            print "POOLING:", dirs
            dir_pool.imap(mkdirs, dirs)
        print "END"
    #joinall(dir_jobs)
    #joinall([
    #    spawn([s_dir] + dirs) for s_dir, dirs in skel_dirs.iteritems()
    #])

    file_queue.join()
예제 #49
0
파일: logging.py 프로젝트: M-o-a-T/moat
class BaseLogger(Collected,Jobber):
	"""\
		This class implements one particular way to log things.
		"""
	storage = Loggers.storage
	q = None
	job = None
	ready = False
	_in_flush = False
	def __init__(self, level):
		self.level = level

		global logger_nr
		logger_nr += 1

		if not hasattr(self,"name") or self.name is None:
			self.name = Name(self.__class__.__name__, "x"+str(logger_nr))

		super(BaseLogger,self).__init__()
		self._init()

	def _init(self):
		"""Fork off the writer thread.
		   Override this to do nothing if you don't have one."""

		self.q = JoinableQueue(100)
		self.start_job("job",self._writer)
		self.job.link(self.delete)
		if self.ready is False:
			self.ready = True
		else:
			self.stop_job("job") # concurrency issues?

	def _writer(self):
		errs = 0
		for r in self.q:
			try:
				if r is FlushMe:
					self._flush()
				else:
					self._log(*r)
			except Exception as ex:
				errs += 1
				fix_exception(ex)
				from moat.run import process_failure
				process_failure(ex)
				if errs > 10:
					reraise(ex)
			else:
				if errs:
					errs -= 1
			finally:
				self.q.task_done()
		self.q.task_done() # for the StopIter

	# Collection stuff
	def list(self):
		yield super(BaseLogger,self)
		yield ("Type",self.__class__.__name__)
		yield ("Level",LogNames[self.level])
		yield ("Queue",self.q.qsize())

	def info(self):
		return LogNames[self.level]+": "+self.__class__.__name__

	def delete(self, ctx=None):
		if self.ready:
			self.ready = None
			super(BaseLogger,self).delete(ctx)
		try:
			if self.q:
				self.q.put(StopIteration,block=False)
		except Full:
			## panic?
			pass
		if self.job is not None:
			self.job.join(timeout=1)
			self.stop_job("job")

	def _wlog(self, *a):
		try:
			self.q.put(a, block=False)
		except Full:
			## panic?
			self.delete()

	def _log(self, level, *a):
		a=" ".join(( x if isinstance(x,six.string_types) else str(x)  for x in a))
		self._slog(level,a)

	def _slog(self, a):
		raise NotImplementedError("You need to override %s._log or ._slog" % (self.__class__.__name__,))

	def _flush(self):
		pass

	def log(self, level, *a):
		if LogLevels[level] >= self.level:
			self._wlog(level,*a)
			if TESTING and not (hasattr(a[0],"startswith") and a[0].startswith("TEST")):
				self.flush()
			else:
				gevent.sleep(0)

	def log_event(self, event, level):
		if level >= self.level:
			for r in report_(event,99):
				self._wlog(LogNames[level],r)
			if TESTING:
				self.flush()

	def log_failure(self, err, level=WARN):
		if level >= self.level:
			self._wlog(LogNames[level],format_exception(err))
			if TESTING:
				self.flush()
	
	def flush(self):
		if self._in_flush: return
		if self.q is not None:
			try:
				self._in_flush = True
				self.q.put(FlushMe)
				self.q.join()
			finally:
				self._in_flush = False

	def end_logging(self):
		self.flush()
		self.delete()
예제 #50
0
from datetime import datetime, timedelta
from textmeplz.utils import get_twilio
from gevent.queue import JoinableQueue
import gevent
from gevent import monkey
monkey.patch_all()

seven_days_ago = datetime.now() - timedelta(days=7)
tasks = JoinableQueue()
tw = get_twilio()
msgsiterable = tw.messages.iter(after=seven_days_ago)

print "Getting all messages from Twilio."
count = 0
for msg in msgsiterable:
    tasks.put(msg)
    count += 1
    if count % 50 == 0:
        print "Got %s messages." % count


def process_messages(thread_num):
    print "Thread %s starting up." % thread_num
    while not tasks.empty():
        msg = tasks.load()
        try:
            for media in msg.media_list.list():
                print "Thread %s deleting media %s" % (thread_num, media.sid)
                media.delete()
        except:
            pass
예제 #51
0
class BaseCrawler(object):
    def __init__(self,
                 requestHandler=BaseRequestHandler(),
                 parseHandler=BaseParseHandler(),
                 sheduler=BaseScheduler(),
                 pipeline=BasePipeline()):
        self.requestHandler = requestHandler
        self.parseHandler = parseHandler
        self.sheduler = sheduler
        self.pipeline = pipeline
        self.task_queue = JoinableQueue()
        self.response_queue = JoinableQueue()
        self.tasks_cnt = 0
        self.result_queue = JoinableQueue()
        self.jobs_cnt = config.num_threads
        self.start_time = time.time()
        self.stop = False

    def doScheduler(self):
        """Generate tasks, one thread
        """
        logging.info('scheduler started!')
        for task in self.sheduler.init_generator():
            self.task_queue.put(task)
            self.tasks_cnt += 1

        while self.tasks_cnt > 0 and not self.stop:
            gevent.sleep(config.new_task_check_time)

        logging.info('scheduler finished! All task done.')

        for i in xrange(config.num_threads):
            self.task_queue.put(StopIteration)

    def worker(self):
        """Fetch url and parse, config.num_threads threads
        """
        task = self.task_queue.get()
        cnt = config.error_retry_cnt
        while task != StopIteration:
            try:
                #timeout = gevent.Timeout(config.TASK_TIMEOUT)
                #timeout.start()
                response = self.requestHandler.handle(task)
                result, new_tasks = self.parseHandler.handle(response)
                #timeout.cancel()
                #if isinstance(result, collections.Iterable):
                #if isinstance(result, list):
                #    for ret in result:
                #        self.result_queue.put(ret)
                #else:
                if result:
                    self.result_queue.put(result)
                for task in new_tasks:
                    self.task_queue.put(task)
                    self.tasks_cnt += 1
                #self.task_queue.task_done()
                self.tasks_cnt -= 1
                task = self.task_queue.get()
                cnt = config.error_retry_cnt
            except Exception as e:
                try:
                    #timeout.cancel()
                    cnt -= 1
                    logging.exception(e)
                    if cnt <= 0:
                        #self.task_queue.task_done()
                        self.tasks_cnt -= 1
                        task = self.task_queue.get()
                        logging.error(
                            'task failed, try \033[31m%d\033[0m times! will not try'
                            % (config.error_retry_cnt - cnt))
                        cnt = config.error_retry_cnt
                    #logging.exception('task failed!')
                    else:
                        logging.error(
                            'task failed, try \033[31m%d\033[0m times!' %
                            (config.error_retry_cnt - cnt))
                except Exception as e:
                    self.tasks_cnt -= 1
                    #self.jobs_cnt -= 1
                    raise
            finally:
                #timeout.cancel()
                pass
        self.jobs_cnt -= 1

    def doPipeline(self):
        while self.jobs_cnt > 0 or not self.result_queue.empty():
            gevent.sleep(config.pipeline_sleeptime)
            results = []
            try:
                while 1:
                    results.append(self.result_queue.get_nowait())
                    if len(results) > 100:
                        raise gevent.queue.Empty
            except gevent.queue.Empty:
                if results:
                    try:
                        self.pipeline.process(results)
                    except:
                        logging.exception('')
                #logging.exception('')
            except:
                logging.exception('')

    def run(self):
        jobs = [
            gevent.spawn(self.doScheduler),
            gevent.spawn(self.doPipeline),
        ]
        for i in xrange(config.num_threads):
            job = gevent.spawn(self.worker)
            jobs.append(job)
            #thread.start_new_thread(self.worker)
        try:
            timeout = gevent.Timeout(config.CRAWLER_TIMEOUT)
            timeout.start()
            #self.task_queue.join()
            gevent.joinall(jobs)
        except:
            logging.exception('pipeline error!')
        finally:
            timeout.cancel()
            self.end_time = time.time()
            logging.info('run times: %f s' % (self.end_time - self.start_time))
예제 #52
0
#    project_cache = shelve.open("project_cache.shelve")

    q = JoinableQueue()
    project_queue = JoinableQueue()
    out_queue = Queue()
    length_queue = Queue()

    for i in range(NUM_THEME_WORKER_THREADS):
         gevent.spawn(theme_worker)

    for i in range(NUM_PROJECT_WORKER_THREADS):
         gevent.spawn(project_worker)

#    i = 0
    for item in get_themes():
        q.put(item)
#        i += 1
#        if i >= 1:
#            break

    try:
        q.join()  # block until all tasks are done
        project_queue.join()
    except KeyboardInterrupt:
        logging.info('CTRL-C: save before exit')
        raise

    length_queue.put(StopIteration)
    max_length = 0
    for length in length_queue:
        if max_length < length:
예제 #53
0
class BaseCrawler(object):
    def __init__(self, requestHandler=BaseRequestHandler(),
                       parseHandler=BaseParseHandler(),
                       sheduler=BaseScheduler(),
                       pipeline=BasePipeline()):
        self.requestHandler = requestHandler
        self.parseHandler = parseHandler
        self.sheduler = sheduler
        self.pipeline = pipeline
        self.task_queue = JoinableQueue()
        self.response_queue = JoinableQueue()
        self.tasks_cnt = 0
        self.result_queue = JoinableQueue()
        self.jobs_cnt = config.num_threads
        self.start_time = time.time()
        self.stop = False
    
    def doScheduler(self):
        """Generate tasks, one thread
        """
        logging.info('scheduler started!')
        for task in self.sheduler.init_generator():
            self.task_queue.put(task)
            self.tasks_cnt += 1

        while self.tasks_cnt > 0 and not self.stop:
            gevent.sleep(config.new_task_check_time)

        logging.info('scheduler finished! All task done.')

        for i in xrange(config.num_threads):
            self.task_queue.put(StopIteration)

    def worker(self):
        """Fetch url and parse, config.num_threads threads
        """
        task = self.task_queue.get()
        cnt = config.error_retry_cnt
        while task != StopIteration:
            try:
                #timeout = gevent.Timeout(config.TASK_TIMEOUT)
                #timeout.start()
                response = self.requestHandler.handle(task)
                result, new_tasks = self.parseHandler.handle(response)
                #timeout.cancel()
                #if isinstance(result, collections.Iterable):
                #if isinstance(result, list):
                #    for ret in result:
                #        self.result_queue.put(ret)
                #else:
                if result:
                    self.result_queue.put(result)
                for task in new_tasks:
                    self.task_queue.put(task)
                    self.tasks_cnt += 1
                #self.task_queue.task_done()
                self.tasks_cnt -= 1
                task = self.task_queue.get()
                cnt = config.error_retry_cnt
            except Exception as e:
                try:
                    #timeout.cancel()
                    cnt -= 1
                    logging.exception(e)
                    if cnt <= 0:
                        #self.task_queue.task_done()
                        self.tasks_cnt -= 1
                        task = self.task_queue.get()
                        logging.error('task failed, try \033[31m%d\033[0m times! will not try' % (config.error_retry_cnt - cnt))
                        cnt = config.error_retry_cnt
                    #logging.exception('task failed!')
                    else:
                        logging.error('task failed, try \033[31m%d\033[0m times!' % (config.error_retry_cnt - cnt))
                except Exception as e:
                    self.tasks_cnt -= 1
                    #self.jobs_cnt -= 1
                    raise
            finally:
                #timeout.cancel()
                pass
        self.jobs_cnt -= 1

    def doPipeline(self):
        while self.jobs_cnt > 0 or not self.result_queue.empty():
            gevent.sleep(config.pipeline_sleeptime)
            results = []
            try:
                while 1:
                    results.append(self.result_queue.get_nowait())
                    if len(results) > 100:
                        raise gevent.queue.Empty
            except gevent.queue.Empty:
                if results:
                    try:
                        self.pipeline.process(results)
                    except:
                        logging.exception('')
                #logging.exception('')
            except:
                logging.exception('')

    def run(self):
        jobs = [
                gevent.spawn(self.doScheduler),
                gevent.spawn(self.doPipeline),
                ]
        for i in xrange(config.num_threads):
            job = gevent.spawn(self.worker)
            jobs.append(job)
            #thread.start_new_thread(self.worker)
        try:
            timeout = gevent.Timeout(config.CRAWLER_TIMEOUT)
            timeout.start()
            #self.task_queue.join()
            gevent.joinall(jobs)
        except:
            logging.exception('pipeline error!')
        finally:
            timeout.cancel()
            self.end_time = time.time()
            logging.info('run times: %f s' % (self.end_time - self.start_time))