Exemplo n.º 1
0
class Chunker(object):
    def __init__(self, redis_host):
        self.work_queue = RedisQueue(redis_host, "inqueue")

    def run(self):
        chunk_id = 0
        a_range = xrange(1,10) + xrange(10,256)
        for a in shuffle(a_range):
            for b in shuffle(xrange(1, 255)):
                if a == 172 and b in xrange(16,32):
                    continue
                if a == 192 and b == 168:
                    continue
                for c in shuffle(xrange(1, 255)):
                    ip_range = "{0}.{1}.{2}.0/24".format(a, b, c)
                    print "Sending chunk {0} range: {1}".format(chunk_id,
                            ip_range)
                    task = {
                            "range": ip_range,
                            "id": chunk_id
                           }
                    self.work_queue.put(task)
                    chunk_id += 1
                    sleep(10)

    def run_test(self):
        self.work_queue.put({"range": "129.21.50.0/24", "id":0})
        self.work_queue.put({"range": "129.21.49.0/24", "id":1})
Exemplo n.º 2
0
class Crawler(object):
    def __init__(self, redis_host, depth=10):
        self.links_queue = RedisQueue(redis_host, "linksqueue")
        self.pages_queue = RedisQueue(redis_host, "pagesqueue")

    def run(self):
        while True:
            link = self.links_queue.get().data
            try:
                page = WebPage(requests.get(link).text, link, 80)
            except:
                print("Exception GETing {0}".format(link))
                continue
            self.pages_queue.put(page.to_dict())
Exemplo n.º 3
0
    def process_request_origin(self, request, spider):
        redis = RedisQueue('proxy_ip')
        if not redis.empty():
            proxy_ip = redis.get()
        else:
            proxy_ip = get_ip()

        proxy_para = {'ip_port': proxy_ip, 'user_pass': ''}
        request.meta['proxy'] = "http://%s" % proxy_para['ip_port']
        if proxy_para['user_pass'] is not None:
            encoded_user_pass = base64.encodestring(proxy_para['user_pass'])
            request.headers[
                'Proxy-Authorization'] = 'Basic ' + encoded_user_pass
        print "*********************** RedisProxyMiddleware Using proxy ip: %s *****" % proxy_para[
            'ip_port']
        redis.put(proxy_ip)
Exemplo n.º 4
0
    def process_request_origin(self, request, spider):
        redis = RedisQueue('proxy_ip')
        if not redis.empty():
            proxy_ip = redis.get()
        else:
            proxy_ip = get_ip()

        proxy_para = {
                'ip_port': proxy_ip,
                'user_pass': ''
            }
        request.meta['proxy'] = "http://%s" % proxy_para['ip_port']
        if proxy_para['user_pass'] is not None:
            encoded_user_pass = base64.encodestring(proxy_para['user_pass'])
            request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
        print "*********************** RedisProxyMiddleware Using proxy ip: %s *****" % proxy_para['ip_port']
        redis.put(proxy_ip)
Exemplo n.º 5
0
class Indexer(object):
    def __init__(self, redis_host, es_urls):
        self.pages_queue = RedisQueue(redis_host, "pagesqueue") # take pages out of this queue
        self.links_queue = RedisQueue(redis_host, "linksqueue") # put links into this queue
        self.connection = pyelasticsearch.ElasticSearch(es_urls)
        try:
            self.connection.create_index("webpages")
        except:
            pass

    def run(self):
        while True:
            result = self.pages_queue.get().data
            result['tags'] = genTags(result['html'])
            self.connection.index('webpages', 'webpage', result, id=result['ip'])
            print('Indexed {0}'.format(result['ip']))
            for link in result['links']:
                self.links_queue.put(link)
Exemplo n.º 6
0
class StudyscrapyPipeline(object):
    def __init__(self):
        self.q = RedisQueue(name='CSDN', host='localhost', port=6379, db=3)
        if redis_db.hlen(redis_data_dict) == 0:
            pass

    def process_item(self, item, spider):
        # fp = open(r'F:\Spider\Spider\studyscrapy\out.txt', 'a+')
        if redis_db.hexists(redis_data_dict, item['title']):
            print('数据已存入队列 <--')
            pass
        else:
            # fp.write(item['title']+', '+item['time']+'\n')
            self.q.put(item['title'] + ':' + item['time'])
            redis_db.hset(redis_data_dict, item['title'], item['time'])
            print('title: {0},time: {1} 存入队列成功'.format(item['title'],
                                                       item['time']))

        return item
Exemplo n.º 7
0
    def dispose_ip(self, proxy_ip, redis_label):
        redis_list = []
        for i in range(REDIS_NUM):
            redis_list.append(RedisQueue('proxy_ip_%d' % i))
        redis_invalid_ip = RedisQueue('invalid_ip')
        if redis_label == REDIS_NUM - 1:
            redis_invalid_ip.put(proxy_ip)
            redis_list[0].put(get_ip())
        else:
            redis_list[redis_label].remove(proxy_ip)
            redis_list[redis_label + 1].put(proxy_ip)
            if redis_list[0].empty():
                redis_list[0].put(get_ip())

        new_redis_label = random.choice(range(REDIS_NUM))
        while redis_list[new_redis_label].empty():
            new_redis_label = random.choice(range(REDIS_NUM))
        new_proxy_ip = redis_list[new_redis_label].get()
        redis_list[new_redis_label].put(new_proxy_ip)
        return new_proxy_ip, new_redis_label
Exemplo n.º 8
0
    def dispose_ip(self, proxy_ip, redis_label):
        redis_list = []
        for i in range(REDIS_NUM):
            redis_list.append(RedisQueue('proxy_ip_%d' %i))
        redis_invalid_ip = RedisQueue('invalid_ip')
        if redis_label == REDIS_NUM - 1:
            redis_invalid_ip.put(proxy_ip)
            redis_list[0].put(get_ip())
        else:
            redis_list[redis_label].remove(proxy_ip)
            redis_list[redis_label+1].put(proxy_ip)
            if redis_list[0].empty():
                redis_list[0].put(get_ip())

        new_redis_label = random.choice(range(REDIS_NUM))
        while redis_list[new_redis_label].empty():
            new_redis_label = random.choice(range(REDIS_NUM))
        new_proxy_ip = redis_list[new_redis_label].get()
        redis_list[new_redis_label].put(new_proxy_ip)
        return new_proxy_ip,new_redis_label
Exemplo n.º 9
0
    def process_exception(self, request, exception, spider):
        request_ip = request.meta['proxy']
        invalid_ip = request_ip.split('//')[1]
        redis = RedisQueue('proxy_ip')
        redis_invalid_ip = RedisQueue('invalid_ip')
        if not redis.empty():
            redis.remove(invalid_ip)
            redis_invalid_ip.put(invalid_ip)
            print '+++++++++++++++++++++++%s' % exception
            print '-----------------------removing ip from redis: %s' % invalid_ip

        new_ip = get_ip()
        proxy_para = {'ip_port': new_ip, 'user_pass': ''}
        request.meta['proxy'] = "http://%s" % proxy_para['ip_port']
        if proxy_para['user_pass'] is not None:
            encoded_user_pass = base64.encodestring(proxy_para['user_pass'])
            request.headers[
                'Proxy-Authorization'] = 'Basic ' + encoded_user_pass
        print ">>>>>>>>>>>>>>>>>>>>>>>>>>> switch %s to ip: %s *****" % (
            invalid_ip, proxy_para['ip_port'])
        redis.put(new_ip)
Exemplo n.º 10
0
def main():

    done_que = RedisQueue('seed')
    run_que = RedisQueue('run')

    run_que.flushdb()

    conn = sqlite3.connect('site_data.db')
    conn.execute(
        "create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)"
    )

    spend = 0
    cnt = 0
    size = 0
    while True:

        data = cPickle.loads(done_que.get())
        st = time.time()
        urls = geturls(data['url'], data['content'])
        if len(urls) == 0:
            continue

        for url in urls:
            if url not in bfdone:
                run_que.put(url)

        gziphtml = sqlite3.Binary(gzip.zlib.compress(data['content']))
        size += len(gziphtml)
        conn.execute(
            "insert into mainpages (url,headers,content) values (?,?,?)",
            (data['url'], str(data['headers']), gziphtml))

        et = time.time()
        spend += (et - st)
        cnt += 1
        if cnt % 10 == 0:
            print "cost:", spend / cnt, cnt, done_que.qsize(
            ), size / 1024 / 1024
            conn.commit()
Exemplo n.º 11
0
    def process_exception(self, request, exception, spider):
        request_ip = request.meta['proxy']
        invalid_ip = request_ip.split('//')[1]
        redis = RedisQueue('proxy_ip')
        redis_invalid_ip = RedisQueue('invalid_ip')
        if not redis.empty():
            redis.remove(invalid_ip)
            redis_invalid_ip.put(invalid_ip)
            print '+++++++++++++++++++++++%s' %exception
            print '-----------------------removing ip from redis: %s' %invalid_ip

        new_ip = get_ip()
        proxy_para = {
            'ip_port': new_ip,
            'user_pass': ''
        }
        request.meta['proxy'] = "http://%s" % proxy_para['ip_port']
        if proxy_para['user_pass'] is not None:
            encoded_user_pass = base64.encodestring(proxy_para['user_pass'])
            request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
        print ">>>>>>>>>>>>>>>>>>>>>>>>>>> switch %s to ip: %s *****" % (invalid_ip,proxy_para['ip_port'])
        redis.put(new_ip)
Exemplo n.º 12
0
class ParserWorker():

    def __init__(self, in_queue_namespace, out_queue_namespace):
        
        self.in_queue_namespace = in_queue_namespace
        self.out_queue_namespace = out_queue_namespace

        self.in_queue = RedisQueue(in_queue_namespace)
        self.out_queue = RedisQueue(out_queue_namespace)

        print "Parser worker loaded"

    def run(self):

        while 1:
            xml_text = self.in_queue.get()
            print "Received XML"
            if xml_text == "None":
                self.out_queue.put("None")
                break

            json_doc = DataParser.parse_get_state_stats_resp(xml_text)
            print "Made JSON"
            self.out_queue.put(json_doc)
Exemplo n.º 13
0
class FetcherWorker:

    def __init__(self, in_queue_namespace, out_queue_namespace, apikey):

        self.in_queue_namespace = in_queue_namespace
        self.out_queue_namespace = out_queue_namespace
        self.apikey = apikey

        self.in_queue = RedisQueue(in_queue_namespace)
        self.out_queue = RedisQueue(out_queue_namespace)

        print "Fetcher loaded with apikey", self.apikey


    def run(self):

        while 1:

            base_url = self.in_queue.get()

            if base_url == "None":
                # add end-of-queue markers for parsers
                self.out_queue.put("None") 

                # ends program
                break

            url = base_url + self.apikey 
            
            t1 = time.time()
            
            print "fetching try 1", url

            resp = urllib2.urlopen(url)
            if resp.code == 200: 
                text = resp.read()
                self.out_queue.put(text)
            else:
                print 'failed once', url
                time.sleep(10)
                print "fetching try 2", url
                resp = urllib2.urlopen(url)
                if resp.code == 200:
                    text = resp.read()
                    self.out_queue.put(text)

            print "done fetching"

            # make sure we don't use the same API key within 2 seconds
            t2 = time.time()
            if t2 - t1 < 2.0:
                time.sleep(2.0 - (t2 - t1))
Exemplo n.º 14
0
class FetcherWorker:
    def __init__(self, in_queue_namespace, out_queue_namespace, apikey):

        self.in_queue_namespace = in_queue_namespace
        self.out_queue_namespace = out_queue_namespace
        self.apikey = apikey

        self.in_queue = RedisQueue(in_queue_namespace)
        self.out_queue = RedisQueue(out_queue_namespace)

        print "Fetcher loaded with apikey", self.apikey

    def run(self):

        while 1:

            base_url = self.in_queue.get()

            if base_url == "None":
                # add end-of-queue markers for parsers
                self.out_queue.put("None")

                # ends program
                break

            url = base_url + self.apikey

            t1 = time.time()

            print "fetching try 1", url

            resp = urllib2.urlopen(url)
            if resp.code == 200:
                text = resp.read()
                self.out_queue.put(text)
            else:
                print 'failed once', url
                time.sleep(10)
                print "fetching try 2", url
                resp = urllib2.urlopen(url)
                if resp.code == 200:
                    text = resp.read()
                    self.out_queue.put(text)

            print "done fetching"

            # make sure we don't use the same API key within 2 seconds
            t2 = time.time()
            if t2 - t1 < 2.0:
                time.sleep(2.0 - (t2 - t1))
Exemplo n.º 15
0
#coding=utf-8
from RedisQueue import RedisQueue
redis = RedisQueue('0','testno1080')

with open("testcn1080.txt") as file:
	for i in file.readlines():
		i = i.replace("\n","") + ":1080"
		print(i)
		redis.put(i)
Exemplo n.º 16
0
    page = urllib2.urlopen(req, None, req_timeout)
    html = page
    return html


def next_page():
    base_url = 'http://jandan.net/ooxx/page-1006#comments'
    for i in range(3):
        html = user_agent(base_url).read()
        soup = BeautifulSoup(html)
        next_url = soup.find('a', {
            'class': 'next-comment-page',
            'title': 'Newer Comments'
        }).get('href')
        yield base_url
        base_url = next_url


for page in next_page():
    queue.put(page)
print 'There are %d pages' % queue.qsize()

while not queue.empty():
    page_url = queue.get()
    html = user_agent(page_url).read()
    soup = BeautifulSoup(html)
    img_urls = soup.find_all(['img'])
    for myimg in img_urls:
        Jpgurl = myimg.get('src')
        redis.put(Jpgurl)
print 'There are %d pictures' % redis.qsize()
Exemplo n.º 17
0
    if (len(sys.argv) > 2):
        time_duration = float(sys.argv[2])

    start = time.time()
    print("Starting sensor readings at {}".format(start))

    previous = start
    try:
        while True:

            current = time.time()

            acc = mpu.get_accel_data()
            q.put("{:.4f}, {:.3f}, {:.3f}, {:.3f}\n".format(
                current - start, (acc['x'] - X_OFFSET), (acc['y'] - Y_OFFSET),
                (acc['z'] - Z_OFFSET)))

            if ((current - start) > monitor_time):
                q.put('finished')
                break

            current_sensed = time.time()

            #print("Sensed and written in {}".format(current_sensed-current))
            while (current_sensed - previous) < time_duration:
                current_sensed = time.time()
                continue
            previous = current_sensed

    except KeyboardInterrupt:
Exemplo n.º 18
0
#!/usr/bin/python
from RedisQueue import RedisQueue
import subprocess
import json
import base64
import sys

q = RedisQueue(sys.argv[1],
               namespace='ansible',
               host='internal-redis.ovmdvp.0001.use2.cache.amazonaws.com',
               port=6379,
               db=1)
q.put(json.dumps({'type': sys.argv[2], 'payload': sys.argv[3]}))
Exemplo n.º 19
0
from RedisQueue import RedisQueue
import time

q = RedisQueue('test')

for i in xrange(20):
    q.put(i)
    print i, "put into queue"
    time.sleep(0.5)

q.put("None")
Exemplo n.º 20
0
            combinations.append(feature)


    shuffle(combinations)

    print('starting', len(combinations))
    input()
    # clear the queue
    while not q.empty():
        q.get()
    print("empty")



    for i in range(6):
        x = machine(df)
        print('starting...')
        x.start()

    for feature in combinations:
        q.put(feature)
    print('all put')

    while not q.empty():
        try:
            sleep(1)
        except:
            break

    del q
Exemplo n.º 21
0
#!/usr/bin/env python

# UniPi Python Control Panel
# stop_server.py
# uses Python 3.5 up
# Author: Johannes Untiedt
# Version 10.0 vom 26.03.2018

from RedisQueue import RedisQueue
import threading

if __name__ == '__main__':
    print("stop_server.py started")
    lock = threading.Lock()
    q = RedisQueue('ws_2')
    payload = "close"
    with lock:
        q.put(payload)
    print("Stop_server.py send ", payload)
Exemplo n.º 22
0
from RedisQueue import RedisQueue
q=RedisQueue('test')
q.put("你好")
print(q.get().decode('utf-8'))


Exemplo n.º 23
0
from RedisQueue import RedisQueue
import sys
import random
from pymongo import MongoClient

if __name__ == '__main__':
    db = MongoClient()
    exists = db.zhihu.zhihu_answers
    exist_owners = []
    for e in exists.find():
        exist_owners.append(e['owner'])
    print(len(exist_owners))
    all_ids = [line.strip().split('\t')[0]
               for line in open('./user_followees.data')]
    candidates = list(set(all_ids) - set(exist_owners))
    queue = RedisQueue('answer_queue')
    queue.clear()
    print('Count: %d' % len(candidates))
    for c in candidates[0:]:
        queue.put(c)
Exemplo n.º 24
0
from flask import Flask
import celeryTask
from RedisQueue import RedisQueue

flask_app = Flask(__name__)


# Example URL to stop the current celery task
@flask_app.route("/terminate", methods=['GET'])
def rfGetVersions():
    result.revoke(terminate=True)
    return ("Celery task Terminated")


# Create a redis queue, which sits on redis server.
q = RedisQueue('test')
q.put('Task 1')
q.put('Task 2')
q.put('Task 3')
q.put('Task 4')

print("Celery background tasks starting.....")

# This is actually calling a celery thread and asssiging it a task 'basic_celery_task'
# By using .delay() method we are actually saying it execute in background, so that the flask can serve web URIs.
result = celeryTask.basic_celery_task.delay()
print("Started!")

flask_app.run(host="127.0.0.1", port=5001, threaded=True)
Exemplo n.º 25
0
    req_timeout = 20
    req = urllib2.Request(url, None, req_header)
    page = urllib2.urlopen(req, None, req_timeout)
    html = page
    return html


def next_page():
    base_url = "http://jandan.net/ooxx/page-1006#comments"
    for i in range(3):
        html = user_agent(base_url).read()
        soup = BeautifulSoup(html)
        next_url = soup.find("a", {"class": "next-comment-page", "title": "Newer Comments"}).get("href")
        yield base_url
        base_url = next_url


for page in next_page():
    queue.put(page)
print "There are %d pages" % queue.qsize()

while not queue.empty():
    page_url = queue.get()
    html = user_agent(page_url).read()
    soup = BeautifulSoup(html)
    img_urls = soup.find_all(["img"])
    for myimg in img_urls:
        Jpgurl = myimg.get("src")
        redis.put(Jpgurl)
print "There are %d pictures" % redis.qsize()
Exemplo n.º 26
0
def get_all_url(url, biz):
    """
    根据URL解析JSON,得到文章信息保存到Redis队列中
    :param url:
    :param biz:
    :return:
    """
    if biz is None:
        print("空")
        return
    q = RedisQueue(biz.strip())
    json_str = get_page_detail(url)
    json_re = parse_page_index(json_str)
    general_msg_list = parse_page_index(json_re['general_msg_list'])
    for list_re in general_msg_list['list']:
        print("当前biz为", biz)
        datetime = list_re['comm_msg_info']['datetime']
        try:
            title = list_re['app_msg_ext_info']['title']
            digest = list_re['app_msg_ext_info']['digest']
            content_url = list_re['app_msg_ext_info']['content_url']
            author = list_re['app_msg_ext_info']['author']
            print(url)
            # content = get_content(content_url)
            data1 = {
                'title': title,
                'digest': digest,
                'datetime': datetime,
                'content_url': content_url,
                'author': author
                # 'content': content
            }
            data1 = json.dumps(data1)
            q.put(data1)
            print(data1)

            for multi_app_msg_item_list in list_re['app_msg_ext_info']['multi_app_msg_item_list']:
                title = multi_app_msg_item_list['title']
                digest = multi_app_msg_item_list['digest']
                content_url = multi_app_msg_item_list['content_url']
                print(content_url)

                author = multi_app_msg_item_list['author']
                data2 = {
                    'title': title,
                    'digest': digest,
                    'datetime': datetime,
                    'content_url': content_url,
                    'author': author
                    # 'content': content
                }
                data2 = json.dumps(data2)
                q.put(data2)
                print(data2)
        except KeyError as e:
            print("error", e)

    # 获取 next_offset
    if len(general_msg_list['list']) < 10:
        return None
    return json_re['next_offset']
Exemplo n.º 27
0
        company_url = url_queue.get()
        download_data(company_url, 5)
        time.sleep(2)
    url_queue.task_done()


def url_producer(name_queue, url_queue):
    while True:
        company_name = name_queue.get()
        download_url(company_name, 5)


#name_queue = queue.Queue()
i = 0
name_queue = RedisQueue("name" + str(i + 1))
csv_reader = csv.reader(open('./company.csv', encoding="utf8"))
for row in csv_reader:

    name_queue.put(row[0])
url_queue = RedisQueue("url" + str(i + 1))
for n in range(4):
    producer_thread = threading.Thread(target=url_producer,
                                       args=(
                                           name_queue,
                                           url_queue,
                                       ))
    producer_thread.start()
for n in range(5):
    consumer_thread = threading.Thread(target=url_consumer, args=(url_queue, ))
    consumer_thread.start()
#url_queue.join()