示例#1
0
class spider(object):
    '''
    A base Spider for v2ex.
    '''
    def __init__(self, url, sleep_time):
        '''
        >>>from v2ex_spider import base_spider
        >>>base_spider.start(url,sleep_time)
        '''
        self.url = url
        self.sleep_time = sleep_time
        time.sleep(int(self.sleep_time))
        self.SQ = SQL()
        self.SQ.open_datebase()
        #run
        self.load_config()
        self.spider()
        #end
        self.SQ.close_datebase()

    def spider(self):
        resp = self.s.get(self.url)
        if resp.status_code != 200:
            self.SQ.close_datebase()
            error_info = 'proxy status: %s, proxy: %s' % (str(
                settings.proxy_enable), str(self.s.proxies))
            raise APIError(error_info)
        topics = resp.json()
        for topic in topics:
            t_id = topic["id"]
            title = topic["title"]
            author = topic["member"]["username"]
            author_id = topic["member"]["id"]
            content = topic["content"]
            content_rendered = topic["content_rendered"]
            replies = topic["replies"]
            node = topic["node"]["id"]
            created = topic["created"]
            n_time = int(time.time())
            self.SQ.write_to_db_base(t_id, title, author, author_id, content,
                                     content_rendered, replies, node, created,
                                     n_time)
        self.SQ.conn.commit()
        return

    def load_config(self):
        self.proxy_enable = settings.proxy_enable
        self.s = requests.session()
        self.s.headers = settings.API_headers
        if self.proxy_enable:
            self.s.proxies = settings.proxies
        return
示例#2
0
class Start(object):
    '''
    Start the project.
    '''
    def __init__(self):
        '''
        $ python run.py
        or
        $ ./Run.sh
        '''
        logging.info('start')
        logging.debug('open sql database.')
        self.SQ = SQL()
        self.SQ.open_datebase()
        self.redis_conn = Redis()
        self.load_config()
        #base
        self.load_json()
        #         self.update_cookies()
        try:
            self.update_nodes()
        except APIError as e:
            pass

    def Mode1(self):
        logging.info('start mode1')
        #start
        self.get_rss()
        self.tasker()
        self.topic_ids_enqueue()
        self.tester_tasker()
        #end
        self.end()

    def Mode2(self):
        logging.info('start mode2')
        #start

        self.get_rss()
        self.topic_ids_enqueue()
        self.tester_tasker()
        #end
        self.end()

    def end(self):
        self.SQ.close_datebase()
        self.dump_json()
        logging.info('end')

    def load_json(self):
        logging.debug('load json')
        #load .time_log.json
        if os.path.exists('.time_log.json'):
            with open('.time_log.json', 'r') as f:
                self.time_log = json.load(f)
        else:
            self.time_log = {
                'cookies_time': '0',
                'nodes_time': '0',
                '8000_node': '0',
                '4000_node': '0',
                '1000_node': '0',
                '500_node': '0',
                '0_node': '0',
                'rss_time': '0',
                'tester': '0',
                'topic_id_reenqueue': '0'
            }
        #load .node_number.json
        if os.path.exists('.node_number.json'):
            with open('.node_number.json', 'r') as f:
                self.node_number = json.load(f)
        else:
            self.node_number = list()
        return

    def dump_json(self):
        #dump .time_log.json
        with open('.time_log.json', 'w') as f1:
            json.dump(self.time_log, f1)
        #dump .node_number.json
        with open('.node_number.json', 'w') as f2:
            self.node_number = list(set(self.node_number))
            json.dump(self.node_number, f2)
        return

    def topic_ids_enqueue(self):
        if int(time.time()) - int(self.time_log['topic_id_reenqueue']) >= 1800:
            logging.info('start topic id reenqueue')
            max_id = topic_id_reenqueue.max_id
            topic_id_reenqueue.reenqueue_m(max_id - 2000, max_id - 29)
            self.time_log['topic_id_reenqueue'] = str(int(time.time()))
        return

    def update_cookies(self):
        if int(time.time()) - int(self.time_log["cookies_time"]) >= 86400 * 4:
            cookies_time_status = False
        else:
            cookies_time_status = True
        if not os.path.exists('cookies.txt') or cookies_time_status is False:
            logging.debug('update cookies')
            try:
                log_s = log_in.v2ex_log_in()
                log_s.log_in(3)
                log_s.save_cookies()
            except log_in.LogError as e:
                return
            self.time_log["cookies_time"] = str(int(time.time()))
        return

    def update_nodes(self):
        if int(time.time()) - int(self.time_log["nodes_time"]) >= 10800:
            nodes_time_status = False
        else:
            nodes_time_status = True
        if not nodes_time_status:
            logging.info('update nodes')
            try:
                resp = self.s.get('https://www.v2ex.com/api/nodes/all.json',
                                  timeout=10)
            except requests.exceptions.RequestException as e:
                logging.error('update_node failed.')
                logging.error('proxy_status: %s' % settings.i_proxy_enable)
                if settings.i_proxy_enable is True:
                    logging.error('proxy: %s' % self.s.proxies)
                logging.error(e)
                self.node_number = list(set(self.node_number))
                return
            if resp.status_code != 200:
                logging.error('update_node failed.')
                logging.error('proxy_status: %s' % settings.i_proxy_enable)
                if settings.i_proxy_enable is True:
                    logging.error('proxy: %s' % self.s.proxies)
                logging.error(APIError('update_node'))
                self.node_number = list(set(self.node_number))
                raise APIError('update_node')
            nodes = resp.json()
            for node in nodes:
                n_id = node["id"]
                name = node["name"]
                url = node["url"]
                title = node["title"]
                title_alternative = node["title_alternative"]
                topics = node["topics"]
                header = node["header"]
                footer = node["footer"]
                created = node["created"]
                n_time = int(time.time())
                if self.SQ.node_test(n_id, topics) is True:
                    self.node_number.append(int(n_id))
                self.SQ.write_to_db_node(n_id, name, url, title,
                                         title_alternative, topics, header,
                                         footer, created, n_time)
            self.time_log["nodes_time"] = str(int(time.time()))
        self.node_number = list(set(self.node_number))
        return

    def tasker(self):
        node_configs_1 = [{
            'sql': 'SELECT ID FROM NODES WHERE topics >= 8000;',
            'sleep_time': 5,
            'between_time': 900,
            'time_log': '8000_node',
            'queue_name': 'node1'
        }, {
            'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 3000 AND 8000;',
            'sleep_time': 10,
            'between_time': 1800,
            'time_log': '4000_node',
            'queue_name': 'node2'
        }, {
            'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 1000 AND 3000;',
            'sleep_time': 20,
            'between_time': 7200,
            'time_log': '1000_node',
            'queue_name': 'node3'
        }, {
            'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 100 AND 1000;',
            'sleep_time': 90,
            'between_time': 86400,
            'time_log': '500_node',
            'queue_name': 'node4'
        }, {
            'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 1 AND 100;',
            'sleep_time': 90,
            'between_time': 86400,
            'time_log': '0_node',
            'queue_name': 'node5'
        }]
        node_configs_2 = [{
            'sql': 'SELECT ID FROM NODES WHERE topics >= 8000;',
            'sleep_time': 5,
            'between_time': 1800,
            'time_log': '8000_node',
            'queue_name': 'node1'
        }, {
            'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 3000 AND 8000;',
            'sleep_time': 10,
            'between_time': 3600,
            'time_log': '4000_node',
            'queue_name': 'node2'
        }, {
            'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 1000 AND 3000;',
            'sleep_time': 20,
            'between_time': 14400,
            'time_log': '1000_node',
            'queue_name': 'node3'
        }, {
            'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 100 AND 1000;',
            'sleep_time': 90,
            'between_time': 86400,
            'time_log': '500_node',
            'queue_name': 'node4'
        }, {
            'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 1 AND 100;',
            'sleep_time': 90,
            'between_time': 86400,
            'time_log': '0_node',
            'queue_name': 'node5'
        }]
        time.tzname = ('CST', 'CST')
        if int(time.strftime('%H')) >= 8 or int(time.strftime('%H')) < 2:
            node_configs = node_configs_1
        else:
            node_configs = node_configs_2
        for node_config in node_configs:
            sql = node_config['sql']
            sleep_time = node_config['sleep_time']
            between_time = node_config['between_time']
            time_log_name = node_config['time_log']
            queue_name = node_config['queue_name']
            q_node = Queue(queue_name, connection=self.redis_conn)
            if int(time.time()) - int(
                    self.time_log[time_log_name]) >= between_time:
                logging.info('start enqueue, queue name: %s' % queue_name)
                self.SQ.cursor.execute(sql)
                node_ids = self.SQ.cursor.fetchall()
                for node_id in node_ids:
                    node_id = node_id[0]
                    if queue_name not in [
                            'node4', 'node5'
                    ] or (queue_name in ['node4', 'node5']
                          and node_id in self.node_number):
                        if queue_name in ['node4', 'node5']:
                            self.node_number.remove(int(node_id))
                        q_node.enqueue(node_spider.start, node_id, sleep_time)
                self.time_log[time_log_name] = str(int(time.time()))
        return

    def get_rss(self):
        if int(time.time()) - int(self.time_log["rss_time"]) >= 600:
            logging.debug('start get_rss')
            try:
                rss_spider.Rss_spider()
            except requests.exceptions.RequestException as e:
                self.time_log["rss_time"] = str(int(time.time()))
                return
            self.time_log["rss_time"] = str(int(time.time()))
        return

    def load_config(self):
        logging.debug('load config')
        self.proxy_enable = settings.i_proxy_enable
        self.s = requests.session()
        self.s.headers = settings.API_headers
        if self.proxy_enable:
            self.s.proxies = settings.i_proxies()
        return

    def tester_tasker(self):
        if int(time.time()) - int(self.time_log["tester"]) >= 1800:
            logging.info('start enqueue tester')
            #losd json
            if os.path.exists('.topics_tester.json'):
                with open('.topics_tester.json', 'r') as f:
                    tmp_topics = json.load(f)
            else:
                tmp_topics = list()
            #main
            sql = "SELECT ID FROM TOPIC WHERE (time - created) < 345600 AND ID NOT IN (SELECT T_ID FROM STATUS) AND (STRFTIME('%s','now') - created) > 1209600;"
            sleep_time = 20
            self.SQ.cursor.execute(sql)
            topic_ids = [x[0] for x in self.SQ.cursor.fetchall()]
            q = Queue('tester', connection=self.redis_conn)
            for topic_id in topic_ids:
                if topic_id not in tmp_topics:
                    q.enqueue(topic_tester.start, topic_id, sleep_time)
                    tmp_topics.append(topic_id)
            #end
            tmp_topics = list(set(tmp_topics))
            with open('.topics_tester.json', 'w') as f:
                json.dump(tmp_topics, f)
            self.time_log["tester"] = str(int(time.time()))
        return
示例#3
0
class Rss_spider(object):
    '''
    A Spider for v2ex's Rss.
    Get the latest and hot topic on the index.
    Using the rss generate the topic list that need to spider.
    '''


    def __init__(self):
        '''
        >>>from v2ex_spider import rss_spider
        >>>rss_spider.Rss_spider()
        '''
        logging.info('start Rss spider')
        self.v2ex_rss_url_list=['https://www.v2ex.com/index.xml',
                   'https://www.v2ex.com/feed/tab/qna.xml',
                   'https://www.v2ex.com/feed/tab/jobs.xml',
                   'https://www.v2ex.com/feed/tab/deals.xml',
                   'https://www.v2ex.com/feed/tab/city.xml',
                   'https://www.v2ex.com/feed/tab/play.xml',
                   'https://www.v2ex.com/feed/tab/apple.xml',
                   'https://www.v2ex.com/feed/tab/creative.xml',
                   'https://www.v2ex.com/feed/tab/tech.xml']
        self.latest_hot_api=['https://www.v2ex.com/api/topics/latest.json','https://www.v2ex.com/api/topics/hot.json']
        self.topic_sleep_time=10
        logging.debug('open sql database')
        self.SQ=SQL()
        self.SQ.open_datebase()
        self.redis_conn=Redis()
        self.load_config()
        #run
        try:
            self.latest_and_hot()
        except APIError as e:
            pass
        self.gen_topic_queue()
        #end
        self.SQ.close_datebase()
        logging.info('end the Rss spider')
    
    def topics_id_rss(self):
        logging.debug('fetch rss feeds')
        topic_ids=list()
        for v2ex_rss_url in self.v2ex_rss_url_list:
            feed=feedparser.parse(v2ex_rss_url)
            logging.debug('fetch rss feed: %s' % v2ex_rss_url)
            items=feed["items"]
            for item in items:
                author=item["author"]
                title=item["title"]
                link=item["link"]
                published=item[ "date" ] 
                summary=item["summary"]
                topic_id=int(re.findall(r't\/(\d+)#?', link)[0])
                topic_ids.append(topic_id)
        topic_ids=set(topic_ids)
        return topic_ids

    def topics_id_sqlite(self):
        logging.debug('SELECT ID FROM TOPIC')
        sql='SELECT ID FROM TOPIC;'
        self.SQ.cursor.execute(sql)
        topics_ids=[x[0] for x in self.SQ.cursor.fetchall()]
        return  topics_ids
    
    def latest_and_hot(self):
        logging.debug('start latest_and_hot')
        for url in self.latest_hot_api:
            try:
                resp=self.s.get(url, timeout=10)
            except requests.exceptions.RequestException as e:
                logging.error('latest_and_hot error')
                logging.error('proxy_status: %s' % self.proxy_enable)
                if self.proxy_enable is True:
                    logging.error('proxy: %s' % self.s.proxies)
                logging.error(e)
                raise e
            if resp.status_code != 200:
                logging.error('latest_and_hot error')
                logging.error('proxy_status: %s' % self.proxy_enable)
                if self.proxy_enable is True:
                    logging.error('proxy: %s' % self.s.proxies)
                logging.error(APIError('latest_and_hot'))
                raise APIError('latest_and_hot')
            topics=resp.json()
            for topic in topics:
                t_id=topic["id"]
                title=topic["title"]
                author=topic["member"]["username"]
                author_id=topic["member"]["id"]
                content=topic["content"]
                content_rendered=topic["content_rendered"]
                replies=topic["replies"]
                node=topic["node"]["id"]
                created=topic["created"]
                n_time=int(time.time())
                self.SQ.write_to_db_base(t_id,title,author,author_id,content,content_rendered,replies,node,created,n_time)
            self.SQ.conn.commit()
        return

    def gen_topic_queue(self):
        logging.debug('start topic enqueue')
        topics_sql=self.topics_id_sqlite()
        if len(topics_sql) <= 2000:
            return
        topics_rss=self.topics_id_rss()
        # load topics
        if os.path.exists('.topics_all.json'):
            with open('.topics_all.json','r') as f:
                tmp_topics=json.load(f)
        else:
            tmp_topics=list()
        t_queue=Queue('topic',connection=self.redis_conn)
        # gen queue
        for topic in topics_rss:
            if topic not in topics_sql and topic not in tmp_topics:
                topic_id=int(topic)
                t_queue.enqueue(topic_spider.start,topic_id, self.topic_sleep_time)
        #save topics
        topics_all=list()
        topics_all.extend(tmp_topics)
        topics_all.extend(topics_rss)
        topics_all.extend(topics_sql)
        topics_all=list(set(topics_all))
        with open('.topics_all.json','w') as f:
            json.dump(topics_all, f)
        return

    def load_config(self):
        logging.debug('load config')
        self.proxy_enable=settings.i_proxy_enable
        self.s=requests.session()
        self.s.headers=settings.API_headers
        if self.proxy_enable:
            self.s.proxies=settings.i_proxies()