def store(topic): if topic is not None: for item in topic.get_child(): store(item) try: DataBase.put_topic_in_db(topic) print topic.get_topic_name() except: pass
def user_spider(user): DataBase.put_user_in_db(user) for follower in user.get_followers(): DataBase.put_user_in_db(follower) DataBase.put_follow_user_in_db(user,follower)
def spider(question): if not questionBloom.is_element_exist(question.get_question_id()): questionlock.acquire() questionBloom.insert_element(question.get_question_id()) Worm_status.record_status("questionBloom", questionBloom) questionlock.release() DataBase.put_question_in_db(question) Logging.info("Topics of question id %s" % question.get_question_id()) for topictag in question.get_topics(): if not topicBloom.is_element_exist(topictag.get_topic_id()): DataBase.put_topic_in_db(topictag) topiclock.acquire() topicBloom.insert_element(topictag.get_topic_id()) Worm_status.record_status("topicBloom", topicBloom) topiclock.release() DataBase.put_question_topic_in_db(question, topictag) Logging.info("Answers of question id %s" % question.get_question_id()) for answer in question.get_answers(): if not answerBloom.is_element_exist(answer.get_answer_id()): answerlock.acquire() answerBloom.insert_element(answer.get_answer_id()) Worm_status.record_status("answerBloom", answerBloom) answerlock.release() if not userBloom.is_element_exist(answer.get_author_id()): DataBase.put_user_in_db(answer.get_author()) userlock.acquire() userBloom.insert_element(answer.get_author_id()) Worm_status.record_status("userBloom", userBloom) userlock.release() DataBase.put_answer_in_db(answer) for user in answer.get_upvoters(): if not userBloom.is_element_exist(user.get_user_id()): DataBase.put_user_in_db(user) userlock.acquire() userBloom.insert_element(user.get_user_id()) Worm_status.record_status("userBloom", userBloom) userlock.release() DataBase.put_vote_in_db(answer, user) Logging.info("Follower of question id %s" % question.get_question_id()) for follower in question.get_followers(): if not userBloom.is_element_exist(follower.get_user_id()): DataBase.put_user_in_db(follower) userlock.acquire() userBloom.insert_element(follower.get_user_id()) Worm_status.record_status("userBloom", userBloom) userlock.release() DataBase.put_follow_question_in_db(question, follower) else: Logging.debug("Exist Question")
#time.sleep(0.1) def user_spider(user): DataBase.put_user_in_db(user) for follower in user.get_followers(): DataBase.put_user_in_db(follower) DataBase.put_follow_user_in_db(user,follower) if __name__ == '__main__': import sys sys.setrecursionlimit(1000000) THREADS = 10 p = mp.Pool(processes = THREADS) topic = Topic("http://www.zhihu.com/topic/19554927") if not topicBloom.is_element_exist(topic.get_topic_id()): topicBloom.insert_element(topic.get_topic_id()) Worm_status.record_status("topicBloom", topicBloom) DataBase.put_topic_in_db(topic) go = topic.get_questions() num = topic.get_question_num() while num >= 0: try: p.map(spider,itertools.islice(go,20)) except AttributeError: pass finally: num -= 20
Logging.debug("Exist Question") #time.sleep(0.1) def user_spider(user): DataBase.put_user_in_db(user) for follower in user.get_followers(): DataBase.put_user_in_db(follower) DataBase.put_follow_user_in_db(user, follower) if __name__ == '__main__': import sys sys.setrecursionlimit(1000000) THREADS = 10 p = mp.Pool(processes=THREADS) topic = Topic("http://www.zhihu.com/topic/19554927") if not topicBloom.is_element_exist(topic.get_topic_id()): topicBloom.insert_element(topic.get_topic_id()) Worm_status.record_status("topicBloom", topicBloom) DataBase.put_topic_in_db(topic) go = topic.get_questions() num = topic.get_question_num() while num >= 0: try: p.map(spider, itertools.islice(go, 20)) except AttributeError: pass finally: num -= 20