def get_base_url(self): if self.use_proxy: return get_random_proxy() else: return "https://weibo.cn"
file_path = os.getcwd() + '/sina/seeds.txt' start_uids = [] start_urls = [] with open(file_path, 'r') as f: lines = f.readlines() for line in lines: if line[0].isdigit(): line = line.strip() userid = line.split('#')[0].strip() start_uids.append(userid) elif line.startswith("http"): start_urls.append(line) # push urls to redis for uid in start_uids: if PROXY_BASEURL: base_url = get_random_proxy() else: base_url = "https://weibo.cn" start_url = base_url+("/%s/info" % uid) print("[DEBUG] URL: " + start_url) r.lpush('weibo_user_profile_spider:start_urls', start_url) for url in start_urls: if PROXY_BASEURL: url = url.replace("https://weibo.cn",get_random_proxy()) print("[DEBUG] URL: " + url) r.lpush('weibo_user_profile_spider:start_urls', url) print('Redis initialized')
client = MongoClient(LOCAL_MONGO_HOST, LOCAL_MONGO_PORT) profiles_collection = client[DB_NAME]['user_profiles'] if PROFILE_GROUP > 0: seeds = profiles_collection.find({ "timelineCrawlJob_current_complete": False, "group": PROFILE_GROUP }) else: seeds = profiles_collection.find( {"timelineCrawlJob_current_complete": False}) print(seeds.count(), "profiles found") for seed in seeds: if PROXY_BASEURL: base_url = get_random_proxy() else: base_url = "https://weibo.cn" start_url = base_url + '/{}?page={}'.format( seed['_id'], seed['timelineCrawlJob_current_page']) print("[DEBUG] start url: " + start_url) r.lpush('weibo_user_timeline_spider:start_urls', start_url) print('Redis initialized') # push urls to redis # for uid in start_uids: # start_url = base_url+("%s/info" % uid) # r.lpush('weibo_user_timeline_spider:start_urls', start_url) # for url in start_urls:
# get status ID for content truncated statuses mydoc = collection.find( {"img_crawl_status": 0} ).limit(CRAWL_BATCH_SIZE) print("Number of queued url: " + str(mydoc.count(True))) for x in mydoc: if x["img_truncated"]==False: img_id = x["multi_img_ids"] if PROXY_BASEURL: image_server_number = random.randint(1,4) base_url = get_random_proxy("http://wx%d.sinaimg.cn/"%image_server_number) else: base_url = "http://wx1.sinaimg.cn" #img_url = x["single_img_url"].replace("https://weibo.cn",base_url) img_url = base_url + "/large/"+img_id #print("[DEBUG] url: "+img_url) r.lpush('weibo_image_spider:start_urls', img_url) else: if PROXY_BASEURL: base_url = get_random_proxy("https://weibo.cn/") else: base_url = "https://weibo.cn" multi_img_url = x["multi_imgs_page_url"].replace("https://weibo.cn",base_url) #print("[DEBUG] url: "+multi_img_url) r.lpush('weibo_image_spider:start_urls', multi_img_url)