def migrate_videos_to_server(): with MongoDBCollection("website_pron", "video_info", host="192.168.1.103") as remote_coll: with MongoDBCollection("website_pron", "video_info") as local_coll: with ExtSSHConnection("192.168.1.103", "yuanyifan", "979323") as ext_ssh: for doc in local_coll.find({"source": {"$ne": None}}): if remote_coll.find_one({"source": doc["source"]}) is None: print("Uploading...") print(doc) _id = doc["_id"] current_index = int( remote_coll.find({}, { "_id": 1 }).sort("_id", -1).limit(1)[0]["_id"]) + 1 insert_doc = doc insert_doc["_id"] = current_index ext_ssh.sftp_conn.put( shortcuts_saving_path % _id, remote_video_preview_path % current_index) ext_ssh.sftp_conn.put( video_saving_path % _id, remote_video_path % current_index) remote_coll.insert_one(insert_doc) print("Uploaded.")
def migrate_videos_to_remote(): with MongoDBCollection("website_pron", "video_info", host="192.168.1.103") as remote_coll: with MongoDBCollection("website_pron", "video_info") as local_coll: with ExtSSHConnection("192.168.1.103", "yuanyifan", "979323") as ext_ssh: for doc in local_coll.find({}): video_id = int(doc["_id"]) if remote_coll.find_one({"_id": video_id}) is None: print("Found a log to migrate:\n%s" % json.dumps(doc, indent=2)) ext_ssh.sftp_conn.put(video_saving_path % video_id, remote_video_path % video_id) ext_ssh.sftp_conn.put(shortcuts_saving_path % video_id, remote_video_preview_path % video_id) remote_coll.insert_one(doc) print("Migrated a log.")
def migrate_images_to_remote(): with MongoDBCollection("website_pron", "images_info", host="192.168.1.103") as remote_coll: with MongoDBCollection("website_pron", "images_info") as local_coll: with ExtSSHConnection("192.168.1.103", "yuanyifan", "979323") as ext_ssh: for doc in local_coll.find({"block": "DAGUERRE"}): # 找到所有的达盖尔的旗帜 image_page_index = int(doc["_id"]) print("Uploading...%08d" % image_page_index) local_path = local_image_list_path % { "page_index": image_page_index } remote_path = remote_images_path % image_page_index ext_ssh.upload_dir(local_path, remote_path) remote_coll.insert_one(doc)
def count_faces(): def weed_url_to_fid(weed_url: str) -> str: parts = weed_url.split("/") return parts[0] + "," + parts[1] try: with MongoDBCollection("website_pron", "images_info_ahash_weed") as coll: indexes = get_face_uncounted_index(coll) print("{} image lists uncounted.".format(len(indexes))) for i in indexes: try: info = coll.find_one({"_id": i}) face = count_face_in_weed([ weed_url_to_fid(image_url) for image_url in info["image_list"] ], 1.1, 5) write_face_count(coll, i, face) print("There're %d faces in index=%d" % (face, i)) except Exception as ex: print("Error while counting faces in %d" % i) print(traceback.format_exc(), file=sys.stderr) except Exception as err: print("Error while counting faces:\n" + traceback.format_exc(), file=sys.stderr)
def process_page_url(url): try: with MongoDBCollection("website_pron", "images_info_ahash_weed") as coll: if is_url_existed(coll, url.replace("http://%s/" % caoliu_host, "/")): raise Exception("URL is existed in db!") page_soup = get_soup(url) title = get_page_title(page_soup) images = get_page_images(page_soup) text = get_page_text(page_soup) # 下载小文件 images_buffer = (download_pool.submit(GET_to_weed_hash, img).result() for img in images) page_index = insert_log(coll, title, url, text, images, [ img_info["weed_fid"].replace(",", "/") + "/" + img_info["_id"] + "." + img_info["file_type"] for img_info in images_buffer if img_info is not None ]) print("Downloaded: {}-->{}".format(url, page_index)) except Exception as ex: if str(ex).find("URL is existed") < 0: print("Error while downloading..." + traceback.format_exc())
def remove_invalid_dirs(): with MongoDBCollection("website_pron", "images_info") as coll: id_list = {int(x) for x in os.listdir(local_images_path)} mg_id_list = {x["_id"] for x in coll.find({}, {"_id": 1})} id_dump = id_list.difference(mg_id_list) for _id in id_dump: shutil.rmtree(local_image_list_path % {"page_index": _id})
def process_page_url(url): try: with MongoDBCollection("website_pron", "images_info") as coll: if is_url_existed(coll, url.replace("http://%s/" % caoliu_host, "/")): raise Exception("URL is existed in db!") page_soup = get_soup(url) title = get_page_title(page_soup) images = get_page_images(page_soup) text = get_page_text(page_soup) # 下载小文件 img_task_list = [ LiteDataDownloader(image_url=img, tag="%d%s" % (i, get_extension(img))) for i, img in enumerate(images) ] for task in img_task_list: task.start() for task in img_task_list: task.join() page_index = insert_log(coll, title, url, text, images) page_path = os.path.join(local_images_path, "%08d" % page_index) try: os.makedirs(page_path) except: print("dir may existed.") for task in img_task_list: task.write_file(os.path.join(page_path, task.tag)) # create tasks print("Downloaded: %s" % url) except Exception as ex: if str(ex).find("URL is existed") < 0: print("Error while downloading..." + traceback.format_exc())
def remove_remote_DAGUERRE(): assert input("Delete all DAGUERRE?(True/False)"), "Paused" with MongoDBCollection("website_pron", "images_info", host="192.168.1.103") as remote_coll: with ExtSSHConnection("192.168.1.103", "yuanyifan", "979323") as ext_ssh: daguerre_ids = [int(doc["_id"]) for doc in remote_coll.find({"block": "DAGUERRE"})] for daguerre_id in daguerre_ids: # 找到所有的达盖尔的旗帜 print("Removing...%08d" % daguerre_id) ext_ssh.run_command('rm -rf "%s"' % (remote_images_path % daguerre_id)) remote_coll.delete_one({"_id": daguerre_id})
def clean_invalid_files(): with MongoDBCollection("website_pron", "video_info") as coll: id_list = {x["_id"] for x in coll.find({}, {"_id": 1})} video_id_list = [ int(filename[5:-4]) for filename in os.listdir(videos_path) if filename.endswith("mp4") ] for id in video_id_list: if id not in id_list: shutil.move(video_saving_path % id, trash_video_file % id)
def remove_trash_images(): """ 根据规则移除垃圾图贴{"comment":{"$regex":"at\.umeng\.com"}} :return: """ with MongoDBCollection("website_pron", "images_info") as coll: return coll.delete_many({ "comment": { "$regex": "at\.umeng\.com" } }).deleted_count
def get_xhamster_detail(request): url = request.GET["url"] if not str(url).startswith("http"): url = "https://m.xhamster.com/videos/" + url with MongoDBCollection("spider", "xhamster_storage") as coll: doc = coll.find_one({"_id": url}) if doc is None: data = query_url(url) data.pop("related") coll.insert_one(data) return data else: return doc
def set_xhamster_rate(req): url = req.POST["url"] if not str(url).startswith("http"): url = "https://m.xhamster.com/videos/" + url rate = int(req.POST["rate"]) with MongoDBCollection("spider", "xhamster_storage") as coll: assert coll.update_one({ "_id": url }, { "$set": { "myrate": rate } }).modified_count > 0, "modify failed, no log's 'myrate' modified."
def migrate_videos_from_server(): with MongoDBCollection("website_pron", "video_info", host="192.168.1.103") as remote_coll: with MongoDBCollection("website_pron", "video_info") as local_coll: with ExtSSHConnection("192.168.1.103", "yuanyifan", "979323") as ext_ssh: for doc in remote_coll.find( {"$or": [{ "source": { "$ne": None } }, { "like": True }]}): _id = doc["_id"] if local_coll.find_one({"_id": _id}) is None: print("Downloading...") print(doc) ext_ssh.sftp_conn.get(remote_video_preview_path % _id, shortcuts_saving_path % _id) ext_ssh.sftp_conn.get(remote_video_path % _id, video_saving_path % _id) local_coll.insert_one(doc)
def remove_replication(): """ 移除重复项,只保留最早下载的项目 :return: """ with MongoDBCollection("website_pron", "images_info") as coll: res = coll.aggregate([ { # page_url不能为空 "$match": { "page_url": { "$ne": None } } }, { # 按照URL计数 "$group": { "_id": { "url": "$page_url" }, "count": { "$sum": 1 } } }, { # 计数大于1的 "$match": { "count": { "$gte": 2 } } } ]) for x in res: url = x["_id"]["url"] is_first = True _id_list = [ x["_id"] for x in coll.find({ "page_url": url }, { "_id": 1 }).sort("_id") ] for _id in _id_list: if is_first: is_first = False else: coll.delete_one({"_id": _id})
def count_faces(): try: with MongoDBCollection("website_pron", "images_info") as coll: indexes = get_face_uncounted_index(coll) for i in indexes: try: dir_name = os.path.join(local_images_path, "%08d" % i) face = count_face_in_dir(dir_name, 1.1, 5) write_face_count(coll, i, face) print("There're %d faces in index=%d" % (face, i)) except: print("Error while counting faces in %d" % i) except Exception as err: print("Error while counting faces:\n" + traceback.format_exc())
def query_xhamster_bylabel(request): tags = json.loads(request.GET["tags"]) page_size = int(get_request_with_default(request, "n", "30")) page_index = int(get_request_with_default(request, "p", "1")) with MongoDBCollection("spider", "xhamster_storage") as coll: return [ x for x in coll.find({ "$or": [{ "label": { "$regex": keyword, "$options": "i" } } for keyword in tags] }).skip((page_index - 1) * page_size).limit(page_size) ]
def remove_invalid_log(): """ 删除在数据库中没有文件夹的记录 :return: """ with MongoDBCollection("website_pron", "images_info") as coll: deprecated_id = [ x["_id"] for x in coll.find({}, {"_id": 1}) if not os.path.exists(local_image_list_path % {"page_index": x["_id"]}) ] if len(deprecated_id) > 0: if prompt("Delete these images:\n" + ",".join(["%d" % x for x in deprecated_id]) + "\n(y/n):").lower() == "y": coll.delete_many({"_id": {"$in": deprecated_id}})
def GET_to_weed_hash(url: str): try: image_bytes = GET(url, 6) with MongoDBCollection("website_pron", "image_hash_pool") as coll: with BytesIO(image_bytes) as bio: hash_info = hash_algorithm(bio) find_hash_in_lib = coll.find_one({"_id": hash_info}) if find_hash_in_lib is None: weed_fs = WeedFS() weed_fid = weed_fs.upload_file(stream=image_bytes, name=url) find_hash_in_lib = { "_id": hash_info, "weed_fid": weed_fid, "file_type": re.findall("\.(\w+)$", url)[0] } coll.insert_one(find_hash_in_lib) return find_hash_in_lib except: print("Error while get+insert image from web:\n{}\n".format(url), file=sys.stderr) print(traceback.format_exc(), file=sys.stderr) return None
def remove_empty_dir(delete_limit): try: with MongoDBCollection("website_pron", "images_info") as coll: img_pool_dirs = os.listdir(local_images_path) for img_dir in img_pool_dirs: img_list_index = int(img_dir) current_dir = os.path.join(local_images_path, img_dir) files_in_dir = os.listdir(current_dir) for filename in files_in_dir: full_filename = os.path.join(current_dir, filename) is_valid = is_valid_image(full_filename) if not is_valid: os.remove(full_filename) print("Image: %s deleted" % full_filename) files_in_dir = os.listdir(current_dir) if len(files_in_dir) <= delete_limit: if not get_is_like(coll, img_list_index): shutil.rmtree(current_dir) remove_log(coll, img_list_index) print("%d files in %s, removed." % (len(files_in_dir), img_dir)) except Exception as err: print("Error while removing empty dir:\n" + traceback.format_exc())
print("%s dumped" % url) failed_times = 0 except: print("%s download error, excepted." % url) waitingColl.insert_one({"_id": url}) failed_times += 1 if __name__ == '__main__': dbName = "spider" collPrefix = "xhamster" with MongoDBConnection() as mongo: mongo.drop_database(dbName) with MongoDBCollection(dbName, collPrefix + "_queue") as coll: if coll.count() <= 0: for i in range(3): for url in getTopURLs(i + 1): try: coll.insert({"_id": url}) except: print("Error while inserting " + url) thread_list = [ SpiderThread(db_name=dbName, coll_prefix=collPrefix) for _ in range(concurrency_num) ] [t.start() for t in thread_list] [t.join() for t in thread_list]
def remove_empty_dir(min_file_tol: int): with MongoDBCollection("website_pron", "images_info_ahash_weed") as coll: return coll.delete_many({ "$where": "this.image_list.length<{}".format(min_file_tol) }).deleted_count