def migrate_videos_to_server():
    with MongoDBCollection("website_pron", "video_info",
                           host="192.168.1.103") as remote_coll:
        with MongoDBCollection("website_pron", "video_info") as local_coll:
            with ExtSSHConnection("192.168.1.103", "yuanyifan",
                                  "979323") as ext_ssh:
                for doc in local_coll.find({"source": {"$ne": None}}):
                    if remote_coll.find_one({"source": doc["source"]}) is None:
                        print("Uploading...")
                        print(doc)
                        _id = doc["_id"]
                        current_index = int(
                            remote_coll.find({}, {
                                "_id": 1
                            }).sort("_id", -1).limit(1)[0]["_id"]) + 1
                        insert_doc = doc
                        insert_doc["_id"] = current_index
                        ext_ssh.sftp_conn.put(
                            shortcuts_saving_path % _id,
                            remote_video_preview_path % current_index)
                        ext_ssh.sftp_conn.put(
                            video_saving_path % _id,
                            remote_video_path % current_index)
                        remote_coll.insert_one(insert_doc)
                        print("Uploaded.")
示例#2
0
def migrate_videos_to_remote():
    with MongoDBCollection("website_pron", "video_info", host="192.168.1.103") as remote_coll:
        with MongoDBCollection("website_pron", "video_info") as local_coll:
            with ExtSSHConnection("192.168.1.103", "yuanyifan", "979323") as ext_ssh:
                for doc in local_coll.find({}):
                    video_id = int(doc["_id"])
                    if remote_coll.find_one({"_id": video_id}) is None:
                        print("Found a log to migrate:\n%s" % json.dumps(doc, indent=2))
                        ext_ssh.sftp_conn.put(video_saving_path % video_id, remote_video_path % video_id)
                        ext_ssh.sftp_conn.put(shortcuts_saving_path % video_id, remote_video_preview_path % video_id)
                        remote_coll.insert_one(doc)
                        print("Migrated a log.")
示例#3
0
def migrate_images_to_remote():
    with MongoDBCollection("website_pron", "images_info", host="192.168.1.103") as remote_coll:
        with MongoDBCollection("website_pron", "images_info") as local_coll:
            with ExtSSHConnection("192.168.1.103", "yuanyifan", "979323") as ext_ssh:
                for doc in local_coll.find({"block": "DAGUERRE"}):  # 找到所有的达盖尔的旗帜
                    image_page_index = int(doc["_id"])
                    print("Uploading...%08d" % image_page_index)
                    local_path = local_image_list_path % {
                        "page_index": image_page_index
                    }
                    remote_path = remote_images_path % image_page_index
                    ext_ssh.upload_dir(local_path, remote_path)
                    remote_coll.insert_one(doc)
示例#4
0
def count_faces():
    def weed_url_to_fid(weed_url: str) -> str:
        parts = weed_url.split("/")
        return parts[0] + "," + parts[1]

    try:
        with MongoDBCollection("website_pron",
                               "images_info_ahash_weed") as coll:
            indexes = get_face_uncounted_index(coll)
            print("{} image lists uncounted.".format(len(indexes)))
            for i in indexes:
                try:
                    info = coll.find_one({"_id": i})
                    face = count_face_in_weed([
                        weed_url_to_fid(image_url)
                        for image_url in info["image_list"]
                    ], 1.1, 5)
                    write_face_count(coll, i, face)
                    print("There're %d faces in index=%d" % (face, i))
                except Exception as ex:
                    print("Error while counting faces in %d" % i)
                    print(traceback.format_exc(), file=sys.stderr)
    except Exception as err:
        print("Error while counting faces:\n" + traceback.format_exc(),
              file=sys.stderr)
示例#5
0
def process_page_url(url):
    try:
        with MongoDBCollection("website_pron",
                               "images_info_ahash_weed") as coll:
            if is_url_existed(coll, url.replace("http://%s/" % caoliu_host,
                                                "/")):
                raise Exception("URL is existed in db!")
            page_soup = get_soup(url)
            title = get_page_title(page_soup)
            images = get_page_images(page_soup)
            text = get_page_text(page_soup)
            # 下载小文件

            images_buffer = (download_pool.submit(GET_to_weed_hash,
                                                  img).result()
                             for img in images)
            page_index = insert_log(coll, title, url, text, images, [
                img_info["weed_fid"].replace(",", "/") + "/" +
                img_info["_id"] + "." + img_info["file_type"]
                for img_info in images_buffer if img_info is not None
            ])
            print("Downloaded: {}-->{}".format(url, page_index))

    except Exception as ex:
        if str(ex).find("URL is existed") < 0:
            print("Error while downloading..." + traceback.format_exc())
示例#6
0
def remove_invalid_dirs():
    with MongoDBCollection("website_pron", "images_info") as coll:
        id_list = {int(x) for x in os.listdir(local_images_path)}
        mg_id_list = {x["_id"] for x in coll.find({}, {"_id": 1})}
        id_dump = id_list.difference(mg_id_list)
        for _id in id_dump:
            shutil.rmtree(local_image_list_path % {"page_index": _id})
示例#7
0
def process_page_url(url):
    try:
        with MongoDBCollection("website_pron", "images_info") as coll:
            if is_url_existed(coll, url.replace("http://%s/" % caoliu_host,
                                                "/")):
                raise Exception("URL is existed in db!")
            page_soup = get_soup(url)
            title = get_page_title(page_soup)
            images = get_page_images(page_soup)
            text = get_page_text(page_soup)
            # 下载小文件
            img_task_list = [
                LiteDataDownloader(image_url=img,
                                   tag="%d%s" % (i, get_extension(img)))
                for i, img in enumerate(images)
            ]
            for task in img_task_list:
                task.start()
            for task in img_task_list:
                task.join()
            page_index = insert_log(coll, title, url, text, images)
            page_path = os.path.join(local_images_path, "%08d" % page_index)
            try:
                os.makedirs(page_path)
            except:
                print("dir may existed.")
            for task in img_task_list:
                task.write_file(os.path.join(page_path, task.tag))
            # create tasks
            print("Downloaded: %s" % url)

    except Exception as ex:
        if str(ex).find("URL is existed") < 0:
            print("Error while downloading..." + traceback.format_exc())
示例#8
0
def remove_remote_DAGUERRE():
    assert input("Delete all DAGUERRE?(True/False)"), "Paused"
    with MongoDBCollection("website_pron", "images_info", host="192.168.1.103") as remote_coll:
        with ExtSSHConnection("192.168.1.103", "yuanyifan", "979323") as ext_ssh:
            daguerre_ids = [int(doc["_id"]) for doc in remote_coll.find({"block": "DAGUERRE"})]
            for daguerre_id in daguerre_ids:  # 找到所有的达盖尔的旗帜
                print("Removing...%08d" % daguerre_id)
                ext_ssh.run_command('rm -rf "%s"' % (remote_images_path % daguerre_id))
                remote_coll.delete_one({"_id": daguerre_id})
def clean_invalid_files():
    with MongoDBCollection("website_pron", "video_info") as coll:
        id_list = {x["_id"] for x in coll.find({}, {"_id": 1})}
        video_id_list = [
            int(filename[5:-4]) for filename in os.listdir(videos_path)
            if filename.endswith("mp4")
        ]
        for id in video_id_list:
            if id not in id_list:
                shutil.move(video_saving_path % id, trash_video_file % id)
示例#10
0
def remove_trash_images():
    """
    根据规则移除垃圾图贴{"comment":{"$regex":"at\.umeng\.com"}}
    :return:
    """
    with MongoDBCollection("website_pron", "images_info") as coll:
        return coll.delete_many({
            "comment": {
                "$regex": "at\.umeng\.com"
            }
        }).deleted_count
示例#11
0
def get_xhamster_detail(request):
    url = request.GET["url"]
    if not str(url).startswith("http"):
        url = "https://m.xhamster.com/videos/" + url
    with MongoDBCollection("spider", "xhamster_storage") as coll:
        doc = coll.find_one({"_id": url})
        if doc is None:
            data = query_url(url)
            data.pop("related")
            coll.insert_one(data)
            return data
        else:
            return doc
示例#12
0
def set_xhamster_rate(req):
    url = req.POST["url"]
    if not str(url).startswith("http"):
        url = "https://m.xhamster.com/videos/" + url
    rate = int(req.POST["rate"])
    with MongoDBCollection("spider", "xhamster_storage") as coll:
        assert coll.update_one({
            "_id": url
        }, {
            "$set": {
                "myrate": rate
            }
        }).modified_count > 0, "modify failed, no log's 'myrate' modified."
def migrate_videos_from_server():
    with MongoDBCollection("website_pron", "video_info",
                           host="192.168.1.103") as remote_coll:
        with MongoDBCollection("website_pron", "video_info") as local_coll:
            with ExtSSHConnection("192.168.1.103", "yuanyifan",
                                  "979323") as ext_ssh:
                for doc in remote_coll.find(
                    {"$or": [{
                        "source": {
                            "$ne": None
                        }
                    }, {
                        "like": True
                    }]}):
                    _id = doc["_id"]
                    if local_coll.find_one({"_id": _id}) is None:
                        print("Downloading...")
                        print(doc)
                        ext_ssh.sftp_conn.get(remote_video_preview_path % _id,
                                              shortcuts_saving_path % _id)
                        ext_ssh.sftp_conn.get(remote_video_path % _id,
                                              video_saving_path % _id)
                        local_coll.insert_one(doc)
示例#14
0
def remove_replication():
    """
    移除重复项,只保留最早下载的项目
    :return:
    """
    with MongoDBCollection("website_pron", "images_info") as coll:
        res = coll.aggregate([
            {
                # page_url不能为空
                "$match": {
                    "page_url": {
                        "$ne": None
                    }
                }
            },
            {
                # 按照URL计数
                "$group": {
                    "_id": {
                        "url": "$page_url"
                    },
                    "count": {
                        "$sum": 1
                    }
                }
            },
            {
                # 计数大于1的
                "$match": {
                    "count": {
                        "$gte": 2
                    }
                }
            }
        ])
        for x in res:
            url = x["_id"]["url"]
            is_first = True
            _id_list = [
                x["_id"] for x in coll.find({
                    "page_url": url
                }, {
                    "_id": 1
                }).sort("_id")
            ]
            for _id in _id_list:
                if is_first:
                    is_first = False
                else:
                    coll.delete_one({"_id": _id})
示例#15
0
def count_faces():
    try:
        with MongoDBCollection("website_pron", "images_info") as coll:
            indexes = get_face_uncounted_index(coll)
            for i in indexes:
                try:
                    dir_name = os.path.join(local_images_path, "%08d" % i)
                    face = count_face_in_dir(dir_name, 1.1, 5)
                    write_face_count(coll, i, face)
                    print("There're %d faces in index=%d" % (face, i))
                except:
                    print("Error while counting faces in %d" % i)
    except Exception as err:
        print("Error while counting faces:\n" + traceback.format_exc())
示例#16
0
def query_xhamster_bylabel(request):
    tags = json.loads(request.GET["tags"])
    page_size = int(get_request_with_default(request, "n", "30"))
    page_index = int(get_request_with_default(request, "p", "1"))
    with MongoDBCollection("spider", "xhamster_storage") as coll:
        return [
            x for x in coll.find({
                "$or": [{
                    "label": {
                        "$regex": keyword,
                        "$options": "i"
                    }
                } for keyword in tags]
            }).skip((page_index - 1) * page_size).limit(page_size)
        ]
示例#17
0
def remove_invalid_log():
    """
    删除在数据库中没有文件夹的记录
    :return:
    """
    with MongoDBCollection("website_pron", "images_info") as coll:
        deprecated_id = [
            x["_id"] for x in coll.find({}, {"_id": 1})
            if not os.path.exists(local_image_list_path %
                                  {"page_index": x["_id"]})
        ]
        if len(deprecated_id) > 0:
            if prompt("Delete these images:\n" +
                      ",".join(["%d" % x for x in deprecated_id]) +
                      "\n(y/n):").lower() == "y":
                coll.delete_many({"_id": {"$in": deprecated_id}})
示例#18
0
def GET_to_weed_hash(url: str):
    try:
        image_bytes = GET(url, 6)
        with MongoDBCollection("website_pron", "image_hash_pool") as coll:
            with BytesIO(image_bytes) as bio:
                hash_info = hash_algorithm(bio)
            find_hash_in_lib = coll.find_one({"_id": hash_info})
            if find_hash_in_lib is None:
                weed_fs = WeedFS()
                weed_fid = weed_fs.upload_file(stream=image_bytes, name=url)
                find_hash_in_lib = {
                    "_id": hash_info,
                    "weed_fid": weed_fid,
                    "file_type": re.findall("\.(\w+)$", url)[0]
                }
                coll.insert_one(find_hash_in_lib)
            return find_hash_in_lib
    except:
        print("Error while get+insert image from web:\n{}\n".format(url),
              file=sys.stderr)
        print(traceback.format_exc(), file=sys.stderr)
        return None
示例#19
0
def remove_empty_dir(delete_limit):
    try:
        with MongoDBCollection("website_pron", "images_info") as coll:
            img_pool_dirs = os.listdir(local_images_path)
            for img_dir in img_pool_dirs:
                img_list_index = int(img_dir)
                current_dir = os.path.join(local_images_path, img_dir)
                files_in_dir = os.listdir(current_dir)
                for filename in files_in_dir:
                    full_filename = os.path.join(current_dir, filename)
                    is_valid = is_valid_image(full_filename)
                    if not is_valid:
                        os.remove(full_filename)
                        print("Image: %s deleted" % full_filename)
                files_in_dir = os.listdir(current_dir)
                if len(files_in_dir) <= delete_limit:
                    if not get_is_like(coll, img_list_index):
                        shutil.rmtree(current_dir)
                        remove_log(coll, img_list_index)
                        print("%d files in %s, removed." %
                              (len(files_in_dir), img_dir))

    except Exception as err:
        print("Error while removing empty dir:\n" + traceback.format_exc())
示例#20
0
                            print("%s dumped" % url)
                        failed_times = 0
                    except:
                        print("%s download error, excepted." % url)
                        waitingColl.insert_one({"_id": url})
                        failed_times += 1


if __name__ == '__main__':
    dbName = "spider"
    collPrefix = "xhamster"

    with MongoDBConnection() as mongo:
        mongo.drop_database(dbName)

    with MongoDBCollection(dbName, collPrefix + "_queue") as coll:
        if coll.count() <= 0:
            for i in range(3):
                for url in getTopURLs(i + 1):
                    try:
                        coll.insert({"_id": url})
                    except:
                        print("Error while inserting " + url)

    thread_list = [
        SpiderThread(db_name=dbName, coll_prefix=collPrefix)
        for _ in range(concurrency_num)
    ]
    [t.start() for t in thread_list]
    [t.join() for t in thread_list]
示例#21
0
def remove_empty_dir(min_file_tol: int):
    with MongoDBCollection("website_pron", "images_info_ahash_weed") as coll:
        return coll.delete_many({
            "$where":
            "this.image_list.length<{}".format(min_file_tol)
        }).deleted_count