예제 #1
0
def thread_scraper(thread_id, board_name, session, directory):
    thread_id = str(thread_id)
    link = "https://bbw-chan.nl/" + board_name + "/res/" + thread_id + ".json"
    r = session.get(link)
    if r.status_code == 404:
        return
    thread = json.loads(r.text)
    thread_master = thread
    if "archived" in thread_master:
        location = "Archive"
    else:
        location = "Catalog"
    text = ""
    if thread_master["subject"]:
        title = thread_master["subject"].lower()
        if any(ignored_keyword in title for ignored_keyword in ignored_keywords):
            print("Removed From "+location+": ", title)
            return
        else:
            text = thread_master["subject"][:maximum_length]

    if thread_master["message"]:
        title = thread_master["message"].lower()
        if any(ignored_keyword in title for ignored_keyword in ignored_keywords):
            print("Removed From "+location+": ", title)
            return
        else:
            if not text:
                text = thread_master["message"][:maximum_length]
    thread_master2 = thread_master.copy()
    for key in thread_master2:
        if "posts" != key:
            del thread_master[key]
    del thread_master2["posts"]
    thread["download_path"] = ""
    thread["posts"] = [thread_master2]+thread_master["posts"]
    found = False
    new_directory = ""
    for post in thread["posts"]:
        date_object = datetime.strptime(
            post["creation"], "%Y-%m-%dT%H:%M:%S.%fZ")
        post["creation"] = date_object.timestamp()
        for media in post["files"]:
            ext = media["mime"].split("/")[1]
            media["ext"] = ext
            file_name = os.path.splitext(media["originalName"])[0].strip()
            text = main_helper.clean_text(text)
            new_directory = directory+"/"+text+" - "+thread_id+"/"
            if not text:
                new_directory = new_directory.replace(" - ", "")
            file_path = main_helper.reformat(new_directory, None, None, file_name,
                                                text, ext, date_object, post["name"], file_directory_format, file_name_format, date_format, maximum_length)
            media["download_path"] = file_path
            found = True
    if found:
        thread["directory"] = new_directory
        return thread
예제 #2
0
파일: start.py 프로젝트: zoerab/OnlyFans
 def update(filepath):
     temp = json.loads(
         json.dumps(reformat, default=lambda o: o.__dict__))
     filepath = os.path.abspath(filepath)
     new_format = main_helper.reformat(**temp)
     new_format = os.path.abspath(new_format)
     if filepath != new_format:
         shutil.move(filepath, new_format)
     return new_format
예제 #3
0
def thread_scraper(thread_id, board_name, session, directory):
    thread_id = str(thread_id)
    link = "http://a.4cdn.org/" + board_name + "/thread/" + thread_id + ".json"
    r = session.get(link)
    if r.status_code == 404:
        return
    thread = json.loads(r.text)
    thread_master = thread["posts"][0]
    if "archived" in thread_master:
        location = "Archive"
    else:
        location = "Catalog"

    if "sub" in thread_master:
        title = thread_master["sub"].lower()
        if any(ignored_keyword in title for ignored_keyword in ignored_keywords):
            print("Removed From "+location+": ", title)
            return

    if "com" in thread_master:
        title = thread_master["com"].lower()
        if any(ignored_keyword in title for ignored_keyword in ignored_keywords):
            print("Removed From "+location+": ", title)
            return
    text = ""
    if "sub" in thread_master:
        text = thread_master["sub"][:maximum_length]
    else:
        text = thread_master["com"][:maximum_length]
    found = False
    new_directory = ""
    seen = set()
    for post in thread["posts"]:
        if "name" not in post:
            post["name"] = "Anonymous"
        if "filename" in post:
            ext = post["ext"].replace(".", "")
            filename = main_helper.clean_text(post["filename"])
            if not filename:
                filename = str(post["no"])
            result = main_helper.rename_duplicates(seen, filename)
            seen = result[0]
            file_name = result[1]
            text = main_helper.clean_text(text)
            new_directory = directory+"/"+text+" - "+thread_id+"/"
            if not text:
                new_directory = new_directory.replace(" - ", "")
            date_object = datetime.fromtimestamp(post["time"])
            file_path = main_helper.reformat(new_directory, None, None, file_name,
                                             text, ext, date_object, post["name"], file_directory_format, file_name_format, date_format, maximum_length)
            post["download_path"] = file_path
            found = True
    if found:
        thread["directory"] = new_directory
        return thread
예제 #4
0
 def reformat(self, unformatted_list) -> list[str]:
     x = []
     format_variables2 = format_variables()
     for key, unformatted_item in unformatted_list.items():
         if "filename_format" == key:
             unformatted_item = os.path.join(x[1], unformatted_item)
             print
         string = main_helper.reformat(self, unformatted_item)
         final_path = []
         paths = string.split(os.sep)
         for path in paths:
             key = main_helper.find_between(path, "{", "}")
             e = getattr(format_variables2, key, None)
             if path == e:
                 break
             final_path.append(path)
         final_path = os.sep.join(final_path)
         print
         x.append(final_path)
     return x
예제 #5
0
def format_directories(directory,
                       site_name,
                       username,
                       unformatted,
                       locations=[],
                       api_type="") -> dict:
    x = {}
    option = {}
    option["site_name"] = site_name
    option["username"] = username
    option["directory"] = directory
    option["postedAt"] = datetime.today()
    option["date_format"] = date_format
    option["maximum_length"] = maximum_length
    prepared_format = prepare_reformat(option)
    legacy_model_directory = x["legacy_model_directory"] = os.path.join(
        directory, site_name, username)
    x["legacy_metadatas"] = {}
    x["legacy_metadatas"]["legacy_metadata"] = os.path.join(
        legacy_model_directory, api_type, "Metadata")
    x["legacy_metadatas"]["legacy_metadata2"] = os.path.join(
        legacy_model_directory, "Metadata")
    x["metadata_directory"] = main_helper.reformat(prepared_format,
                                                   unformatted)
    x["download_directory"] = directory
    x["locations"] = []
    for location in locations:
        directories = {}
        cats = ["Unsorted", "Free", "Paid"]
        for cat in cats:
            cat2 = cat
            if "Unsorted" in cat2:
                cat2 = ""
            path = os.path.join(api_type, cat2, location[0])
            directories[cat.lower()] = path
        y = {}
        y["sorted_directories"] = directories
        y["media_type"] = location[0]
        y["alt_media_type"] = location[1]
        x["locations"].append(y)
    return x
예제 #6
0
def media_scraper(results, api, formatted_directories, username, api_type, parent_type=""):
    new_set = {}
    new_set["content"] = []
    directories = []
    session = api.sessions[0]
    if api_type == "Stories":
        if "stories" in results:
            items = results["stories"]
            for item in items:
                item["text"] = results["title"]
            results = results["stories"]
    if api_type == "Archived":
        print
        pass
    if api_type == "Posts":
        print
    if api_type == "Messages":
        pass
    if not results or "error" in results:
        return new_set
    if "result" in results:
        session = results["session"]
        results = results["result"]
        if "error" in results:
            return new_set
    download_path = formatted_directories["download_directory"]
    for location in formatted_directories["locations"]:
        sorted_directories = copy.copy(location["sorted_directories"])
        master_date = "01-01-0001 00:00:00"
        media_type = location["media_type"]
        alt_media_type = location["alt_media_type"]
        file_directory_format = json_settings["file_directory_format"]
        if api_type == "Archived":
            x = file_directory_format.split(os.sep)
            for y in x:
                substr = "{api_type}"
                if substr == y:
                    new_path = os.path.join(substr, parent_type)
                    file_directory_format = file_directory_format.replace(
                        substr, new_path)
                    break
                print
            print
        seperator = " | "
        print(
            f"Scraping [{seperator.join(alt_media_type)}]. Should take less than a minute.")
        for media_api in results:
            new_post = {}
            new_post["medias"] = []
            rawText = media_api.get("rawText", "")
            text = media_api.get("text", "")
            final_text = rawText if rawText else text
            # if media_api["responseType"] == "post":
            #     if media_api["isArchived"]:
            #         pass
            if api_type == "Messages":
                media_api["rawText"] = media_api["text"]
            if api_type == "Mass Messages":
                media_user = media_api["fromUser"]
                media_username = media_user["username"]
                if media_username != username:
                    continue
            date = media_api["postedAt"] if "postedAt" in media_api else media_api["createdAt"]
            if date == "-001-11-30T00:00:00+00:00":
                date_string = master_date
                date_object = datetime.strptime(
                    master_date, "%d-%m-%Y %H:%M:%S")
            else:
                date_object = datetime.fromisoformat(date)
                date_string = date_object.replace(tzinfo=None).strftime(
                    "%d-%m-%Y %H:%M:%S")
                master_date = date_string
            new_post["post_id"] = media_api["id"]
            new_post["text"] = final_text
            new_post["postedAt"] = date_string
            new_post["paid"] = False
            price = new_post["price"] = media_api["price"]if "price" in media_api else None
            if price == None:
                price = 0
            canPurchase = media_api.get("canPurchase", None)
            canViewMedia = media_api.get("canViewMedia", None)
            if price:
                if not canPurchase or canViewMedia:
                    new_post["paid"] = True
            for media in media_api["media"]:
                media_id = media["id"]
                date = "-001-11-30T00:00:00+00:00"
                size = 0
                link = ""
                preview_link = ""
                if "source" in media:
                    quality_key = "source"
                    source = media[quality_key]
                    link = source[quality_key]
                    if link:
                        if media["type"] == "video":
                            qualities = media["videoSources"]
                            qualities = dict(
                                sorted(qualities.items(), reverse=False))
                            qualities[quality_key] = source[quality_key]
                            for quality, quality_link in qualities.items():
                                video_quality_json = json_settings["video_quality"]
                                video_quality_json = video_quality_json.removesuffix(
                                    "p")
                                if quality == video_quality_json:
                                    if link:
                                        link = quality_link
                                        break
                                    print
                                print
                            print

                    size = media["info"]["preview"]["size"] if "info" in media_api else 1
                if "src" in media:
                    link = media["src"]
                    size = media["info"]["preview"]["size"] if "info" in media_api else 1
                    date = media_api["createdAt"]
                matches = ["us", "uk", "ca", "ca2", "de"]

                if not link:
                    continue
                url = urlparse(link)
                subdomain = url.hostname.split('.')[0]
                preview_link = media["preview"]
                if any(subdomain in nm for nm in matches):
                    subdomain = url.hostname.split('.')[1]
                    if "upload" in subdomain:
                        continue
                    if "convert" in subdomain:
                        link = preview_link
                rules = [link == "",
                         preview_link == ""]
                if all(rules):
                    continue
                new_media = dict()
                new_media["media_id"] = media_id
                new_media["links"] = []
                new_media["media_type"] = media_type
                for xlink in link, preview_link:
                    if xlink:
                        new_media["links"].append(xlink)
                        break
                session.links.extend(new_media["links"])

                if media["type"] not in alt_media_type:
                    continue
                matches = [s for s in ignored_keywords if s in final_text]
                if matches:
                    print("Matches: ", matches)
                    continue
                filename = link.rsplit('/', 1)[-1]
                filename, ext = os.path.splitext(filename)
                ext = ext.__str__().replace(".", "").split('?')[0]

                option = {}
                option = option | new_post
                option["site_name"] = "OnlyFans"
                option["media_id"] = media_id
                option["filename"] = filename
                option["api_type"] = api_type
                option["media_type"] = media_type
                option["ext"] = ext
                option["username"] = username
                option["date_format"] = date_format
                option["text_length"] = text_length
                option["directory"] = download_path

                prepared_format = prepare_reformat(option)
                file_directory = main_helper.reformat(
                    prepared_format, file_directory_format)
                prepared_format.directory = file_directory
                file_path = main_helper.reformat(
                    prepared_format, filename_format)
                new_media["directory"] = os.path.join(file_directory)
                new_media["filename"] = os.path.basename(file_path)
                if file_directory not in directories:
                    directories.append(file_directory)
                new_post["medias"].append(new_media)
            new_set["content"].append(new_post)
    new_set["directories"] = directories
    return new_set
예제 #7
0
def media_scraper(link, session, directory, username, api_type):
    media_set = [[], []]
    media_type = directory[-1]
    y = json_request(session, link)
    if not y or "error" in y:
        return media_set
    x = 0
    if api_type == "Highlights":
        y = y["stories"]
    if api_type == "Messages":
        y = y["list"]
    if api_type == "Mass Messages":
        y = y["list"]
    master_date = "01-01-0001 00:00:00"
    for media_api in y:
        if api_type == "Mass Messages":
            media_user = media_api["fromUser"]
            media_username = media_user["username"]
            if media_username != username:
                continue
        for media in media_api["media"]:
            date = "-001-11-30T00:00:00+00:00"
            size = 0
            if "source" in media:
                source = media["source"]
                link = source["source"]
                size = media["info"]["preview"][
                    "size"] if "info" in media_api else 1
                date = media_api[
                    "postedAt"] if "postedAt" in media_api else media_api[
                        "createdAt"]
            if "src" in media:
                link = media["src"]
                size = media["info"]["preview"][
                    "size"] if "info" in media_api else 1
                date = media_api["createdAt"]
            if not link:
                continue
            matches = ["us", "uk", "ca", "ca2", "de"]

            url = urlparse(link)
            subdomain = url.hostname.split('.')[0]
            preview_link = media["preview"]
            if any(subdomain in nm for nm in matches):
                subdomain = url.hostname.split('.')[1]
                if "upload" in subdomain:
                    continue
                if "convert" in subdomain:
                    link = preview_link
            rules = [link == "", preview_link == ""]
            if all(rules):
                continue
            new_dict = dict()
            new_dict["post_id"] = media_api["id"]
            new_dict["media_id"] = media["id"]
            new_dict["links"] = []
            for xlink in link, preview_link:
                if xlink:
                    new_dict["links"].append(xlink)
            new_dict[
                "price"] = media_api["price"] if "price" in media_api else None
            if date == "-001-11-30T00:00:00+00:00":
                date_string = master_date
                date_object = datetime.strptime(master_date,
                                                "%d-%m-%Y %H:%M:%S")
            else:
                date_object = datetime.fromisoformat(date)
                date_string = date_object.replace(
                    tzinfo=None).strftime("%d-%m-%Y %H:%M:%S")
                master_date = date_string

            if media["type"] not in media_type:
                x += 1
                continue
            if "rawText" not in media_api:
                media_api["rawText"] = ""
            text = media_api["rawText"] if media_api["rawText"] else ""
            matches = [s for s in ignored_keywords if s in text]
            if matches:
                print("Matches: ", matches)
                continue
            text = clean_text(text)
            new_dict["postedAt"] = date_string
            post_id = new_dict["post_id"]
            media_id = new_dict["media_id"]
            file_name = link.rsplit('/', 1)[-1]
            file_name, ext = os.path.splitext(file_name)
            ext = ext.__str__().replace(".", "").split('?')[0]
            file_path = reformat(directory[0][1], post_id, media_id, file_name,
                                 text, ext, date_object, username, format_path,
                                 date_format, maximum_length)
            new_dict["text"] = text
            new_dict["paid"] = False
            if new_dict["price"]:
                if api_type in ["Messages", "Mass Messages"]:
                    new_dict["paid"] = True
                else:
                    if media["id"] not in media_api["preview"] and media[
                            "canView"]:
                        new_dict["paid"] = True
            new_dict["directory"] = os.path.join(directory[0][1])
            if sort_free_paid_posts:
                new_dict["directory"] = os.path.join(directory[1][1])
                if new_dict["paid"]:
                    new_dict["directory"] = os.path.join(directory[2][1])
            new_dict["filename"] = file_path.rsplit('/', 1)[-1]
            new_dict["size"] = size
            if size == 0:
                media_set[1].append(new_dict)
                continue
            media_set[0].append(new_dict)
    return media_set
예제 #8
0
def media_scraper(result, sessions, formatted_directories, username, api_type):
    link = result["link"]
    session = sessions[result["count"]]
    media_set = []
    y = main_helper.json_request(session, link)
    if not y or "error" in y:
        return media_set
    x = 0
    if api_type == "Highlights":
        y = y["stories"]
    if api_type == "Messages":
        y = y["list"]
    y = y["list"] if "list" in y else y
    model_directory = formatted_directories["model_directory"]
    for location in formatted_directories["locations"]:
        sorted_directories = location["sorted_directories"]
        master_date = "01-01-0001 00:00:00"
        media_type = location["media_type"]
        alt_media_type = location["alt_media_type"]
        if result["count"] == 0:
            seperator = " | "
            print("Scraping [" + str(seperator.join(alt_media_type)) +
                  "]. Should take less than a minute.")
        media_set2 = {}
        media_set2["type"] = media_type
        media_set2["valid"] = []
        media_set2["invalid"] = []
        for media_api in y:
            if api_type == "Mass Messages":
                media_user = media_api["fromUser"]
                media_username = media_user["username"]
                if media_username != username:
                    continue
            new_api = (media_api["media"]
                       if "media" in media_api else [media_api])
            for media in new_api:
                date = "-001-11-30T00:00:00+00:00"
                size = 1
                src = media["src"]
                link = src["source"]
                date = media_api[
                    "createdAt"] if "createdAt" in media_api else media_api[
                        "postedAt"]
                if not link:
                    continue
                new_dict = dict()
                new_dict["post_id"] = media_api["id"]
                new_dict["links"] = [link]
                if date == "-001-11-30T00:00:00+00:00":
                    date_string = master_date
                    date_object = datetime.strptime(master_date,
                                                    "%d-%m-%Y %H:%M:%S")
                else:
                    date_object = datetime.fromisoformat(date)
                    date_string = date_object.replace(
                        tzinfo=None).strftime("%d-%m-%Y %H:%M:%S")
                    master_date = date_string
                media["mediaType"] = media[
                    "mediaType"] if "mediaType" in media else media["type"]
                if media["mediaType"] not in alt_media_type:
                    x += 1
                    continue
                if "text" not in media_api:
                    media_api["text"] = ""
                new_dict[
                    "text"] = media_api["text"] if media_api["text"] else ""
                new_dict["postedAt"] = date_string
                post_id = new_dict["post_id"]
                media_id = media["id"] if "id" in media else None
                media_id = media_id if isinstance(media_id, int) else None
                text = new_dict["text"]
                file_name = link.rsplit('/', 1)[-1]
                file_name, ext = os.path.splitext(file_name)
                ext = ext.__str__().replace(".", "").split('?')[0]
                media_directory = os.path.join(model_directory,
                                               sorted_directories["unsorted"])
                file_path = main_helper.reformat(media_directory, post_id,
                                                 media_id, file_name, text,
                                                 ext, date_object, username,
                                                 file_directory_format,
                                                 file_name_format, date_format,
                                                 maximum_length)
                file_directory = os.path.dirname(file_path)
                new_dict["directory"] = os.path.join(file_directory)
                new_dict["filename"] = os.path.basename(file_path)
                new_dict["size"] = size
                if size == 0:
                    media_set2["invalid"].append(new_dict)
                    continue
                new_dict["session"] = session
                media_set2["valid"].append(new_dict)
        media_set.append(media_set2)
    return media_set
예제 #9
0
def media_scraper(results, api, formatted_directories, username, api_type, parent_type=""):
    media_set = {}
    directories = []
    session = api.sessions[0]
    if api_type == "Stories":
        if "stories" in results:
            items = results["stories"]
            for item in items:
                item["text"] = results["title"]
            results = results["stories"]
    if api_type == "Archived":
        print
        pass
    if api_type == "Posts":
        print
    if api_type == "Messages":
        pass
    if not results or "error" in results:
        return media_set
    if "result" in results:
        session = results["session"]
        results = results["result"]
        if "error" in results:
            return media_set
    download_path = formatted_directories["download_directory"]
    for location in formatted_directories["locations"]:
        sorted_directories = copy.copy(location["sorted_directories"])
        master_date = "01-01-0001 00:00:00"
        media_type = location["media_type"]
        alt_media_type = location["alt_media_type"]
        file_directory_format = json_settings["file_directory_format"]
        if api_type == "Archived":
            x = file_directory_format.split(os.sep)
            for y in x:
                substr = "{api_type}"
                if substr == y:
                    new_path = os.path.join(substr, parent_type)
                    file_directory_format = file_directory_format.replace(
                        substr, new_path)
                    break
                print
            print
        seperator = " | "
        print(
            f"Scraping [{seperator.join(alt_media_type)}]. Should take less than a minute.")
        media_set2 = {}
        media_set2["valid"] = []
        media_set2["invalid"] = []
        for media_api in results:
            # if media_api["responseType"] == "post":
            #     if media_api["isArchived"]:
            #         pass
            if api_type == "Messages":
                media_api["rawText"] = media_api["text"]
            if api_type == "Mass Messages":
                media_user = media_api["fromUser"]
                media_username = media_user["username"]
                if media_username != username:
                    continue
            date = media_api["postedAt"] if "postedAt" in media_api else media_api["createdAt"]
            if date == "-001-11-30T00:00:00+00:00":
                date_string = master_date
                date_object = datetime.strptime(
                    master_date, "%d-%m-%Y %H:%M:%S")
            else:
                date_object = datetime.fromisoformat(date)
                date_string = date_object.replace(tzinfo=None).strftime(
                    "%d-%m-%Y %H:%M:%S")
                master_date = date_string
            if not media_api["media"] and "rawText" in media_api:
                if media_type == "Texts":
                    new_dict = dict()
                    new_dict["post_id"] = media_api["id"]
                    new_dict["text"] = media_api["rawText"]
                    new_dict["postedAt"] = date_string
                    media_set2["valid"].append(new_dict)
                    print
                print
            for media in media_api["media"]:
                date = "-001-11-30T00:00:00+00:00"
                size = 0
                link = ""
                if "source" in media:
                    source = media["source"]
                    link = source["source"]
                    size = media["info"]["preview"]["size"] if "info" in media_api else 1
                if "src" in media:
                    link = media["src"]
                    size = media["info"]["preview"]["size"] if "info" in media_api else 1
                    date = media_api["createdAt"]
                if not link:
                    continue
                matches = ["us", "uk", "ca", "ca2", "de"]

                url = urlparse(link)
                subdomain = url.hostname.split('.')[0]
                preview_link = media["preview"]
                if any(subdomain in nm for nm in matches):
                    subdomain = url.hostname.split('.')[1]
                    if "upload" in subdomain:
                        continue
                    if "convert" in subdomain:
                        link = preview_link
                rules = [link == "",
                         preview_link == ""]
                if all(rules):
                    continue
                new_dict = dict()
                new_dict["post_id"] = media_api["id"]
                new_dict["media_id"] = media["id"]
                new_dict["links"] = []
                for xlink in link, preview_link:
                    if xlink:
                        new_dict["links"].append(xlink)
                        break
                new_dict["price"] = media_api["price"]if "price" in media_api else None

                if media["type"] not in alt_media_type:
                    continue
                if "rawText" not in media_api:
                    media_api["rawText"] = ""
                text = media_api["rawText"] if media_api["rawText"] else ""
                matches = [s for s in ignored_keywords if s in text]
                if matches:
                    print("Matches: ", matches)
                    continue
                new_dict["postedAt"] = date_string
                post_id = new_dict["post_id"]
                media_id = new_dict["media_id"]
                filename = link.rsplit('/', 1)[-1]
                filename, ext = os.path.splitext(filename)
                ext = ext.__str__().replace(".", "").split('?')[0]
                price = new_dict["price"]
                new_dict["text"] = text

                option = {}
                option = option | new_dict
                option["site_name"] = "OnlyFans"
                option["filename"] = filename
                option["api_type"] = api_type
                option["media_type"] = media_type
                option["ext"] = ext
                option["username"] = username
                option["date_format"] = date_format
                option["maximum_length"] = maximum_length
                option["directory"] = download_path

                prepared_format = prepare_reformat(option)
                file_directory = main_helper.reformat(
                    prepared_format, file_directory_format)
                prepared_format.directory = file_directory
                file_path = main_helper.reformat(
                    prepared_format, filename_format)
                new_dict["directory"] = os.path.join(file_directory)
                new_dict["filename"] = os.path.basename(file_path)
                new_dict["session"] = session
                if size == 0:
                    media_set2["invalid"].append(new_dict)
                    continue
                if file_directory not in directories:
                    directories.append(file_directory)
                media_set2["valid"].append(new_dict)
        if media_set2["valid"] or media_set2["invalid"]:
            media_set[media_type] = media_set2
        else:
            print
    media_set["directories"] = directories
    return media_set
예제 #10
0
def media_scraper(link, session, directory, username, api_type):
    media_set = [[], []]
    media_type = directory[-1]
    count = 0
    found = False
    y = json_request(session, link)
    if "error" in y:
        return media_set
    x = 0
    if api_type == "Highlights":
        y = y["stories"]
    y = y["list"] if "list" in y else y
    master_date = "01-01-0001 00:00:00"
    for media_api in y:
        if api_type == "Mass Messages":
            media_user = media_api["fromUser"]
            media_username = media_user["username"]
            if media_username != username:
                continue
        new_api = (media_api["media"] if "media" in media_api else [media_api])
        for media in new_api:
            date = "-001-11-30T00:00:00+00:00"
            size = 1
            src = media["src"]
            link = src["source"]
            date = media_api[
                "createdAt"] if "createdAt" in media_api else media_api[
                    "postedAt"]
            if not link:
                continue
            new_dict = dict()
            new_dict["post_id"] = media_api["id"]
            new_dict["links"] = [link]
            if date == "-001-11-30T00:00:00+00:00":
                date_string = master_date
                date_object = datetime.strptime(master_date,
                                                "%d-%m-%Y %H:%M:%S")
            else:
                date_object = datetime.fromisoformat(date)
                date_string = date_object.replace(
                    tzinfo=None).strftime("%d-%m-%Y %H:%M:%S")
                master_date = date_string
            media["mediaType"] = media[
                "mediaType"] if "mediaType" in media else media["type"]
            if media["mediaType"] not in media_type:
                x += 1
                continue
            if "text" not in media_api:
                media_api["text"] = ""
            new_dict["text"] = media_api["text"] if media_api["text"] else ""
            new_dict["postedAt"] = date_string
            post_id = new_dict["post_id"]
            media_id = media["id"] if "id" in media else None
            media_id = media_id if isinstance(media_id, int) else None
            text = new_dict["text"]
            file_name = link.rsplit('/', 1)[-1]
            file_name, ext = os.path.splitext(file_name)
            ext = ext.__str__().replace(".", "").split('?')[0]
            file_path = reformat(directory[0][1], post_id, media_id, file_name,
                                 text, ext, date_object, username, format_path,
                                 date_format, maximum_length)
            new_dict["directory"] = directory[0][1]
            new_dict["filename"] = file_path.rsplit('/', 1)[-1]
            new_dict["size"] = size
            if size == 0:
                media_set[1].append(new_dict)
                continue
            media_set[0].append(new_dict)
    return media_set
예제 #11
0
파일: start.py 프로젝트: tgage/OnlyFans
    folders = os.listdir(metadata_directory)
    for metadata_file in folders:
        metadata_filepath = os.path.join(metadata_directory, metadata_file)
        metadatas = json.load(open(metadata_filepath))[0]["valid"]
        for metadata in metadatas:
            for model in metadata:
                model_folder = model["directory"]
                filename = model["filename"]
                post_id = str(model["post_id"])
                filepath = os.path.join(model_folder, filename)

                class prepare_reformat(object):
                    def __init__(self, option):
                        self.directory = option.get('directory', "")
                        self.post_id = option.get('post_id', "")
                        self.media_id = option.get('media_id', "")
                        filename, ext = os.path.splitext(filepath)
                        self.filename = option.get('filename', "")
                        self.username = option.get('username', username)
                        self.text = option.get('text', "")
                        self.postedAt = option.get('postedAt', "")
                        print

                x = prepare_reformat(model)
                x = json.loads(json.dumps(x, default=lambda o: o.__dict__))
                if os.path.isfile(filepath):
                    new_format = main_helper.reformat(*x)
                    print
                else:
                    print
예제 #12
0
def media_scraper(result, sessions, formatted_directories, username, api_type):
    link = result["link"]
    session = sessions[result["count"]]
    media_set = []
    y = main_helper.json_request(session, link)
    if not y or "error" in y:
        return media_set
    x = 0
    if api_type == "Highlights":
        y = y["stories"]
    if api_type == "Messages":
        y = y["list"]
    if api_type == "Mass Messages":
        y = y["list"]
    model_directory = formatted_directories["model_directory"]
    for location in formatted_directories["locations"]:
        sorted_directories = location["sorted_directories"]
        master_date = "01-01-0001 00:00:00"
        media_type = location["media_type"]
        alt_media_type = location["alt_media_type"]
        if result["count"] == 0:
            seperator = " | "
            print("Scraping ["+str(seperator.join(alt_media_type)) +
                  "]. Should take less than a minute.")
        media_set2 = {}
        media_set2["type"] = media_type
        media_set2["valid"] = []
        media_set2["invalid"] = []
        for media_api in y:
            if api_type == "Messages":
                media_api["rawText"] = media_api["text"]
            if api_type == "Mass Messages":
                media_user = media_api["fromUser"]
                media_username = media_user["username"]
                if media_username != username:
                    continue
            for media in media_api["media"]:
                date = "-001-11-30T00:00:00+00:00"
                size = 0
                if "source" in media:
                    source = media["source"]
                    link = source["source"]
                    size = media["info"]["preview"]["size"] if "info" in media_api else 1
                    date = media_api["postedAt"] if "postedAt" in media_api else media_api["createdAt"]
                if "src" in media:
                    link = media["src"]
                    size = media["info"]["preview"]["size"] if "info" in media_api else 1
                    date = media_api["createdAt"]
                if not link:
                    continue
                matches = ["us", "uk", "ca", "ca2", "de"]

                url = urlparse(link)
                subdomain = url.hostname.split('.')[0]
                preview_link = media["preview"]
                if any(subdomain in nm for nm in matches):
                    subdomain = url.hostname.split('.')[1]
                    if "upload" in subdomain:
                        continue
                    if "convert" in subdomain:
                        link = preview_link
                rules = [link == "",
                         preview_link == ""]
                if all(rules):
                    continue
                new_dict = dict()
                new_dict["post_id"] = media_api["id"]
                new_dict["media_id"] = media["id"]
                new_dict["links"] = []
                for xlink in link, preview_link:
                    if xlink:
                        new_dict["links"].append(xlink)
                        break
                new_dict["price"] = media_api["price"]if "price" in media_api else None
                if date == "-001-11-30T00:00:00+00:00":
                    date_string = master_date
                    date_object = datetime.strptime(
                        master_date, "%d-%m-%Y %H:%M:%S")
                else:
                    date_object = datetime.fromisoformat(date)
                    date_string = date_object.replace(tzinfo=None).strftime(
                        "%d-%m-%Y %H:%M:%S")
                    master_date = date_string

                if media["type"] not in alt_media_type:
                    x += 1
                    continue
                if "rawText" not in media_api:
                    media_api["rawText"] = ""
                text = media_api["rawText"] if media_api["rawText"] else ""
                matches = [s for s in ignored_keywords if s in text]
                if matches:
                    print("Matches: ", matches)
                    continue
                text = main_helper.clean_text(text)
                new_dict["postedAt"] = date_string
                post_id = new_dict["post_id"]
                media_id = new_dict["media_id"]
                file_name = link.rsplit('/', 1)[-1]
                file_name, ext = os.path.splitext(file_name)
                ext = ext.__str__().replace(".", "").split('?')[0]
                media_directory = os.path.join(
                    model_directory, sorted_directories["unsorted"])
                new_dict["paid"] = False
                if new_dict["price"]:
                    if api_type in ["Messages", "Mass Messages"]:
                        new_dict["paid"] = True
                    else:
                        if media["id"] not in media_api["preview"] and media["canView"]:
                            new_dict["paid"] = True
                if sort_free_paid_posts:
                    media_directory = os.path.join(
                        model_directory, sorted_directories["free"])
                    if new_dict["paid"]:
                        media_directory = os.path.join(
                            model_directory, sorted_directories["paid"])
                file_path = main_helper.reformat(media_directory, post_id, media_id, file_name,
                                                 text, ext, date_object, username, file_directory_format, file_name_format, date_format, maximum_length)
                new_dict["text"] = text
                file_directory = os.path.dirname(file_path)
                new_dict["directory"] = os.path.join(file_directory)
                new_dict["filename"] = os.path.basename(file_path)
                new_dict["size"] = size
                if size == 0:
                    media_set2["invalid"].append(new_dict)
                    continue
                new_dict["session"] = session
                media_set2["valid"].append(new_dict)
        media_set.append(media_set2)
    return media_set