Пример #1
0
 async def profile_scraper(self, subscription: user_types):
     authed = subscription.get_authed()
     site_settings = authed.api.get_site_settings()
     if not (subscription.directory_manager and site_settings):
         return
     subscription_directory_manager = subscription.directory_manager
     subscription_username = subscription.username
     site_name = authed.api.site_name
     authed = subscription.get_authed()
     override_media_types: list[list[Any]] = []
     avatar = subscription.avatar
     header = subscription.header
     override_media_types.extend([["Avatars", avatar], ["Headers", header]])
     session = await authed.session_manager.create_client_session()
     progress_bar = None
     p_r = prepare_reformat()
     p_r.site_name = site_name
     p_r.model_username = subscription_username
     p_r.api_type = "Profile"
     p_r.text_length = site_settings.text_length
     p_r.directory = subscription_directory_manager.root_download_directory
     directory = await p_r.remove_non_unique(
         subscription_directory_manager, "file_directory_format"
     )
     if not isinstance(directory, Path):
         return
     directory = directory.joinpath(p_r.api_type)
     for override_media_type in override_media_types:
         media_type = override_media_type[0]
         media_link = override_media_type[1]
         if not media_link:
             continue
         directory2 = directory.joinpath(media_type)
         directory2.mkdir(parents=True, exist_ok=True)
         download_path = directory2.joinpath(f"{media_link.split('/')[-2]}.jpg")
         if download_path.is_file():
             continue
         response = await authed.session_manager.json_request(
             media_link, method="HEAD"
         )
         if not response:
             continue
         if not progress_bar:
             progress_bar = main_helper.download_session()
             progress_bar.start(unit="B", unit_scale=True, miniters=1)
         progress_bar.update_total_size(response.content_length)
         response = await authed.session_manager.json_request(
             media_link,
             session,
             stream=True,
             json_format=False,
         )
         await main_helper.write_data(response, download_path, progress_bar)
     await session.close()
     if progress_bar:
         progress_bar.close()  # type: ignore
Пример #2
0
def start(subscription, api_type, api_path, site_name, json_settings):
    metadata = getattr(subscription.scraped, api_type)
    download_info = subscription.download_info
    root_directory = download_info["directory"]
    date_format = json_settings["date_format"]
    text_length = json_settings["text_length"]
    reformats = {}
    reformats["metadata_directory_format"] = json_settings[
        "metadata_directory_format"]
    reformats["file_directory_format"] = json_settings["file_directory_format"]
    reformats["filename_format"] = json_settings["filename_format"]
    username = subscription.username
    option = {}
    option["site_name"] = site_name
    option["api_type"] = api_type
    option["username"] = username
    option["date_format"] = date_format
    option["maximum_length"] = text_length
    option["directory"] = root_directory
    formatted = format_types(reformats).check_unique()
    unique = formatted["unique"]
    for key, value in reformats.items():
        key2 = getattr(unique, key)[0]
        reformats[key] = value.split(key2, 1)[0] + key2
        print
    print
    a, base_directory, c = prepare_reformat(option,
                                            keep_vars=True).reformat(reformats)
    download_info["base_directory"] = base_directory
    print
    all_files = []
    for root, subdirs, files in os.walk(base_directory):
        x = [os.path.join(root, x) for x in files]
        all_files.extend(x)
    for media_type, value in metadata.content:
        if media_type == "Texts":
            continue
        for status, value2 in value:
            fixed, new_directories = fix_directories(value2, root_directory,
                                                     site_name, api_path,
                                                     media_type, username,
                                                     all_files, json_settings)
            for new_directory in new_directories:
                directory = os.path.abspath(new_directory)
                os.makedirs(directory, exist_ok=True)
            fixed2 = fix_metadata(fixed)
            setattr(value, status, fixed2)
        setattr(
            metadata.content,
            media_type,
            value,
        )
    return metadata
Пример #3
0
def profile_scraper(api: start, site_name, api_type, username, text_length,
                    base_directory):
    reformats = {}
    reformats["metadata_directory_format"] = json_settings[
        "metadata_directory_format"]
    reformats["file_directory_format"] = json_settings["file_directory_format"]
    reformats["file_directory_format"] = reformats[
        "file_directory_format"].replace("{value}", "")
    reformats["filename_format"] = json_settings["filename_format"]
    option = {}
    option["site_name"] = site_name
    option["api_type"] = api_type
    option["username"] = username
    option["date_format"] = date_format
    option["maximum_length"] = text_length
    option["directory"] = base_directory
    a, b, c = prepare_reformat(option, keep_vars=True).reformat(reformats)
    print
    y = api.get_subscription(identifier=username)
    override_media_types = []
    avatar = y.avatar
    header = y.header
    if avatar:
        override_media_types.append(["Avatars", avatar])
    if header:
        override_media_types.append(["Headers", header])
    for override_media_type in override_media_types:
        new_dict = dict()
        media_type = override_media_type[0]
        media_link = override_media_type[1]
        new_dict["links"] = [media_link]
        directory2 = os.path.join(b, media_type)
        os.makedirs(directory2, exist_ok=True)
        download_path = os.path.join(directory2,
                                     media_link.split("/")[-2] + ".jpg")
        if not overwrite_files:
            if os.path.isfile(download_path):
                continue
        session = api.sessions[0]
        r = api.json_request(media_link,
                             session,
                             stream=True,
                             json_format=False,
                             sleep=False)
        if not isinstance(r, requests.Response):
            continue
        while True:
            downloader = main_helper.downloader(r, download_path)
            if not downloader:
                continue
            break
Пример #4
0
def fix_directories(post_item, base_directory, site_name, api_type, media_type,
                    username, all_files, json_settings):
    new_directories = []
    for posts in post_item:
        for media in posts:
            if media.links:
                path = urlparse.urlparse(media.links[0]).path
            else:
                path = media.filename
            new_filename = os.path.basename(path)
            filename, ext = os.path.splitext(new_filename)
            ext = ext.replace(".", "")
            file_directory_format = json_settings["file_directory_format"]
            filename_format = json_settings["filename_format"]
            date_format = json_settings["date_format"]
            text_length = json_settings["text_length"]
            download_path = base_directory
            today = datetime.today()
            today = today.strftime("%d-%m-%Y %H:%M:%S")
            new_dict = media.convert(keep_empty_items=True)
            option = {}
            option = option | new_dict
            option["site_name"] = site_name
            option["filename"] = filename
            option["api_type"] = api_type
            option["media_type"] = media_type
            option["ext"] = ext
            option["username"] = username
            option["date_format"] = date_format
            option["maximum_length"] = text_length
            option["directory"] = download_path
            prepared_format = prepare_reformat(option)
            file_directory = main_helper.reformat(prepared_format,
                                                  file_directory_format)
            prepared_format.directory = file_directory
            old_filepath = ""
            x = [x for x in all_files if media.filename in x]
            if x:
                # media.downloaded = True
                old_filepath = x[0]
                old_filepath = os.path.abspath(old_filepath)
            print
            new_filepath = main_helper.reformat(prepared_format,
                                                filename_format)
            setattr(media, "old_filepath", old_filepath)
            setattr(media, "new_filepath", new_filepath)
            new_directories.append(os.path.dirname(new_filepath))
    new_directories = list(set(new_directories))
    return post_item, new_directories
Пример #5
0
def start(Session, parent_type, api_type, api_path, site_name, subscription,
          folder, json_settings):
    api_table = folder.api_table
    media_table = folder.media_table
    database_session = Session()
    result = database_session.query(api_table).all()
    metadata = getattr(subscription.scraped, api_type)
    download_info = subscription.download_info
    root_directory = download_info["directory"]
    date_format = json_settings["date_format"]
    text_length = json_settings["text_length"]
    reformats = {}
    reformats["metadata_directory_format"] = json_settings[
        "metadata_directory_format"]
    reformats["file_directory_format"] = json_settings["file_directory_format"]
    reformats["filename_format"] = json_settings["filename_format"]
    username = subscription.username
    option = {}
    option["site_name"] = site_name
    option["api_type"] = api_type
    option["username"] = username
    option["date_format"] = date_format
    option["maximum_length"] = text_length
    option["directory"] = root_directory
    formatted = format_types(reformats).check_unique()
    unique = formatted["unique"]
    for key, value in reformats.items():
        key2 = getattr(unique, key)[0]
        reformats[key] = value.split(key2, 1)[0] + key2
        print
    print
    a, base_directory, c = prepare_reformat(option,
                                            keep_vars=True).reformat(reformats)
    download_info["base_directory"] = base_directory
    print
    all_files = []
    for root, subdirs, files in os.walk(base_directory):
        x = [os.path.join(root, x) for x in files]
        all_files.extend(x)

    fixed, new_directories = fix_directories(result, all_files,
                                             database_session, folder,
                                             site_name, parent_type, api_type,
                                             username, root_directory,
                                             json_settings)
    database_session.close()
    return metadata
Пример #6
0
def format_directories(directory,
                       site_name,
                       username,
                       unformatted,
                       locations=[],
                       api_type="") -> dict:
    x = {}
    option = {}
    option["site_name"] = site_name
    option["username"] = username
    option["directory"] = directory
    option["postedAt"] = datetime.today()
    option["date_format"] = date_format
    option["maximum_length"] = maximum_length
    prepared_format = prepare_reformat(option)
    legacy_model_directory = x["legacy_model_directory"] = os.path.join(
        directory, site_name, username)
    x["legacy_metadatas"] = {}
    x["legacy_metadatas"]["legacy_metadata"] = os.path.join(
        legacy_model_directory, api_type, "Metadata")
    x["legacy_metadatas"]["legacy_metadata2"] = os.path.join(
        legacy_model_directory, "Metadata")
    x["metadata_directory"] = main_helper.reformat(prepared_format,
                                                   unformatted)
    x["download_directory"] = directory
    x["locations"] = []
    for location in locations:
        directories = {}
        cats = ["Unsorted", "Free", "Paid"]
        for cat in cats:
            cat2 = cat
            if "Unsorted" in cat2:
                cat2 = ""
            path = os.path.join(api_type, cat2, location[0])
            directories[cat.lower()] = path
        y = {}
        y["sorted_directories"] = directories
        y["media_type"] = location[0]
        y["alt_media_type"] = location[1]
        x["locations"].append(y)
    return x
Пример #7
0
async def start(
    subscription: user_types,
    api_type: str,
    Session: scoped_session,
    site_settings: SiteSettings,
):
    authed = subscription.get_authed()
    directory_manager = subscription.directory_manager
    api_table_ = user_database.table_picker(api_type)
    database_session: scoped_session = Session()
    # Slow
    authed_username = authed.username
    subscription_username = subscription.username
    site_name = authed.api.site_name
    p_r = prepare_reformat()
    p_r = await p_r.standard(
        site_name=site_name,
        profile_username=authed_username,
        user_username=subscription_username,
        date=datetime.today(),
        date_format=site_settings.date_format,
        text_length=site_settings.text_length,
        directory=directory_manager.root_metadata_directory,
    )
    p_r.api_type = api_type
    result: list[api_table] = database_session.query(api_table_).all()
    metadata = getattr(subscription.temp_scraped, api_type)

    await fix_directories(
        result,
        subscription,
        database_session,
        api_type,
    )
    database_session.close()
    return metadata
Пример #8
0
def media_scraper(results, api, formatted_directories, username, api_type, parent_type=""):
    new_set = {}
    new_set["content"] = []
    directories = []
    session = api.sessions[0]
    if api_type == "Stories":
        if "stories" in results:
            items = results["stories"]
            for item in items:
                item["text"] = results["title"]
            results = results["stories"]
    if api_type == "Archived":
        print
        pass
    if api_type == "Posts":
        print
    if api_type == "Messages":
        pass
    if not results or "error" in results:
        return new_set
    if "result" in results:
        session = results["session"]
        results = results["result"]
        if "error" in results:
            return new_set
    download_path = formatted_directories["download_directory"]
    for location in formatted_directories["locations"]:
        sorted_directories = copy.copy(location["sorted_directories"])
        master_date = "01-01-0001 00:00:00"
        media_type = location["media_type"]
        alt_media_type = location["alt_media_type"]
        file_directory_format = json_settings["file_directory_format"]
        if api_type == "Archived":
            x = file_directory_format.split(os.sep)
            for y in x:
                substr = "{api_type}"
                if substr == y:
                    new_path = os.path.join(substr, parent_type)
                    file_directory_format = file_directory_format.replace(
                        substr, new_path)
                    break
                print
            print
        seperator = " | "
        print(
            f"Scraping [{seperator.join(alt_media_type)}]. Should take less than a minute.")
        for media_api in results:
            new_post = {}
            new_post["medias"] = []
            rawText = media_api.get("rawText", "")
            text = media_api.get("text", "")
            final_text = rawText if rawText else text
            # if media_api["responseType"] == "post":
            #     if media_api["isArchived"]:
            #         pass
            if api_type == "Messages":
                media_api["rawText"] = media_api["text"]
            if api_type == "Mass Messages":
                media_user = media_api["fromUser"]
                media_username = media_user["username"]
                if media_username != username:
                    continue
            date = media_api["postedAt"] if "postedAt" in media_api else media_api["createdAt"]
            if date == "-001-11-30T00:00:00+00:00":
                date_string = master_date
                date_object = datetime.strptime(
                    master_date, "%d-%m-%Y %H:%M:%S")
            else:
                date_object = datetime.fromisoformat(date)
                date_string = date_object.replace(tzinfo=None).strftime(
                    "%d-%m-%Y %H:%M:%S")
                master_date = date_string
            new_post["post_id"] = media_api["id"]
            new_post["text"] = final_text
            new_post["postedAt"] = date_string
            new_post["paid"] = False
            price = new_post["price"] = media_api["price"]if "price" in media_api else None
            if price == None:
                price = 0
            canPurchase = media_api.get("canPurchase", None)
            canViewMedia = media_api.get("canViewMedia", None)
            if price:
                if not canPurchase or canViewMedia:
                    new_post["paid"] = True
            for media in media_api["media"]:
                media_id = media["id"]
                date = "-001-11-30T00:00:00+00:00"
                size = 0
                link = ""
                preview_link = ""
                if "source" in media:
                    quality_key = "source"
                    source = media[quality_key]
                    link = source[quality_key]
                    if link:
                        if media["type"] == "video":
                            qualities = media["videoSources"]
                            qualities = dict(
                                sorted(qualities.items(), reverse=False))
                            qualities[quality_key] = source[quality_key]
                            for quality, quality_link in qualities.items():
                                video_quality_json = json_settings["video_quality"]
                                video_quality_json = video_quality_json.removesuffix(
                                    "p")
                                if quality == video_quality_json:
                                    if link:
                                        link = quality_link
                                        break
                                    print
                                print
                            print

                    size = media["info"]["preview"]["size"] if "info" in media_api else 1
                if "src" in media:
                    link = media["src"]
                    size = media["info"]["preview"]["size"] if "info" in media_api else 1
                    date = media_api["createdAt"]
                matches = ["us", "uk", "ca", "ca2", "de"]

                if not link:
                    continue
                url = urlparse(link)
                subdomain = url.hostname.split('.')[0]
                preview_link = media["preview"]
                if any(subdomain in nm for nm in matches):
                    subdomain = url.hostname.split('.')[1]
                    if "upload" in subdomain:
                        continue
                    if "convert" in subdomain:
                        link = preview_link
                rules = [link == "",
                         preview_link == ""]
                if all(rules):
                    continue
                new_media = dict()
                new_media["media_id"] = media_id
                new_media["links"] = []
                new_media["media_type"] = media_type
                for xlink in link, preview_link:
                    if xlink:
                        new_media["links"].append(xlink)
                        break
                session.links.extend(new_media["links"])

                if media["type"] not in alt_media_type:
                    continue
                matches = [s for s in ignored_keywords if s in final_text]
                if matches:
                    print("Matches: ", matches)
                    continue
                filename = link.rsplit('/', 1)[-1]
                filename, ext = os.path.splitext(filename)
                ext = ext.__str__().replace(".", "").split('?')[0]

                option = {}
                option = option | new_post
                option["site_name"] = "OnlyFans"
                option["media_id"] = media_id
                option["filename"] = filename
                option["api_type"] = api_type
                option["media_type"] = media_type
                option["ext"] = ext
                option["username"] = username
                option["date_format"] = date_format
                option["text_length"] = text_length
                option["directory"] = download_path

                prepared_format = prepare_reformat(option)
                file_directory = main_helper.reformat(
                    prepared_format, file_directory_format)
                prepared_format.directory = file_directory
                file_path = main_helper.reformat(
                    prepared_format, filename_format)
                new_media["directory"] = os.path.join(file_directory)
                new_media["filename"] = os.path.basename(file_path)
                if file_directory not in directories:
                    directories.append(file_directory)
                new_post["medias"].append(new_media)
            new_set["content"].append(new_post)
    new_set["directories"] = directories
    return new_set
Пример #9
0
    async def fix_directories2(
        post: api_table, media_db: list[template_media_table], all_files: list[Path]
    ):
        delete_rows = []
        final_api_type = (
            os.path.join("Archived", api_type) if post.archived else api_type
        )
        post_id = post.post_id
        media_db = [x for x in media_db if x.post_id == post_id]
        for media in media_db:
            media_id = media.media_id
            if media.link:
                url_path = urlparse.urlparse(media.link).path
                url_path = Path(url_path)
            else:
                url_path = Path(media.filename)
            new_filename = url_path.name
            original_filename, ext = (url_path.stem, url_path.suffix)
            ext = ext.replace(".", "")

            file_directory_format = site_settings.file_directory_format
            filename_format = site_settings.filename_format
            date_format = site_settings.date_format
            text_length = site_settings.text_length
            download_path = subscription.directory_manager.root_download_directory
            option = {}
            option["site_name"] = api.site_name
            option["post_id"] = post_id
            option["media_id"] = media_id
            option["profile_username"] = authed.username
            option["model_username"] = subscription.username
            option["api_type"] = final_api_type
            option["media_type"] = media.media_type
            option["filename"] = original_filename
            option["ext"] = ext
            option["text"] = post.text
            option["postedAt"] = media.created_at
            option["price"] = post.price
            option["date_format"] = date_format
            option["text_length"] = text_length
            option["directory"] = download_path
            option["preview"] = media.preview
            option["archived"] = post.archived
            prepared_format = prepare_reformat(option)
            file_directory = await prepared_format.reformat_2(file_directory_format)
            prepared_format.directory = file_directory
            old_filepath = ""
            if media.linked:
                filename_format = filename_format.with_name(f"linked_{filename_format}")
            new_filepath = await prepared_format.reformat_2(filename_format)
            old_filepaths = [
                x
                for x in all_files
                if original_filename in x.name and x.parts != new_filepath.parts
            ]
            if not old_filepaths:
                old_filepaths = [x for x in all_files if str(media_id) in x.name]
                print
            if not media.linked:
                old_filepaths: list[Path] = [
                    x for x in old_filepaths if "linked_" not in x.parts
                ]
            if old_filepaths:
                old_filepath = old_filepaths[0]
            # a = randint(0,1)
            # await asyncio.sleep(a)
            if old_filepath and old_filepath != new_filepath:
                moved = None
                while not moved:
                    try:
                        if old_filepath.exists():
                            _old_filename, old_ext = (url_path.stem, url_path.suffix)
                            if ".part" == old_ext:
                                old_filepath.unlink()
                                continue
                            if media.size:
                                media.downloaded = True
                            found_dupes = [
                                x
                                for x in media_db
                                if x.filename == new_filename and x.id != media.id
                            ]
                            delete_rows.extend(found_dupes)
                            os.makedirs(os.path.dirname(new_filepath), exist_ok=True)
                            if media.linked:
                                if os.path.dirname(old_filepath) == os.path.dirname(
                                    new_filepath
                                ):
                                    moved = shutil.move(old_filepath, new_filepath)
                                else:
                                    moved = shutil.copy(old_filepath, new_filepath)
                            else:
                                moved = shutil.move(old_filepath, new_filepath)
                        else:
                            break
                    except OSError as e:
                        print(traceback.format_exc())
                    print
                print

            if os.path.exists(new_filepath):
                if media.size:
                    media.downloaded = True
            if prepared_format.text:
                pass
            media.directory = file_directory.as_posix()
            media.filename = os.path.basename(new_filepath)
            new_directories.append(os.path.dirname(new_filepath))
        return delete_rows
Пример #10
0
    async def media_scraper(
        self,
        post_result: Union[create_story, create_post, create_message],
        subscription: create_user,
        formatted_directory: Path,
        api_type: str,
    ):
        authed = subscription.get_authed()
        api = authed.api
        site_settings = api.get_site_settings()
        if not site_settings:
            return
        new_set: dict[str, Any] = {}
        new_set["content"] = []
        directories: list[Path] = []
        if api_type == "Stories":
            pass
        if api_type == "Archived":
            pass
        if api_type == "Posts":
            pass
        if api_type == "Products":
            pass
        if api_type == "Messages":
            pass
        download_path = formatted_directory
        model_username = subscription.username
        date_format = site_settings.date_format
        locations = self.media_types
        for media_type, alt_media_types in locations.__dict__.items():
            date_today = datetime.now()
            master_date = datetime.strftime(date_today, "%d-%m-%Y %H:%M:%S")
            file_directory_format = site_settings.file_directory_format
            post_id = post_result.id
            new_post = {}
            new_post["medias"] = []
            new_post["archived"] = False
            rawText = ""
            text = ""
            previews = []
            date = None
            price = None

            if isinstance(post_result, create_story):
                date = post_result.createdAt
            if isinstance(post_result, create_post):
                if post_result.isReportedByMe:
                    continue
                rawText = post_result.rawText
                text = post_result.text
                previews = post_result.preview
                date = post_result.postedAt
                price = post_result.price
                new_post["archived"] = post_result.isArchived
            if isinstance(post_result, create_product):
                if post_result.isReportedByMe:
                    continue
                title = post_result.title
                rawText = post_result.rawText
                text = post_result.text
                previews = post_result.preview
                date = post_result.postedAt
                price = post_result.price
                new_post["title"] = title
                new_post["archived"] = post_result.isArchived
            if isinstance(post_result, create_message):
                if post_result.isReportedByMe:
                    continue
                text = post_result.text
                previews = post_result.previews
                date = post_result.createdAt
                price = post_result.price
                if api_type == "Mass Messages":
                    media_user = post_result.fromUser
                    media_username = media_user.username
                    if media_username != model_username:
                        continue
            final_text = rawText if rawText else text

            if date == "-001-11-30T00:00:00+00:00":
                date_string = master_date
                date_object = datetime.strptime(master_date,
                                                "%d-%m-%Y %H:%M:%S")
            else:
                if not date:
                    date = master_date
                if "T" in date:
                    date_object = datetime.fromisoformat(date)
                else:
                    date_object = datetime.strptime(date, "%d-%m-%Y %H:%M:%S")

                date_string = date_object.replace(
                    tzinfo=None).strftime("%d-%m-%Y %H:%M:%S")
                master_date = date_string
            new_post["post_id"] = post_id
            new_post["user_id"] = subscription.id
            if isinstance(post_result, create_message):
                new_post["user_id"] = post_result.fromUser.id

            new_post["text"] = final_text
            new_post["postedAt"] = date_string
            new_post["paid"] = False
            new_post["preview_media_ids"] = previews
            new_post["api_type"] = api_type
            new_post["price"] = 0
            if price is None:
                price = 0
            if price:
                if all(media["canView"] for media in post_result.media):
                    new_post["paid"] = True
                else:
                    print
            new_post["price"] = price
            for media in post_result.media:
                media_id = media["id"]
                preview_link = ""
                link = await post_result.link_picker(
                    media, site_settings.video_quality)
                matches = ["us", "uk", "ca", "ca2", "de"]

                if not link:
                    continue
                url = urlparse(link)
                if not url.hostname:
                    continue
                subdomain = url.hostname.split(".")[0]
                preview_link = media["preview"]
                if any(subdomain in nm for nm in matches):
                    subdomain = url.hostname.split(".")[1]
                    if "upload" in subdomain:
                        continue
                    if "convert" in subdomain:
                        link = preview_link
                rules = [link == "", preview_link == ""]
                if all(rules):
                    continue
                new_media: dict[str, Any] = dict()
                new_media["media_id"] = media_id
                new_media["links"] = []
                new_media["media_type"] = media_type
                new_media["preview"] = False
                new_media["created_at"] = new_post["postedAt"]
                if isinstance(post_result, create_story):
                    date_object = datetime.fromisoformat(media["createdAt"])
                    date_string = date_object.replace(
                        tzinfo=None).strftime("%d-%m-%Y %H:%M:%S")
                    new_media["created_at"] = date_string
                if int(media_id) in new_post["preview_media_ids"]:
                    new_media["preview"] = True
                for xlink in link, preview_link:
                    if xlink:
                        new_media["links"].append(xlink)
                        break

                if media["type"] not in alt_media_types:
                    continue
                matches = [
                    s for s in site_settings.ignored_keywords
                    if s in final_text
                ]
                if matches:
                    print("Ignoring - ", f"PostID: {post_id}")
                    continue
                filename = link.rsplit("/", 1)[-1]
                filename, ext = os.path.splitext(filename)
                ext = ext.__str__().replace(".", "").split("?")[0]
                final_api_type = (os.path.join("Archived", api_type)
                                  if new_post["archived"] else api_type)
                option: dict[str, Any] = {}
                option = option | new_post
                option["site_name"] = api.site_name
                option["media_id"] = media_id
                option["filename"] = filename
                option["api_type"] = final_api_type
                option["media_type"] = media_type
                option["ext"] = ext
                option["profile_username"] = authed.username
                option["model_username"] = model_username
                option["date_format"] = date_format
                option["postedAt"] = new_media["created_at"]
                option["text_length"] = site_settings.text_length
                option["directory"] = download_path
                option["preview"] = new_media["preview"]
                option["archived"] = new_post["archived"]

                prepared_format = prepare_reformat(option)
                file_directory = await prepared_format.reformat_2(
                    file_directory_format)
                prepared_format.directory = file_directory
                file_path = await prepared_format.reformat_2(
                    site_settings.filename_format)
                new_media["directory"] = os.path.join(file_directory)
                new_media["filename"] = os.path.basename(file_path)
                if file_directory not in directories:
                    directories.append(file_directory)
                new_media["linked"] = None
                for k, v in subscription.temp_scraped:
                    if k == api_type:
                        continue
                    if k == "Archived":
                        v = getattr(v, api_type, [])
                    if v:
                        for post in v:
                            found_medias = []
                            medias = post.media
                            if medias:
                                for temp_media in medias:
                                    temp_filename = temp_media.get("filename")
                                    if temp_filename:
                                        if temp_filename == new_media[
                                                "filename"]:
                                            found_medias.append(temp_media)
                                    else:
                                        continue
                            # found_medias = [x for x in medias
                            #                 if x["filename"] == new_media["filename"]]
                            if found_medias:
                                for found_media in found_medias:
                                    found_media["linked"] = api_type
                                new_media["linked"] = post["api_type"]
                                new_media[
                                    "filename"] = f"linked_{new_media['filename']}"
                                print
                            print
                        print
                    print
                new_post["medias"].append(new_media)
            found_post = [
                x for x in new_set["content"] if x["post_id"] == post_id
            ]
            if found_post:
                found_post = found_post[0]
                found_post["medias"] += new_post["medias"]
            else:
                new_set["content"].append(new_post)
        new_set["directories"] = directories
        return new_set
Пример #11
0
    def fix_directories(post: api_table, media_db: list[media_table]):
        delete_rows = []
        final_type = ""
        if parent_type:
            final_type = f"{api_type}{os.path.sep}{parent_type}"
        final_type = final_type if final_type else api_type
        post_id = post.post_id
        media_db = [x for x in media_db if x.post_id == post_id]
        for media in media_db:
            media_id = media.media_id
            if media.link:
                path = urlparse.urlparse(media.link).path
            else:
                path: str = media.filename
            new_filename = os.path.basename(path)
            original_filename, ext = os.path.splitext(new_filename)
            ext = ext.replace(".", "")
            file_directory_format = json_settings["file_directory_format"]
            filename_format = json_settings["filename_format"]
            date_format = json_settings["date_format"]
            text_length = json_settings["text_length"]
            download_path = base_directory
            today = datetime.today()
            today = today.strftime("%d-%m-%Y %H:%M:%S")
            option = {}
            option["site_name"] = site_name
            option["post_id"] = post_id
            option["media_id"] = media_id
            option["username"] = username
            option["api_type"] = final_type if parent_type else api_type
            option["media_type"] = media.media_type
            option["filename"] = original_filename
            option["ext"] = ext
            option["text"] = post.text
            option["postedAt"] = media.created_at
            option["price"] = post.price
            option["date_format"] = date_format
            option["text_length"] = text_length
            option["directory"] = download_path
            option["preview"] = media.preview
            prepared_format = prepare_reformat(option)
            file_directory = main_helper.reformat(
                prepared_format, file_directory_format)
            prepared_format.directory = file_directory
            old_filepath = ""
            if media.linked:
                filename_format = f"linked_{filename_format}"
            old_filepaths = [
                x for x in all_files if original_filename in os.path.basename(x)]
            if not old_filepaths:
                old_filepaths = [
                    x for x in all_files if str(media_id) in os.path.basename(x)]
                print
            if not media.linked:
                old_filepaths = [x for x in old_filepaths if "linked_" not in x]
            if old_filepaths:
                old_filepath = old_filepaths[0]
            new_filepath = main_helper.reformat(
                prepared_format, filename_format)
            if old_filepath and old_filepath != new_filepath:
                if os.path.exists(new_filepath):
                    os.remove(new_filepath)
                moved = None
                while not moved:
                    try:
                        if os.path.exists(old_filepath):
                            if media.size:
                                media.downloaded = True
                            found_dupes = [
                                x for x in media_db if x.filename == new_filename and x.id != media.id]
                            delete_rows.extend(found_dupes)
                            os.makedirs(os.path.dirname(
                                new_filepath), exist_ok=True)
                            if media.linked:
                                if os.path.dirname(old_filepath) == os.path.dirname(new_filepath):
                                    moved = shutil.move(old_filepath, new_filepath)
                                else:
                                    moved = shutil.copy(old_filepath, new_filepath)
                            else:
                                moved = shutil.move(old_filepath, new_filepath)
                        else:
                            break
                    except OSError as e:
                        print(traceback.format_exc())
                    print
                print

            if os.path.exists(new_filepath):
                if media.size:
                    media.downloaded = True
            if prepared_format.text:
                pass
            media.directory = file_directory
            media.filename = os.path.basename(new_filepath)
            new_directories.append(os.path.dirname(new_filepath))
        return delete_rows
Пример #12
0
def media_scraper(results, api, formatted_directories, username, api_type, parent_type=""):
    media_set = {}
    directories = []
    session = api.sessions[0]
    if api_type == "Stories":
        if "stories" in results:
            items = results["stories"]
            for item in items:
                item["text"] = results["title"]
            results = results["stories"]
    if api_type == "Archived":
        print
        pass
    if api_type == "Posts":
        print
    if api_type == "Messages":
        pass
    if not results or "error" in results:
        return media_set
    if "result" in results:
        session = results["session"]
        results = results["result"]
        if "error" in results:
            return media_set
    download_path = formatted_directories["download_directory"]
    for location in formatted_directories["locations"]:
        sorted_directories = copy.copy(location["sorted_directories"])
        master_date = "01-01-0001 00:00:00"
        media_type = location["media_type"]
        alt_media_type = location["alt_media_type"]
        file_directory_format = json_settings["file_directory_format"]
        if api_type == "Archived":
            x = file_directory_format.split(os.sep)
            for y in x:
                substr = "{api_type}"
                if substr == y:
                    new_path = os.path.join(substr, parent_type)
                    file_directory_format = file_directory_format.replace(
                        substr, new_path)
                    break
                print
            print
        seperator = " | "
        print(
            f"Scraping [{seperator.join(alt_media_type)}]. Should take less than a minute.")
        media_set2 = {}
        media_set2["valid"] = []
        media_set2["invalid"] = []
        for media_api in results:
            # if media_api["responseType"] == "post":
            #     if media_api["isArchived"]:
            #         pass
            if api_type == "Messages":
                media_api["rawText"] = media_api["text"]
            if api_type == "Mass Messages":
                media_user = media_api["fromUser"]
                media_username = media_user["username"]
                if media_username != username:
                    continue
            date = media_api["postedAt"] if "postedAt" in media_api else media_api["createdAt"]
            if date == "-001-11-30T00:00:00+00:00":
                date_string = master_date
                date_object = datetime.strptime(
                    master_date, "%d-%m-%Y %H:%M:%S")
            else:
                date_object = datetime.fromisoformat(date)
                date_string = date_object.replace(tzinfo=None).strftime(
                    "%d-%m-%Y %H:%M:%S")
                master_date = date_string
            if not media_api["media"] and "rawText" in media_api:
                if media_type == "Texts":
                    new_dict = dict()
                    new_dict["post_id"] = media_api["id"]
                    new_dict["text"] = media_api["rawText"]
                    new_dict["postedAt"] = date_string
                    media_set2["valid"].append(new_dict)
                    print
                print
            for media in media_api["media"]:
                date = "-001-11-30T00:00:00+00:00"
                size = 0
                link = ""
                if "source" in media:
                    source = media["source"]
                    link = source["source"]
                    size = media["info"]["preview"]["size"] if "info" in media_api else 1
                if "src" in media:
                    link = media["src"]
                    size = media["info"]["preview"]["size"] if "info" in media_api else 1
                    date = media_api["createdAt"]
                if not link:
                    continue
                matches = ["us", "uk", "ca", "ca2", "de"]

                url = urlparse(link)
                subdomain = url.hostname.split('.')[0]
                preview_link = media["preview"]
                if any(subdomain in nm for nm in matches):
                    subdomain = url.hostname.split('.')[1]
                    if "upload" in subdomain:
                        continue
                    if "convert" in subdomain:
                        link = preview_link
                rules = [link == "",
                         preview_link == ""]
                if all(rules):
                    continue
                new_dict = dict()
                new_dict["post_id"] = media_api["id"]
                new_dict["media_id"] = media["id"]
                new_dict["links"] = []
                for xlink in link, preview_link:
                    if xlink:
                        new_dict["links"].append(xlink)
                        break
                new_dict["price"] = media_api["price"]if "price" in media_api else None

                if media["type"] not in alt_media_type:
                    continue
                if "rawText" not in media_api:
                    media_api["rawText"] = ""
                text = media_api["rawText"] if media_api["rawText"] else ""
                matches = [s for s in ignored_keywords if s in text]
                if matches:
                    print("Matches: ", matches)
                    continue
                new_dict["postedAt"] = date_string
                post_id = new_dict["post_id"]
                media_id = new_dict["media_id"]
                filename = link.rsplit('/', 1)[-1]
                filename, ext = os.path.splitext(filename)
                ext = ext.__str__().replace(".", "").split('?')[0]
                price = new_dict["price"]
                new_dict["text"] = text

                option = {}
                option = option | new_dict
                option["site_name"] = "OnlyFans"
                option["filename"] = filename
                option["api_type"] = api_type
                option["media_type"] = media_type
                option["ext"] = ext
                option["username"] = username
                option["date_format"] = date_format
                option["maximum_length"] = maximum_length
                option["directory"] = download_path

                prepared_format = prepare_reformat(option)
                file_directory = main_helper.reformat(
                    prepared_format, file_directory_format)
                prepared_format.directory = file_directory
                file_path = main_helper.reformat(
                    prepared_format, filename_format)
                new_dict["directory"] = os.path.join(file_directory)
                new_dict["filename"] = os.path.basename(file_path)
                new_dict["session"] = session
                if size == 0:
                    media_set2["invalid"].append(new_dict)
                    continue
                if file_directory not in directories:
                    directories.append(file_directory)
                media_set2["valid"].append(new_dict)
        if media_set2["valid"] or media_set2["invalid"]:
            media_set[media_type] = media_set2
        else:
            print
    media_set["directories"] = directories
    return media_set
Пример #13
0
    async def fix_directories2(post: api_table,
                               media_db: list[template_media_table]):
        delete_rows = []
        final_api_type = (os.path.join("Archived", api_type)
                          if post.archived else api_type)
        post_id = post.post_id
        media_db = [x for x in media_db if x.post_id == post_id]
        for media in media_db:
            media_id = media.media_id
            if media.link:
                path = urlparse.urlparse(media.link).path
            else:
                path: str = media.filename
            new_filename = os.path.basename(path)
            original_filename, ext = os.path.splitext(new_filename)
            ext = ext.replace(".", "")
            file_directory_format = json_settings["file_directory_format"]
            filename_format = json_settings["filename_format"]
            date_format = json_settings["date_format"]
            text_length = json_settings["text_length"]
            download_path = base_directory
            option = {}
            option["site_name"] = site_name
            option["post_id"] = post_id
            option["media_id"] = media_id
            option["profile_username"] = subscription.subscriber.username
            option["model_username"] = subscription.username
            option["api_type"] = final_api_type
            option["media_type"] = media.media_type
            option["filename"] = original_filename
            option["ext"] = ext
            option["text"] = post.text
            option["postedAt"] = media.created_at
            option["price"] = post.price
            option["date_format"] = date_format
            option["text_length"] = text_length
            option["directory"] = download_path
            option["preview"] = media.preview
            option["archived"] = post.archived
            prepared_format = prepare_reformat(option)
            file_directory = await main_helper.reformat(
                prepared_format, file_directory_format)
            prepared_format.directory = file_directory
            old_filepath = ""
            if media.linked:
                filename_format = f"linked_{filename_format}"
            old_filepaths = [
                x for x in all_files
                if original_filename in os.path.basename(x)
            ]
            if not old_filepaths:
                old_filepaths = [
                    x for x in all_files
                    if str(media_id) in os.path.basename(x)
                ]
                print
            if not media.linked:
                old_filepaths = [
                    x for x in old_filepaths if "linked_" not in x
                ]
            if old_filepaths:
                old_filepath = old_filepaths[0]
            # a = randint(0,1)
            # await asyncio.sleep(a)
            new_filepath = await main_helper.reformat(prepared_format,
                                                      filename_format)
            if old_filepath and old_filepath != new_filepath:
                if os.path.exists(new_filepath):
                    os.remove(new_filepath)
                moved = None
                while not moved:
                    try:
                        if os.path.exists(old_filepath):
                            if media.size:
                                media.downloaded = True
                            found_dupes = [
                                x for x in media_db if
                                x.filename == new_filename and x.id != media.id
                            ]
                            delete_rows.extend(found_dupes)
                            os.makedirs(os.path.dirname(new_filepath),
                                        exist_ok=True)
                            if media.linked:
                                if os.path.dirname(
                                        old_filepath) == os.path.dirname(
                                            new_filepath):
                                    moved = shutil.move(
                                        old_filepath, new_filepath)
                                else:
                                    moved = shutil.copy(
                                        old_filepath, new_filepath)
                            else:
                                moved = shutil.move(old_filepath, new_filepath)
                        else:
                            break
                    except OSError as e:
                        print(traceback.format_exc())
                    print
                print

            if os.path.exists(new_filepath):
                if media.size:
                    media.downloaded = True
            if prepared_format.text:
                pass
            media.directory = file_directory
            media.filename = os.path.basename(new_filepath)
            new_directories.append(os.path.dirname(new_filepath))
        return delete_rows
Пример #14
0
 def fix_directories(post):
     final_type = ""
     if parent_type:
         final_type = f"{api_type}{os.path.sep}{parent_type}"
         print
     final_type = final_type if final_type else api_type
     database_session = Session()
     post_id = post.id
     result = database_session.query(folder.media_table)
     media_db = result.filter_by(post_id=post_id).all()
     for media in media_db:
         if media.link:
             path = urlparse.urlparse(media.link).path
         else:
             path = media.filename
         new_filename = os.path.basename(path)
         filename, ext = os.path.splitext(new_filename)
         ext = ext.replace(".", "")
         file_directory_format = json_settings["file_directory_format"]
         filename_format = json_settings["filename_format"]
         date_format = json_settings["date_format"]
         text_length = json_settings["text_length"]
         download_path = base_directory
         today = datetime.today()
         today = today.strftime("%d-%m-%Y %H:%M:%S")
         option = {}
         option["site_name"] = site_name
         option["post_id"] = post_id
         option["media_id"] = media.id
         option["username"] = username
         option["api_type"] = final_type if parent_type else api_type
         option["media_type"] = media.media_type
         option["filename"] = filename
         option["ext"] = ext
         option["text"] = post.text
         option["postedAt"] = media.created_at
         option["price"] = post.price
         option["date_format"] = date_format
         option["text_length"] = text_length
         option["directory"] = download_path
         prepared_format = prepare_reformat(option)
         file_directory = main_helper.reformat(prepared_format,
                                               file_directory_format)
         prepared_format.directory = file_directory
         old_filepath = ""
         old_filepaths = [
             x for x in all_files if media.filename in os.path.basename(x)
         ]
         if not old_filepaths:
             old_filepaths = [
                 x for x in all_files
                 if str(media.id) in os.path.basename(x)
             ]
             print
         if old_filepaths:
             old_filepath = old_filepaths[0]
         print
         new_filepath = main_helper.reformat(prepared_format,
                                             filename_format)
         if old_filepath and old_filepath != new_filepath:
             if os.path.exists(new_filepath):
                 os.remove(new_filepath)
             if os.path.exists(old_filepath):
                 if media.size:
                     media.downloaded = True
                 moved = None
                 while not moved:
                     try:
                         moved = shutil.move(old_filepath, new_filepath)
                     except OSError as e:
                         print(traceback.format_exc())
                 print
             print
         else:
             print
         if prepared_format.text:
             pass
         media.directory = file_directory
         media.filename = os.path.basename(new_filepath)
         database_session.commit()
         new_directories.append(os.path.dirname(new_filepath))
     database_session.close()