示例#1
0
class YoutubeFlagsSchema(SerializableSchema):
    class Meta:
        ordered = True

    kind = StringEnum(
        metadata={
            "label": "Type",
            "description": "Type of collection. Only `playlist` accepts multiple IDs.",
        },
        validate=validate.OneOf(["channel", "playlist", "user"]),
        data_key="type",
        required=True,
    )
    ident = fields.String(
        metadata={
            "label": "Youtube ID",
            "description": "Youtube ID of the collection. Seperate multiple playlists with commas.",
        },
        data_key="id",
        required=True,
    )
    api_key = fields.String(
        metadata={"label": "API Key", "description": "Youtube API Token"},
        data_key="api-key",
        required=True,
    )

    name = fields.String(
        metadata={
            "label": "ZIM Name",
            "description": "Used as identifier and filename (date will be appended)",
            "placeholder": "mychannel_eng_all",
        },
        required=True,
    )

    video_format = StringEnum(
        metadata={
            "label": "Video format",
            "description": "Format to download/transcode video to. webm is smaller",
        },
        validate=validate.OneOf(["webm", "mp4"]),
        data_key="format",
    )
    low_quality = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Low Quality",
            "description": "Re-encode video using stronger compression",
        },
        data_key="low-quality",
    )
    concurrency = fields.Integer(
        metadata={
            "label": "Concurrency",
            "description": "Number of concurrent threads to use",
        },
    )

    dateafter = fields.String(
        metadata={
            "label": "Only after date",
            "description": "Custom filter to download videos uploaded on or after specified date. Format: YYYYMMDD or (now|today)[+-][0-9](day|week|month|year)(s)?",
        }
    )

    optimization_cache = fields.Url(
        metadata={
            "label": "Optimization Cache URL",
            "description": "S3 Storage URL including credentials and bucket",
            "secret": True,
        },
        data_key="optimization-cache",
    )

    use_any_optimized_version = fields.Boolean(
        metadata={
            "label": "Use any optimized version",
            "description": "Use the cached files if present, whatever the version",
        },
        data_key="use-any-optimized-version",
    )

    all_subtitles = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "All Subtitles",
            "description": "Include auto-generated subtitles",
        },
        data_key="all-subtitles",
    )
    pagination = fields.Integer(
        metadata={
            "label": "Pagination",
            "description": "Number of videos per page (40 otherwise)",
        },
    )
    autoplay = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Auto-play",
            "description": "Enable autoplay on video articles (home never have autoplay).",
        },
    )
    output = fields.String(
        metadata={
            "label": "Output folder",
            "placeholder": "/output",
            "description": "Output folder for ZIM file or build folder. Leave it as `/output`",
        },
        missing="/output",
        default="/output",
        validate=validate_output,
    )
    zim_file = fields.String(
        metadata={
            "label": "ZIM filename",
            "description": "ZIM file name (based on --name if not provided)",
        },
        data_key="zim-file",
    )
    language = fields.String(
        metadata={
            "label": "Language",
            "description": "ISO-639-3 (3 chars) language code of content",
        }
    )
    locale = fields.String(
        metadata={
            "label": "Locale",
            "description": "Locale name to use for translations (if avail) and time representations. Defaults to --language or English.",
        }
    )
    title = fields.String(
        metadata={
            "label": "Title",
            "description": "Custom title for your project and ZIM. Default to Channel name (of first video if playlists)",
        }
    )
    description = fields.String(metadata={"label": "Description", "description": ""})
    creator = fields.String(
        metadata={
            "label": "Content Creator",
            "description": "Name of content creator. Defaults to Channel name or “Youtue Channels”",
        }
    )
    tags = fields.String(
        metadata={
            "label": "ZIM Tags",
            "description": "List of Tags for the ZIM file. _videos:yes added automatically",
        }
    )

    profile = fields.Url(
        metadata={
            "label": "Profile Image",
            "description": "Custom profile image. Squared. Will be resized to 100x100px",
        }
    )
    banner = fields.Url(
        metadata={
            "label": "Banner Image",
            "description": "Custom banner image. Will be resized to 1060x175px",
        }
    )
    main_color = HexColor(
        metadata={
            "label": "Main Color",
            "description": "Custom color. Hex/HTML syntax (#DEDEDE). Default to main color of profile image.",
        },
        data_key="main-color",
    )
    secondary_color = HexColor(
        metadata={
            "label": "Secondary Color",
            "description": "Custom secondary color. Hex/HTML syntax (#DEDEDE). Default to secondary color of profile image.",
        },
        data_key="secondary-color",
    )

    debug = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={"label": "Debug", "description": "Enable verbose output"},
    )
示例#2
0
class OpenedxFlagsSchema(SerializableSchema):
    class Meta:
        ordered = True

    course_url = fields.Url(
        metadata={
            "label": "Course URL",
            "description": "URL of the course you wnat to scrape",
        },
        data_key="course-url",
        required=True,
    )

    email = fields.String(
        metadata={
            "label": "Registered e-mail",
            "description": "The registered e-mail ID on the openedx instance",
        },
        data_key="email",
        required=True,
    )

    password = fields.String(
        metadata={
            "label": "Password",
            "description": "Password to the account registered on the openedx instance",
            "secret": True,
        },
        data_key="password",
        required=True,
    )

    instance_login_page = fields.String(
        metadata={
            "label": "Login page path",
            "description": "The login path in the instance. Must start with /",
            "placeholder": "/login_ajax",
        },
        data_key="instance-login-page",
    )

    instance_course_page = fields.String(
        metadata={
            "label": "Course page path",
            "description": "The path to the course page after the course ID. Must start with /",
            "placeholder": "/course",
        },
        data_key="instance-course-page",
    )

    instance_course_prefix = fields.String(
        metadata={
            "label": "Course prefix path",
            "description": "The prefix in the path before the course ID. Must start and end with /",
            "placeholder": "/courses/",
        },
        data_key="instance-course-prefix",
    )

    favicon_url = fields.Url(
        metadata={
            "label": "Favicon URL",
            "description": "URL pointing to a favicon image. Recommended size >= (48px x 48px)",
            "placeholder": "https://github.com/edx/edx-platform/raw/master/lms/static/images/favicon.ico",
        },
        data_key="favicon-url",
    )

    ignore_missing_xblocks = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Ignore unsupported xblocks",
            "description": "Ignore unsupported content (xblock(s))",
        },
        data_key="ignore-missing-xblocks",
    )

    add_wiki = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Include wiki",
            "description": "Add wiki (if available) to the ZIM",
        },
        data_key="add-wiki",
    )

    add_forum = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Include forum",
            "description": "Add forum/discussion (if available) to the ZIM",
        },
        data_key="add-forum",
    )

    remove_seq_nav = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "No top sequential navigation",
            "description": "Remove the top sequential navigation bar in the ZIM",
        },
        data_key="remove-seq-nav",
    )

    video_format = StringEnum(
        metadata={
            "label": "Video format",
            "description": "Format to download/transcode video to. webm is smaller",
        },
        validate=validate.OneOf(["webm", "mp4"]),
        data_key="format",
    )

    low_quality = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Low Quality",
            "description": "Re-encode video using stronger compression",
        },
        data_key="low-quality",
    )

    autoplay = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Autoplay videos",
            "description": "Enable autoplay on videos. Behavior differs on platforms/browsers",
        },
        data_key="autoplay",
    )

    name = fields.String(
        metadata={
            "label": "Name",
            "description": "ZIM name. Used as identifier and filename (date will be appended)",
            "placeholder": "topic_eng",
        },
        data_key="name",
        required=True,
    )

    title = fields.String(
        metadata={
            "label": "Title",
            "description": "Custom title for your ZIM. Based on MOOC otherwise",
        },
        data_key="title",
    )

    description = fields.String(
        metadata={
            "label": "Description",
            "description": "Custom description for your ZIM. Based on MOOC otherwise",
        },
        data_key="description",
    )

    creator = fields.String(
        metadata={
            "label": "Content Creator",
            "description": "Name of content creator. Defaults to edX",
        },
        data_key="creator",
    )

    tags = fields.String(
        metadata={
            "label": "ZIM Tags",
            "description": "List of comma-separated Tags for the ZIM file. category:other, and openedx added automatically",
        },
        data_key="tags",
    )

    optimization_cache = fields.Url(
        metadata={
            "label": "Optimization Cache URL",
            "description": "URL with credentials and bucket name to S3 Optimization Cache",
            "secret": True,
        },
        data_key="optimization-cache",
    )

    use_any_optimized_version = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Use any optimized version",
            "description": "Use the cached files if present, whatever the version",
        },
        data_key="use-any-optimized-version",
    )

    output = fields.String(
        metadata={
            "label": "Output folder",
            "placeholder": "/output",
            "description": "Output folder for ZIM file(s). Leave it as `/output`",
        },
        missing="/output",
        default="/output",
        validate=validate_output,
        data_key="output",
    )

    tmp_dir = fields.String(
        metadata={
            "label": "Temp folder",
            "description": "Where to create temporay build folder. Leave it as `/output`",
        },
        missing="/output",
        default="/output",
        validate=validate_output,
        data_key="tmp-dir",
    )

    zim_file = fields.String(
        metadata={
            "label": "ZIM filename",
            "description": "ZIM file name (based on ZIM name if not provided)",
        },
        data_key="zim-file",
    )

    debug = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={"label": "Debug", "description": "Enable verbose output"},
    )

    threads = fields.Integer(
        metadata={
            "label": "Threads",
            "description": "Number of parallel threads to use while downloading",
        },
        validate=validate.Range(min=1),
    )

    locale = fields.String(
        metadata={
            "label": "Locale",
            "description": "The locale to use for the translations in ZIM",
        }
    )
示例#3
0
class TedFlagsSchema(SerializableSchema):
    class Meta:
        ordered = True

    indiv_zims = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Individual ZIM mode",
            "description": "Whether to produce one ZIM per topic/playlist",
        },
        data_key="indiv-zims",
    )

    topics = fields.String(metadata={
        "label":
        "Topics",
        "description":
        "Comma-seperated list of topics to scrape; as given on ted.com/talks. Pass all for all topics",
    }, )

    playlists = fields.String(metadata={
        "label":
        "TED Playlists",
        "description":
        "Comma-seperated list of TED playlist IDs to scrape. Pass all for all playlists",
    }, )

    languages = fields.String(metadata={
        "label":
        "Languages",
        "description":
        "Comma-seperated list of languages to filter videos",
    }, )

    subtitles_enough = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label":
            "Subtitles enough?",
            "description":
            "Whether to include videos that have a subtitle in requested language(s) if audio is in another language",
        },
    )

    subtitles = fields.String(metadata={
        "label":
        "Subtitles Setting",
        "description":
        "Language setting for subtitles. all: include all available subtitles, matching (default): only subtitles matching language(s), none: include no subtitle. Also accepts comma-seperated list of language(s)",
    }, )

    video_format = StringEnum(
        metadata={
            "label":
            "Video format",
            "description":
            "Format to download/transcode video to. webm is smaller",
        },
        validate=validate.OneOf(["webm", "mp4"]),
        data_key="format",
    )

    low_quality = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Low Quality",
            "description": "Re-encode video using stronger compression",
        },
        data_key="low-quality",
    )

    autoplay = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label":
            "Auto-play",
            "description":
            "Enable autoplay on video articles. Behavior differs on platforms/browsers.",
        },
    )

    name = fields.String(metadata={
        "label": "Name",
        "description":
        "ZIM name. Used as identifier and filename (date will be appended)",
        "placeholder": "topic_eng",
    }, )

    name_format = fields.String(
        metadata={
            "label": "Name Format",
            "description":
            "Format for building individual --name argument. Use variable {identity} for playlist id or topic name",
            "placeholder": "{identity}_eng",
        },
        data_key="name-format",
    )

    title = fields.String(
        metadata={
            "label":
            "Title",
            "description":
            "Custom title for your ZIM. Based on selection otherwise",
        })

    title_format = fields.String(
        metadata={
            "label": "Title Format",
            "description": "Custom title format for individual ZIMs",
        },
        data_key="title-format",
    )

    description = fields.String(
        metadata={
            "label":
            "Description",
            "description":
            "Custom description for your ZIM. Based on selection otherwise",
        })

    description_format = fields.String(
        metadata={
            "label": "Description Format",
            "description": "Custom description format for individual ZIMs",
        },
        data_key="description-format",
    )

    creator = fields.String(
        metadata={
            "label": "Content Creator",
            "description": "Name of content creator. Defaults to TED",
        })

    tags = fields.String(
        metadata={
            "label":
            "ZIM Tags",
            "description":
            "List of comma-separated Tags for the ZIM file. category:ted, ted, and _videos:yes added automatically",
        })

    optimization_cache = fields.Url(
        metadata={
            "label": "Optimization Cache URL",
            "description":
            "URL with credentials and bucket name to S3 Optimization Cache",
            "secret": True,
        },
        data_key="optimization-cache",
    )

    use_any_optimized_version = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Use any optimized version",
            "description":
            "Use the cached files if present, whatever the version",
        },
        data_key="use-any-optimized-version",
    )

    output = fields.String(
        metadata={
            "label": "Output folder",
            "placeholder": "/output",
            "description":
            "Output folder for ZIM file(s). Leave it as `/output`",
        },
        missing="/output",
        default="/output",
        validate=validate_output,
    )

    tmp_dir = fields.String(
        metadata={
            "label":
            "Temp folder",
            "description":
            "Where to create temporay build folder. Leave it as `/output`",
        },
        missing="/output",
        default="/output",
        validate=validate_output,
        data_key="tmp-dir",
    )

    metadata_from = fields.String(
        metadata={
            "label":
            "Metadata JSON",
            "description":
            "File path or URL to a JSON file holding custom metadata for individual playlists/topics",
        },
        data_key="metadata-from",
    )

    zim_file = fields.String(
        metadata={
            "label": "ZIM filename",
            "description": "ZIM file name (based on ZIM name if not provided)",
        },
        data_key="zim-file",
    )

    zim_file_format = fields.String(
        metadata={
            "label":
            "ZIM filename format",
            "description":
            "Format for building individual --zim-file argument for individual ZIMs. Uses --name-format otherwise",
        },
        data_key="zim-file-format",
    )

    debug = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Debug",
            "description": "Enable verbose output"
        },
    )

    @validates_schema
    def validate(self, data, **kwargs):
        if data.get("indiv_zims"):
            if not data.get("name_format"):
                raise ValidationError(
                    "name-format required in individual ZIMs mode")
        else:
            if not data.get("name"):
                raise ValidationError("name required in normal mode")
示例#4
0
class ZimitFlagsSchema(SerializableSchema):
    class Meta:
        ordered = True

    url = fields.Url(
        metadata={
            "label": "URL",
            "description":
            "The URL to start crawling from and main page for ZIM",
        },
        required=True,
    )

    name = fields.String(
        metadata={
            "label":
            "Name",
            "description":
            "Name of the ZIM. "
            "Used to compose filename if not otherwise defined",
        },
        required=True,
    )

    lang = fields.String(
        metadata={
            "label":
            "Language",
            "description":
            "ISO-639-3 (3 chars) language code of content. "
            "Default to `eng`",
        })

    title = fields.String(
        metadata={
            "label": "Title",
            "description":
            "Custom title for ZIM. Default to title of main page",
        })
    description = fields.String(metadata={
        "label": "Description",
        "description": "Description for ZIM"
    })

    favicon = fields.Url(
        metadata={
            "label":
            "Favicon",
            "description":
            "URL for Favicon. "
            "If unspecified, will attempt to use the one used from main page.",
        },
        required=False,
    )

    zim_file = fields.String(
        metadata={
            "label": "ZIM filename",
            "description": "ZIM file name (based on --name if not provided)",
        },
        data_key="zim-file",
    )

    tags = fields.String(metadata={
        "label": "ZIM Tags",
        "description": "List of Tags for the ZIM file.",
    })

    creator = fields.String(metadata={
        "label": "Content Creator",
        "description": "Name of content creator.",
    })

    source = fields.String(metadata={
        "label": "Content Source",
        "description": "Source name/URL of content",
    })

    workers = fields.Integer(
        metadata={
            "label":
            "Workers",
            "description":
            "The number of workers to run in parallel. Default to 1",
        },
        required=False,
    )

    include_domains = fields.String(
        metadata={
            "label":
            "Include domains",
            "description":
            "Limit to URLs from only certain domains. "
            "If not set, all URLs are included.",
        },
        data_key="include-domains",
        required=False,
    )

    exclude = fields.String(
        metadata={
            "label": "Exclude",
            "description":
            "Regex of URLs that should be excluded from the crawl.",
        },
        required=False,
    )

    wait_until = fields.String(
        metadata={
            "label":
            "WaitUntil",
            "description":
            "Puppeteer page.goto() condition to wait for "
            "before continuing. Default to `load`",
        },
        data_key="waitUntil",
        required=False,
    )

    limit = fields.Integer(metadata={
        "label":
        "Limit",
        "description":
        "Limit crawl to this number of pages. 0 means no-limit.",
    }, )

    timeout = fields.Integer(
        metadata={
            "label":
            "Timeout",
            "description":
            "Timeout for each page to load (in millis). "
            "Default to 30000",
        },
        required=False,
    )

    scope = fields.String(
        metadata={
            "label":
            "Scope",
            "description":
            "The scope of current page that should be included in the "
            "crawl (defaults to the domain of URL)",
        },
        required=False,
    )

    scroll = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Scroll",
            "description": "If set, will autoscroll pages to bottom.",
        },
        required=False,
    )

    new_context = StringEnum(
        metadata={
            "label": "New Context",
            "description":
            "The context for each new capture. Defaults to page",
        },
        validate=validate.OneOf(["page", "session", "browser"]),
        data_key="newContext",
        required=False,
    )

    custom_css = fields.Url(
        metadata={
            "label": "Custom CSS",
            "description": "URL to a CSS file to inject into pages",
        },
        data_key="custom-css",
        required=False,
    )

    verbose = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Verbose mode",
            "description": "Whether to display additional logs",
        },
        required=False,
    )

    output = fields.String(
        metadata={
            "label": "Output folder",
            "placeholder": "/output",
            "description":
            "Output folder for ZIM file(s). Leave it as `/output`",
        },
        missing="/output",
        default="/output",
        validate=validate_output,
    )

    stats_filename = fields.String(
        metadata={
            "label":
            "Stats filename",
            "placeholder":
            "/output/task_progress.json",
            "description":
            "Scraping progress file. "
            "Leave it as `/output/task_progress.json`",
        },
        data_key="statsFilename",
        missing="/output/task_progress.json",
        default="/output/task_progress.json",
        validate=validate.Equal("/output/task_progress.json"),
    )

    replay_viewer_source = fields.Url(
        metadata={
            "label":
            "Replay Viewer Source",
            "description":
            "URL from which to load the ReplayWeb.page "
            "replay viewer from",
        },
        data_key="replay-viewer-source",
        required=False,
    )

    use_sitemap = fields.Url(
        metadata={
            "label":
            "Use sitemap",
            "description":
            "Use as sitemap to get additional URLs for the crawl "
            "(usually at /sitemap.xml)",
        },
        data_key="useSitemap",
        required=False,
    )

    mobile_device = StringEnum(
        metadata={
            "label":
            "As device",
            "description":
            "Device to crawl as. Defaults to `Iphone X`. "
            "See Pupeeter's DeviceDescriptors.",
        },
        data_key="mobileDevice",
        required=False,
        validate=validate_devicelist,
    )

    admin_email = fields.String(
        metadata={
            "label":
            "Admin Email",
            "description":
            "Admin Email for crawler: used in UserAgent "
            "so website admin can contact us",
        },
        data_key="adminEmail",
        required=False,
    )
示例#5
0
class MWOfflinerFlagsSchema(SerializableSchema):
    class Meta:
        ordered = True

    mwUrl = fields.URL(
        required=True,
        metadata={
            "label": "Wiki URL",
            "description": "The URL of the mediawiki to scrape",
        },
    )
    adminEmail = fields.Email(
        required=True,
        metadata={
            "label":
            "Admin Email",
            "description":
            "Email of the mwoffliner user which will be put in the HTTP user-agent string",
        },
    )

    articleList = fields.URL(
        metadata={
            "label":
            "Article List",
            "description":
            "URL to an UTF-8 tsv file containing article names to include (one per line)",
        })
    customMainPage = fields.String(
        metadata={
            "label":
            "Main Page",
            "description":
            "Article Name to use as home page. Automatically built or guessed otherwise.",
        })
    customZimTitle = fields.String(
        metadata={
            "label": "ZIM Title",
            "description": "Custom ZIM title. Wiki name otherwise.",
        })
    customZimDescription = fields.String(metadata={"label": "ZIM Description"})
    customZimFavicon = fields.Url(
        metadata={
            "label":
            "ZIM favicon",
            "description":
            "URL to a png to use as favicon. Will be resized to 48x48px.",
        })
    customZimTags = fields.String(
        metadata={
            "label": "ZIM Tags",
            "description": "Semi-colon separated list of ZIM tags",
        })
    publisher = fields.String(
        metadata={
            "label": "Publisher",
            "description": "ZIM publisher metadata. `Kiwix` otherwise.",
        })
    filenamePrefix = fields.String(
        metadata={
            "label": "Filename prefix",
            "description":
            "Custome filename up to the formats and date parts.",
        })
    formats = ListOfStringEnum(
        fields.String(validate=validate.OneOf([
            "nodet,nopic:mini",
            "nodet:mini",
            "nopic:nopic",
            "novid:maxi",
            "",
            "nodet",
            "nopic",
            "novid",
            "nodet,nopic",
        ])),
        data_key="format",
        metadata={
            "label":
            "Flavours",
            "description":
            "Which flavours to build, as `<flavour>:<custom-suffix>`. Empty option is full without suffix.",
        },
    )
    customFlavour = StringEnum(
        metadata={
            "label":
            "Custom Flavour",
            "description":
            "Custom processor to filter and process articles (see extensions/*.js)",
        },
        validate=validate.OneOf(
            ["/tmp/mwoffliner/extensions/wiktionary_fr.js"]  # nosec
        ),
    )

    optimisationCacheUrl = fields.Url(
        metadata={
            "label": "Optimisation Cache URL",
            "description": "S3 Storage URL including credentials and bucket",
            "secret": True,
        })

    zstd = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Use Zstandard compression",
            "description": "Use Zstandard as ZIM compression (Lzma otherwise)",
        },
    )

    addNamespaces = fields.String(
        metadata={
            "label": "Add Namespaces",
            "description":
            "Include addional namespaces (comma separated numbers)",
        })
    getCategories = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Add categories",
            "description": "[WIP] Download category pages",
        },
    )
    keepEmptyParagraphs = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Keep empty paragraphs",
            "description": "Keep all paragraphs, even empty ones.",
        },
    )
    minifyHtml = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Minify HTML",
            "description": "Try to reduce the size of the HTML",
        },
    )

    mwWikiPath = fields.String(
        metadata={
            "label": "Wiki Path",
            "description": "Mediawiki wiki base path. Otherwise `/wiki/`.",
        })
    mwApiPath = fields.String(
        metadata={
            "label": "API Path",
            "description": "Mediawiki API path. Otherwise `/w/api.php`.",
        })
    mwModulePath = fields.String(
        metadata={
            "label": "Module Path",
            "description":
            "Mediawiki module load path. Otherwise `/w/load.php`.",
        })
    mwDomain = fields.String(
        metadata={
            "label": "User Domain",
            "description": "Mediawiki user domain (for private wikis)",
        })
    mwUsername = fields.String(
        metadata={
            "label": "Username",
            "description": "Mediawiki username (for private wikis)",
        })
    mwPassword = fields.String(
        metadata={
            "label": "Password",
            "description": "Mediawiki user password (for private wikis)",
        })

    osTmpDir = fields.String(
        metadata={
            "label":
            "OS Temp Dir",
            "description":
            "Override default operating system temporary directory path environnement variable",
        })
    outputDirectory = fields.String(
        metadata={
            "label":
            "Output folder",
            "placeholder":
            "/output",
            "description":
            "Output folder for ZIM file or build folder. Leave it as `/output`",
        },
        missing="/output",
        default="/output",
        validate=validate_output,
    )
    noLocalParserFallback = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label":
            "Don't fallback to local Parser",
            "description":
            "Don't fall back to a local MCS or Parsoid, only use remote APIs",
        },
    )
    requestTimeout = fields.Integer(
        metadata={
            "label": "Request Timeout",
            "description": "Request timeout (in seconds)",
        },
        validate=validate.Range(min=1),
    )
    speed = fields.Float(
        metadata={
            "label":
            "Speed",
            "description":
            "Multiplicator for the number of parallel HTTP requests on Parsoid backend. Otherwise `1`. Reduce on throttled Wikis.",
        })
    withoutZimFullTextIndex = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Without Full Text Index",
            "description": "Don't include a fulltext search index to the ZIM",
        },
    )
    verbose = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Verbose",
            "description": "Print debug information to the stdout",
        },
    )

    webp = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Webp",
            "description": "Convert images to Webp",
        },
    )
示例#6
0
class OpenedxFlagsSchema(SerializableSchema):
    class Meta:
        ordered = True

    course_url = fields.Url(
        metadata={
            "label": "Course URL",
            "description": "URL of the course you wnat to scrape",
        },
        data_key="course-url",
        required=True,
    )

    email = fields.String(
        metadata={
            "label": "Registered e-mail",
            "description": "The registered e-mail ID on the openedx instance",
        },
        data_key="email",
        required=True,
    )

    password = fields.String(
        metadata={
            "label": "Password",
            "description":
            "Password to the account registered on the openedx instance",
            "secret": True,
        },
        data_key="password",
        required=True,
    )

    ignore_missing_xblocks = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Ignore unsupported xblocks",
            "description": "Ignore unsupported content (xblock(s))",
        },
        data_key="ignore-missing-xblocks",
    )

    add_wiki = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Include wiki",
            "description": "Add wiki (if available) to the ZIM",
        },
        data_key="add-wiki",
    )

    add_forum = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Include forum",
            "description": "Add forum/discussion (if available) to the ZIM",
        },
        data_key="add-forum",
    )

    video_format = StringEnum(
        metadata={
            "label":
            "Video format",
            "description":
            "Format to download/transcode video to. webm is smaller",
        },
        validate=validate.OneOf(["webm", "mp4"]),
        data_key="format",
    )

    low_quality = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Low Quality",
            "description": "Re-encode video using stronger compression",
        },
        data_key="low-quality",
    )

    name = fields.String(
        metadata={
            "label": "Name",
            "description":
            "ZIM name. Used as identifier and filename (date will be appended)",
            "placeholder": "topic_eng",
        },
        data_key="name",
        required=True,
    )

    title = fields.String(
        metadata={
            "label": "Title",
            "description":
            "Custom title for your ZIM. Based on MOOC otherwise",
        },
        data_key="title",
    )

    description = fields.String(
        metadata={
            "label":
            "Description",
            "description":
            "Custom description for your ZIM. Based on MOOC otherwise",
        },
        data_key="description",
    )

    creator = fields.String(
        metadata={
            "label": "Content Creator",
            "description": "Name of content creator. Defaults to edX",
        },
        data_key="creator",
    )

    tags = fields.String(
        metadata={
            "label":
            "ZIM Tags",
            "description":
            "List of comma-separated Tags for the ZIM file. category:openedx, and openedx added automatically",
        },
        data_key="tags",
    )

    optimization_cache = fields.Url(
        metadata={
            "label": "Optimization Cache URL",
            "description":
            "URL with credentials and bucket name to S3 Optimization Cache",
            "secret": True,
        },
        data_key="optimization-cache",
    )

    use_any_optimized_version = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Use any optimized version",
            "description":
            "Use the cached files if present, whatever the version",
        },
        data_key="use-any-optimized-version",
    )

    output = fields.String(
        metadata={
            "label": "Output folder",
            "placeholder": "/output",
            "description":
            "Output folder for ZIM file(s). Leave it as `/output`",
        },
        missing="/output",
        default="/output",
        validate=validate_output,
        data_key="output",
    )

    tmp_dir = fields.String(
        metadata={
            "label":
            "Temp folder",
            "description":
            "Where to create temporay build folder. Leave it as `/output`",
        },
        missing="/output",
        default="/output",
        validate=validate_output,
        data_key="tmp-dir",
    )

    zim_file = fields.String(
        metadata={
            "label": "ZIM filename",
            "description": "ZIM file name (based on ZIM name if not provided)",
        },
        data_key="zim-file",
    )

    debug = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Debug",
            "description": "Enable verbose output"
        },
    )
示例#7
0
class YoutubeFlagsSchema(SerializableSchema):
    class Meta:
        ordered = True

    indiv_playlists = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Playlists mode",
            "description": "Build one ZIM per playlist of the channel or user",
        },
        data_key="indiv-playlists",
    )

    kind = StringEnum(
        metadata={
            "label": "Type",
            "description": "Type of collection. Only `playlist` accepts multiple IDs.",
        },
        validate=validate.OneOf(["channel", "playlist", "user"]),
        data_key="type",
        required=True,
    )
    ident = fields.String(
        metadata={
            "label": "Youtube ID",
            "description": "Youtube ID of the collection. "
            "Separate multiple playlists with commas.",
        },
        data_key="id",
        required=True,
    )
    api_key = fields.String(
        metadata={"label": "API Key", "description": "Youtube API Token"},
        data_key="api-key",
        required=True,
    )

    name = fields.String(
        metadata={
            "label": "ZIM Name",
            "description": "Used as identifier and filename (date will be appended)",
            "placeholder": "mychannel_eng_all",
        },
    )
    playlists_name = fields.String(
        metadata={
            "label": "Playlists name",
            "description": "Format for building individual --name argument. "
            "Required in playlist mode. Variables: {title}, {description}, "
            "{playlist_id}, {slug} (from title), {creator_id}, {creator_name}",
        },
        data_key="playlists-name",
    )

    video_format = StringEnum(
        metadata={
            "label": "Video format",
            "description": "Format to download/transcode video to. webm is smaller",
        },
        validate=validate.OneOf(["webm", "mp4"]),
        data_key="format",
    )
    low_quality = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Low Quality",
            "description": "Re-encode video using stronger compression",
        },
        data_key="low-quality",
    )
    concurrency = fields.Integer(
        metadata={
            "label": "Concurrency",
            "description": "Number of concurrent threads to use",
        },
    )

    dateafter = fields.String(
        metadata={
            "label": "Only after date",
            "description": "Custom filter to download videos uploaded on "
            "or after specified date. Format: YYYYMMDD or "
            "(now|today)[+-][0-9](day|week|month|year)(s)?",
        }
    )

    optimization_cache = fields.Url(
        metadata={
            "label": "Optimization Cache URL",
            "description": "S3 Storage URL including credentials and bucket",
            "secret": True,
        },
        data_key="optimization-cache",
    )

    use_any_optimized_version = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Use any optimized version",
            "description": "Use the cached files if present, whatever the version",
        },
        data_key="use-any-optimized-version",
    )

    all_subtitles = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "All Subtitles",
            "description": "Include auto-generated subtitles",
        },
        data_key="all-subtitles",
    )
    pagination = fields.Integer(
        metadata={
            "label": "Pagination",
            "description": "Number of videos per page (40 otherwise)",
        },
    )
    autoplay = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={
            "label": "Auto-play",
            "description": "Enable autoplay on video articles "
            "(home never have autoplay).",
        },
    )
    output = fields.String(
        metadata={
            "label": "Output folder",
            "placeholder": "/output",
            "description": "Output folder for ZIM file(s). Leave it as `/output`",
        },
        missing="/output",
        default="/output",
        validate=validate_output,
    )
    tmp_dir = fields.String(
        metadata={
            "label": "Temp folder",
            "placeholder": "/output",
            "description": "Where to create temporay build folder. "
            "Leave it as `/output`",
        },
        missing="/output",
        default="/output",
        validate=validate_output,
        data_key="tmp-dir",
    )

    zim_file = fields.String(
        metadata={
            "label": "ZIM filename",
            "description": "ZIM file name (based on --name if not provided). "
            "Include {period} to insert date period dynamically",
        },
        data_key="zim-file",
    )
    playlists_zim_file = fields.String(
        metadata={
            "label": "Playlists ZIM filename",
            "description": "Format for building individual --zim-file argument. "
            "Uses --playlists-name otherwise",
        },
        data_key="playlists-zim-file",
    )

    language = fields.String(
        metadata={
            "label": "Language",
            "description": "ISO-639-3 (3 chars) language code of content",
        }
    )
    locale = fields.String(
        metadata={
            "label": "Locale",
            "description": "Locale name to use for translations (if avail) "
            "and time representations. Defaults to --language or English.",
        }
    )

    title = fields.String(
        metadata={
            "label": "Title",
            "description": "Custom title for your project and ZIM. Default to "
            "Channel name (of first video if playlists)",
        }
    )
    playlists_title = fields.String(
        metadata={
            "label": "Playlists title",
            "description": "Custom title format for individual playlist ZIM",
        },
        data_key="playlists-title",
    )

    description = fields.String(
        metadata={"label": "Description", "description": "Description for ZIM"}
    )
    playlists_description = fields.String(
        metadata={
            "label": "Playlists description",
            "description": "Custom description format for individual playlist ZIM",
        },
        data_key="playlists-description",
    )

    creator = fields.String(
        metadata={
            "label": "Content Creator",
            "description": "Name of content creator. Defaults to Channel name "
            "or “Youtue Channels”",
        }
    )
    tags = fields.String(
        metadata={
            "label": "ZIM Tags",
            "description": "List of Tags for the ZIM file. "
            "_videos:yes added automatically",
        }
    )

    metadata_from = fields.String(
        metadata={
            "label": "Metadata JSON",
            "description": "File path or URL to a JSON file holding custom metadata "
            "for individual playlists",
        },
        data_key="metadata-from",
    )

    profile = fields.Url(
        metadata={
            "label": "Profile Image",
            "description": "Custom profile image. Squared. "
            "Will be resized to 100x100px",
        }
    )
    banner = fields.Url(
        metadata={
            "label": "Banner Image",
            "description": "Custom banner image. Will be resized to 1060x175px",
        }
    )
    main_color = HexColor(
        metadata={
            "label": "Main Color",
            "description": "Custom color. Hex/HTML syntax (#DEDEDE). "
            "Default to main color of profile image.",
        },
        data_key="main-color",
    )
    secondary_color = HexColor(
        metadata={
            "label": "Secondary Color",
            "description": "Custom secondary color. Hex/HTML syntax (#DEDEDE). "
            "Default to secondary color of profile image.",
        },
        data_key="secondary-color",
    )

    debug = fields.Boolean(
        truthy=[True],
        falsy=[False],
        metadata={"label": "Debug", "description": "Enable verbose output"},
    )

    @validates_schema
    def validate(self, data, **kwargs):
        if data.get("indiv_playlists"):
            if not data.get("playlists_name"):
                raise ValidationError("playlists-name required in playlists mode")
        else:
            if not data.get("name"):
                raise ValidationError("name required in normal mode")