Пример #1
0
    def validate_query(query, request, user):
        """
        Validate input for a dataset query on the 4chan data source.

        Will raise a QueryParametersException if invalid parameters are
        encountered. Mutually exclusive parameters may also be sanitised by
        ignoring either of the mutually exclusive options.

        :param dict query:  Query parameters, from client-side.
        :param request:  Flask request
        :param User user:  User object of user who has submitted the query
        :return dict:  Safe query parameters
        """
        # this is the bare minimum, else we can't narrow down the full data set
        if not query.get("body_match", None):
            raise QueryParametersException("Please provide a search query")

        # the dates need to make sense as a range to search within
        query["min_date"], query["max_date"] = query["daterange"]
        del query["daterange"]

        # both dates need to be set, or none
        if query.get("min_date", None) and not query.get("max_date", None):
            raise QueryParametersException(
                "When setting a date range, please provide both an upper and lower limit."
            )

        return query
Пример #2
0
    def validate_query(query, request, user):
        """
        Validate BitChute query input

        :param dict query:  Query parameters, from client-side.
        :param request:  Flask request
        :param User user:  User object of user who has submitted the query
        :return dict:  Safe query parameters
        """
        # no query 4 u
        if not query.get("query", "").strip():
            raise QueryParametersException("You must provide a search query.")

        # reformat queries to be a comma-separated list with no wrapping
        # whitespace
        items = query.get("query").replace("\n", ",")
        if len(items.split(",")) > 15:
            raise QueryParametersException(
                "You cannot query more than 15 items at a time.")

        # simple!
        return {
            "items": query.get("max_posts"),
            "query": items,
            "scope": query.get("search_scope"),
            "item_type": query.get("search_type")
        }
Пример #3
0
    def validate_query(query, request, user):
        """
		Validate input for a dataset query on the Usenet data source.

		Will raise a QueryParametersException if invalid parameters are
		encountered. Mutually exclusive parameters may also be sanitised by
		ignoring either of the mutually exclusive options.

		:param dict query:  Query parameters, from client-side.
		:param request:  Flask request
		:param User user:  User object of user who has submitted the query
		:return dict:  Safe query parameters
		"""

        # this is the bare minimum, else we can't narrow down the full data set
        if not user.is_admin() and not user.get_value(
                "usenet.can_query_without_keyword",
                False) and not query.get("body_match", None) and not query.get(
                    "subject_match",
                    None) and query.get("search_scope", "") != "random-sample":
            raise QueryParametersException(
                "Please provide a body query, subject query or random sample size."
            )

        # the dates need to make sense as a range to search within
        query["min_date"], query["max_date"] = query.get("daterange")
        if any(query.get("daterange")) and not all(query.get("daterange")):
            raise QueryParametersException(
                "When providing a date range, set both an upper and lower limit."
            )

        del query["daterange"]

        # if we made it this far, the query can be executed
        return query
Пример #4
0
    def validate_query(query, request, user):
        """
		Validate custom data input

		Confirms that the uploaded file is a valid CSV file and, if so, returns
		some metadata.

		:param dict query:  Query parameters, from client-side.
		:param request:  Flask request
		:param User user:  User object of user who has submitted the query
		:return dict:  Safe query parameters
		"""
        # no query 4 u
        if not query.get("query", "").strip():
            raise QueryParametersException("You must provide a search query.")

        # reformat queries to be a comma-separated list with no wrapping
        # whitespace
        whitespace = re.compile(r"\s+")
        items = whitespace.sub("", query.get("query").replace("\n", ","))
        if len(items.split(",")) > 5:
            raise QueryParametersException(
                "You cannot query more than 5 items at a time.")

        # simple!
        return {
            "items": query.get("max_posts"),
            "query": items,
            "board": query.get("search_scope") + "s",  # used in web interface
            "search_scope": query.get("search_scope"),
            "scrape_comments": query.get("scrape_comments")
        }
Пример #5
0
    def validate_query(query, request, user):
        """
        Validate Parler query input

        :param dict query:  Query parameters, from client-side.
        :param request:  Flask request
        :param User user:  User object of user who has submitted the query
        :return dict:  Safe query parameters
        """
        # no query 4 u
        if not query.get("query", "").strip():
            raise QueryParametersException("You must provide a search query.")

        if not query.get("jst") or not query.get("mst"):
            raise QueryParametersException(
                "You must provide the 'JST' and 'MST' values")

        # 500 is mostly arbitrary - may need tweaking
        max_posts = 2500
        if query.get("max_posts", ""):
            try:
                max_posts = min(abs(int(query.get("max_posts"))), max_posts)
            except TypeError:
                raise QueryParametersException(
                    "Provide a valid number of posts to query.")

        # reformat queries to be a comma-separated list with no wrapping
        # whitespace
        whitespace = re.compile(r"\s+")
        items = whitespace.sub("", query.get("query").replace("\n", ","))
        if len(items.split(",")) > 15:
            raise QueryParametersException(
                "You cannot query more than 15 items at a time.")

        # the dates need to make sense as a range to search within
        after, before = query.get("daterange")
        if before and after and before < after:
            raise QueryParametersException(
                "Date range must start before it ends")

        query["min_date"], query["max_date"] = (after, before)

        # simple!
        return {
            "items": max_posts,
            "query": items,
            "min_date": query.get("min_date", None),
            "max_date": query.get("max_date", None),
            "jst": query.get("jst"),
            "mst": query.get("mst"),
            "scrape_echoes": bool(query.get("scrape_echoes", False))
        }
Пример #6
0
    def validate_query(query, request, user):
        """
		Validate custom data input

		Confirms that the uploaded file is a valid CSV file and, if so, returns
		some metadata.

		:param dict query:  Query parameters, from client-side.
		:param request:  	Flask request
		:param User user:  	User object of user who has submitted the query
		:return dict:  		Safe query parameters
		"""
        # no query 4 u
        if not query.get("query", "").strip():
            raise QueryParametersException("You must provide a search query.")

        # reformat queries to be a comma-separated list
        items = query.get("query").replace("#", "")
        items = items.split("\n")

        # Not more than 10 plox
        if len(items) > 10:
            raise QueryParametersException(
                "Only query for ten or less tags or blogs." + str(len(items)))

        # no query 4 u
        if not items:
            raise QueryParametersException("Search query cannot be empty.")

        # So it shows nicely in the frontend.
        items = ", ".join([item.lstrip().rstrip() for item in items if item])

        # the dates need to make sense as a range to search within
        query["min_date"], query["max_date"] = query.get("daterange")
        if any(query.get("daterange")) and not all(query.get("daterange")):
            raise QueryParametersException(
                "When providing a date range, set both an upper and lower limit."
            )

        del query["daterange"]

        query["query"] = items
        query["board"] = query.get(
            "search_scope") + "s"  # used in web interface

        # if we made it this far, the query can be executed
        return query
Пример #7
0
    def validate_query(query, request, user):
        """
		Validate input for a dataset query on the Guardian data source.

		Will raise a QueryParametersException if invalid parameters are
		encountered. Mutually exclusive parameters may also be sanitised by
		ignoring either of the mutually exclusive options.

		:param dict query:  Query parameters, from client-side.
		:param request:  Flask request
		:param User user:  User object of user who has submitted the query
		:return dict:  Safe query parameters
		"""
        # this is the bare minimum, else we can't narrow down the full data set
        if not query.get("body_match", None) and not query.get(
                "subject_match", None):
            raise QueryParametersException("Please provide a search query")

        # only one of two dense threads options may be chosen at the same time, and
        # it requires valid density and length parameters. full threads is implied,
        # so it is otherwise left alone here
        if query.get("search_scope", "") == "dense-threads":
            try:
                dense_density = int(query.get("scope_density", ""))
            except ValueError:
                raise QueryParametersException(
                    "Please provide a valid numerical density percentage.")

            if dense_density < 15 or dense_density > 100:
                raise QueryParametersException(
                    "Please provide a density percentage between 15 and 100.")

            try:
                dense_length = int(query.get("scope_length", ""))
            except ValueError:
                raise QueryParametersException(
                    "Please provide a valid numerical dense thread length.")

            if dense_length < 30:
                raise QueryParametersException(
                    "Please provide a dense thread length of at least 30.")

        # the dates need to make sense as a range to search within
        if not all(query.get("daterange")):
            raise QueryParametersException("You must provide a date range")

        query["min_date"], query["max_date"] = query.get("daterange")
        del query["daterange"]

        if query["max_date"] and (query["max_date"] -
                                  query["min_date"]) > (86400 * 31 * 6):
            raise QueryParametersException(
                "Date range may span 6 months at most")

        # if we made it this far, the query can be executed
        return query
Пример #8
0
    def validate_query(query, request, user):
        """
        Validate input for a dataset query on the Twitter data source.

        Will raise a QueryParametersException if invalid parameters are
        encountered. Parameters are additionally sanitised.

        :param dict query:  Query parameters, from client-side.
        :param request:  Flask request
        :param User user:  User object of user who has submitted the query
        :return dict:  Safe query parameters
        """

        # this is the bare minimum, else we can't narrow down the full data set
        if not query.get("query", None):
            raise QueryParametersException("Please provide a query.")

        if not query.get("api_bearer_token", None):
            raise QueryParametersException(
                "Please provide a valid bearer token.")

        if len(query.get("query")) > 1024:
            raise QueryParametersException(
                "Twitter API queries cannot be longer than 1024 characters.")

        # the dates need to make sense as a range to search within
        # but, on Twitter, you can also specify before *or* after only
        after, before = query.get("daterange")
        if before and after and before < after:
            raise QueryParametersException(
                "Date range must start before it ends")

        # if we made it this far, the query can be executed
        return {
            "query": query.get("query"),
            "api_bearer_token": query.get("api_bearer_token"),
            "min_date": after,
            "max_date": before,
            "amount": query.get("amount")
        }
Пример #9
0
    def validate_query(query, request, user):
        """
		Validate Telegram query

		:param dict query:  Query parameters, from client-side.
		:param request:  Flask request
		:param User user:  User object of user who has submitted the query
		:return dict:  Safe query parameters
		"""
        # no query 4 u
        if not query.get("query", "").strip():
            raise QueryParametersException("You must provide a search query.")

        if not query.get("api_id", None) or not query.get(
                "api_hash", None) or not query.get("api_phone", None):
            raise QueryParametersException(
                "You need to provide valid Telegram API credentials first.")

        # reformat queries to be a comma-separated list with no wrapping
        # whitespace
        whitespace = re.compile(r"\s+")
        items = whitespace.sub("", query.get("query").replace("\n", ","))
        if len(items.split(",")) > 25:
            raise QueryParametersException(
                "You cannot query more than 25 items at a time.")

        # eliminate empty queries
        items = ",".join([item for item in items.split(",") if item])

        # simple!
        return {
            "items": query.get("max_posts"),
            "query": items,
            "board": "",  # needed for web interface
            "api_id": query.get("api_id"),
            "api_hash": query.get("api_hash"),
            "api_phone": query.get("api_phone")
        }
Пример #10
0
    def parse_all(options, input, silently_correct=True):
        """
        Parse form input for the provided options

        Ignores all input not belonging to any of the defined options: parses
        and sanitises the rest, and returns a dictionary with the sanitised
        options. If an option is *not* present in the input, the default value
        is used, and if that is absent, `None`.

        In other words, this ensures a dictionary with 1) only white-listed
        keys, 2) a value of an expected type for each key.

        :param dict options:  Options, as a name -> settings dictionary
        :param dict input:  Input, as a form field -> value dictionary
        :param bool silently_correct:  If true, replace invalid values with the
        given default value; else, raise a QueryParametersException if a value
        is invalid.

        :return dict:  Sanitised form input
        """
        parsed_input = {}

        # all parameters are submitted as option-[parameter ID], this is an
        # artifact of how the web interface works and we can simply remove the
        # prefix
        input = {
            re.sub(r"^option-", "", field): input[field]
            for field in input
        }

        for option, settings in options.items():
            if settings.get("type") in (UserInput.OPTION_DIVIDER,
                                        UserInput.OPTION_INFO):
                # these are structural form elements and never have a value
                continue

            elif settings.get("type") == UserInput.OPTION_DATERANGE:
                # special case, since it combines two inputs
                option_min = option + "-min"
                option_max = option + "-max"

                # normally this is taken care of client-side, but in case this
                # didn't work, try to salvage it server-side
                if option_min not in input or input.get(option_min) == "-1":
                    option_min += "_proxy"

                if option_max not in input or input.get(option_max) == "-1":
                    option_max += "_proxy"

                # save as a tuple of unix timestamps (or None)
                after, before = (UserInput.parse_value(settings,
                                                       input.get(option_min),
                                                       silently_correct),
                                 UserInput.parse_value(settings,
                                                       input.get(option_max),
                                                       silently_correct))

                if before and after and after > before:
                    if not silently_correct:
                        raise QueryParametersException(
                            "End of date range must be after beginning of date range."
                        )
                    else:
                        before = after

                parsed_input[option] = (after, before)

            elif settings.get("type") == UserInput.OPTION_TOGGLE:
                # special case too, since if a checkbox is unchecked, it simply
                # does not show up in the input
                parsed_input[option] = option in input

            elif option not in input:
                # not provided? use default
                parsed_input[option] = settings.get("default", None)

            else:
                # normal parsing and sanitisation
                parsed_input[option] = UserInput.parse_value(
                    settings, input[option], silently_correct)

        return parsed_input
Пример #11
0
    def parse_value(settings, choice, silently_correct=True):
        """
        Filter user input

        Makes sure user input for post-processors is valid and within the
        parameters specified by the post-processor

        :param obj settings:  Settings, including defaults and valid options
        :param choice:  The chosen option, to be parsed
        :param bool silently_correct:  If true, replace invalid values with the
        given default value; else, raise a QueryParametersException if a value
        is invalid.

        :return:  Validated and parsed input
        """
        input_type = settings.get("type", "")
        if input_type in (UserInput.OPTION_INFO, UserInput.OPTION_DIVIDER):
            # these are structural form elements and can never return a value
            return None

        elif input_type == UserInput.OPTION_TOGGLE:
            # simple boolean toggle
            return choice is not None

        elif input_type in (UserInput.OPTION_DATE, UserInput.OPTION_DATERANGE):
            # parse either integers (unix timestamps) or try to guess the date
            # format (the latter may be used for input if JavaScript is turned
            # off in the front-end and the input comes from there)
            value = None
            try:
                value = int(choice)
            except ValueError:
                parsed_choice = parse_datetime(choice)
                value = int(parsed_choice.timestamp())
            finally:
                return value

        elif input_type == UserInput.OPTION_MULTI:
            # any number of values out of a list of possible values
            # comma-separated during input, returned as a list of valid options
            if not choice:
                return settings.get("default", [])

            chosen = choice.split(",")
            return [
                item for item in chosen if item in settings.get("options", [])
            ]

        elif input_type == UserInput.OPTION_CHOICE:
            # select box
            # one out of multiple options
            # return option if valid, or default
            if choice not in settings.get("options"):
                if not silently_correct:
                    raise QueryParametersException(
                        "Invalid value selected; must be one of %s." %
                        ", ".join(settings.get("options", {}).keys()))
                else:
                    return settings.get("default", "")
            else:
                return choice

        elif input_type in (UserInput.OPTION_TEXT,
                            UserInput.OPTION_TEXT_LARGE):
            # text string
            # optionally clamp it as an integer; return default if not a valid
            # integer
            if "max" in settings:
                try:
                    choice = min(settings["max"], int(choice))
                except (ValueError, TypeError) as e:
                    if not silently_correct:
                        raise QueryParametersException(
                            "Provide a value of %i or lower." %
                            settings["max"])

                    choice = settings.get("default")

            if "min" in settings:
                try:
                    choice = max(settings["min"], int(choice))
                except (ValueError, TypeError) as e:
                    if not silently_correct:
                        raise QueryParametersException(
                            "Provide a value of %i or more." % settings["min"])

                    choice = settings.get("default")

            if choice is None or choice == "":
                choice = settings.get("default")

            if choice is None:
                choice = 0 if "min" in settings or "max" in settings else ""

            return choice

        else:
            # no filtering
            return choice
Пример #12
0
    def validate_query(query, request, user):
        """
		Validate custom data input

		Confirms that the uploaded file is a valid CSV or tab file and, if so, returns
		some metadata.

		:param dict query:  Query parameters, from client-side.
		:param request:  Flask request
		:param User user:  User object of user who has submitted the query
		:return dict:  Safe query parameters
		"""

        # do we have an uploaded file?
        if "option-data_upload" not in request.files:
            raise QueryParametersException("No file was offered for upload.")

        file = request.files["option-data_upload"]
        if not file:
            raise QueryParametersException("No file was offered for upload.")

        encoding = sniff_encoding(file)

        wrapped_file = io.TextIOWrapper(file, encoding=encoding)
        sample = wrapped_file.read(1024 * 1024)
        wrapped_file.seek(0)
        dialect = csv.Sniffer().sniff(sample, delimiters=(",", ";", "\t"))

        # With validated csvs, save as is but make sure the raw file is sorted
        reader = csv.DictReader(wrapped_file, dialect=dialect)

        try:
            fields = reader.fieldnames
        except UnicodeDecodeError:
            raise QueryParametersException(
                "Uploaded file is not a well-formed CSV or TAB file.")

        # check if all required fields are present
        required = ("id", "thread_id", "subject", "author", "body",
                    "timestamp")
        missing = []
        for field in required:
            if field not in reader.fieldnames:
                missing.append(field)

        if missing:
            raise QueryParametersException(
                "The following required columns are not present in the csv file: %s"
                % ", ".join(missing))

        try:
            row = reader.__next__()
            try:
                parse_datetime(row["timestamp"])
            except ValueError:
                raise QueryParametersException(
                    "Your 'timestamp' column does not use a recognisable format (yyyy-mm-dd hh:mm:ss is recommended)"
                )
        except StopIteration:
            pass

        wrapped_file.detach()

        # Whether to strip the HTML tags
        strip_html = False
        if query.get("strip_html"):
            strip_html = True

        # return metadata - the filename is sanitised and serves no purpose at
        # this point in time, but can be used to uniquely identify a dataset
        disallowed_characters = re.compile(r"[^a-zA-Z0-9._+-]")
        return {
            "filename": disallowed_characters.sub("", file.filename),
            "time": time.time(),
            "datasource": "custom",
            "board": "upload",
            "strip_html": strip_html
        }
Пример #13
0
    def validate_query(query, request, user):
        """
		Validate input for a dataset query on the 4chan data source.

		Will raise a QueryParametersException if invalid parameters are
		encountered. Mutually exclusive parameters may also be sanitised by
		ignoring either of the mutually exclusive options.

		:param dict query:  Query parameters, from client-side.
		:param request:  Flask request
		:param User user:  User object of user who has submitted the query
		:return dict:  Safe query parameters
		"""
        # we need a board!
        r_prefix = re.compile(r"^/?r/")
        boards = [
            r_prefix.sub("", board).strip()
            for board in query.get("board", "").split(",") if board.strip()
        ]

        if not boards:
            raise QueryParametersException(
                "Please provide a board or a comma-separated list of boards to query."
            )

        # ignore leading r/ for boards
        query["board"] = ",".join(boards)

        # this is the bare minimum, else we can't narrow down the full data set
        if not user.is_admin() and not user.get_value(
                "reddit.can_query_without_keyword", False) and not query.get(
                    "body_match", "").strip() and not query.get(
                        "subject_match", "").strip():
            raise QueryParametersException(
                "Please provide a body query or subject query.")

        # body query and full threads are incompatible, returning too many posts
        # in most cases
        if query.get("body_match", None):
            if "full_threads" in query:
                del query["full_threads"]

        # Make sure no body or subject searches starting with just a minus sign are possible, e.g. "-Trump"
        if query.get("body_match", None) or query.get("subject_match", None):
            queries_to_check = []

            if query.get("body_match", None):
                queries_to_check += [
                    body_query.strip()
                    for body_query in query["body_match"].split(" ")
                ]

            if query.get("subject_match", None):
                queries_to_check += [
                    subject_query.strip()
                    for subject_query in query["subject_match"].split(" ")
                ]

            startswith_minus = [
                query_check.startswith("-") for query_check in queries_to_check
            ]
            if all(startswith_minus):
                raise QueryParametersException(
                    "Please provide body queries that do not start with a minus sign."
                )

        # both dates need to be set, or none
        if query.get("min_date", None) and not query.get("max_date", None):
            raise QueryParametersException(
                "When setting a date range, please provide both an upper and lower limit."
            )

        # the dates need to make sense as a range to search within
        query["min_date"], query["max_date"] = query.get("daterange")

        if "*" in query.get("body_match", "") and not user.get_value(
                "reddit.can_query_without_keyword", False):
            raise QueryParametersException(
                "Wildcard queries are not allowed as they typically return too many results to properly process."
            )

        if "*" in query.get("board", "") and not user.get_value(
                "reddit.can_query_without_keyword", False):
            raise QueryParametersException(
                "Wildcards are not allowed for boards as this typically returns too many results to properly process."
            )

        del query["daterange"]
        if query.get("search_scope") not in ("dense-threads", ):
            del query["scope_density"]
            del query["scope_length"]

        # if we made it this far, the query can be executed
        return query
Пример #14
0
    def validate_query(query, request, user):
        """
		Validate custom data input

		Confirms that the uploaded file is a valid CSV or tab file and, if so, returns
		some metadata.

		:param dict query:  Query parameters, from client-side.
		:param request:  Flask request
		:param User user:  User object of user who has submitted the query
		:return dict:  Safe query parameters
		"""

        # do we have an uploaded file?
        if "option-data_upload" not in request.files:
            raise QueryParametersException("No file was offered for upload.")

        platform = query.get("platform", "")
        if platform not in ImportFromExternalTool.required_columns:
            raise QueryParametersException("Invalid platform")

        file = request.files["option-data_upload"]
        if not file:
            raise QueryParametersException("No file was offered for upload.")

        # detect encoding - UTF-8 with or without BOM
        encoding = sniff_encoding(file)
        wrapped_upload = io.TextIOWrapper(file, encoding=encoding)

        # validate file as csv
        reader = csv.DictReader(wrapped_upload, delimiter=",")

        try:
            fields = reader.fieldnames
        except UnicodeDecodeError:
            raise QueryParametersException(
                "Uploaded file is not a well-formed CSV file.")

        # check if all required fields are present
        required = ImportFromExternalTool.required_columns[platform]
        missing = []
        for field in required:
            if field not in reader.fieldnames:
                missing.append(field)

        if missing:
            wrapped_upload.detach()
            raise QueryParametersException(
                "The following required columns are not present in the csv file: %s. Provided field names: %s"
                % (", ".join(missing), ", ".join(reader.fieldnames)))

        wrapped_upload.detach()

        # return metadata - the filename is sanitised and serves no purpose at
        # this point in time, but can be used to uniquely identify a dataset
        disallowed_characters = re.compile(r"[^a-zA-Z0-9._+-]")
        return {
            "filename": disallowed_characters.sub("", file.filename),
            "time": time.time(),
            "datasource": platform,
            "board": "upload",
            "platform": platform
        }