def restart_dataset(key): """ Run a dataset's query again Deletes all underlying datasets, marks dataset as unfinished, and queues a job for it. :param str key: Dataset key :return: """ try: dataset = DataSet(key=key, db=db) except TypeError: return error(404, message="Dataset not found.") if current_user.get_id() != dataset.parameters.get("user", "") and not current_user.is_admin: return error(403, message="Not allowed.") if not dataset.is_finished(): return render_template("error.html", message="This dataset is not finished yet - you cannot re-run it.") if "type" not in dataset.parameters: return render_template("error.html", message="This is an older dataset that unfortunately lacks the information necessary to properly restart it.") for child in dataset.children: child.delete() dataset.unfinish() queue = JobQueue(logger=log, database=db) queue.add_job(jobtype=dataset.parameters["type"], remote_id=dataset.key) flash("Dataset queued for re-running.") return redirect("/results/" + dataset.key + "/")
def delete_dataset(key=None): """ Delete a dataset Only available to administrators. Deletes a dataset, as well as any children linked to it, from 4CAT. Calling this on a dataset that is currently being executed is undefined behaviour. :request-param str query_key: ID of the dataset for which to return the status :request-param str ?access_token: Access token; only required if not logged in currently. :return: A dictionary with a successful `status`. :return-schema: {type=object,properties={status={type=string}}} :return-error 404: If the dataset does not exist. """ if not current_user.is_admin(): return error(403, message="Not allowed") dataset_key = request.form.get("key", "") if not key else key try: dataset = DataSet(key=dataset_key, db=db) except TypeError: return error(404, error="Dataset does not exist.") dataset.delete() return jsonify({"status": "success"})
def datasource_script(datasource_id): """ Get data source query form HTML The data source needs to have been loaded as a module with a `ModuleCollector`, and also needs to be present in `config.py`. If so, this endpoint returns the data source's tool javascript file, if it exists as `tool.js` in the data source's `webtool` folder. :param datasource_id: Datasource ID, as specified in the datasource and config.py :return: A javascript file :return-error 404: If the datasource does not exist. """ if datasource_id not in backend.all_modules.datasources: return error(404, message="Datasource '%s' does not exist" % datasource_id) if datasource_id not in config.DATASOURCES: return error(404, message="Datasource '%s' does not exist" % datasource_id) datasource = backend.all_modules.datasources[datasource_id] script_path = datasource["path"].joinpath("webtool", "tool.js") if not script_path.exists(): return error(404, message="Datasource '%s' does not exist" % datasource_id) return send_file(str(script_path))
def api_thread(datasource, board, thread_id): """ Emulate 4chan thread.json API endpoint :param str datasource: Data source ID :param str board: Board name :param int thread_id: Thread ID :request-param str format: Data format. Can be `json` (default) or `html`. :return: Thread data, as a list of `posts`. :return-schema: {type=object,properties={posts={type=object,additionalProperties={}}}} :return-error 404: If the thread ID does not exist for the given data source. """ if datasource not in config.DATASOURCES: return error(404, error="Invalid data source") thread = db.fetchone( "SELECT * FROM threads_" + datasource + " WHERE board = %s AND id = %s", (board, thread_id)) if thread == None: return "Thread is not anymore available on the server." response = get_thread(datasource, board, thread, db) def strip_html(post): post["com"] = strip_tags(post.get("com", "")) return post response["posts"] = [strip_html(post) for post in response["posts"]] if not response: return error(404, error="No posts available for this datasource") elif request.args.get("format", "json") == "html": def format(post): post["com"] = format_post(post.get("com", "")).replace("\n", "<br>") return post response["posts"] = [format(post) for post in response["posts"]] metadata = { "subject": "".join([post.get("sub", "") for post in response["posts"]]), "id": response["posts"][0]["no"] } return render_template("thread.html", datasource=datasource, board=board, posts=response["posts"], thread=thread, metadata=metadata) else: return jsonify(response)
def queue_dataset(): """ Queue a 4CAT search query for processing into a dataset Requires authentication by logging in or providing a valid access token. Request parameters vary by data source. The ones mandated constitute the minimum but more may be required. :request-param str board: Board ID to query :request-param str datasource: Data source ID to query :request-param str body_match: String to match in the post body :request-param str subject_match: String to match in the post subject :request-param int min_date: Timestamp marking the beginning of the match period :request-param int max_date: Timestamp marking the end of the match period :request-param str ?access_token: Access token; only required if not logged in currently. :return str: The dataset key, which may be used to later retrieve dataset status and results. :return-error 404: If the datasource does not exist. """ datasource_id = request.form.get("datasource", "") if datasource_id not in backend.all_modules.datasources: return error(404, message="Datasource '%s' does not exist" % datasource_id) search_worker_id = datasource_id + "-search" if search_worker_id not in backend.all_modules.workers: return error(404, message="Datasource '%s' has no search interface" % datasource_id) search_worker = backend.all_modules.workers[search_worker_id] if hasattr(search_worker["class"], "validate_query"): try: sanitised_query = search_worker["class"].validate_query( request.form.to_dict(), request, current_user) except QueryParametersException as e: return "Invalid query. %s" % e else: sanitised_query = request.form.to_dict() sanitised_query["user"] = current_user.get_id() sanitised_query["datasource"] = datasource_id sanitised_query["type"] = search_worker_id dataset = DataSet(parameters=sanitised_query, db=db, type="search") if hasattr(search_worker["class"], "after_create"): search_worker["class"].after_create(sanitised_query, dataset, request) queue.add_job(jobtype=search_worker_id, remote_id=dataset.key) return dataset.key
def api_board_catalog(datasource, board): """ Emulate 4chan API /[board]/catalog.json endpoint :param str datasource: Data source ID :param str board: Board to get index for :return: Board catalog, up to 150 threads divided over a list of 20-thread pages, each page having a `page` number and a list of `threads`, each thread containing the first post. :return-schema:{type=array,items={type=object,properties={ page={type=integer}, threads={type=array,items={type=object,properties={ no={type=integer}, last_modified={type=integer}, replies={type=integer} }}} }}} :return-error 404: If the board does not exist for the given datasource. """ if datasource not in config.DATASOURCES: return error(404, error="Invalid data source") threads = db.fetchall( "SELECT * FROM threads_" + datasource + " WHERE board = %s ORDER BY is_sticky DESC, timestamp_modified DESC LIMIT 150", (board, )) if not threads: return error(404, error="No threads available for this datasource") response = [] page = 1 while len(threads) > 0: threads = threads[20:] page_threads = [] for thread in threads: thread = get_thread(datasource, board, thread, db, limit=6) if not thread: log.error( "Thread %s is in database and was requested via API but has no posts." % thread) continue thread = thread["posts"] first_post = thread[0] if len(thread) > 1: first_post["last_replies"] = thread[1:6] page_threads.append(first_post) response.append({"page": page, "threads": page_threads}) return jsonify(response)
def datasource_form(datasource_id): """ Get data source query form HTML The data source needs to have been loaded as a module with a `ModuleCollector`, and also needs to be present in `config.py`. If so, this endpoint returns the HTML form configured by the template in the data source's folder, or a default tool template if that one is not available. If a file `tool.js` is available in the data source's `webtool` folder, the response will indicate that a javascript file is available for this data source. :param datasource_id: Data source ID, as specified in the data source and config.py :return: A JSON object with the `html` of the template, a boolean `javascript` determining whether javascript should be loaded for this template, a `status` code and the `datasource` ID. :return-error 404: If the datasource does not exist. """ if datasource_id not in backend.all_modules.datasources: return error(404, message="Datasource '%s' does not exist" % datasource_id) if datasource_id not in config.DATASOURCES: return error(404, message="Datasource '%s' does not exist" % datasource_id) datasource = backend.all_modules.datasources[datasource_id] template_path = datasource["path"].joinpath("webtool", "query-form.html") if not template_path.exists(): template_path = Path("tool_default.html") javascript_path = datasource["path"].joinpath("webtool", "tool.js") has_javascript = javascript_path.exists() if not template_path.exists(): return error(404, message="No interface exists for datasource '%s'" % datasource_id) html = render_template_string( template_path.read_text(), datasource_id=datasource_id, datasource_config=config.DATASOURCES[datasource_id], datasource=datasource) return jsonify({ "status": "success", "datasource": datasource_id, "has_javascript": has_javascript, "html": html })
def datasource_call(datasource, action): """ Call datasource function Datasources may define custom API calls as functions in a file 'webtool/views.py'. These are then available as 'actions' with this API endpoint. Any GET parameters are passed as keyword arguments to the function. :param str action: Action to call :return: A JSON object """ # allow prettier URLs action = action.replace("-", "_") if datasource not in backend.all_modules.datasources: return error(404, error="Datasource not found.") forbidden_call_name = re.compile(r"[^a-zA-Z0-9_]") if forbidden_call_name.findall(action) or action[0:2] == "__": return error(406, error="Datasource '%s' has no call '%s'" % (datasource, action)) folder = backend.all_modules.datasources[datasource]["path"] views_file = folder.joinpath("webtool", "views.py") if not views_file.exists(): return error(406, error="Datasource '%s' has no call '%s'" % (datasource, action)) datasource_id = backend.all_modules.datasources[datasource]["id"] datasource_calls = importlib.import_module("datasources.%s.webtool.views" % datasource_id) if not hasattr(datasource_calls, action) or not callable( getattr(datasource_calls, action)): return error(406, error="Datasource '%s' has no call '%s'" % (datasource, action)) parameters = request.args if request.method == "GET" else request.form response = getattr(datasource_calls, action).__call__(request, current_user, **parameters) if not response: return jsonify({"success": False}) elif response is True: return jsonify({"success": True}) else: return jsonify({"success": True, "data": response})
def delete_dataset(key=None): """ Delete a dataset Only available to administrators and dataset owners. Deletes a dataset, as well as any children linked to it, from 4CAT. Also tells the backend to stop any jobs dealing with the dataset. :request-param str key: ID of the dataset to delete :request-param str ?access_token: Access token; only required if not logged in currently. :return: A dictionary with a successful `status`. :return-schema: {type=object,properties={status={type=string}}} :return-error 404: If the dataset does not exist. """ dataset_key = request.form.get("key", "") if not key else key try: dataset = DataSet(key=dataset_key, db=db) except TypeError: return error(404, error="Dataset does not exist.") if not current_user.is_admin() and not current_user.get_id() == dataset.parameters.get("user"): return error(403, message="Not allowed") # if there is an active or queued job for some child dataset, cancel and # delete it children = dataset.get_all_children() for child in children: try: job = Job.get_by_remote_ID(child.key, database=db, jobtype=child.type) call_api("cancel-job", {"remote_id": child.key, "jobtype": dataset.type, "level": BasicWorker.INTERRUPT_CANCEL}) job.finish() except JobNotFoundException: pass # now cancel and delete the job for this one (if it exists) try: job = Job.get_by_remote_ID(dataset.key, database=db, jobtype=dataset.type) call_api("cancel-job", {"remote_id": dataset.key, "jobtype": dataset.type, "level": BasicWorker.INTERRUPT_CANCEL}) except JobNotFoundException: pass # and delete the dataset and child datasets dataset.delete() return jsonify({"status": "success", "key": dataset.key})
def api_board(datasource, board): """ Emulate 4chan API /[board]/threads.json endpoint :param str datasource: Data source ID :param str board: Board to get index for :return: Thread index for board, as a list of pages, each page containing a page number `page` and a list of `threads`, each thread having the keys `no` and `last_modified`. :return-schema:{type=array,items={type=object,properties={ page={type=integer}, threads={type=array,items={type=object,properties={ no={type=integer}, last_modified={type=integer}, replies={type=integer} }}} }}} :return-error 404: If the board does not exist for the given datasource. """ if datasource not in config.DATASOURCES: return error(404, error="Invalid data source") threads = db.fetchall( "SELECT * FROM threads_" + datasource + " WHERE board = %s ORDER BY is_sticky DESC, timestamp_modified DESC LIMIT 200", (board, )) if not threads: return error(404, error="No threads available for this datasource") response = [] page = 1 while len(threads) > 0: chunk = threads[:20] threads = threads[20:] response.append({ "page": page, "threads": [{ "no": thread["id"], "last_modified": thread["timestamp_modified"] } for thread in chunk] }) page += 1 return jsonify(response)
def datasource_form(datasource_id): """ Get data source query form HTML The data source needs to have been loaded as a module with a `ModuleCollector`, and also needs to be present in `config.py`. If so, this endpoint returns the HTML form configured by the template in the data source's folder. If a file `tool.js` is available in the data source's `webtool` folder, the response will indicate that a javascript file is available for this data source. If the data source has no search worker or its search worker does not have any parameters defined, this returns a 404 Not Found status. :param datasource_id: Data source ID, as specified in the data source and config.py :return: A JSON object with the `html` of the template, a boolean `has_javascript` determining whether javascript should be loaded for this template, a `status` code and the `datasource` ID. :return-error 404: If the datasource does not exist. """ if datasource_id not in backend.all_modules.datasources: return error(404, message="Datasource '%s' does not exist" % datasource_id) if datasource_id not in config.DATASOURCES: return error(404, message="Datasource '%s' does not exist" % datasource_id) datasource = backend.all_modules.datasources[datasource_id] worker = backend.all_modules.workers.get(datasource_id + "-search") if not worker: return error(404, message="Datasource '%s' has no search worker" % datasource_id) worker_class = backend.all_modules.load_worker_class(worker) if not hasattr(worker_class, "options"): return error(404, message="Datasource '%s' has no dataset parameter options defined" % datasource_id) form = render_template("create-dataset-option.html", options=worker_class.options) javascript_path = datasource["path"].joinpath("webtool", "tool.js") has_javascript = javascript_path.exists() html = render_template_string(form, datasource_id=datasource_id, datasource_config=config.DATASOURCES[datasource_id], datasource=datasource) return jsonify({"status": "success", "datasource": datasource_id, "has_javascript": has_javascript, "html": html})
def preview_csv(key): """ Preview a CSV file Simply passes the first 25 rows of a dataset's csv result file to the template renderer. :param str key: Dataset key :return: HTML preview """ try: dataset = DataSet(key=key, db=db) except TypeError: return error(404, "Dataset not found.") try: with dataset.get_results_path().open(encoding="utf-8") as csvfile: rows = [] reader = csv.reader(csvfile) while len(rows) < 25: try: row = next(reader) rows.append(row) except StopIteration: break except FileNotFoundError: abort(404) return render_template("result-csv-preview.html", rows=rows, filename=dataset.get_results_path().name)
def available_processors(): """ Get processors available for a dataset :request-param string key: Dataset key to get processors for :return: An object containing the `error` if the request failed, or a list of processors, each with a `name`, a `type` ID, a `description` of what it does, the `extension` of the file it produces, a `category` name, what types of datasets it `accepts`, and a list of `options`, if applicable. :return-schema: {type=array,items={type=object,properties={ name={type=string}, type={type=string}, description={type=string}, extension={type=string}, category={type=string}, accepts={type=array,items={type=string}} }}} :return-error 404: If the dataset does not exist. """ try: dataset = DataSet(key=request.args.get("key"), db=db) except TypeError: return error(404, error="Dataset does not exist.") # Class type is not JSON serialisable processors = dataset.get_available_processors() for key, value in processors.items(): if "class" in value: del value["class"] return jsonify(processors)
def toggle_favourite(key): """ 'Like' a dataset Marks the dataset as being liked by the currently active user, which can be used for organisation in the front-end. :param str key: Key of the dataset to mark as favourite. :return: A JSON object with the status of the request :return-schema: {type=object,properties={success={type=boolean},favourite_status={type=boolean}}} :return-error 404: If the dataset key was not found """ try: dataset = DataSet(key=key, db=db) except TypeError: return error(404, error="Dataset does not exist.") current_status = db.fetchone("SELECT * FROM users_favourites WHERE name = %s AND key = %s", (current_user.get_id(), dataset.key)) if not current_status: db.insert("users_favourites", data={"name": current_user.get_id(), "key": dataset.key}) return jsonify({"success": True, "favourite_status": True}) else: db.delete("users_favourites", where={"name": current_user.get_id(), "key": dataset.key}) return jsonify({"success": True, "favourite_status": False})
def check_processor(): """ Check processor status :request-param str subqueries: A JSON-encoded list of dataset keys to get the status of :return: A list of dataset data, with each dataset an item with a `key`, whether it had `finished`, a `html` snippet containing details, and a `url` at which the result may be downloaded when finished. :return-schema:{type=array,items={type=object,properties={ key={type=string}, finished={type=boolean}, html={type=string}, url={type=string} }}} :return-error 406: If the list of subqueries could not be parsed. """ try: keys = json.loads(request.args.get("subqueries")) except (TypeError, json.decoder.JSONDecodeError): return error(406, error="Unexpected format for child dataset key list.") children = [] for key in keys: try: dataset = DataSet(key=key, db=db) except TypeError: continue genealogy = dataset.get_genealogy() parent = genealogy[-2] top_parent = genealogy[0] children.append({ "key": dataset.key, "finished": dataset.is_finished(), "html": render_template("result-child.html", child=dataset, dataset=parent, query=dataset.get_genealogy()[0], parent_key=top_parent.key, processors=backend.all_modules.processors), "resultrow_html": render_template("result-result-row.html", dataset=top_parent), "url": "/result/" + dataset.data["result_file"] }) return jsonify(children)
def api_board_page(datasource, board, page): """ Emulate 4chan API /[board]/[page].json endpoint :param str datasource: Data source ID :param str board: Board to get index for :param int page: Page to show :return: A page containing a list of `threads`, each thread a list of `posts`. :return-schema:{type=object,properties={ threads={type=array,items={type=object,properties={ posts={type=array,items={type=object,additionalProperties={}}} }}} }} :return-error 404: If the board does not exist for the given datasource. """ if datasource not in config.DATASOURCES: return error(404, error="Invalid data source") try: page = int(page) except ValueError: return error(404, error="Invalid page number") limit = "LIMIT 15 OFFSET %i" % ((int(page) - 1) * 15) threads = db.fetchall( "SELECT * FROM threads_" + datasource + " WHERE board = %s ORDER BY is_sticky DESC, timestamp_modified DESC " + limit, (board, )) if not threads: return error(404, error="No threads available for this datasource") response = { "threads": [get_thread(datasource, board, thread, db) for thread in threads] } return jsonify(response)
def request_token(): """ Request an access token Requires that the user is currently logged in to 4CAT. :return: An object with one item `token` :return-schema={type=object,properties={token={type=string}}} :return-error 403: If the user is logged in with an anonymous account. """ if current_user.get_id() == "autologin": # access tokens are only for 'real' users so we can keep track of who # (ab)uses them return error(403, error="Anonymous users may not request access tokens.") token = db.fetchone( "SELECT * FROM access_tokens WHERE name = %s AND (expires = 0 OR expires > %s)", (current_user.get_id(), int(time.time()))) if token: token = token["token"] else: token = current_user.get_id() + str(time.time()) token = hashlib.sha256(token.encode("utf8")).hexdigest() token = { "name": current_user.get_id(), "token": token, "expires": int(time.time()) + (365 * 86400) } # delete any expired tokens db.delete("access_tokens", where={"name": current_user.get_id()}) # save new token db.insert("access_tokens", token) if request.args.get("forward"): # show HTML page return redirect(url_for("show_access_tokens")) else: # show JSON response (by default) return jsonify(token)
def get_archive(datasource, board): """ Emulate 4chan API /[board]/archive.json endpoint :param str datasource: Data source ID :param board: Board to get list of archived thread IDs for :return: Thread archive, a list of threads IDs of threads within this board. :return-schema: {type=array,items={type=integer}} :return-error 404: If the datasource does not exist. """ if datasource not in config.DATASOURCES: return error(404, error="Invalid data source") threads = db.fetchall( "SELECT id FROM threads_" + datasource + " WHERE board = %s AND timestamp_archived > 0 ORDER BY timestamp_archived ASC", (board,)) return jsonify([thread["id"] for thread in threads])
def live_stats(): if not current_user.is_admin(): return error(403, message="This page is off-limits to you.") worker_stats = call_api("workers")["response"] datasources = all_modules.datasources for id in datasources: del datasources[id]["path"] workers = {} for worker in worker_stats: if worker not in all_modules.workers or worker_stats[worker] == 0: continue workers[worker] = { "id": worker, "name": all_modules.workers[worker]["name"], "active": worker_stats[worker] } return jsonify({"workers": workers, "datasources": datasources})
def get_boards(datasource): """ Get available boards in datasource :param datasource: The datasource for which to acquire the list of available boards. :return: A list containing a list of `boards`, as string IDs. :return-schema: {type=object,properties={ boards={type=array,items={type=object,properties={ board={type=string} }}} }} :return-error 404: If the datasource does not exist. """ if datasource not in config.DATASOURCES: return error(404, error="Invalid data source") boards = db.fetchall("SELECT DISTINCT board FROM threads_" + datasource) return jsonify({"boards": [{"board": board["board"]} for board in boards]})
def add_user(): if not current_user.is_admin(): return error(403, message="This page is off-limits to you.") response = {"success": False} email = request.form.get("email", request.args.get("email", "")).strip() if not email or not re.match(r"[^@]+\@.*?\.[a-zA-Z]+", email): response = {**response, **{"message": "Please provide a valid e-mail address."}} else: username = email try: db.insert("users", data={"name": username, "timestamp_token": int(time.time())}) user = User.get_by_name(username) if user is None: response = {**response, **{"message": "User was created but could not be instantiated properly."}} else: try: user.email_token(new=True) response["success"] = True response = {**response, **{ "message": "An e-mail containing a link through which the registration can be completed has been sent to %s." % username}} except RuntimeError as e: response = {**response, **{ "message": "User was created but the registration e-mail could not be sent to them (%s)." % e}} except psycopg2.IntegrityError: db.rollback() response = {**response, **{"message": "Error: User %s already exists." % username}} if request.args.get("format", None) == "html": return render_template("error.html", message=response["message"], title=("New account created" if response["success"] else "Error")) else: return jsonify(response)
def cp_index(): if not current_user.is_admin(): return error(403, message="This page is off-limits to you.") return render_template("controlpanel/index.html")
def add_user(): """ Create a new user Sends the user an e-mail with a link through which they can set their password. :return: Either an html page with a message, or a JSON response, depending on whether ?format == html """ if not current_user.is_authenticated or not current_user.is_admin(): return error(403, message="This page is off-limits to you.") response = {"success": False} email = request.form.get("email", request.args.get("email", "")).strip() fmt = request.form.get("format", request.args.get("format", "")).strip() force = request.form.get("force", request.args.get("force", None)) if not email or not re.match(r"[^@]+\@.*?\.[a-zA-Z]+", email): response = { **response, **{ "message": "Please provide a valid e-mail address." } } else: username = email try: db.insert("users", data={ "name": username, "timestamp_token": int(time.time()) }) user = User.get_by_name(username) if user is None: response = { **response, **{ "message": "User was created but could not be instantiated properly." } } else: try: user.email_token(new=True) response["success"] = True response = { **response, **{ "message": "An e-mail containing a link through which the registration can be completed has been sent to %s." % username } } except RuntimeError as e: response = { **response, **{ "message": "User was created but the registration e-mail could not be sent to them (%s)." % e } } except psycopg2.IntegrityError: db.rollback() if not force: response = { **response, **{ "message": 'Error: User %s already exists. If you want to re-create the user and re-send the registration e-mail, use [this link](/admin/add-user?email=%s&force=1&format=%s).' % (username, username, fmt) } } else: # if a user does not use their token in time, maybe you want to # be a benevolent admin and give them another change, without # having them go through the whole signup again user = User.get_by_name(username) db.update("users", data={"timestamp_token": int(time.time())}, where={"name": username}) try: user.email_token(new=True) response["success"] = True response = { **response, **{ "message": "A new registration e-mail has been sent to %s." % username } } except RuntimeError as e: response = { **response, **{ "message": "Token was reset registration e-mail could not be sent to them (%s)." % e } } if fmt == "html": return render_template( "error.html", message=response["message"], title=("New account created" if response["success"] else "Error")) else: return jsonify(response)
def reject_user(): """ (Politely) reject an account request Sometimes, account requests need to be rejected. If you want to let the requester know of the rejection, this is the route to use :-) :return: HTML form, or message containing the e-mail send status """ if not current_user.is_authenticated or not current_user.is_admin(): return error(403, message="This page is off-limits to you.") email_address = request.form.get("email", request.args.get("email", "")).strip() name = request.form.get("name", request.args.get("name", "")).strip() form_message = request.form.get("message", request.args.get("message", "")).strip() incomplete = [] if not email_address: incomplete.append("email") if not name: incomplete.append(name) if not form_message: incomplete.append(form_message) if incomplete: if not form_message: form_answer = Path( config.PATH_ROOT, "webtool/templates/account/reject-template.html") form_message = "" if not form_answer.exists() else render_template( "account/reject-template.html", email=email_address, name=name) return render_template("account/reject.html", email=email_address, name=name, message=form_message, incomplete=incomplete) email = MIMEMultipart("alternative") email["From"] = config.NOREPLY_EMAIL email["To"] = email_address email["Subject"] = "Your %s account request" % config.TOOL_NAME try: html_message = markdown.markdown(form_message) email.attach(MIMEText(form_message, "plain")) email.attach(MIMEText(html_message, "html")) with smtplib.SMTP(config.MAILHOST) as smtp: smtp.sendmail(config.NOREPLY_EMAIL, [email_address], email.as_string()) except (smtplib.SMTPException, ConnectionRefusedError) as e: return render_template("error.html", message="Could not send e-mail to %s: %s" % (email_address, e), title="Error sending rejection") return render_template("error.html", message="Rejection sent to %s." % email_address, title="Rejection sent")
def check_dataset(): """ Check dataset status Requires authentication by logging in or providing a valid access token. :request-param str key: ID of the dataset for which to return the status :return: Dataset status, containing the `status`, `query`, number of `rows`, the dataset `key`, whether the dataset is `done`, the `path` of the result file and whether the dataset is `empty`. :return-schema: { type=object, properties={ status={type=string}, query={type=string}, rows={type=integer}, key={type=string}, done={type=boolean}, path={type=string}, empty={type=boolean}, is_favourite={type=boolean} } } :return-error 404: If the dataset does not exist. """ dataset_key = request.args.get("key") try: dataset = DataSet(key=dataset_key, db=db) except TypeError: return error(404, error="Dataset does not exist.") results = dataset.check_dataset_finished() if results == 'empty': dataset_data = dataset.data dataset_data["parameters"] = json.loads(dataset_data["parameters"]) path = False elif results: # Return absolute folder when using localhost for debugging path = results.name dataset_data = dataset.data dataset_data["parameters"] = json.loads(dataset_data["parameters"]) else: path = "" status = { "status": dataset.get_status(), "status_html": render_template("result-status.html", dataset=dataset), "label": dataset.get_label(), "query": dataset.data["query"], "rows": dataset.data["num_rows"], "key": dataset_key, "done": True if dataset.is_finished() else False, "path": path, "empty": (dataset.data["num_rows"] == 0), "is_favourite": (db.fetchone("SELECT COUNT(*) AS num FROM users_favourites WHERE name = %s AND key = %s", (current_user.get_id(), dataset.key))["num"] > 0) } return jsonify(status)
def queue_dataset(): """ Queue a 4CAT search query for processing into a dataset Requires authentication by logging in or providing a valid access token. Request parameters vary by data source. The ones mandated constitute the minimum but more may be required. :request-param str board: Board ID to query :request-param str datasource: Data source ID to query :request-param str body_match: String to match in the post body :request-param str subject_match: String to match in the post subject :request-param int min_date: Timestamp marking the beginning of the match period :request-param int max_date: Timestamp marking the end of the match period :request-param str ?access_token: Access token; only required if not logged in currently. :return str: The dataset key, which may be used to later retrieve dataset status and results. :return-error 404: If the datasource does not exist. """ datasource_id = request.form.get("datasource", "") if datasource_id not in backend.all_modules.datasources: return error(404, message="Datasource '%s' does not exist" % datasource_id) search_worker_id = datasource_id + "-search" if search_worker_id not in backend.all_modules.workers: return error(404, message="Datasource '%s' has no search interface" % datasource_id) search_worker = backend.all_modules.workers[search_worker_id] worker_class = backend.all_modules.load_worker_class(search_worker) if hasattr(worker_class, "validate_query"): try: # first sanitise values sanitised_query = UserInput.parse_all(worker_class.options, request.form.to_dict(), silently_correct=False) # then validate for this particular datasource sanitised_query = worker_class.validate_query(sanitised_query, request, current_user) except QueryParametersException as e: return "Invalid query. %s" % e else: raise NotImplementedError("Data sources MUST sanitise input values with validate_query") sanitised_query["user"] = current_user.get_id() sanitised_query["datasource"] = datasource_id sanitised_query["type"] = search_worker_id sanitised_query["pseudonymise"] = bool(request.form.to_dict().get("pseudonymise", False)) extension = worker_class.extension if hasattr(worker_class, "extension") else "csv" dataset = DataSet(parameters=sanitised_query, db=db, type=search_worker_id, extension=extension) if hasattr(worker_class, "after_create"): worker_class.after_create(sanitised_query, dataset, request) queue.add_job(jobtype=search_worker_id, remote_id=dataset.key) return dataset.key
def process_standalone(processor): """ Run a standalone processor This bypasses the usual 4CAT query-processor structure and allows running any available processor (see the `/api/get-standalone-processors/` endpoint) with one API call. The data is returned immediately and not saved server-side. Requires authentication. :param str processor: ID of the processor to run on incoming data :request-body object data: Data to process, a JSON-formatted list of objects with each object having at least they keys `post_id`, `thread_id`, body`, and `author`. :request-schema data: { type=object, properties={ post_id={type=string}, thread_id={type=string}, body={type=string}, author={type=string} } } :request-param str ?access_token: Access token; only required if not logged in currently. :return: A JSON object containing the processed data, with a processor-specific structure. :return-schema: { type=object, additionalProperties={} } :return-error 402: If an invalid processor is requested, or if the input is not properly-formatted JSON. :return-error 503: If too many other requests are currently being handled, so that the server does not have the capacity to deal with this request """ processors = get_standalone_processors().get_json() if processor not in processors: return error(402, error="Processor '%s' is not available" % processor) if not request.is_json: return error(402, error="This API endpoint only accepts JSON-formatted data as input") try: input = request.get_json(force=True) except json.JSONDecodeError: return error(402, error="JSON decoding error") # check file integrity required = ("id", "thread_id", "body", "author") try: for row in input: for field in required: if field not in row: return error(402, error="Input is valid JSON, but not a list of data objects (missing field '%s')" % field) except TypeError: return error(402, error="Input is valid JSON, but not a list of data objects") if not input: return error(402, error="Input is empty") # ok, valid input! temp_dataset = DataSet(extension="csv", type="standalone", parameters={"user": current_user.get_id(), "after": [processor]}, db=db) temp_dataset.finish(len(input)) # make sure the file is deleted later, whichever way this request is # ultimately handled @after_this_request def delete_temporary_dataset(response): temp_dataset.delete() # also deletes children! return response # write the input as a csv file so it can be accessed as normal by # processors result_file = temp_dataset.get_results_path() with result_file.open("w") as temp_csv: writer = csv.DictWriter(temp_csv, fieldnames=required) writer.writeheader() for row in input: writer.writerow({field: row[field] for field in required}) # queue the postprocessor metadata = processors[processor] processed = DataSet(extension=metadata["extension"], type=processor, parent=temp_dataset.key, db=db) queue = JobQueue(database=db, logger=log) job = queue.add_job(processor, {}, processed.key) place_in_queue = queue.get_place_in_queue(job) if place_in_queue > 5: job.finish() return error(code=503, error="Your request could not be handled as there are currently %i other jobs of this type in the queue. Please try again later." % place_in_queue) # wait up to half a minute for the job to be taken up # if not, tell the user to try again later start = time.time() while True: if time.time() > start + 30: job.finish() return error(code=503, error="The server is currently too busy to handle your request. Please try again later.") if queue.get_place_in_queue(job) != 0: time.sleep(2) continue else: break # job currently being processed, wait for it to finish while True: try: job = Job.get_by_remote_ID(job.data["remote_id"], db, processor) except JobNotFoundException: break if not job.is_finished: time.sleep(2) else: break # job finished, send file - temporary datasets will be cleaned up by # after_this_request function defined earlier return send_file(processed.get_results_path(), as_attachment=True)