Пример #1
0
def index():
    es = get_db()
    count = es.count(
        index=current_app.config.get("ES_INDEX"),
        doc_type="_doc",
        body={"query": {
            "match_all": {}
        }},
    )
    return render_template("index.html.j2", docs=count.get("count", 0))
Пример #2
0
def charity_get(regno, filetype="html"):
    es = get_db()
    documents = es.search(
        index=current_app.config.get("ES_INDEX"),
        doc_type="_doc",
        _source_includes=["regno", "fye"],
        body={"query": {
            "term": {
                "regno": regno
            }
        }},
    )
    documents = {
        d["_source"]["fye"][0:10]: {
            "doc_id": d.get("_id"),
            "doc_url": url_for("doc.doc_get", id=d.get("_id")),
        }
        for d in documents.get("hits", {}).get("hits", [])
        if d.get("_source", {}).get("fye")
    }

    source = get_charity_type(regno)
    accounts = source.list_accounts(regno)
    accounts = {"{:%Y-%m-%d}".format(a.fyend): a for a in accounts}
    charity = source.get_charity(regno)
    if not charity:
        abort(404)
    charity["finances"] = [{
        **f,
        **accounts.get(
            f["financialYear"]["end"][0:10],
            Account(None, regno, f["financialYear"]["end"][0:10]),
        )._asdict(),
        **documents.get(f["financialYear"]["end"][0:10], {}),
        "fyend":
        f["financialYear"]["end"][0:10],
    } for f in charity.get("finances", [])]
    if filetype == "json":
        return {
            "data": dict(results=accounts, charity=charity, regno=regno),
            "errors": [],
        }
    return render_template("charity.html.j2",
                           results=accounts,
                           charity=charity,
                           regno=regno)
Пример #3
0
def get_doc(id, q=None):
    highlight_class = 'data-charity-account-highlight="true"'
    es = get_db()
    body = {
        "query": {
            "terms": {
                "_id": [id],
            }
        }
    }
    if q:
        body["highlight"] = {
            "fields": {
                "attachment.content": {
                    "number_of_fragments":
                    0,
                    "pre_tags":
                    [f'<em class="bg-yellow b highlight" {highlight_class}>'],
                    "post_tags": ["</em>"],
                    "highlight_query": {
                        "simple_query_string": {
                            "query": q,
                            "fields": ["attachment.content"],
                            "default_operator": "or",
                        }
                    },
                }
            },
            "encoder": "html",
        }
    search_doc = es.search(
        index=current_app.config.get("ES_INDEX"),
        doc_type="_doc",
        body=body,
        _source_excludes=["filedata"],
    )
    if search_doc.get("hits", {}).get("hits", []):
        doc = search_doc.get("hits", {}).get("hits", [])[0]
        if doc.get("highlight", {}).get("attachment.content"):
            content = doc["highlight"]["attachment.content"][0]
            content = Markup(content).unescape()
            doc["_highlight_count"] = content.count(highlight_class)
            doc["_source"]["attachment"]["content"] = content
        return doc
Пример #4
0
def doc_get_pdf(id):
    es = get_db()
    try:
        doc = es.get(
            index=current_app.config.get("ES_INDEX"),
            doc_type="_doc",
            id=id,
            _source_includes=["filedata"],
        )
    except NotFoundError:
        abort(404, description=f"Could not find document (id: [{id}])")
    return make_response(
        base64.b64decode(doc.get("_source", {}).get("filedata")),
        200,
        {
            "Content-type": "application/pdf",
            # "Content-Disposition": "attachment;filename={}.pdf".format(id)
        },
    )
Пример #5
0
def cli_upload(input_path, debug, skip_if_exists=False):
    def file_generator(directory):
        pathlist = Path(directory).glob("**/*.pdf")
        for filename in pathlist:
            yield filename

    if os.path.isdir(input_path):
        files = file_generator(input_path)
    else:
        files = [input_path]

    for filepath in tqdm(files):
        filesize = os.path.getsize(filepath)
        if filesize > current_app.config["FILE_SIZE_LIMT"]:
            click.echo(
                click.style(
                    f"ERROR Filesize too big: {filepath} [{filesize}]",
                    fg="white",
                    bg="red",
                ),
                err=True,
            )
            continue

        with open(filepath, "rb") as pdffile:
            filename = os.path.basename(filepath)
            regno, fyend = filename.rstrip(".pdf").split("_")
            fyend = datetime.date(
                int(fyend[0:4]),
                int(fyend[4:6]),
                int(fyend[6:8]),
            )
            charity = {
                "regno": regno,
                "fye": fyend,
                # "name": request.values.get("name"),
                # "income": request.values.get("income"),
                # "spending": request.values.get("spending"),
                # "assets": request.values.get("assets"),
            }
            if debug:
                click.echo(f"Uploading document: {pdffile.name}")
            result = upload_doc(charity,
                                pdffile.read(),
                                get_db(),
                                skip_if_exists=skip_if_exists)
            if result["result"] in ("created", "updated", "already exists"):
                if debug:
                    click.echo(
                        click.style(
                            f"Document {result['result']}: {pdffile.name}",
                            fg="green"))
            else:
                click.echo(
                    click.style(
                        f"ERROR Could not upload document: {pdffile.name}",
                        fg="white",
                        bg="red",
                    ),
                    err=True,
                )
                print(result)
Пример #6
0
def doc_upload(filetype="html"):
    if filetype not in ["json", "html"]:
        filetype = "html"
    es = get_db()
    if request.method == "POST":

        content = None

        # check file is provided
        doc = request.files.get("doc")
        url = request.values.get("url")
        if doc:

            # check the filename
            filename = secure_filename(doc.filename)
            content = doc.read()

        # download from an URL
        elif url:

            r = requests.get(url)
            if not r.status_code == requests.codes.ok:
                flash("Couldn't load from URL: {}".format(url), "error")
                if filetype == "json":
                    return jsonify({
                        "data": {},
                        "errors": ["Couldn't load from URL: {}".format(url)],
                    })
                return redirect(request.url)
            content = r.content
            if "Content-Disposition" in r.headers.keys():
                filename = re.findall(
                    "filename=(.+)",
                    r.headers["Content-Disposition"])[0].strip('"')
            elif request.values.get("regno") and request.values.get("fye"):
                filename = "{}_{:%Y%m%d}.pdf".format(
                    request.values.get("regno"), request.values.get("fye"))
            else:
                filename = "annual_accounts.pdf"

        else:
            flash("No file found", "error")
            if filetype == "json":
                return jsonify({
                    "data": {},
                    "errors": ["No file found"],
                })
            return redirect(request.url)

        # check the filename
        if not filename.lower().endswith(".pdf"):
            flash("File must be a PDF", "error")
            return redirect(request.url)

        charity = {
            "regno": request.values.get("regno"),
            "fye": request.values.get("fye"),
            "name": request.values.get("name"),
            "income": request.values.get("income"),
            "spending": request.values.get("spending"),
            "assets": request.values.get("assets"),
        }

        if not charity["regno"] or not charity["fye"]:
            nameparse = re.match(CC_ACCOUNT_FILENAME, filename, re.IGNORECASE)
            if nameparse:
                charity["regno"] = nameparse.group(1).lstrip("0")
                charity["fye"] = "{}-{}-{}".format(
                    nameparse.group(2),
                    nameparse.group(3),
                    nameparse.group(4),
                )
            else:
                flash("Must provide charity number and financial year end",
                      "error")
                if filetype == "json":
                    return jsonify({
                        "data": {},
                        "errors": [("Must provide charity number "
                                    " and financial year end")],
                    })
                return redirect(request.url)

        charity["fye"] = datetime.datetime.strptime(charity["fye"], "%Y-%m-%d")

        result = upload_doc(charity, content, es)
        flash('Uploaded "{}"'.format(filename), "message")
        if filetype == "json":
            return jsonify({
                "data": {
                    "id": result.get("_id"),
                    "result": result.get("result"),
                },
                "errors": [],
            })
        return redirect(url_for("doc.doc_get", id=result.get("_id")))

    return render_template("doc_upload.html.j2")
Пример #7
0
def doc_all_docs(filetype="html"):
    es = get_db()

    try:
        p = int(request.values.get("p", 1))
    except ValueError:
        p = 1
    limit = 10
    skip = limit * (p - 1)

    results = None
    resultCount = 0
    nav = {}

    if filetype == "csv":
        doc = scan(
            es,
            index=current_app.config.get("ES_INDEX"),
            doc_type="_doc",
            _source_excludes=["filedata", "attachment"],
            request_timeout=1000,
        )
        fields = [
            "regno",
            "fye",
            "filename",
            "name",
            "income",
            "spending",
            "assets",
        ]

        def generate_csv():
            buffer = io.StringIO()
            writer = csv.DictWriter(buffer, fieldnames=fields)
            writer.writeheader()
            yield buffer.getvalue()

            for result in doc:
                buffer = io.StringIO()
                writer = csv.DictWriter(buffer, fieldnames=fields)
                writer.writerow(result["_source"])
                yield buffer.getvalue()

        return Response(
            generate_csv(),
            mimetype="text/csv",
            headers={
                "Content-Disposition": "attachment; filename=all_accounts.csv",
                "Content-type": "text/csv",
            },
        )

    res = es.search(
        index=current_app.config.get("ES_INDEX"),
        doc_type="_doc",
        _source_excludes=["filedata", "attachment"],
        body={
            "size": limit,
            "from": skip,
            "query": {
                "match_all": {}
            }
        },
    )
    resultCount = res.get("hits", {}).get("total", 0)
    if isinstance(resultCount, dict):
        resultCount = resultCount.get("value")

    nav = get_nav(
        p,
        limit,
        resultCount,
        "doc.doc_all_docs",
        dict(),
    )
    results = res.get("hits", {}).get("hits", [])
    return render_template(
        "doc_all_docs.html.j2",
        results=results,
        resultCount=resultCount,
        nav=nav,
        downloadUrl=url_for("doc.doc_all_docs", filetype="csv"),
    )
Пример #8
0
def doc_search(filetype="html"):
    es = get_db()
    q = request.values.get("q")

    try:
        p = int(request.values.get("p", 1))
    except ValueError:
        p = 1
    limit = 10
    skip = limit * (p - 1)

    results = None
    resultCount = 0
    nav = {}
    if q:
        params = dict(
            index=current_app.config.get("ES_INDEX"),
            doc_type="_doc",
            _source_excludes=["filedata", "attachment.content"],
            body={
                "query": {
                    "simple_query_string": {
                        "query": request.values.get("q"),
                        "fields": ["attachment.content"],
                        "default_operator": "or",
                    }
                }
            },
        )
        if filetype == "csv":
            params["query"] = params.pop("body")
            doc = scan(
                es,
                request_timeout=10000,
                **params,
            )
            buffer = io.StringIO()
            fields = [
                "regno",
                "fye",
                "filename",
                "name",
                "income",
                "spending",
                "assets",
                "search term",
            ]
            writer = csv.DictWriter(buffer, fieldnames=fields)
            writer.writeheader()
            for k, result in enumerate(doc):
                row = {
                    "search term": q,
                    **{
                        k: v
                        for k, v in result["_source"].items() if k in fields
                    },
                }
                writer.writerow(row)
            output = make_response(buffer.getvalue())
            output.headers[
                "Content-Disposition"] = f"attachment; filename=account_search_{slugify(q, separator='_')}.csv"
            output.headers["Content-type"] = "text/csv"
            return output
        params["body"]["highlight"] = {
            "fields": {
                "attachment.content": {
                    "fragment_size": 150,
                    "number_of_fragments": 3,
                    "pre_tags": ['<em class="bg-yellow b highlight">'],
                    "post_tags": ["</em>"],
                }
            },
            "encoder": "html",
        }
        doc = es.search(
            **params,
            from_=skip,
            size=limit,
        )
        resultCount = doc.get("hits", {}).get("total", 0)
        if isinstance(resultCount, dict):
            resultCount = resultCount.get("value")

        nav = get_nav(
            p,
            limit,
            resultCount,
            "doc.doc_search",
            dict(q=q),
        )
        results = doc.get("hits", {}).get("hits", [])
        for r in results:
            if r.get("highlight", {}).get("attachment.content"):
                r["highlight"]["attachment.content"] = [
                    Markup(s).unescape()
                    for s in r["highlight"]["attachment.content"]
                ]
    return render_template(
        "doc_search.html.j2",
        results=results,
        q=q,
        resultCount=resultCount,
        nav=nav,
        downloadUrl=url_for("doc.doc_search", q=q, filetype="csv"),
    )