示例#1
0
文件: notify.py 项目: 01-/aleph
def notify_role(role, subject, html):
    if role.email is None:
        log.error("Role does not have E-Mail: %r", role)
        return

    sender = '%s <%s>' % (get_config('APP_TITLE'),
                          get_config('MAIL_FROM'))
    subject = '[%s] %s' % (get_config('APP_TITLE'), subject)
    msg = Message(subject=subject,
                  sender=sender,
                  recipients=[role.email])
    msg.html = html
    mail.send(msg)
示例#2
0
def graph_metadata():
    graph = get_graph()
    if graph is None:
        return {'active': False}
    ignore_labels = ['Collection', BASE_NODE]
    labels = [l for l in graph.node_labels if l not in ignore_labels]
    types = [t for t in graph.relationship_types if t != 'PART_OF']
    return {
        'active': True,
        'labels': labels,
        'types': types,
        'icons': get_config('GRAPH_ICONS'),
        'colors': get_config('GRAPH_COLORS')
    }
示例#3
0
 def session(self):
     if not hasattr(self, '_session'):
         username = get_config('ID_USERNAME')
         password = get_config('ID_PASSWORD')
         sess = requests.Session()
         res = sess.get(urljoin(self.host, '/accounts/login/'))
         data = {'csrfmiddlewaretoken': sess.cookies['csrftoken'],
                 'username': username,
                 'password': password}
         res = sess.post(res.url, data=data, headers={
             'Referer': res.url
         })
         self._session = sess
     return self._session
示例#4
0
def metadata():
    enable_cache(server_side=False)
    schemata = {}
    for schema_id, schema in resolver.store.items():
        if not schema_id.endswith('#'):
            schema_id = schema_id + '#'
        schemata[schema_id] = {
            'id': schema_id,
            'title': schema.get('title'),
            'faIcon': schema.get('faIcon'),
            'plural': schema.get('plural', schema.get('title')),
            'description': schema.get('description'),
            'inline': schema.get('inline', False)
        }
    return jsonify({
        'status': 'ok',
        'app': {
            'title': get_app_title(),
            'url': get_app_url(),
            'samples': get_config('SAMPLE_SEARCHES')
        },
        'fields': Metadata.facets(),
        'categories': Collection.CATEGORIES,
        'countries': COUNTRY_NAMES,
        'languages': LANGUAGE_NAMES,
        'schemata': schemata
    })
示例#5
0
def extract_image_data(data, languages=None):
    """Extract text from a binary string of data."""
    tessdata_prefix = get_config('TESSDATA_PREFIX')
    if tessdata_prefix is None:
        raise IngestorException("TESSDATA_PREFIX is not set, OCR won't work.")
    languages = get_languages_iso3(languages)
    text = Cache.get_ocr(data, languages)
    if text is not None:
        return text
    try:
        img = Image.open(StringIO(data))
    except DecompressionBombWarning as dce:
        log.debug("Image too large: %", dce)
        return None
    except IOError as ioe:
        log.info("Unknown image format: %r", ioe)
        return None
    # TODO: play with contrast and sharpening the images.
    extractor = Tesseract(tessdata_prefix, lang=languages)
    extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD)
    text = extractor.ocr_image(img)
    extractor.clear()
    log.debug('OCR done: %s, %s characters extracted',
              languages, len(text))
    Cache.set_ocr(data, languages, text)
    return text
示例#6
0
def get_languages():
    active = [c.lower().strip() for c in get_config("LANGUAGES")]
    languages = {}
    for code, label in LANGUAGE_NAMES.items():
        if code in active:
            languages[code] = label
    return languages
示例#7
0
def _extract_image_page(pdf_file, page, languages=None):
    # This is a somewhat hacky way of working around some of the formats
    # and compression mechanisms not supported in pdfminer. It will
    # generate an image based on the given page in the PDF and then OCR
    # that.
    pdftoppm = get_config('PDFTOPPM_BIN')
    args = [pdftoppm, pdf_file, '-singlefile', '-gray', '-f', str(page)]
    output = subprocess.check_output(args)
    return extract_image_data(output, languages=languages)
示例#8
0
文件: html.py 项目: adamchainz/aleph
 def generate_pdf_version(self, html_path):
     """OK, this is weirder. Converting HTML to PDF via WebKit."""
     fh, out_path = mkstemp(suffix='.pdf')
     os.close(fh)
     wkhtmltopdf = get_config('WKHTMLTOPDF_BIN')
     args = [wkhtmltopdf, '--disable-javascript', '--no-outline',
             '--no-images', '--quiet', html_path, out_path]
     subprocess.call(args)
     return out_path
示例#9
0
 def generate_pdf_alternative(self, meta, local_path):
     """Convert DjVu book to PDF."""
     out_path = make_tempfile(meta.file_name, suffix='pdf')
     ddjvu = get_config('DDJVU_BIN')
     args = [ddjvu, '-format=pdf', '-quality=85', '-skip',
             local_path, out_path]
     log.debug('Converting DJVU book: %r', ' '.join(args))
     subprocess.call(args, stderr=subprocess.STDOUT)
     return out_path
示例#10
0
文件: html.py 项目: nivertech/aleph
 def handle_html(self, meta, html_path):
     """OK, this is weirder. Converting HTML to PDF via WebKit."""
     out_path = make_tempfile(name=meta.file_name, suffix='pdf')
     try:
         wkhtmltopdf = get_config('WKHTMLTOPDF_BIN')
         args = [wkhtmltopdf, '--disable-javascript', '--no-outline',
                 '--no-images', '--quiet', html_path, out_path]
         subprocess.call(args)
         if not os.path.isfile(out_path):
             raise IngestorException("Could not convert document: %r", meta)
         self.extract_pdf_alternative(meta, out_path)
     finally:
         remove_tempfile(out_path)
示例#11
0
def angular_templates():
    templates = {}
    template_dirs = [current_app.static_folder]
    template_dirs.extend(get_config('CUSTOM_TEMPLATES_DIR'))
    for template_dir in template_dirs:
        for tmpl_set in ['templates', 'help']:
            tmpl_dir = os.path.join(template_dir, tmpl_set)
            for (root, dirs, files) in os.walk(tmpl_dir):
                for file_name in files:
                    file_path = os.path.join(root, file_name)
                    with open(file_path, 'rb') as fh:
                        file_name = file_path[len(template_dir) + 1:]
                        templates[file_name] = fh.read().decode('utf-8')
    return templates.items()
示例#12
0
文件: __init__.py 项目: 01-/aleph
def check_role_alerts(role):
    alerts = Alert.by_role(role).all()
    if not len(alerts):
        return
    log.info('Alerting %r, %d alerts...', role, len(alerts))
    for alert in alerts:
        q = documents_query(alert.query, newer_than=alert.notified_at)
        results = execute_documents_alert_query(alert.query, q)
        if results['total'] == 0:
            continue
        log.info('Found: %d new results for: %r', results['total'],
                 alert.query)
        alert.update()
        try:
            subject = '%s (%s new results)' % (alert.label, results['total'])
            html = render_template('alert.html', alert=alert, results=results,
                                   role=role, qs=make_document_query(alert),
                                   app_title=get_config('APP_TITLE'),
                                   app_url=get_config('APP_BASEURL'))
            notify_role(role, subject, html)
        except Exception as ex:
            log.exception(ex)
    db.session.commit()
示例#13
0
 def handle_html(self, meta, html_path):
     """OK, this is weirder. Converting HTML to PDF via WebKit."""
     out_path = make_tempfile(name=meta.file_name, suffix='pdf')
     try:
         wkhtmltopdf = get_config('WKHTMLTOPDF_BIN')
         args = [
             wkhtmltopdf, '--disable-javascript', '--no-outline',
             '--no-images', '--quiet', html_path, out_path
         ]
         subprocess.call(args)
         if not os.path.isfile(out_path):
             raise IngestorException("Could not convert document: %r", meta)
         self.extract_pdf_alternative(meta, out_path)
     finally:
         remove_tempfile(out_path)
示例#14
0
文件: cache.py 项目: 01-/aleph
def enable_cache(vary_user=False, vary=None, server_side=True):
    args = sorted(set(request.args.items()))
    # jquery where is your god now?!?
    args = filter(lambda (k, v): k != '_', args)

    cache_parts = [args, vary]

    if vary_user:
        cache_parts.extend((request.auth_roles))

    request._http_cache = get_config('CACHE')
    request._http_etag = cache_hash(*cache_parts)
    request._http_server = server_side

    if request.if_none_match == request._http_etag:
        raise NotModified()
示例#15
0
 def ingest(self, meta, local_path):
     try:
         fh, pdf_path = mkstemp(suffix='.pdf')
         os.close(fh)
         meta.title = meta.file_name
         convert = get_config('CONVERT_BIN')
         args = [convert, local_path, '-density', '300', '-define',
                 'pdf:fit-page=A4', pdf_path]
         subprocess.call(args)
         if pdf_path is None or not os.path.isfile(pdf_path):
             raise IngestorException("Could not convert image: %r" % meta)
         self.store_pdf(meta, pdf_path)
         self.extract_pdf(meta, pdf_path)
     finally:
         if os.path.isfile(pdf_path):
             os.unlink(pdf_path)
示例#16
0
文件: image.py 项目: nivertech/aleph
 def ingest(self, meta, local_path):
     pdf_path = make_tempfile(name=meta.file_name, suffix='pdf')
     try:
         meta.title = meta.file_name
         if not self.check_image_size(meta, local_path):
             return
         convert = get_config('CONVERT_BIN')
         args = [convert, local_path, '-density', '300', '-define',
                 'pdf:fit-page=A4', pdf_path]
         subprocess.call(args)
         if not os.path.isfile(pdf_path):
             msg = "Could not convert image: %r" % meta
             raise ImageIngestorException(msg)
         self.store_pdf(meta, pdf_path)
         self.extract_pdf(meta, pdf_path)
     finally:
         remove_tempfile(pdf_path)
示例#17
0
 def generate_pdf_alternative(self, meta, local_path):
     """Convert LibreOffice-supported documents to PDF."""
     work_dir = mkdtemp()
     instance_dir = mkdtemp()
     try:
         soffice = get_config('SOFFICE_BIN')
         instance_path = '"-env:UserInstallation=file://%s"' % instance_dir
         args = [soffice, '--convert-to', 'pdf', '--nofirststartwizard',
                 instance_path, '--norestore', '--nologo', '--nodefault',
                 '--nolockcheck', '--invisible', '--outdir', work_dir,
                 '--headless', local_path]
         log.debug('Converting document: %r', ' '.join(args))
         subprocess.call(args)
         for out_file in os.listdir(work_dir):
             return os.path.join(work_dir, out_file)
     finally:
         shutil.rmtree(instance_dir)
示例#18
0
def extract_image_data(data, languages=None):
    """Extract text from a binary string of data."""
    tessdata_prefix = get_config('TESSDATA_PREFIX')
    if tessdata_prefix is None:
        raise IngestorException("TESSDATA_PREFIX is not set, OCR won't work.")
    languages = get_languages_iso3(languages)
    text = Cache.get_ocr(data, languages)
    if text is not None:
        return text
    img = Image.open(StringIO(data))
    # TODO: play with contrast and sharpening the images.
    extractor = Tesseract(tessdata_prefix, lang=languages)
    extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD)
    text = extractor.ocr_image(img)
    log.debug('OCR done: %s, %s characters extracted',
              languages, len(text))
    Cache.set_ocr(data, languages, text)
    return text
示例#19
0
def _convert_page(interpreter, page, device, page_no, path, languages):
    # If this returns None or an empty string, it'll trigger OCR.
    text_content = []
    ocr_required = False
    try:
        interpreter.process_page(page)
        layout = device.get_result()

        for text_obj in _find_objects(layout._objs, (LTTextBox, LTTextLine)):
            text = text_obj.get_text()
            if text is None:
                continue
            text = text.strip()
            if len(text):
                text_content.append(text)

        # Generous try/catch because pdfminers image support is
        # horrible.
        page_area = float(layout.width * layout.height)
        for image_obj in _find_objects(layout._objs, LTImage):
            image_area = float(image_obj.width * image_obj.height)
            page_portion = image_area / page_area
            # Go for OCR if an image makes up more than 70% of the page.
            if page_portion > 0.7:
                ocr_required = True

    except Exception as ex:
        log.exception(ex)
        ocr_required = True

    if ocr_required and get_config("OCR_PDF_PAGES"):
        log.info("Using OCR for %r, p.%s", path, page_no)
        text_content.append(_extract_image_page(path, page_no, languages))

    text = "\n".join(text_content)
    log.debug("Extracted %d characters of text from %r, p.%s", len(text), path, page_no)
    return text.strip()
示例#20
0
 def iter_table(self, local_path, table_name):
     mdb_export = get_config('MDB_EXPORT_BIN')
     args = [mdb_export, '-b', 'strip', local_path, table_name]
     proc = subprocess.Popen(args, stdout=subprocess.PIPE)
     for row in DictReader(proc.stdout):
         yield row
示例#21
0
def get_language_whitelist():
    return [c.lower().strip() for c in get_config('LANGUAGES')]
示例#22
0
 def unpack(self, meta, local_path, temp_dir):
     args = [get_config('SEVENZ_BIN'), 'x', local_path, '-y', '-r',
             '-bb0', '-bd', '-oc:%s' % temp_dir]
     print ' '.join(args)
     subprocess.call(args, stderr=subprocess.STDOUT)
示例#23
0
 def get_tables(self, local_path):
     mdb_tables = get_config('MDB_TABLES_BIN')
     output = subprocess.check_output([mdb_tables, local_path])
     return [t.strip() for t in output.split(' ') if len(t.strip())]
示例#24
0
 def host(self):
     return get_config('ID_HOST', 'https://investigativedashboard.org/')
示例#25
0
 def get_languages(self, meta):
     default_languages = [get_config('DEFAULT_LANGUAGE')]
     languages = meta.languages + default_languages
     return list(set(languages))