示例#1
0
def cli(ctx, index, corrections, types, indexname):
    from idb.helpers.logging import idblogger
    logger = idblogger.getChild('indexing')
    from idb import config
    from .indexer import ElasticSearchIndexer
    from idb.corrections.record_corrector import RecordCorrector

    if not types:
        types = config.config["elasticsearch"]["types"]
    if indexname is None:
        indexname = config.config["elasticsearch"]["indexname"]
    serverlist = config.config["elasticsearch"]["servers"]

    if config.ENV == 'beta':
        logger.info("Enabling beta configuration")
        indexname = "2.5.0"
        serverlist = [
            "c17node52.acis.ufl.edu",
            "c17node53.acis.ufl.edu",
            "c17node54.acis.ufl.edu",
            "c17node55.acis.ufl.edu",
            "c17node56.acis.ufl.edu"
        ]

    if not index:
        logger.info("Enabling no-index dry run mode")

    # These are the parameters that are common to every indexing
    # function
    ctx.obj = {
        'ei': lambda: ElasticSearchIndexer(indexname, types, serverlist=serverlist),
        'rc': lambda: RecordCorrector(reload=corrections),
        'no_index': lambda: not index
    }
示例#2
0
from pyquery import PyQuery as pq

from idb.data_tables.rights_strings import acceptable_licenses_trans
from idb.helpers.logging import idblogger

logger = idblogger.getChild('eml')


def getElement(root, name):
    return root.find(name)


def parseEml(id, emlText):
    "Returns a dictionary of the supplied emlText"

    # dump the full eml/xml for debugging
    #logger.debug(emlText)

    # If the target eml document is not XML, the eml object will not be created due to XMLSyntaxError or other
    # pyquery exception.  This is known to occur when a link to eml results in a 404 error page containing HTML.
    # For example:  http://xbiod.osu.edu/ipt/eml.do?r=osum-fish
    # It is possible we could trap this ahead of time by checking the raw emlText for key xml features
    # or HTML document features.

    eml = pq(emlText, parser='xml')

    ### The eml().txt() function returns an empty string instead of None if the location does not exist in the eml
    ### (if there is "no text node" according to release notes https://pypi.python.org/pypi/pyquery)

    collection = {}
    collection["id"] = id
示例#3
0
def run_server(info, host, port, reload, debugger, eager_loading, debug, wsgi):
    """Runs a local development server for the Flask application.
    This local server is recommended for development purposes only but it
    can also be used for simple intranet deployments.  By default it will
    not support any sort of concurrency at all to simplify debugging.

    The reloader and debugger are by default enabled if the debug flag of
    Flask is enabled and disabled otherwise.

    This is very similar to flask.cli.run_command; with the main
    addition of the --wsgi flag

    """
    info.app_import_path = 'idb.data_api.api:app'
    info.debug = debug
    from idb import config

    if reload is None:
        reload = info.debug
    if debugger is None:
        debugger = info.debug
    if eager_loading is None:
        eager_loading = not reload

    if wsgi is None:
        if (debug or debugger or reload or config.ENV in ('dev', )):
            wsgi = 'werkzeug'
        else:
            wsgi = 'gevent'

    if wsgi == 'werkzeug':
        from werkzeug.serving import run_simple
        from flask.cli import DispatchingApp

        app = DispatchingApp(info.load_app, use_eager_loading=eager_loading)

        # Extra startup messages.  This depends a but on Werkzeug internals to
        # not double execute when the reloader kicks in.
        if os.environ.get('WERKZEUG_RUN_MAIN') != 'true':
            # If we have an import path we can print it out now which can help
            # people understand what's being served.  If we do not have an
            # import path because the app was loaded through a callback then
            # we won't print anything.
            if info.app_import_path is not None:
                print("Werkzeug server @ http://{0}:{1}/ ENV={2}".format(
                    host, port, config.ENV),
                      file=sys.stderr)
            if info.debug is not None:
                print(' * Forcing debug %s' % (info.debug and 'on' or 'off'))

        run_simple(host,
                   port,
                   app,
                   use_reloader=reload,
                   use_debugger=debugger,
                   threaded=False,
                   passthrough_errors=True)

    elif wsgi == 'gevent':
        from gevent.pool import Pool
        from gevent.wsgi import WSGIServer
        from idb.helpers.logging import idblogger
        from requestlogger import WSGILogger, ApacheFormatter
        logger = idblogger.getChild('api')

        from werkzeug.contrib.fixers import ProxyFix
        logger.info("gevent server @ http://%s:%s/ ENV=%s", host, port,
                    config.ENV)
        app = info.load_app()
        app = WSGILogger(app, [], ApacheFormatter())
        app.logger = logger.getChild('r')
        app = ProxyFix(app)
        http_server = WSGIServer(('', 19197), app, spawn=Pool(1000), log=None)
        http_server.serve_forever()
    else:
        raise ValueError('Unknown wsgi backend type', wsgi)
示例#4
0
import elasticsearch.helpers
import unicodecsv as csv

from atomicfile import AtomicFile

# idb imports
from idb.helpers.conversions import index_field_to_longname
from idb.indexing.indexer import get_connection, get_indexname
from idb.helpers.logging import idblogger

# local imports
from .query_shim import queryFromShim
from .meta_xml import make_meta, make_file_block
from .identification import identifiy_locality, identifiy_scientificname

logger = idblogger.getChild('download')

indexName = get_indexname()

# 0: Current Year
# 1: Query Text
# 2: Total Number of Records
# 3: Access Datetime
# 4: Number of recordsets
# 5: List of recordset IDs and counts
citation_format = """http://www.idigbio.org/portal ({0}),
Query: {1},
{2} records, accessed on {3},
contributed by {4} Recordsets, Recordset identifiers:
{5}"""
示例#5
0
    def __init__(self,
                 fh,
                 encoding="utf8",
                 delimiter=",",
                 fieldenc="\"",
                 header=None,
                 rowtype=None,
                 logname=None):
        super(DelimitedFile, self).__init__()

        self.encoding = encoding
        self.fieldenc = fieldenc
        self.delimiter = delimiter
        self.rowtype = rowtype
        self.lineCount = 0
        self.lineLength = None

        if isinstance(fh, str) or isinstance(fh, unicode):
            self.name = fh
        else:
            self.name = fh.name
        self.filehandle = io.open(fh,
                                  "r",
                                  encoding=encoding,
                                  errors="flag_error")

        if logname is None:
            self.logger = idblogger.getChild('df')
        else:
            self.logger = getLogger(logname)

        encoded_lines = (l.encode("utf-8") for l in self.filehandle)
        if self.fieldenc is None or self.fieldenc == "":
            self._reader = csv.reader(encoded_lines,
                                      encoding="utf-8",
                                      delimiter=self.delimiter,
                                      quoting=csv.QUOTE_NONE)
        else:
            self._reader = csv.reader(encoded_lines,
                                      encoding="utf-8",
                                      delimiter=self.delimiter,
                                      quotechar=self.fieldenc)

        t = defaultdict(int)
        if header is not None:
            self.fields = header
            for k, v in header.items():
                cn = get_canonical_name(v)
                t[cn[1]] += 1
        else:
            headerline = self._reader.next()
            self.lineLength = len(headerline)
            self.fields = {}
            for k, v in enumerate(headerline):
                cn = get_canonical_name(v)
                if cn[0] is not None:
                    t[cn[1]] += 1
                    self.fields[k] = cn[0]

        if self.rowtype is None:
            items = t.items()
            items.sort(key=lambda item: (item[1], item[0]), reverse=True)
            self.rowtype = items[0][0]
            self.logger.info("Setting row type to %s", self.rowtype)
        elif self.rowtype in types:
            self.rowtype = types[self.rowtype]["shortname"]
        else:
            raise TypeError("{} not mapped to short name".format(self.rowtype))
示例#6
0
from idigbio_ingestion.lib.eml import parseEml

#### disabling warnings per https://urllib3.readthedocs.org/en/latest/security.html#disabling-warnings
## Would rather have warnings go to log but could not get logging.captureWarnings(True) to work.
## There is no urllib3.enable_warnings method. Is it possible to disable_warnings and then re-enable them later?
####### The disable_warnings method did not prevent warnings from being printed. Commenting out for now...
#import urllib3
#assert urllib3.__version__ >= "1.13"
#urllib3.disable_warnings()
####

# uuid '872733a2-67a3-4c54-aa76-862735a5f334' is the idigbio root entity,
# the parent of all publishers.
IDIGBIO_ROOT_UUID = "872733a2-67a3-4c54-aa76-862735a5f334"

logger = idblogger.getChild('upr')


def struct_to_datetime(s):
    """
    Convert a Struct representation of a time to a datetime
    timestamp.

    Parameters
    ----------
    s : struct
        Timestamp in Struct representation, a 9-tuple such as
        (2019, 2, 17, 17, 3, 38, 1, 48, 0)

    Returns
    -------
示例#7
0
from PIL import Image
from boto.exception import BotoServerError, BotoClientError, S3DataError

from idb.helpers import first, gipcpool, ilen, grouper
from idb.helpers.memoize import memoized
from idb import config
from idb.helpers.storage import IDigBioStorage
from idb.postgres_backend import apidbpool, NamedTupleCursor
from idb.helpers.logging import idblogger

WIDTHS = {'thumbnail': 260, 'webview': 600}

POOLSIZE = 50
DTYPES = ('thumbnail', 'fullsize', 'webview')

logger = idblogger.getChild('deriv')

CheckItem = namedtuple('CheckItem', ['etag', 'bucket', 'media', 'keys'])

GenerateResult = namedtuple('GenerateResult', ['etag', 'items'])
GenerateItem = namedtuple('GenerateItem', ['key', 'data'])
CopyItem = namedtuple('CopyItem', ['key', 'data'])


class BadImageError(Exception):
    etag = None
    inner = None

    def __init__(self, message, inner=None):
        self.message = message
        self.inner = inner
示例#8
0
from __future__ import division, absolute_import, print_function

from datetime import datetime

from idb import config
from idb.postgres_backend import apidbpool
from idb.helpers.logging import idblogger

logger = idblogger.getChild('migrate')


def migrate():
    """Migrate objects from the old media API

    Specifically the `idb_object_keys` into the new `media` and `objects`
    """
    t1 = datetime.now()
    logger.info("Checking for objects in the old media api")
    try:
        sql = """INSERT INTO objects (bucket, etag)
              (SELECT DISTINCT
                type,
                etag
              FROM idb_object_keys
              LEFT JOIN objects USING (etag)
              WHERE objects.etag IS NULL
                AND idb_object_keys.user_uuid <> %s);
        """
        rc = apidbpool.execute(sql, (config.IDB_UUID, ))
        logger.info("Objects Migrated: %s", rc)
        sql = """INSERT INTO media (url, type, owner, last_status, last_check)
示例#9
0
from __future__ import division, absolute_import, print_function

import cStringIO
import math
import os
import time

import boto
import boto.s3.connection
from boto.exception import BotoServerError, BotoClientError, S3DataError

from idb import config
from idb.helpers.logging import idblogger
from idb.postgres_backend.db import MediaObject

logger = idblogger.getChild('storage')

private_buckets = {"debugfile"}


class IDigBioStorage(object):
    """
        Class to abstract out the iDigBio S3 storage.

        Note:
            You must either set access_key and secret_key when
            initializing the object, or (prefered) set the
            IDB_STORAGE_ACCESS_KEY and IDB_STORAGE_SECRET_KEY
            environment variables.
    """
    def __init__(self, host=None, access_key=None, secret_key=None):
示例#10
0
from flask import jsonify, current_app
from werkzeug.exceptions import default_exceptions

# Find the stack on which we want to store the database connection.
# Starting with Flask 0.9, the _app_ctx_stack is the correct one,
# before that we need to use the _request_ctx_stack.
try:
    from flask import _app_ctx_stack as stack
except ImportError:
    from flask import _request_ctx_stack as stack

from idb.postgres_backend.db import PostgresDB
from idb.helpers.logging import idblogger


logger = idblogger.getChild('api')
s = requests.Session()


def json_error(status_code, message=None):
    if message is None:
        if status_code in default_exceptions:
            message = default_exceptions[status_code].description
    resp = jsonify({"error": message})
    resp.status_code = status_code
    return resp


def get_idb():
    return PostgresDB(pool=current_app.config['DB'])
示例#11
0
from __future__ import division, absolute_import
from __future__ import print_function

import functools
import itertools
import logging
import functools
import os
import sys

import click
import idb

from idb.helpers.logging import configure_app_log, idblogger

clilog = idblogger.getChild('cli')


def get_std_options():
    return [
        click.Option(
            ['--verbose', '-v'],
            count=True,
            help="Output more log messages, repeat for increased verbosity"),
        click.Option(
            ['--config'],
            type=click.Path(exists=True, dir_okay=False, resolve_path=True),
            help="JSON config file to load. config value precedence: "
            "default config path < PATH < environment values < command line"),
        click.Option(['--env'],
                     envvar="ENV",
示例#12
0
from idb import stats
from idb.postgres_backend import apidbpool, NamedTupleCursor
from idb.postgres_backend.db import PostgresDB, RecordSet
from idb.helpers import ilen
from idb.helpers.etags import calcEtag, calcFileHash
from idb.helpers.logging import idblogger, LoggingContext
from idb.helpers.storage import IDigBioStorage
from idb.helpers import gipcpool

from idigbio_ingestion.lib.dwca import Dwca
from idigbio_ingestion.lib.delimited import DelimitedFile

bad_chars = u"\ufeff"
bad_char_re = re.compile("[%s]" % re.escape(bad_chars))

logger = idblogger.getChild("db-check")

uuid_re = re.compile(
    "([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})"
)

s = IDigBioStorage()


class RecordException(Exception):
    pass


def getrslogger(rsid):
    return logger.getChild(rsid)
示例#13
0
from __future__ import division, absolute_import
from __future__ import print_function

import logging

from uuid import UUID

from idb.helpers.logging import idblogger, configure

from idb.postgres_backend.db import tombstone_etag
from idb.postgres_backend.db import PostgresDB

logger = idblogger.getChild('ingestion')
configure(logger=logger, stderr_level=logging.INFO)


def check_uuid(uuid):
    """
    Check to see if a string is a valid uuid representation.
    """
    try:
        uuid_obj = UUID(uuid)
        return True
    except:
        logger.error("'{0}' does not appear to be a UUID.".format(uuid))
        return False


def delete_recordset(uuid, db):
    """
    Deletes a recordset and all child records by marking them deleted in the database
示例#14
0
from __future__ import absolute_import

from functools import wraps
from flask import request, jsonify, current_app
import os

from .encryption import _encrypt
from idb.helpers.logging import idblogger
from idb import config

logger = idblogger.getChild('authn')


def check_auth(username, password):
    """This function is called to check if a username /
    password combination is valid.
    """
    try:
        corrections = "/v2/corrections" in request.url
        annotations = "/v2/anotations" in request.url
        objects = "/v2/media" in request.url
        if corrections:
            sql = (
                "SELECT * FROM idb_api_keys WHERE user_uuid=%s and apikey=%s and corrections_allowed=true",
                (username, _encrypt(password, config.IDB_CRYPT_KEY)))
        elif annotations:
            sql = (
                "SELECT * FROM idb_api_keys WHERE user_uuid=%s and apikey=%s and annotations_allowed=true",
                (username, _encrypt(password, config.IDB_CRYPT_KEY)))
        elif objects:
            sql = (
示例#15
0
from __future__ import division, absolute_import, print_function

from psycopg2.extensions import cursor

from idb.postgres_backend import apidbpool
from idb.helpers.storage import IDigBioStorage
from idb.helpers.conversions import get_accessuri, get_media_type
from idb.helpers.logging import idblogger
from . import IGNORE_PREFIXES

logger = idblogger.getChild('mediaing')


def check_ignore_media(url):
    for p in IGNORE_PREFIXES:
        if url.startswith(p):
            return True
    return False


def updatedb(prefix=None, since=None):
    """Runs the process of finding new urls

    Records that are imported don't go directly into the media table,
    instead periodically we need to run this to look for new urls in
    mediarecords data.

    """
    media_urls = existing_media_urls(prefix)
    to_insert, to_update = find_new_urls(media_urls, prefix, since)
    write_urls_to_db(to_insert, to_update)
示例#16
0
from __future__ import absolute_import
import contextlib
import uuid
import sys

import psycopg2.extensions
import psycopg2.pool

import gevent
import gevent.lock
from gevent.queue import Queue
from gevent.socket import wait_read, wait_write

from idb.helpers.logging import idblogger

logger = idblogger.getChild('gevent_helpers')


def gevent_wait_callback(conn, timeout=None):
    """A wait callback useful to allow gevent to work with Psycopg."""
    while 1:
        state = conn.poll()
        if state == psycopg2.extensions.POLL_OK:
            break
        elif state == psycopg2.extensions.POLL_READ:
            wait_read(conn.fileno(), timeout=timeout)
        elif state == psycopg2.extensions.POLL_WRITE:
            wait_write(conn.fileno(), timeout=timeout)
        else:
            raise psycopg2.OperationalError("Bad result from poll: %r" % state)
示例#17
0
from __future__ import division, absolute_import
from __future__ import print_function

from datetime import datetime
from contextlib import contextmanager

from signal import (  # noqa
    SIG_DFL, SIG_IGN, SIGABRT, SIGHUP,
    SIGINT, SIGQUIT, SIGUSR1, SIGUSR2, SIGTERM)
from signal import signal as _signal


from idb.helpers.logging import idblogger

logger = idblogger.getChild('sig')


@contextmanager
def ignored(signalnum):
    """Ignore the specified signal for the duration of this contextmanager

    The original signal handler will be restored at the end.
    """
    with signalcm(signalnum, SIG_IGN, call_original=False):
        yield


@contextmanager
def signalcm(signalnum, handler, call_original=True):
    """Install a new signal handler
示例#18
0
import os
from collections import Counter

import logging
logging.basicConfig()
from idb.helpers.logging import idblogger, configure
logger = idblogger.getChild("taxon_index")
configure(logger=logger)

from idigbio_ingestion.lib.dwca import Dwca

import elasticsearch.helpers
from elasticsearch import Elasticsearch

index = "taxonnames-20170619"

es = Elasticsearch([
    "c18node2.acis.ufl.edu",
    "c18node6.acis.ufl.edu",
    "c18node10.acis.ufl.edu",
    "c18node12.acis.ufl.edu",
    "c18node14.acis.ufl.edu"
], sniff_on_start=False, sniff_on_connection_fail=False, retry_on_timeout=True, max_retries=10, timeout=10)

def bulk_formater(tups):
    for t, i in tups:
        meta = {
            "_index": index,
            "_type": t,
            "_id": "gbif_" + i["dwc:taxonID"],
            "_source": i,
示例#19
0
from __future__ import division, absolute_import, print_function
import os.path
import smtplib
from email.MIMEMultipart import MIMEMultipart
from email.MIMEBase import MIMEBase
from email.MIMEText import MIMEText
from email.Utils import COMMASPACE, formatdate
from email import Encoders

from idb.helpers.logging import idblogger

logger = idblogger.getChild("mailer")


def send_mail(send_from, send_to, subject, text, files=[]):
    smtp = smtplib.SMTP("smtp.ufl.edu")
    assert type(send_to) == list
    assert type(files) == list

    msg = MIMEMultipart()
    msg['From'] = send_from
    msg['To'] = COMMASPACE.join(send_to)
    msg['Date'] = formatdate(localtime=True)
    msg['Subject'] = subject

    msg.attach(MIMEText(text, 'plain'))

    for f in files:
        part = MIMEBase('application', "octet-stream")
        part.set_payload(open(f, "rb").read())
        Encoders.encode_base64(part)
示例#20
0
from __future__ import division, absolute_import, print_function

from pytz import timezone

import elasticsearch
import elasticsearch.helpers

from idb import config
from idb.helpers.logging import idblogger
from idb.helpers.conversions import fields, custom_mappings

local_tz = timezone('US/Eastern')
logger = idblogger.getChild('indexing')

# Try using smaller batches.
INDEX_CHUNK_SIZE = 1000


def get_connection(**kwargs):
    """
    Build connection to ElasticSearch based on json config overriding with specified kwargs

    Returns
    -------
    elasticsearch.Elasticsearch
        An elasticsearch connection object
    """
    kwargs.setdefault('hosts', config.config["elasticsearch"]["servers"])
    kwargs.setdefault('retry_on_timeout', True)  # this isn't valid until >=1.3
    kwargs.setdefault('sniff_on_start', False)
    kwargs.setdefault('sniff_on_connection_fail', False)
示例#21
0
    def __init__(self,filedict,fh,logname=None):
        """
            Construct a DwcaRecordFile from a xml tree pointer to the <location> tag containing the data file name
            and a file handle or string pointing to the data file.
        """

        # Avoid Setting attributes on self that conflict with attributes in DelimitedFile to enforce namespace separation
        if isinstance(filedict["files"]["location"], list):
            for l in filedict["files"]["location"]:
                if fh.endswith(l):
                    self.name = l
                    break
            else:
                raise Exception("Name not found.")
        else:
            self.name = filedict['files']['location']

        if logname:
            logbase = getLogger(logname)
        else:
            logbase = idblogger.getChild('dwca')
        self.logger = logbase.getChild(self.name.split(".")[0])

        fields = {}
        self.linebuf = deque()

        idtag = "id"
        idfld = None
        if 'id' in filedict:
            self.filetype = "core"
            idfld = filedict["id"]
        elif "coreid" in filedict:
            idtag = "coreid"
            idfld = filedict["coreid"]
            self.filetype = "extension"
        else:
            self.filetype = "core"

        if idfld is not None:
            fields[int(idfld['#index'])] = idtag

        rowtype = filedict["#rowType"]
        encoding = filedict.get("#encoding", "UTF-8")
        linesplit = filedict["#linesTerminatedBy"].decode('string_escape')
        fieldsplit = filedict["#fieldsTerminatedBy"].decode('string_escape')
        fieldenc = filedict["#fieldsEnclosedBy"].decode('string_escape')
        ignoreheader = int(filedict.get("#ignoreHeaderLines","0"))

        self.defaults = {}
        if "field" not in filedict:
            filedict["field"] = []
        elif not isinstance(filedict['field'],list):
            filedict['field'] = [filedict['field']]
        for fld in filedict['field']:
            # drop any extra quote characters
            term = fld['#term'].replace("\"","")

            # map xmp namespaces into short code form (xxx:fieldName), longest namespaces first
            for ns in sorted(namespaces.keys(),key=lambda x: len(x), reverse=True):
                if term.startswith(ns):
                    term = term.replace(ns,namespaces[ns]+":")
                    break
            if '#index' in fld:
                if int(fld['#index']) not in fields:
                    fields[int(fld['#index'])] = term
                else:
                    self.logger.error("Duplicate field index ignored {0}".format(str(fld)))
            if '#default' in fld:
                self.defaults[term] = fld['#default']
        # print self.defaults

        super(DwcaRecordFile,self).__init__(
            fh,encoding=encoding,delimiter=fieldsplit,fieldenc=fieldenc,header=fields,rowtype=rowtype,
            logname=self.logger.name)

        while ignoreheader > 0:
            self._reader.next()
            ignoreheader -= 1
import uuid
import json

import gevent
import udatetime
from udatetime.rfc3339 import from_rfc3339_string
import pytest
from flask import url_for


from idb.data_api import v2_download
from idb.data_api.v2_download import DOWNLOADER_TASK_PREFIX
from idb.helpers.logging import idblogger


logger = idblogger.getChild('test.api.downloads')

@pytest.fixture()
def fakeredcli(request):
    import fakeredis
    fsr = fakeredis.FakeStrictRedis()
    v2_download.get_redis_conn = lambda: fsr
    request.addfinalizer(fsr.flushall)
    return fsr


class FakeAsyncResult(object):
    id = None
    status = "PENDING"
    result = None
示例#23
0
    def __init__(self,name="dwca.zip",skipeml=False,logname=None):
        self.path = name.split(".")[0]
        if self.path == name:
            self.path += "_extracted"

        if logname:
            logbase = getLogger(logname)
        else:
            logbase = idblogger.getChild('dwca')
        self.logger = logbase.getChild(name.split("/")[-1].split(".")[0])

        try:
            self.archive = zipfile.ZipFile(name, 'r')
            self.archive.extractall(self.path)
        except zipfile.BadZipfile:
            self.logger.fatal("Couldn't extract '%s'", name)
            raise

        root = None
        meta_filename = self.path + "/" + archiveFile(self.archive,"meta.xml")
        try:
            schema_parser = etree.XMLParser(no_network=False)
            # wut is going on. see https://redmine.idigbio.org/issues/3042
            schema = etree.XMLSchema(etree.parse(DWC_SCHEMA_URL, parser=schema_parser))
            parser = etree.XMLParser(schema=schema, no_network=False)

            with open(meta_filename,'r') as meta:
                try:
                    root = etree.parse(meta, parser=parser).getroot()
                except:
                    self.logger.info("Schema validation failed against '%s', continuing unvalidated.", DWC_SCHEMA_URL)
                    self.logger.debug(traceback.format_exc())
                    meta.seek(0)
                    # print meta.read()
                    # meta.seek(0)
                    root = etree.parse(meta).getroot()
        except:
            self.logger.info("Failed to fetch schema '%s', continuing unvalidated.", DWC_SCHEMA_URL)
            self.logger.debug(traceback.format_exc())
            with open(meta_filename,'r') as meta:
                root = etree.parse(meta).getroot()
        rdict = xml2d(root)

        self.archdict = rdict["archive"]

        if not skipeml and "#metadata" in self.archdict:
            metadata = archiveFile(self.archive,self.archdict["#metadata"])
            with open(self.path + "/" + metadata,'r') as mf:
                mdtree = etree.parse(mf).getroot()
                self.metadata = xml2d(mdtree)
        else:
            self.metadata = None

        corefile = archiveFile(self.archive,self.archdict["core"]["files"]["location"])
        self.core = DwcaRecordFile(self.archdict["core"],
                                   self.path + "/" + corefile,
                                   logname=self.logger.name)

        self.extensions = []
        if "extension" in self.archdict:
            if isinstance(self.archdict["extension"],list):
                for x in self.archdict["extension"]:
                    if isinstance(x["files"]["location"], list):
                        for loc in x["files"]["location"]:
                            extfile = archiveFile(self.archive,loc)
                            print(extfile)
                            try:
                                self.extensions.append(
                                    DwcaRecordFile(x,
                                                   self.path + "/" + extfile,
                                                   logname=self.logger.name))
                            except:
                                traceback.print_exc()
                    else:
                        extfile = archiveFile(self.archive,x["files"]["location"])
                        try:
                            self.extensions.append(
                                DwcaRecordFile(x,
                                               self.path + "/" + extfile,
                                               logname=self.logger.name))
                        except:
                            pass
            else:
                extfile = archiveFile(self.archive,self.archdict["extension"]["files"]["location"])
                self.extensions.append(
                    DwcaRecordFile(self.archdict["extension"],
                                   self.path + "/" + extfile,
                                   logname=self.logger.name))
示例#24
0
result object back, just the processes return code.

"""

from __future__ import division, absolute_import, print_function

import functools
import multiprocessing

import gevent.pool
import gipc
import greenlet

from idb.helpers.logging import idblogger

logger = idblogger.getChild('gipc')


def spawn(target, *args, **kwargs):
    daemon = kwargs.pop('daemon', False)
    try:
        p = gipc.start_process(target, args, kwargs, daemon=daemon)
        p.join()
        return p.exitcode
    except (KeyboardInterrupt, greenlet.GreenletExit) as e:
        logger.debug("Killing child proc %s on %r", p, e)
        p.terminate()
        p.join()
        raise
    except:
        logger.exception("Failed on child %s", p)
示例#25
0
from __future__ import division, absolute_import, print_function

from flask import Flask, jsonify, request, abort, url_for
from flask_uuid import FlaskUUID

from idb import __version__
from idb.helpers.logging import idblogger
from idb.helpers.cors import crossdomain
from idb.postgres_backend import apidbpool
from idb.data_api.common import idbmodel

logger = idblogger.getChild("api")

app = Flask(__name__)
FlaskUUID(app)

app.config.from_object('idb.data_api.config')

app.url_map.strict_slashes = False

app.config["DB"] = apidbpool
idbmodel.init_app(app)

from .v1 import this_version as v1
from .v2 import this_version as v2
from .v2_download import this_version as v2_download
from .v2_media import this_version as v2_media

app.register_blueprint(v1, url_prefix="/v1")
app.register_blueprint(v2, url_prefix="/v2")
app.register_blueprint(v2_download, url_prefix="/v2")