Exemplo n.º 1
0
class Json2Redshift(object):
    @use_settings
    def __init__(
            self,
            host,
            user,
            password,
            table,
            meta,  # REDSHIFT COPY COMMAND REQUIRES A BUCKET TO HOLD PARAMETERS
            database=None,
            port=5439,
            settings=None):
        self.settings = settings
        self.db = Redshift(settings)
        INDEX_CACHE[settings.table] = wrap(
            {"name": settings.table})  # HACK TO GET parse_columns TO WORK
        columns = parse_columns(settings.table,
                                settings.mapping.test_result.properties)
        nested = [c.name for c in columns if c.type == "nested"]
        self.columns = wrap([
            c for c in columns if c.type not in ["object"] and not any(
                c.name.startswith(n + ".") for n in nested)
        ])

        try:
            self.db.execute("""
                CREATE TABLE {{table_name}} (
                    "_id" character varying UNIQUE,
                    {{columns}}
                )""", {
                "table_name":
                self.db.quote_column(settings.table),
                "columns":
                SQL(",\n".join(
                    self.db.quote_column(c.name) + " " +
                    self.db.es_type2pg_type(c.type) for c in self.columns))
            },
                            retry=False)
        except Exception, e:
            if "already exists" in e:
                Log.alert("Table {{table}} exists in Redshift",
                          table=settings.table)
            else:
                Log.error("Could not make table", e)

        # MAKE jsonpaths FOR COPY COMMAND
        jsonpaths = {
            "jsonpaths": [
                "$" + "".join("[" + convert.string2quote(p) + "]"
                              for p in split_field(c.name))
                for c in self.columns
            ]
        }
        content = convert.value2json(jsonpaths)
        content = content.replace("\\\"", "'")
        # PUSH TO S3
        s3.Bucket(meta).write(meta.jsonspath, content)
Exemplo n.º 2
0
 def __init__(
         self,
         bucket,  # NAME OF THE BUCKET
         aws_access_key_id=None,  # CREDENTIAL
         aws_secret_access_key=None,  # CREDENTIAL
         region=None,  # NAME OF AWS REGION, REQUIRED FOR SOME BUCKETS
         public=False,
         debug=False,
         settings=None):
     self.bucket = s3.Bucket(settings)
     self.settings = settings
Exemplo n.º 3
0
    def __init__(self, backing, database, kwargs=None):
        if backing.directory:
            self.backing = DirectoryBacking(kwargs=backing)
        else:
            self.backing = s3.Bucket(kwargs=backing)
        self.db = Sqlite(database)

        # ENSURE DATABASE IS SETUP
        if not self.db.about(VERSION_TABLE):
            schema.setup(self)
        self.next_id = id_generator(db=self.db, version_table=VERSION_TABLE)
        self.queues = []
        self.please_stop = Signal()
        self.cleaner = Thread.run("cleaner", self._cleaner)
Exemplo n.º 4
0
def get_all_s3(in_es, settings):
    # EVERYTHING FROM S3
    bucket = s3.Bucket(settings.source)
    prefixes = [
        p.name.rstrip(":") for p in bucket.list(prefix="", delimiter=":")
    ]
    in_s3 = []
    for i, p in enumerate(prefixes):
        if i % 1000 == 0:
            Log.note("Scrubbed {{p|percent(decimal=1)}}", p=i / len(prefixes))
        try:
            if int(p) not in in_es:
                in_s3.append(int(p))
            else:
                pass
        except Exception, _:
            Log.note("delete key {{key}}", key=p)
            bucket.delete_key(strip_extension(p))
Exemplo n.º 5
0
def copy2es(es, settings, work_queue, please_stop=None):
    # EVERYTHING FROM ELASTICSEARCH
    bucket = s3.Bucket(settings.source)

    for key in iter(work_queue.pop, ""):
        if please_stop:
            return
        if key == None:
            continue

        key = unicode(key)
        extend_time = Timer("insert", silent=True)
        Log.note("Indexing {{key}}", key=key)
        with extend_time:
            if settings.sample_only:
                sample_filter = {
                    "terms": {
                        "build.branch": settings.sample_only
                    }
                }
            elif settings.sample_size:
                sample_filter = True
            else:
                sample_filter = None

            if key.find(":") >= 0:
                more_keys = bucket.keys(prefix=key)
            else:
                more_keys = bucket.keys(prefix=key + ":")
            num_keys = es.copy(more_keys, bucket, sample_filter,
                               settings.sample_size)

        if num_keys > 1:
            Log.note(
                "Added {{num}} keys from {{key}} block in {{duration}} ({{rate|round(places=3)}} keys/second)",
                num=num_keys,
                key=key,
                duration=extend_time.duration,
                rate=num_keys / Math.max(extend_time.duration.seconds, 0.01))

        work_queue.commit()
Exemplo n.º 6
0
    def __init__(self, kwargs=None):
        self.settings = kwargs
        self.schema = SnowflakeSchema(self.settings.snowflake)
        self._extract = extract = kwargs.extract

        # SOME PREP
        get_git_revision()

        # VERIFY WE DO NOT HAVE TOO MANY OTHER PROCESSES WORKING ON STUFF
        with MySQL(**kwargs.snowflake.database) as db:
            processes = None
            try:
                processes = jx.filter(
                    db.query("show processlist"), {
                        "and": [{
                            "neq": {
                                "Command": "Sleep"
                            }
                        }, {
                            "neq": {
                                "Info": "show processlist"
                            }
                        }]
                    })
            except Exception as e:
                Log.warning("no database", cause=e)

            if processes:
                if DEBUG:
                    Log.warning("Processes are running\n{{list|json}}",
                                list=processes)
                else:
                    Log.error("Processes are running\n{{list|json}}",
                              list=processes)

        extract.type = listwrap(extract.type)
        extract.start = listwrap(extract.start)
        extract.batch = listwrap(extract.batch)
        extract.field = listwrap(extract.field)
        if any(
                len(extract.type) != len(other)
                for other in [extract.start, extract.batch, extract.field]):
            Log.error(
                "Expecting same number of dimensions for `type`, `start`, `batch`, and `field` in the `extract` inner object"
            )
        for i, t in enumerate(extract.type):
            if t == "time":
                extract.start[i] = Date(extract.start[i])
                extract.batch[i] = Duration(extract.batch[i])
            elif t == "number":
                pass
            else:
                Log.error('Expecting `extract.type` to be "number" or "time"')

        extract.threads = coalesce(extract.threads, 1)
        self.done_pulling = Signal()
        self.queue = Queue("all batches",
                           max=2 * coalesce(extract.threads, 1),
                           silent=True)

        self.bucket = s3.Bucket(self.settings.destination)
        self.notify = aws.Queue(self.settings.notify)
        Thread.run("get records", self.pull_all_remaining)