Exemplo n.º 1
0
def find_dataset(dataset, client):
    esc_name = escape_name(dataset)

    datasets = list(client.list_datasets())
    for _dataset in datasets:
        if ApiName(_dataset.dataset_id) == esc_name:
            return _dataset.reference
Exemplo n.º 2
0
    def __init__(self, dataset, account_info, kwargs):
        self.client = connect(account_info)
        self.short_name = dataset
        esc_name = escape_name(dataset)
        self.full_name = ApiName(account_info.project_id) + esc_name

        self.dataset = find_dataset(dataset, self.client)
        if not self.dataset:
            self.dataset = create_dataset(account_info.project_id, dataset,
                                          self.client)
Exemplo n.º 3
0
        def _schema_to_bq_schema(jx_path, es_path, schema):
            output = []
            nt = schema.get(NESTED_TYPE)
            if nt:
                schema = {NESTED_TYPE: nt}
            for t, sub_schema in jx.sort(schema.items(), 0):
                bqt = typed_to_bq_type.get(
                    t, {"field_type": "RECORD", "mode": "NULLABLE"}
                )
                full_name = es_path + escape_name(t)
                top_field = self._top_level_fields.get(text(full_name))
                if is_text(sub_schema):
                    new_field_type = json_type_to_bq_type.get(sub_schema, sub_schema)
                    if new_field_type != bqt["field_type"]:
                        # OVERRIDE TYPE
                        bqt = bqt.copy()
                        bqt["field_type"] = new_field_type
                    fields = ()
                else:
                    fields = _schema_to_bq_schema(jx_path + (t,), full_name, sub_schema)

                if top_field:
                    if fields:
                        Log.error("not expecting a structure")
                    if self._partition.field == top_field:
                        if bqt["field_type"] != "TIMESTAMP":
                            Log.error("Partition field must be of time type")
                    struct = SchemaField(name=top_field, fields=fields, **bqt)
                    top_fields.append(struct)
                elif not fields and bqt["field_type"] == "RECORD":
                    # THIS CAN HAPPEN WHEN WE MOVE A PRIMITIVE FIELD TO top_fields
                    pass
                else:
                    struct = SchemaField(
                        name=text(escape_name(t)), fields=fields, **bqt
                    )
                    output.append(struct)
            return output
Exemplo n.º 4
0
    def __init__(self, dataset, account_info, kwargs):
        creds = service_account.Credentials.from_service_account_info(
            info=account_info)
        self.client = bigquery.Client(project=account_info.project_id,
                                      credentials=creds)
        self.short_name = dataset
        esc_name = escape_name(dataset)
        self.full_name = ApiName(account_info.project_id) + esc_name

        datasets = list(self.client.list_datasets())
        for _dataset in datasets:
            if ApiName(_dataset.dataset_id) == esc_name:
                self.dataset = _dataset.reference
                break
        else:
            _dataset = bigquery.Dataset(text(self.full_name))
            _dataset.location = "US"
            self.dataset = self.client.create_dataset(_dataset)
Exemplo n.º 5
0
    def delete_table(self, name):
        api_name = escape_name(name)

        tables = list(self.client.list_tables(self.dataset))
        for table_item in tables:
            table = table_item.reference
            table_api_name = ApiName(table.table_id)
            if text(table_api_name).startswith(text(api_name)):
                if table_api_name == api_name:
                    if table_item.table_type != "VIEW":
                        Log.error("expecting {{table}} to be a view",
                                  table=api_name)
                    self.client.delete_table(table)
                elif SUFFIX_PATTERN.match(
                        text(table_api_name)[len(text(api_name)):]):
                    try:
                        self.client.delete_table(table)
                    except Exception as e:
                        Log.warning("could not delete table {{table}}",
                                    table=table,
                                    cause=e)
Exemplo n.º 6
0
 def to_bq(self, schema, not_null=False, boolean=False, many=True):
     var_name = self.var
     if var_name == GUID:
         return BQLScript(data_type=STRING,
                          expr=quote_column(escape_name(GUID)),
                          frum=self,
                          miss=FALSE,
                          many=False,
                          schema=schema)
     cols = schema.leaves(var_name)
     if not cols:
         # DOES NOT EXIST
         return BQLScript(data_type=OBJECT,
                          expr=SQL_NULL,
                          frum=self,
                          miss=TRUE,
                          many=False,
                          schema=schema)
     elif len(cols) == 1:
         col = first(cols)
         return BQLScript(data_type=col.jx_type,
                          expr=quote_column(
                              ApiName(*split_field(col.es_column))),
                          frum=self,
                          miss=MissingOp(self),
                          many=False,
                          schema=schema)
     else:
         coalesce = []
         for col in cols:
             rel_path = untype_path(relative_field(col.name, var_name))
             if rel_path == '.':
                 coalesce.append(Variable(col.name))
             else:
                 Log.error("structure not supported")
         return CoalesceOp(coalesce).to_bq(schema)
Exemplo n.º 7
0
 def parse_schema(schema, tops, es_type_info, jx_path, nested_path, es_path):
     if is_text(schema):
         json_type = schema
         expected_es_type = json_type_to_bq_type[json_type]
         if es_type_info and es_type_info != expected_es_type:
             Log.error(
                 "expecting {{path}} to be of type {{expected_type}} not of type {{observed_type}}",
                 path=jx_path,
                 expected_type=expected_es_type,
                 observed_type=es_type_info
             )
         c = jx_base.Column(
             name=join_field(jx_path),
             es_column=coalesce(tops, text(es_path)),
             es_index=self.es_index,
             es_type=coalesce(es_type_info, expected_es_type),
             jx_type=json_type,
             nested_path=nested_path,
             last_updated=now,
         )
         columns.append(c)
     else:
         c = jx_base.Column(
             name=join_field(jx_path),
             es_column=text(es_path),
             es_index=self.es_index,
             es_type="RECORD",
             jx_type=OBJECT,
             nested_path=nested_path,
             last_updated=now,
         )
         columns.append(c)
         count = len(columns)
         for k, s in schema.items():
             if k == NESTED_TYPE:
                 c.jx_type = NESTED
                 parse_schema(
                     s,
                     tops if is_text(tops) else tops[k],
                     es_type_info
                     if is_text(es_type_info)
                     else es_type_info[k],
                     jx_path + (k,),
                     (jx_path,) + nested_path,
                     es_path + escape_name(k),
                 )
             else:
                 parse_schema(
                     s,
                     tops if is_text(tops) else tops[k],
                     es_type_info
                     if is_text(es_type_info)
                     else es_type_info[k],
                     jx_path + (k,),
                     nested_path,
                     es_path + escape_name(k),
                 )
         if is_text(tops) and len(columns) > count + 1:
             Log.error(
                 "too many top level fields at {{field}}:",
                 field=join_field(jx_path),
             )
Exemplo n.º 8
0
    def columns(self):
        if not self._columns:
            now = Date.now()
            columns = []

            def parse_schema(schema, tops, es_type_info, jx_path, nested_path, es_path):
                if is_text(schema):
                    json_type = schema
                    expected_es_type = json_type_to_bq_type[json_type]
                    if es_type_info and es_type_info != expected_es_type:
                        Log.error(
                            "expecting {{path}} to be of type {{expected_type}} not of type {{observed_type}}",
                            path=jx_path,
                            expected_type=expected_es_type,
                            observed_type=es_type_info,
                        )
                    c = jx_base.Column(
                        name=join_field(jx_path),
                        es_column=coalesce(tops, text(es_path)),
                        es_index=self.es_index,
                        es_type=coalesce(es_type_info, expected_es_type),
                        jx_type=json_type,
                        nested_path=nested_path,
                        last_updated=now,
                    )
                    columns.append(c)
                else:
                    c = jx_base.Column(
                        name=join_field(jx_path),
                        es_column=text(es_path),
                        es_index=self.es_index,
                        es_type="RECORD",
                        jx_type=OBJECT,
                        cardinality=1,
                        nested_path=nested_path,
                        last_updated=now,
                    )
                    columns.append(c)
                    count = len(columns)
                    for k, s in schema.items():
                        if k == NESTED_TYPE:
                            c.jx_type = NESTED
                            parse_schema(
                                s,
                                tops if is_text(tops) else tops[k],
                                es_type_info
                                if is_text(es_type_info)
                                else es_type_info[k],
                                jx_path + (k,),
                                (jx_path,) + nested_path,
                                es_path + escape_name(k),
                            )
                        else:
                            parse_schema(
                                s,
                                tops if is_text(tops) else tops[k],
                                es_type_info
                                if is_text(es_type_info)
                                else es_type_info[k],
                                jx_path + (k,),
                                nested_path,
                                es_path + escape_name(k),
                            )
                    if is_text(tops) and len(columns) > count + 1:
                        Log.error(
                            "too many top level fields at {{field}}:",
                            field=join_field(jx_path),
                        )

            parse_schema(
                self.schema,
                self.top_level_fields,
                self._es_type_info,
                (),
                (".",),
                ApiName(),
            )
            self._columns = columns

            self._top_level_fields = OrderedDict()  # FORCE ORDERING
            for path, field in jx.sort(wrap(self.top_level_fields).leaves(), 0):
                leaves = self.leaves(path)
                if not leaves:
                    continue
                if len(leaves) > 1:
                    Log.error(
                        "expecting {{path}} to have just one primitive value", path=path
                    )
                specific_path = first(leaves).name
                self._top_level_fields[
                    ".".join(text(escape_name(step)) for step in split_field(specific_path))
                ] = field
            self._partition = Partition(kwargs=self.partition, flake=self)

        return self._columns
Exemplo n.º 9
0
def create_dataset(project_id, dataset, client):
    full_name = ApiName(project_id) + escape_name(dataset)

    _dataset = bigquery.Dataset(text(full_name))
    _dataset.location = "US"
    return client.create_dataset(_dataset)
Exemplo n.º 10
0
    def merge_shards(self):
        shards = []
        tables = list(self.container.client.list_tables(
            self.container.dataset))
        current_view = Null  # VIEW THAT POINTS TO PRIMARY SHARD
        primary_shard_name = None  # PRIMARY SHARD
        api_name = escape_name(self.short_name)

        for table_item in tables:
            table = table_item.reference
            table_api_name = ApiName(table.table_id)
            if text(table_api_name).startswith(text(api_name)):
                if table_api_name == api_name:
                    if table_item.table_type != "VIEW":
                        Log.error("expecting {{table}} to be a view",
                                  table=api_name)
                    current_view = self.container.client.get_table(table)
                    view_sql = current_view.view_query
                    primary_shard_name = _extract_primary_shard_name(view_sql)
                elif SUFFIX_PATTERN.match(
                        text(table_api_name)[len(text(api_name)):]):
                    try:
                        known_table = self.container.client.get_table(table)
                        shards.append(known_table)
                    except Exception as e:
                        Log.warning("could not merge table {{table}}",
                                    table=table,
                                    cause=e)

        if not current_view:
            Log.error("expecting {{table}} to be a view pointing to a table",
                      table=api_name)

        shard_flakes = [
            Snowflake.parse(
                big_query_schema=shard.schema,
                es_index=text(self.container.full_name +
                              ApiName(shard.table_id)),
                top_level_fields=self.top_level_fields,
                partition=self.partition,
            ) for shard in shards
        ]
        total_flake = snowflakes.merge(
            shard_flakes,
            es_index=text(self.full_name),
            top_level_fields=self.top_level_fields,
            partition=self.partition,
        )

        for i, s in enumerate(shards):
            if ApiName(s.table_id) == primary_shard_name:
                if total_flake == shard_flakes[i]:
                    # USE THE CURRENT PRIMARY SHARD AS A DESTINATION
                    del shards[i]
                    del shard_flakes[i]
                    break
        else:
            name = self.short_name + "_" + "".join(Random.sample(ALLOWED, 20))
            primary_shard_name = escape_name(name)
            self.container.create_table(
                table=name,
                schema=total_flake.schema,
                sharded=False,
                read_only=False,
                kwargs=self.config,
            )

        primary_full_name = self.container.full_name + primary_shard_name

        selects = []
        for flake, table in zip(shard_flakes, shards):
            q = ConcatSQL(
                SQL_SELECT,
                JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR),
                        gen_select(total_flake, flake)),
                SQL_FROM,
                quote_column(ApiName(table.dataset_id, table.table_id)),
            )
            selects.append(q)

        Log.note("inserting into table {{table}}",
                 table=text(primary_shard_name))
        matched = []
        unmatched = []
        for sel, shard, flake in zip(selects, shards, shard_flakes):
            if flake == total_flake:
                matched.append((sel, shard, flake))
            else:
                unmatched.append((sel, shard, flake))

        # EVERYTHING THAT IS IDENTICAL TO PRIMARY CAN BE MERGED WITH SIMPLE UNION ALL
        if matched:
            for g, merge_chunk in jx.chunk(matched, MAX_MERGE):
                command = ConcatSQL(
                    SQL_INSERT,
                    quote_column(primary_full_name),
                    JoinSQL(
                        SQL_UNION_ALL,
                        (sql_query({
                            "from":
                            self.container.full_name + ApiName(shard.table_id)
                        }) for _, shard, _ in merge_chunk),
                    ),
                )
                DEBUG and Log.note("{{sql}}", sql=text(command))
                job = self.container.query_and_wait(command)
                Log.note("job {{id}} state = {{state}}",
                         id=job.job_id,
                         state=job.state)

                if job.errors:
                    Log.error(
                        "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}",
                        sql=command.sql,
                        reason=job.errors,
                    )
                for _, shard, _ in merge_chunk:
                    self.container.client.delete_table(shard)

        # ALL OTHER SCHEMAS MISMATCH
        for s, shard, _ in unmatched:
            try:
                command = ConcatSQL(SQL_INSERT,
                                    quote_column(primary_full_name), s)
                DEBUG and Log.note("{{sql}}", sql=text(command))
                job = self.container.query_and_wait(command)
                Log.note(
                    "from {{shard}}, job {{id}}, state {{state}}",
                    id=job.job_id,
                    shard=shard.table_id,
                    state=job.state,
                )

                if job.errors:
                    if all(" does not have a schema." in m
                           for m in wrap(job.errors).message):
                        pass  # NOTHING TO DO
                    else:
                        Log.error(
                            "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}",
                            sql=command.sql,
                            reason=job.errors,
                        )

                self.container.client.delete_table(shard)
            except Exception as e:
                Log.warning("failure to merge {{shard}}", shard=shard, cause=e)

        # REMOVE OLD VIEW
        view_full_name = self.container.full_name + api_name
        if current_view:
            self.container.client.delete_table(current_view)

        # CREATE NEW VIEW
        self.container.create_view(view_full_name, primary_full_name)
Exemplo n.º 11
0
    def __init__(
        self,
        table,
        typed,
        read_only,
        sharded,
        container,
        id=Null,
        partition=Null,
        cluster=Null,
        top_level_fields=Null,
        kwargs=None,
    ):
        self.short_name = table
        self.typed = typed
        self.read_only = read_only
        self.cluster = cluster
        self.id = id
        self.top_level_fields = top_level_fields
        self.config = Data(  # USED TO REPLICATE THIS
            typed=typed,
            read_only=read_only,
            sharded=sharded,
            id=id,
            partition=partition,
            cluster=cluster,
            top_level_fields=top_level_fields,
        )

        esc_name = escape_name(table)
        self.full_name = container.full_name + esc_name
        self.alias_view = alias_view = container.client.get_table(
            text(self.full_name))
        self.partition = partition
        self.container = container

        if not sharded:
            if not read_only and alias_view.table_type == "VIEW":
                Log.error("Expecting a table, not a view")
            self.shard = alias_view
            self._flake = Snowflake.parse(
                alias_view.schema,
                text(self.full_name),
                self.top_level_fields,
                partition,
            )
        else:
            if alias_view.table_type != "VIEW":
                Log.error("Sharded tables require a view")
            current_view = container.client.get_table(text(self.full_name))
            view_sql = current_view.view_query
            shard_name = _extract_primary_shard_name(view_sql)
            try:
                self.shard = container.client.get_table(
                    text(container.full_name + shard_name))
                self._flake = Snowflake.parse(
                    alias_view.schema,
                    text(self.full_name),
                    self.top_level_fields,
                    partition,
                )
            except Exception as e:
                Log.warning("view {{name}} is invalid",
                            name=shard_name,
                            cause=e)
                self._flake = Snowflake.parse(
                    alias_view.schema,
                    text(self.full_name),
                    self.top_level_fields,
                    partition,
                )
                # REMOVE STALE VIEW
                container.client.delete_table(current_view)

                # MAKE NEW VIEW POINTING TO NEW SHARD
                self._create_new_shard()
                container.create_view(
                    self.full_name,
                    self.container.full_name + ApiName(self.shard.table_id),
                )

        self.last_extend = Date.now() - EXTEND_LIMIT
Exemplo n.º 12
0
    def create_table(
        self,
        table,
        schema=None,
        typed=True,
        read_only=True,  # TO PREVENT ACCIDENTAL WRITING
        sharded=False,
        partition=Null,  # PARTITION RULES
        cluster=None,  # TUPLE OF FIELDS TO SORT DATA
        top_level_fields=Null,
        kwargs=None,
    ):
        if kwargs.lookup != None or kwargs.flake != None:
            Log.error("expecting schema, not lookup")
        full_name = self.full_name + escape_name(table)
        if not schema:
            # WE MUST HAVE SOMETHING
            if typed:
                schema = copy(DEFAULT_TYPED_SCHEMA)
            else:
                schema = copy(DEFAULT_SCHEMA)

        flake = Snowflake(text(full_name),
                          top_level_fields,
                          partition,
                          schema=schema)

        if read_only:
            Log.error("Can not create a table for read-only use")

        if sharded:
            shard_name = escape_name(table + "_" +
                                     "".join(Random.sample(ALLOWED, 20)))
            shard_api_name = self.full_name + shard_name
            _shard = bigquery.Table(text(shard_api_name),
                                    schema=flake.to_bq_schema())
            _shard.time_partitioning = unwrap(
                flake._partition.bq_time_partitioning)
            _shard.clustering_fields = [
                c.es_column for f in listwrap(cluster)
                for c in [first(flake.leaves(f))] if c
            ] or None
            self.shard = self.client.create_table(_shard)
            self.create_view(full_name, shard_api_name)
        else:
            _table = bigquery.Table(text(full_name),
                                    schema=flake.to_bq_schema())
            _table.time_partitioning = unwrap(
                flake._partition.bq_time_partitioning)
            _table.clustering_fields = [
                l.es_column for f in listwrap(cluster) for l in flake.leaves(f)
            ] or None
            self.client.create_table(_table)
            Log.note("created table {{table}}", table=_table.table_id)

        return Table(
            table=table,
            typed=typed,
            read_only=read_only,
            sharded=sharded,
            partition=partition,
            top_level_fields=top_level_fields,
            kwargs=kwargs,
            container=self,
        )
Exemplo n.º 13
0
def _typed_encode(value, schema):
    if is_many(value):
        output = []
        update = {}
        nest_added = False
        child_schema = schema.get(NESTED_TYPE)
        if not child_schema:
            child_schema = schema[NESTED_TYPE] = {}

        for r in value:
            v, m, n = _typed_encode(r, child_schema)
            output.append(v)
            update.update(m)
            nest_added |= n

        if update:
            return {text(REPEATED): output}, {NESTED_TYPE: update}, True
        else:
            return {text(REPEATED): output}, None, nest_added
    elif NESTED_TYPE in schema:
        if not value:
            return {text(REPEATED): []}, None, False
        else:
            return _typed_encode([value], schema)
    elif is_data(value):
        output = {}
        update = {}
        nest_added = False
        for k, v in value.items():
            child_schema = schema.get(k)
            if not child_schema:
                child_schema = schema[k] = {}
            result, more_update, n = _typed_encode(v, child_schema)
            output[text(escape_name(k))] = result
            if more_update:
                update.update({k: more_update})
                nest_added |= n
        return output, update, nest_added
    elif is_text(schema):
        v, inserter_type, json_type = schema_type(value)
        if schema != json_type:
            Log.error(
                "Can not convert {{existing_type}} to {{expected_type}}",
                existing_type=json_type,
                expected_type=schema,
            )
        return v, None, False
    elif value is None:
        return {text(escape_name(t)): None
                for t, child_schema in schema}, None, False
    else:
        v, inserter_type, json_type = schema_type(value)
        child_schema = schema.get(inserter_type)
        update = None
        if not child_schema:
            if schema.get(TIME_TYPE):
                # ATTEMPT TO CONVERT TO TIME, IF EXPECTING TIME
                try:
                    v = parse(v).format(TIMESTAMP_FORMAT)
                    return {text(escape_name(TIME_TYPE)): v}, update, False
                except Exception as e:
                    Log.warning(
                        "Failed attempt to convert {{value}} to TIMESTAMP string",
                        value=v,
                        cause=e)

            schema[inserter_type] = json_type
            update = {inserter_type: json_type}
        return {text(escape_name(inserter_type)): v}, update, False
Exemplo n.º 14
0
}

typed_to_bq_type = {
    BOOLEAN_TYPE: {
        "field_type": "BOOLEAN",
        "mode": "NULLABLE"
    },
    NUMBER_TYPE: {
        "field_type": "FLOAT64",
        "mode": "NULLABLE"
    },
    INTEGER_TYPE: {
        "field_type": "INT64",
        "mode": "NULLABLE"
    },
    TIME_TYPE: {
        "field_type": "TIMESTAMP",
        "mode": "NULLABLE"
    },
    STRING_TYPE: {
        "field_type": "STRING",
        "mode": "NULLABLE"
    },
    NESTED_TYPE: {
        "field_type": "RECORD",
        "mode": "REPEATED"
    },
}

REPEATED = escape_name(NESTED_TYPE)
Exemplo n.º 15
0
    def _gen_select(source_path, source_tops, source_flake, total_path,
                    total_tops, total_flake):
        if total_flake == source_flake and not total_tops:
            return [
                quote_column(source_path + escape_name(k))
                for k in jx.sort(total_flake.keys())
            ]

        if NESTED_TYPE in total_flake:
            # PROMOTE EVERYTHING TO REPEATED
            v = source_flake.get(NESTED_TYPE)
            t = total_flake.get(NESTED_TYPE)

            if not v:
                # CONVERT INNER OBJECT TO ARRAY OF ONE STRUCT
                inner = [
                    ConcatSQL(
                        SQL_SELECT_AS_STRUCT,
                        JoinSQL(
                            ConcatSQL(SQL_COMMA, SQL_CR),
                            _gen_select(
                                source_path,
                                Null,
                                source_flake,
                                total_path + REPEATED,
                                Null,
                                t,
                            ),
                        ),
                    )
                ]
            else:
                row_name = "row" + text(len(source_path.values))
                ord_name = "ordering" + text(len(source_path.values))
                inner = [
                    ConcatSQL(
                        SQL_SELECT_AS_STRUCT,
                        JoinSQL(
                            ConcatSQL(SQL_COMMA, SQL_CR),
                            _gen_select(ApiName(row_name), Null, v,
                                        ApiName(row_name), Null, t),
                        ),
                        SQL_FROM,
                        sql_call("UNNEST",
                                 quote_column(source_path + REPEATED)),
                        SQL_AS,
                        SQL(row_name),
                        SQL(" WITH OFFSET AS "),
                        SQL(ord_name),
                        SQL_ORDERBY,
                        SQL(ord_name),
                    )
                ]

            return [sql_alias(sql_call("ARRAY", *inner), REPEATED)]

        selection = []
        for k, t in jx.sort(total_flake.items(), 0):
            k_total_tops = total_tops if is_text(total_tops) else total_tops[k]
            k_tops = source_tops if is_text(source_tops) else source_tops[k]
            v = source_flake.get(k)
            if is_text(k_total_tops):
                # DO NOT INCLUDE TOP_LEVEL_FIELDS
                pass
            elif t == v and not k_total_tops and not k_tops:
                selection.append(
                    ConcatSQL(
                        quote_column(source_path + escape_name(k)),
                        SQL_AS,
                        quote_column(escape_name(k)),
                    ))
            elif is_data(t):
                if not v:
                    selects = _gen_select(
                        source_path + escape_name(k),
                        source_tops,
                        {},
                        total_path + escape_name(k),
                        k_total_tops,
                        t,
                    )
                elif is_data(v):
                    selects = _gen_select(
                        source_path + escape_name(k),
                        source_tops,
                        v,
                        total_path + escape_name(k),
                        k_total_tops,
                        t,
                    )
                else:
                    raise Log.error(
                        "Datatype mismatch on {{field}}: Can not merge {{type}} into {{main}}",
                        field=join_field(source_path + escape_name(k)),
                        type=v,
                        main=t,
                    )
                if selects:
                    inner = [
                        ConcatSQL(
                            SQL_SELECT_AS_STRUCT,
                            JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR), selects),
                        )
                    ]
                    selection.append(
                        sql_alias(sql_call("", *inner), escape_name(k)))
            elif is_text(t):
                if is_text(k_tops):
                    # THE SOURCE HAS THIS PROPERTY AS A TOP_LEVEL_FIELD
                    selection.append(
                        ConcatSQL(SQL(k_tops), SQL_AS,
                                  quote_column(escape_name(k))))
                elif v == t:
                    selection.append(
                        ConcatSQL(
                            quote_column(total_path + escape_name(k)),
                            SQL_AS,
                            quote_column(escape_name(k)),
                        ))
                else:
                    if v:
                        Log.note(
                            "Datatype mismatch on {{field}}: Can not merge {{type}} into {{main}}",
                            field=join_field(source_path + escape_name(k)),
                            type=v,
                            main=t,
                        )
                    selection.append(
                        ConcatSQL(
                            sql_call(
                                "CAST",
                                ConcatSQL(SQL_NULL, SQL_AS,
                                          SQL(json_type_to_bq_type[t])),
                            ),
                            SQL_AS,
                            quote_column(escape_name(k)),
                        ))
            else:
                Log.error("not expected")
        return selection
Exemplo n.º 16
0
def _typed_encode(value, schema):
    """
    RETURN TRIPLE
    output - THE ENCODED VALUE
    update - THE ADDITIONAL SCHEMA OVER schema PROVIDED
    nested - True IF NESTING IS REQUIRED (CONSIDERED SERIOUS SCHEMA CHANGE)
    """
    if is_many(value):
        if len(value) == 0:
            return None, None, False
        output = []
        update = {}
        nest_added = False
        child_schema = schema.get(NESTED_TYPE)
        if not child_schema:
            nest_added = True
            child_schema = schema[NESTED_TYPE] = {}

        for r in value:
            v, m, n = _typed_encode(r, child_schema)
            output.append(v)
            set_default(update, m)
            nest_added |= n

        if update:
            return {text(REPEATED): output}, {NESTED_TYPE: update}, nest_added
        else:
            return {text(REPEATED): output}, None, nest_added
    elif NESTED_TYPE in schema:
        if not value:
            return {text(REPEATED): []}, None, False
        else:
            return _typed_encode([value], schema)
    elif is_data(value):
        output = {}
        update = {}
        nest_added = False
        for k, v in value.items():
            child_schema = schema.get(k)
            if not child_schema:
                child_schema = schema[k] = {}
            result, more_update, n = _typed_encode(v, child_schema)
            if result != None:
                output[text(escape_name(k))] = result
            set_default(update, {k: more_update})
            nest_added |= n
        return output, update or None, nest_added
    elif is_text(schema):
        v, inserter_type, json_type = schema_type(value)
        if schema != json_type:
            Log.error(
                "Can not convert {{existing_type}} to {{expected_type}}",
                existing_type=json_type,
                expected_type=schema,
            )
        return v, None, False
    elif value == None:
        return {
            text(escape_name(t)): None
            for t, child_schema in schema.items()
        } or None, None, False
    else:
        try:
            v, inserter_type, json_type = schema_type(value)
        except Exception as e:
            # LAST DESPERATE ATTEMPT
            return _typed_encode(value.__data__(), schema)
        child_schema = schema.get(inserter_type)
        update = None
        if not child_schema:
            if schema.get(TIME_TYPE):
                # ATTEMPT TO CONVERT TO TIME, IF EXPECTING TIME
                try:
                    v = parse(v).format(TIMESTAMP_FORMAT)
                    return {text(escape_name(TIME_TYPE)): v}, update, False
                except Exception as e:
                    Log.warning(
                        "Failed attempt to convert {{value}} to TIMESTAMP string",
                        value=v,
                        cause=e,
                    )

            schema[inserter_type] = json_type
            update = {inserter_type: json_type}
        return {text(escape_name(inserter_type)): v}, update, False
Exemplo n.º 17
0
 def delete_table(self, name):
     full_name = self.full_name + escape_name(name)
     self.client.delete_table(full_name)