def typed_prop_to_binary(prop_val, prop_type):
    # All format strings start with an unsigned char to represent our prop_type enum
    format_str = "=B"

    # Remove leading and trailing whitespace
    prop_val = prop_val.strip()

    if prop_val == "":
        # An empty string indicates a NULL property.
        # TODO This is not allowed in Cypher, consider how to handle it here rather than in-module.
        return struct.pack(format_str, 0)

    # TODO allow ID type specification
    if prop_type == Type.LONG:
        try:
            numeric_prop = int(prop_val)
            return struct.pack(format_str + "q", Type.LONG.value, numeric_prop)
        except (ValueError, struct.error):
            # TODO ugly, rethink
            if prop_type == Type.LONG:
                raise SchemaError("Could not parse '%s' as a long" % prop_val)

    elif prop_type == Type.DOUBLE:
        try:
            numeric_prop = float(prop_val)
            if not math.isnan(numeric_prop) and not math.isinf(
                    numeric_prop):  # Don't accept non-finite values.
                return struct.pack(format_str + "d", Type.DOUBLE.value,
                                   numeric_prop)
        except (ValueError, struct.error):
            # TODO ugly, rethink
            if prop_type == Type.DOUBLE:
                raise SchemaError("Could not parse '%s' as a double" %
                                  prop_val)

    elif prop_type == Type.BOOL:
        # If field is 'false' or 'true', it is a boolean
        if prop_val.lower() == 'false':
            return struct.pack(format_str + '?', Type.BOOL.value, False)
        elif prop_val.lower() == 'true':
            return struct.pack(format_str + '?', Type.BOOL.value, True)
        else:
            raise SchemaError("Could not parse '%s' as a boolean" % prop_val)

    elif prop_type == Type.ID or prop_type == Type.STRING:
        # If we've reached this point, the property is a string
        encoded_str = str.encode(
            prop_val)  # struct.pack requires bytes objects as arguments
        # Encoding len+1 adds a null terminator to the string
        format_str += "%ds" % (len(encoded_str) + 1)
        return struct.pack(format_str, Type.STRING.value, encoded_str)

    elif prop_type == Type.ARRAY:
        if prop_val[0] != '[' or prop_val[-1] != ']':
            raise SchemaError("Could not parse '%s' as an array" % prop_val)
        return array_prop_to_binary(format_str, prop_val)

    # If it hasn't returned by this point, it is trying to set it to a type that it can't adopt
    raise SchemaError("unable to parse [" + prop_val + "] with type [" +
                      repr(prop_type) + "]")
    def convert_header_with_schema(self, header):
        self.types = [None] * self.column_count  # Value type of every column.
        for idx, field in enumerate(header):
            pair = field.split(':')

            # Multiple colons found in column name, emit error.
            # TODO might need to check for backtick escapes
            if len(pair) > 2:
                raise CSVError("%s: Field '%s' had %d colons" %
                               (self.infile.name, field, len(field)))

            # Convert the column type.
            col_type = convert_schema_type(pair[1].upper().strip())

            # If the column did not have a name but the type requires one, emit an error.
            if len(pair[0]) == 0 and col_type not in (Type.ID, Type.START_ID,
                                                      Type.END_ID,
                                                      Type.IGNORE):
                raise SchemaError(
                    "%s: Each property in the header should be a colon-separated pair"
                    % (self.infile.name))
            else:
                # We have a column name and a type.
                # Only store the name if the column's values should be added as properties.
                if len(pair[0]) > 0 and col_type not in (Type.START_ID,
                                                         Type.END_ID,
                                                         Type.IGNORE):
                    column_name = pair[0].strip()
                    self.column_names[idx] = column_name

            # Store the column type.
            self.types[idx] = col_type
Пример #3
0
    def post_process_header_with_schema(self, header):
        # Can interleave these tasks if preferred.
        if self.types.count(Type.START_ID) != 1:
            raise SchemaError(
                "Relation file '%s' should have exactly one START_ID column." %
                (self.infile.name))
        if self.types.count(Type.END_ID) != 1:
            raise SchemaError(
                "Relation file '%s' should have exactly one END_ID column." %
                (self.infile.name))

        self.start_id = self.types.index(Type.START_ID)
        self.end_id = self.types.index(Type.END_ID)
        # Capture namespaces of start and end IDs if provided
        start_match = re.search(r"\((\w+)\)", header[self.start_id])
        if start_match:
            self.start_namespace = start_match.group(1)
        end_match = re.search(r"\((\w+)\)", header[self.end_id])
        if end_match:
            self.end_namespace = end_match.group(1)
    def process_entities(self):
        entities_created = 0
        with click.progressbar(self.reader,
                               length=self.entities_count,
                               label=self.entity_str) as reader:
            for row in reader:
                self.validate_row(row)
                try:
                    start_id = row[self.start_id]
                    if self.start_namespace:
                        start_id = self.start_namespace + '.' + str(start_id)
                    end_id = row[self.end_id]
                    if self.end_namespace:
                        end_id = self.end_namespace + '.' + str(end_id)

                    src = self.query_buffer.nodes[start_id]
                    dest = self.query_buffer.nodes[end_id]
                except KeyError as e:
                    print(
                        "%s:%d Relationship specified a non-existent identifier. src: %s; dest: %s"
                        % (self.infile.name, self.reader.line_num - 1,
                           row[self.start_id], row[self.end_id]))
                    if self.config.skip_invalid_edges is False:
                        raise e
                    continue
                fmt = "=QQ"  # 8-byte unsigned ints for src and dest
                try:
                    row_binary = struct.pack(fmt, src,
                                             dest) + self.pack_props(row)
                except SchemaError as e:
                    raise SchemaError(
                        "%s:%d %s" %
                        (self.infile.name, self.reader.line_num, str(e)))
                row_binary_len = len(row_binary)
                # If the addition of this entity will make the binary token grow too large,
                # send the buffer now.
                added_size = self.binary_size + row_binary_len
                if added_size >= self.config.max_token_size or self.query_buffer.buffer_size + added_size >= self.config.max_buffer_size:
                    self.query_buffer.reltypes.append(self.to_binary())
                    self.query_buffer.send_buffer()
                    self.reset_partial_binary()
                    # Push the reltype onto the query buffer again, as there are more entities to process.
                    self.query_buffer.reltypes.append(self.to_binary())

                self.query_buffer.relation_count += 1
                entities_created += 1
                self.binary_size += row_binary_len
                self.binary_entities.append(row_binary)
            self.query_buffer.reltypes.append(self.to_binary())
        self.infile.close()
        print("%d relations created for type '%s'" %
              (entities_created, self.entity_str))
def convert_schema_type(in_type):
    try:
        return Type[in_type]
    except KeyError:
        # Handling for ID namespaces
        # TODO think of better alternatives
        if in_type.startswith('ID('):
            return Type.ID
        elif in_type.startswith('START_ID('):
            return Type.START_ID
        elif in_type.startswith('END_ID('):
            return Type.END_ID
        else:
            raise SchemaError("Encountered invalid field type '%s'" % in_type)
Пример #6
0
    def post_process_header_with_schema(self, header):
        # No ID field is required if we're only inserting nodes.
        if self.config.store_node_identifiers is False:
            return

        # Verify that exactly one field is labeled ID.
        if self.types.count(Type.ID) != 1:
            raise SchemaError(
                "Node file '%s' should have exactly one ID column." %
                (self.infile.name))
        self.id = self.types.index(
            Type.ID)  # Track the offset containing the node ID.
        id_field = header[self.id]
        # If the ID field specifies an ID namespace in parentheses like "val:ID(NAMESPACE)", capture the namespace.
        match = re.search(r"\((\w+)\)", id_field)
        if match:
            self.id_namespace = match.group(1)
Пример #7
0
    def process_entities(self):
        entities_created = 0
        with click.progressbar(self.reader,
                               length=self.entities_count,
                               label=self.entity_str) as reader:
            for row in reader:
                self.validate_row(row)

                # Update the node identifier dictionary if necessary
                if self.config.store_node_identifiers:
                    id_field = row[self.id]
                    if self.id_namespace is not None:
                        id_field = self.id_namespace + '.' + str(id_field)
                    self.update_node_dictionary(id_field)

                try:
                    row_binary = self.pack_props(row)
                except SchemaError as e:
                    # TODO why is line_num off by one?
                    raise SchemaError(
                        "%s:%d %s" %
                        (self.infile.name, self.reader.line_num - 1, str(e)))
                row_binary_len = len(row_binary)
                # If the addition of this entity will make the binary token grow too large,
                # send the buffer now.
                # TODO how much of this can be made uniform w/ relations and moved to Querybuffer?
                added_size = self.binary_size + row_binary_len
                if added_size >= self.config.max_token_size or self.query_buffer.buffer_size + added_size >= self.config.max_buffer_size:
                    self.query_buffer.labels.append(self.to_binary())
                    self.query_buffer.send_buffer()
                    self.reset_partial_binary()
                    # Push the label onto the query buffer again, as there are more entities to process.
                    self.query_buffer.labels.append(self.to_binary())

                self.query_buffer.node_count += 1
                entities_created += 1
                self.binary_size += row_binary_len
                self.binary_entities.append(row_binary)
            self.query_buffer.labels.append(self.to_binary())
        self.infile.close()
        print("%d nodes created with label '%s'" %
              (entities_created, self.entity_str))
Пример #8
0
 def check_schema(cls, schema):
     for error in cls(cls.META_SCHEMA).iter_errors(schema):
         raise SchemaError.create_from(error)