def test_set_shard_explodes(self) -> None:
     expected = get_shard()
     with self.assertRaisesRegex(
             AssertionError,
             'can only shard_set_explicitly if it has not been used yet.'):
         shard_set_explicitly('x')
     actual = get_shard()
     self.assertEqual(expected, actual)
Exemplo n.º 2
0
    def construct_type(cls: Type["VertexType"], **kwargs: Any) -> "VertexType":
        defaults: Dict[str, Hashable] = dict()
        properties: Set[Property] = set(kwargs.pop('properties', []))
        properties.update({
            MagicProperties.LABEL.value, MagicProperties.ID.value,
            WellKnownProperties.Key.value
        })

        # (magically) insinuate the shard identifier into the vertex id format
        shard = get_shard()
        id_format = kwargs.pop('id_format', VertexTypeIdFormats.DEFAULT.value)
        if shard:
            properties.update({WellKnownProperties.TestShard.value})
            defaults.update({WellKnownProperties.TestShard.value.name: shard})
            # prepend if it's not in the format already (which would be pretty weird, but anyway)
            if '{shard}' not in id_format:
                id_format = '{shard}:' + id_format

        parameters = _discover_parameters(id_format)
        properties_names = set([p.name for p in properties])
        assert all(p in properties_names for p in parameters), \
            f'id_format: {id_format} has parameters: {parameters} not found in our properties {properties_names}'

        return cls(properties=tuple(properties),
                   defaults=tuple(defaults.items()),
                   id_format=id_format,
                   **kwargs)
Exemplo n.º 3
0
 def drop(self) -> None:
     test_shard = get_shard()
     g = self.g.V()
     if test_shard:
         g = g.has(WellKnownProperties.TestShard.value.name, test_shard)
     g = g.drop()
     LOGGER.warning('DROPPING ALL NODES')
     self.query_executor()(query=g, get=FromResultSet.iterate)
     # we seem to mess this up easily
     leftover = self.query_executor()(query=self.g.V().hasId(TextP.startingWith(test_shard)).id(),
                                      get=FromResultSet.toList)
     self.query_executor()(query=self.g.V().hasId(TextP.startingWith(test_shard)).drop(),
                           get=FromResultSet.iterate)
     assert not leftover, f'we have some leftover: {leftover}'
     LOGGER.warning('COMPLETED DROP OF ALL NODES')
Exemplo n.º 4
0
    def bulk_load_entities(
            self,
            *,
            entities: Mapping[GraphEntityType, Mapping[str, GraphEntity]],
            object_prefix: Optional[str] = None,
            polling_period: int = 10,
            raise_if_failed: bool = False) -> Mapping[str, Mapping[str, Any]]:
        """
        :param entities:  The entities being bulk loaded.  They will be partitioned at least by vertex vs edge, but
        possibly by conflicting property type (though the latter is unusual), and written to files in S3, then loaded
        by Neptune.
        :param object_prefix: (optional)   The string is treated like a format string, and 'now' and 'shard' (if
        get_shard() is truthy) are the available parameters. Defaults to '{now}/{shard}' or '{now}'.
        :param polling_period: (optional) defaults to 10 (seconds).  The period at which the status will be polled.
        :param raise_if_failed: (optional) defaults to False.  If True, will raise if any of the loads failed, otherwise
        log a warning and return the status.  True would be useful for testing or other situations where you would
        always expect the load to succeed.
        :return:
        """
        format_args = dict(now=datetime.datetime.now().isoformat(
            timespec='milliseconds').replace(':', '-').replace('.', '-'))
        shard = get_shard()
        if shard:
            format_args.update(shard=shard)
        if not object_prefix:
            object_prefix = '{now}/{shard}' if 'shard' in format_args else '{now}'
        object_prefix = object_prefix.format(**format_args)

        assert isinstance(object_prefix, str) and all(c not in object_prefix for c in ':'), \
            f'object_prefix is going to break S3 {object_prefix}'

        vertexes, edges = group_by_class(entities)

        # TODO: write these to tmp? stream them in?
        vertex_csvs: List[bytes] = []
        for types in partition_properties(vertexes.keys()):
            with StringIO() as w:
                write_entities_as_csv(w, dict((t, vertexes[t]) for t in types))
                vertex_csvs.append(w.getvalue().encode('utf-8'))

        edge_csvs: List[bytes] = []
        for types in partition_properties(edges.keys()):
            with StringIO() as w:
                write_entities_as_csv(w, dict((t, edges[t]) for t in types))
                edge_csvs.append(w.getvalue().encode('utf-8'))

        csvs: List[Tuple[str, bytes]] = [
            (f'{object_prefix}/vertex{i}.csv', v)
            for i, v in enumerate(vertex_csvs)
        ] + [(f'{object_prefix}/edge{i}.csv', v)
             for i, v in enumerate(edge_csvs)]

        todo: List[str] = []
        for s3_object_key, v in csvs:
            # upload to s3
            with BytesIO(v) as r:
                self.upload(f=r, s3_object_key=s3_object_key)

            # now poke Neptune and tell it to load that file
            # TODO: dependencies? endpoint doesn't seem to like the way we pass these
            response = self.load(s3_object_key=s3_object_key)

            # TODO: retry?
            assert 'payload' in response and 'loadId' in response['payload'], \
                f'failed to submit load for vertex.csv: {response}'
            todo.append(response['payload']['loadId'])

        status_by_load_id: Dict[str, Mapping[str, Any]] = dict()

        while todo:
            status_by_load_id.update([
                (id,
                 self.load_status(load_id=id, errors=True,
                                  errors_per_page=30)['payload'])
                for id in todo
            ])
            todo = [
                load_id
                for load_id, overall_status in status_by_load_id.items()
                if overall_status['overallStatus']['status'] not in (
                    'LOAD_COMPLETED', 'LOAD_FAILED')
            ]
            time.sleep(polling_period)

        # TODO: timeout and parse errors
        assert not todo
        failed = dict([
            (load_id, overall_status)
            for load_id, overall_status in status_by_load_id.items()
            if overall_status['overallStatus']['status'] != 'LOAD_COMPLETED'
        ])
        if failed:
            LOGGER.warning(
                f'some loads failed: {failed.keys()}: bulk_loader_details={failed}'
            )
            if raise_if_failed:
                raise AssertionError(f'some loads failed: {failed.keys()}')

        return status_by_load_id
 def test_set_shard_works(self) -> None:
     expected = Fixtures.next_string()
     shard_set_explicitly(expected)
     actual = get_shard()
     self.assertEqual(expected, actual)