Пример #1
0
 def handle(self, *args, **options):
     """Command handle."""
     if self.has_filter(options):
         self.filter_indices(options)
     else:
         # Process all indices.
         index_builder.build()
Пример #2
0
    def handle(self, *args, **options):
        """Command handle."""
        verbosity = int(options["verbosity"])

        if self.has_filter(options):
            self.filter_indices(options, verbosity)
        else:
            # Process all indices.
            index_builder.build()
Пример #3
0
    def handle(self, *args, **options):
        """Command handle."""
        verbosity = int(options['verbosity'])

        if self.has_filter(options):
            self.filter_indices(options, verbosity)
        else:
            # Process all indices.
            index_builder.build()
Пример #4
0
    def setUp(self):
        self.collection1 = Collection.objects.get(pk=1)

        self.resource_name = 'collection'
        self.viewset = CollectionViewSet

        self.post_data = {
            'name': 'Test collection',
            'slug': 'test_collection',
        }

        super().setUp()

        # Reindex data objects as they are loaded in fixtures.
        # TODO: Remove this when we get rid of fixtures.
        from resolwe.elastic.builder import index_builder
        index_builder.build()
Пример #5
0
    def setUp(self):
        self.collection1 = Collection.objects.get(pk=1)

        self.resource_name = 'collection'
        self.viewset = CollectionViewSet

        self.post_data = {
            'name': 'Test collection',
            'slug': 'test_collection',
        }

        super().setUp()

        # Reindex data objects as they are loaded in fixtures.
        # TODO: Remove this when we get rid of fixtures.
        from resolwe.elastic.builder import index_builder
        index_builder.build()
Пример #6
0
    def setUp(self):
        self.data1 = Data.objects.get(pk=1)

        self.resource_name = 'data'
        self.viewset = DataViewSet

        self.data = {
            'name': 'New data',
            'slug': 'new_data',
            'collections': ['1'],
            'process': 'test_process',
        }

        super().setUp()

        # Reindex data objects as they are loaded in fixtures.
        # TODO: Remove this when we get rid of fixtures.
        from resolwe.elastic.builder import index_builder
        index_builder.build()
Пример #7
0
    def setUp(self):
        self.data1 = Data.objects.get(pk=1)

        self.resource_name = 'data'
        self.viewset = DataViewSet

        self.data = {
            'name': 'New data',
            'slug': 'new_data',
            'collections': ['1'],
            'process': 'test_process',
        }

        super().setUp()

        # Reindex data objects as they are loaded in fixtures.
        # TODO: Remove this when we get rid of fixtures.
        from resolwe.elastic.builder import index_builder
        index_builder.build()
Пример #8
0
    def setUp(self):
        self.data1 = Data.objects.get(pk=1)

        self.resource_name = "data"
        self.viewset = DataViewSet

        self.data = {
            "name": "New data",
            "slug": "new_data",
            "collection": {
                "id": 1
            },
            "process": {
                "slug": "test_process"
            },
        }

        super().setUp()

        # Reindex data objects as they are loaded in fixtures.
        # TODO: Remove this when we get rid of fixtures.
        from resolwe.elastic.builder import index_builder

        index_builder.build()
Пример #9
0
    def test_bulk_indexing(self):
        from .test_app.models import TestModel
        from .test_app.elastic_indexes import TestSearchDocument

        first_obj = TestModel.objects.create(name='First name', number=42)
        TestModel.objects.create(name='Second name', number=43)

        # Delete whole index
        index_builder.delete()
        es_objects = TestSearchDocument.search().execute()
        self.assertEqual(len(es_objects), 0)

        # Build empty queryset
        index_builder.build(queryset=TestModel.objects.none())
        es_objects = TestSearchDocument.search().execute()
        self.assertEqual(len(es_objects), 0)

        # Build only the subset of queryset defined in index
        index_builder.build(queryset=TestModel.objects.filter(pk=first_obj.pk))
        es_objects = TestSearchDocument.search().execute()
        self.assertEqual(len(es_objects), 1)

        # Delete whole index
        index_builder.delete()
        es_objects = TestSearchDocument.search().execute()
        self.assertEqual(len(es_objects), 0)

        # Build only object
        index_builder.build(obj=first_obj)
        es_objects = TestSearchDocument.search().execute()
        self.assertEqual(len(es_objects), 1)

        # Delete whole index
        index_builder.delete()
        es_objects = TestSearchDocument.search().execute()
        self.assertEqual(len(es_objects), 0)

        # Build whole queryset defined in index
        index_builder.build()
        es_objects = TestSearchDocument.search().execute()
        self.assertEqual(len(es_objects), 2)
Пример #10
0
    def test_bulk_indexing(self):
        from .test_app.models import TestModel
        from .test_app.elastic_indexes import TestSearchDocument

        first_obj = TestModel.objects.create(name='First name', number=42)
        TestModel.objects.create(name='Second name', number=43)

        # Delete whole index
        index_builder.delete()
        es_objects = TestSearchDocument.search().execute()
        self.assertEqual(len(es_objects), 0)

        # Build empty queryset
        index_builder.build(queryset=TestModel.objects.none())
        es_objects = TestSearchDocument.search().execute()
        self.assertEqual(len(es_objects), 0)

        # Build only the subset of queryset defined in index
        index_builder.build(queryset=TestModel.objects.filter(pk=first_obj.pk))
        es_objects = TestSearchDocument.search().execute()
        self.assertEqual(len(es_objects), 1)

        # Delete whole index
        index_builder.delete()
        es_objects = TestSearchDocument.search().execute()
        self.assertEqual(len(es_objects), 0)

        # Build only object
        index_builder.build(obj=first_obj)
        es_objects = TestSearchDocument.search().execute()
        self.assertEqual(len(es_objects), 1)

        # Delete whole index
        index_builder.delete()
        es_objects = TestSearchDocument.search().execute()
        self.assertEqual(len(es_objects), 0)

        # Build whole queryset defined in index
        index_builder.build()
        es_objects = TestSearchDocument.search().execute()
        self.assertEqual(len(es_objects), 2)
Пример #11
0
    def handle(self, *args, **options):
        """Command handle."""
        count_total, count_inserted = 0, 0
        to_index = []

        relation_type_choices = list(zip(*Mapping.RELATION_TYPE_CHOICES))[0]

        for tab_file_name, _, tab_file in decompress(options['file_name']):
            logger.info(__("Importing mappings from \"{}\"...", tab_file_name))

            mappings = set()
            for row in csv.DictReader(tab_file, delimiter=str('\t')):
                if row['relation_type'] not in relation_type_choices:
                    raise ValidationError("Unknown relation type: {}".format(
                        row['relation_type']))

                # NOTE: For performance reasons this is a tuple instead of a dict.
                #       Tuple can be hashed, so it can be used in `ìn` operation,
                #       and is serialized to a JSON list.
                #       Make sure that any changes also reflect in the SQL query
                #       below.
                mapping = (
                    row['relation_type'],
                    row['source_db'],
                    row['source_id'],
                    row['source_species'],
                    row['target_db'],
                    row['target_id'],
                    row['target_species'],
                )

                if mapping in mappings:
                    raise ValidationError(
                        "Duplicated mapping (relation type: '{}', source db: '{}', source id: "
                        "'{}', source species: {}, target db: '{}', target id: '{}', "
                        "target species: {}) found in '{}'".format(
                            row['relation_type'], row['source_db'],
                            row['source_id'], row['source_species'],
                            row['target_db'], row['target_id'],
                            row['target_species'], tab_file_name))

                mappings.add(mapping)

            with connection.cursor() as cursor:
                cursor.execute(
                    """
                    WITH tmp AS(
                        INSERT INTO {table_name} (
                            relation_type, source_db, source_id, source_species,
                            target_db, target_id, target_species
                        )
                        SELECT
                            value->>0, value->>1, value->>2, value->>3,
                            value->>4, value->>5, value->>6
                        FROM json_array_elements(%s)
                        LEFT JOIN {table_name}
                            ON value->>0 = {table_name}.relation_type
                            AND value->>1 = {table_name}.source_db
                            AND value->>2 = {table_name}.source_id
                            AND value->>3 = {table_name}.source_species
                            AND value->>4 = {table_name}.target_db
                            AND value->>5 = {table_name}.target_id
                            AND value->>6 = {table_name}.target_species
                        WHERE {table_name}.relation_type IS NULL
                        RETURNING id
                    )
                    SELECT
                        COALESCE(array_agg(id), ARRAY[]::INTEGER[]) AS ids,
                        COUNT(*) AS count_inserted
                    FROM tmp;
                    """.format(
                        table_name=Mapping._meta.db_table,  # pylint: disable=no-member,protected-access
                    ),
                    params=[json.dumps(list(mappings))])
                result = cursor.fetchone()

            to_index.extend(result[0])

            count_total += len(mappings)
            count_inserted += result[1]

        index_builder.build(queryset=Mapping.objects.filter(id__in=to_index))

        logger.info(  # pylint: disable=logging-not-lazy
            "Total mappings: %d. Inserted %d, unchanged %d." %
            (count_total, count_inserted, count_total - count_inserted))
Пример #12
0
    def handle(self, *args, **options):
        """Command handle."""
        count_total, count_inserted, count_updated = 0, 0, 0
        to_index = []

        type_choices = list(zip(*Feature.TYPE_CHOICES))[0]
        subtype_choices = list(zip(*Feature.SUBTYPE_CHOICES))[0]

        for tab_file_name, tab_file in decompress(options['file_name']):
            logger.info(__("Importing features from \"{}\"...", tab_file_name))

            features = []
            unique_features = set()
            for row in csv.DictReader(tab_file, delimiter=str('\t')):
                sub_type = SUBTYPE_MAP.get(row['Gene type'], 'other')

                if row['Type'] not in type_choices:
                    raise ValidationError("Unknown type: {}".format(
                        row['Type']))
                if sub_type not in subtype_choices:
                    raise ValidationError(
                        "Unknown subtype: {}".format(sub_type))

                aliases_text = row['Aliases'].strip()
                aliases = []
                if aliases_text and aliases_text != '-':
                    aliases = aliases_text.split(',')

                if (row['Source'], row['ID']) in unique_features:
                    raise ValidationError(
                        "Duplicated feature (source: '{}', id: '{}') found in '{}'"
                        .format(row['Source'], row['ID'], tab_file_name))

                # NOTE: For performance reasons this is a list instead of a dict.
                #       Make sure that any changes also reflect in the SQL query
                #       below.
                features.append([
                    row['Source'],
                    row['ID'],
                    row['Species'],
                    row['Type'],
                    sub_type,
                    row['Name'],
                    row['Full name'],
                    row['Description'],
                    aliases,
                ])
                unique_features.add((row['Source'], row['ID']))

            with connection.cursor() as cursor:
                cursor.execute(
                    """
                    WITH tmp AS (
                        INSERT INTO {table_name} (
                            source, feature_id, species, type,
                            sub_type, name, full_name, description,
                            aliases
                        )
                        SELECT
                            value->>0, value->>1, value->>2, value->>3,
                            value->>4, value->>5, value->>6, value->>7,
                            ARRAY(SELECT json_array_elements_text(value->8))
                        FROM json_array_elements(%s)
                        ON CONFLICT (species, source, feature_id) DO UPDATE
                        SET
                            type = EXCLUDED.type,
                            sub_type = EXCLUDED.sub_type,
                            name = EXCLUDED.name,
                            full_name = EXCLUDED.full_name,
                            description = EXCLUDED.description,
                            aliases = EXCLUDED.aliases
                        WHERE (
                            {table_name}.type, {table_name}.sub_type, {table_name}.name,
                            {table_name}.full_name, {table_name}.description, {table_name}.aliases
                        ) IS DISTINCT FROM (
                            EXCLUDED.type, EXCLUDED.sub_type, EXCLUDED.name,
                            EXCLUDED.full_name, EXCLUDED.description, EXCLUDED.aliases
                        )
                        RETURNING id, xmax
                    )
                    SELECT
                        COALESCE(array_agg(id), ARRAY[]::INTEGER[]) AS ids,
                        COUNT(CASE WHEN xmax = 0 THEN 1 END) AS count_inserted,
                        COUNT(CASE WHEN xmax != 0 THEN 1 END) AS count_updated
                    FROM tmp;
                    """.format(
                        table_name=Feature._meta.db_table,  # pylint: disable=no-member,protected-access
                    ),
                    params=[json.dumps(features)])
                result = cursor.fetchone()

            to_index.extend(result[0])

            count_total += len(features)
            count_inserted += result[1]
            count_updated += result[2]

        index_builder.build(queryset=Feature.objects.filter(id__in=to_index))

        logger.info(  # pylint: disable=logging-not-lazy
            "Total features: %d. Inserted %d, updated %d, unchanged %d." %
            (count_total, count_inserted, count_updated,
             count_total - count_inserted - count_updated))
Пример #13
0
    def handle(self, *args, **options):
        """Command handle."""
        count_total, count_inserted, count_updated = 0, 0, 0
        to_index = []

        type_choices = list(zip(*Feature.TYPE_CHOICES))[0]
        subtype_choices = list(zip(*Feature.SUBTYPE_CHOICES))[0]

        for tab_file_name, tab_file in decompress(options['file_name']):
            logger.info(__("Importing features from \"{}\"...", tab_file_name))

            features = []
            unique_features = set()
            for row in csv.DictReader(tab_file, delimiter=str('\t')):
                sub_type = SUBTYPE_MAP.get(row['Gene type'], 'other')

                if row['Type'] not in type_choices:
                    raise ValidationError("Unknown type: {}".format(row['Type']))
                if sub_type not in subtype_choices:
                    raise ValidationError("Unknown subtype: {}".format(sub_type))

                aliases_text = row['Aliases'].strip()
                aliases = []
                if aliases_text and aliases_text != '-':
                    aliases = aliases_text.split(',')

                if (row['Source'], row['ID']) in unique_features:
                    raise ValidationError(
                        "Duplicated feature (source: '{}', id: '{}') found in '{}'".format(
                            row['Source'], row['ID'], tab_file_name
                        )
                    )

                # NOTE: For performance reasons this is a list instead of a dict.
                #       Make sure that any changes also reflect in the SQL query
                #       below.
                features.append([
                    row['Source'],
                    row['ID'],
                    row['Species'],
                    row['Type'],
                    sub_type,
                    row['Name'],
                    row['Full name'],
                    row['Description'],
                    aliases,
                ])
                unique_features.add((row['Source'], row['ID']))

            with connection.cursor() as cursor:
                cursor.execute(
                    """
                    WITH tmp AS (
                        INSERT INTO {table_name} (
                            source, feature_id, species, type,
                            sub_type, name, full_name, description,
                            aliases
                        )
                        SELECT
                            value->>0, value->>1, value->>2, value->>3,
                            value->>4, value->>5, value->>6, value->>7,
                            ARRAY(SELECT json_array_elements_text(value->8))
                        FROM json_array_elements(%s)
                        ON CONFLICT (species, source, feature_id) DO UPDATE
                        SET
                            type = EXCLUDED.type,
                            sub_type = EXCLUDED.sub_type,
                            name = EXCLUDED.name,
                            full_name = EXCLUDED.full_name,
                            description = EXCLUDED.description,
                            aliases = EXCLUDED.aliases
                        WHERE (
                            {table_name}.type, {table_name}.sub_type, {table_name}.name,
                            {table_name}.full_name, {table_name}.description, {table_name}.aliases
                        ) IS DISTINCT FROM (
                            EXCLUDED.type, EXCLUDED.sub_type, EXCLUDED.name,
                            EXCLUDED.full_name, EXCLUDED.description, EXCLUDED.aliases
                        )
                        RETURNING id, xmax
                    )
                    SELECT
                        COALESCE(array_agg(id), ARRAY[]::INTEGER[]) AS ids,
                        COUNT(CASE WHEN xmax = 0 THEN 1 END) AS count_inserted,
                        COUNT(CASE WHEN xmax != 0 THEN 1 END) AS count_updated
                    FROM tmp;
                    """.format(
                        table_name=Feature._meta.db_table,  # pylint: disable=no-member,protected-access
                    ),
                    params=[json.dumps(features)]
                )
                result = cursor.fetchone()

            to_index.extend(result[0])

            count_total += len(features)
            count_inserted += result[1]
            count_updated += result[2]

        index_builder.build(queryset=Feature.objects.filter(id__in=to_index))

        logger.info(  # pylint: disable=logging-not-lazy
            "Total features: %d. Inserted %d, updated %d, unchanged %d." %
            (count_total, count_inserted, count_updated,
             count_total - count_inserted - count_updated)
        )
Пример #14
0
    def handle(self, *args, **options):
        """Command handle."""
        count_total, count_inserted = 0, 0
        to_index = []

        relation_type_choices = list(zip(*Mapping.RELATION_TYPE_CHOICES))[0]

        for tab_file_name, tab_file in decompress(options['file_name']):
            logger.info(__("Importing mappings from \"{}\"...", tab_file_name))

            mappings = set()
            for row in csv.DictReader(tab_file, delimiter=str('\t')):
                if row['relation_type'] not in relation_type_choices:
                    raise ValidationError(
                        "Unknown relation type: {}".format(row['relation_type'])
                    )

                # NOTE: For performance reasons this is a tuple instead of a dict.
                #       Tuple can be hashed, so it can be used in `ìn` operation,
                #       and is serialized to a JSON list.
                #       Make sure that any changes also reflect in the SQL query
                #       below.
                mapping = (
                    row['relation_type'],
                    row['source_db'],
                    row['source_id'],
                    row['source_species'],
                    row['target_db'],
                    row['target_id'],
                    row['target_species'],
                )

                if mapping in mappings:
                    raise ValidationError(
                        "Duplicated mapping (relation type: '{}', source db: '{}', source id: "
                        "'{}', source species: {}, target db: '{}', target id: '{}', "
                        "target species: {}) found in '{}'".format(
                            row['relation_type'], row['source_db'], row['source_id'],
                            row['source_species'], row['target_db'], row['target_id'],
                            row['target_species'], tab_file_name
                        )
                    )

                mappings.add(mapping)

            with connection.cursor() as cursor:
                cursor.execute(
                    """
                    WITH tmp AS(
                        INSERT INTO {table_name} (
                            relation_type, source_db, source_id, source_species,
                            target_db, target_id, target_species
                        )
                        SELECT
                            value->>0, value->>1, value->>2, value->>3,
                            value->>4, value->>5, value->>6
                        FROM json_array_elements(%s)
                        ON CONFLICT DO NOTHING -- conflict means that mapping is already present
                        RETURNING id
                    )
                    SELECT
                        COALESCE(array_agg(id), ARRAY[]::INTEGER[]) AS ids,
                        COUNT(*) AS count_inserted
                    FROM tmp;
                    """.format(
                        table_name=Mapping._meta.db_table,  # pylint: disable=no-member,protected-access
                    ),
                    params=[json.dumps(list(mappings))]
                )
                result = cursor.fetchone()

            to_index.extend(result[0])

            count_total += len(mappings)
            count_inserted += result[1]

        index_builder.build(queryset=Mapping.objects.filter(id__in=to_index))

        logger.info(  # pylint: disable=logging-not-lazy
            "Total mappings: %d. Inserted %d, unchanged %d." %
            (count_total, count_inserted, count_total - count_inserted)
        )
Пример #15
0
 def handle(self, *args, **options):
     """Command handle."""
     index_builder.build(push=False)
     index_builder.push()