예제 #1
0
    def index_mapping(self, **kwargs):
        cherrypy.log(str(kwargs))
        output = {'success': False}
        index = kwargs.get('index')
        if index is None:
            msg = "Error: 'index' argument is required"
            output['msg'] = msg
            return output
        idx = Index(index)
        if not idx.exists():
            msg = "Error: index {} does not exist".format(index)
            output['msg'] = msg
            return output

        try:
            mapping = idx.get_mapping()
        except Exception as e:
            msg = "Error: index mapping: reason {}".format(e)
            output['msg'] = msg
            return output

        output['success'] = True
        output['mapping'] = mapping
        return output
예제 #2
0
            '_type': 'document',
            'path': f,
            'text': text
        })

    client = Elasticsearch()

    # Tokenizers: whitespace classic standard letter
    my_analyzer = analyzer('default',
                           type='custom',
                           tokenizer=tokenizer(args.token),
                           filter=args.filter)

    try:
        # Drop index if it exists
        ind = Index(index, using=client)
        ind.delete()
    except NotFoundError:
        pass
    # then create it
    ind.settings(number_of_shards=1)
    ind.create()

    ind = Index(index, using=client)

    # configure default analyzer
    ind.close()  # index must be closed for configuring analyzer
    ind.analyzer(my_analyzer)

    # configure the path field so it is not tokenized and we can do exact match search
    client.indices.put_mapping(
예제 #3
0
    def construct_query(self, system=None, file_path=None, **kwargs):
        project_query_fields = [
            "projectId", "title", "description", "doi", "publications", "pis",
            "name"
        ]
        published_index_name = list(
            Index(settings.ES_INDEX_PREFIX.format(
                'publications')).get_alias().keys())[0]
        legacy_index_name = list(
            Index(settings.ES_INDEX_PREFIX.format(
                'publications-legacy')).get_alias().keys())[0]
        filter_queries = []
        if kwargs.get('type_filters'):
            for type_filter in kwargs['type_filters']:
                if type_filter == 'nees':
                    type_query = Q({'term': {'_index': legacy_index_name}})
                else:
                    type_query = Q(
                        'term',
                        **{'project.value.projectType._exact': type_filter})
                filter_queries.append(type_query)

        ds_user_query = Q({
            "nested": {
                "path": "users",
                "ignore_unmapped": True,
                "query": {
                    "query_string": {
                        "query":
                        self.query_string,
                        "fields": [
                            "users.first_name", "users.last_name",
                            "user.username"
                        ],
                        "lenient":
                        True
                    }
                }
            }
        })
        nees_pi_query = Q({
            "nested": {
                "path": "pis",
                "ignore_unmapped": True,
                "query": {
                    "query_string": {
                        "query": self.query_string,
                        "fields": ["pis.firstName", "pis.lastName"],
                        "lenient": True
                    }
                }
            }
        })
        pub_query = Q('query_string',
                      query=self.query_string,
                      default_operator='and',
                      fields=project_query_fields)

        published_query = Q(
            'bool',
            must=[
                Q('bool', should=[ds_user_query, nees_pi_query, pub_query]),
                Q({'term': {
                    '_index': legacy_index_name
                }}),
            ],
            must_not=[
                Q('term', status='unpublished'),
                Q('term', status='saved')
            ])

        return published_query
예제 #4
0
def exists():
    return Index(APIDoc.Index.name).exists()
예제 #5
0
def delete():
    Index(APIDoc.Index.name).delete()
예제 #6
0
# coding:utf-8
'''
@author = super_fazai
@File    : search.py
@Time    : 2017/8/11 10:41
@connect : [email protected]
'''

from elasticsearch_dsl import (
    DocType,
    Index,
)
from scrapy import Field


class Post():
    id = Field()


posts = Index('posts')


@posts.doc_type
class PostDocument(DocType):
    class Meta:
        model = Post

        fields = [
            'id',
        ]
예제 #7
0
파일: guesser.py 프로젝트: Pinafore/qb-api
 def delete():
     try:
         Index(INDEX_NAME).delete()
     except elasticsearch.exceptions.NotFoundError:
         log.info(
             'Could not delete non-existent index, creating new index...')
예제 #8
0
import os
from elasticsearch_dsl import (Index, tokenizer, analyzer)
from pprint import pprint

movie_index: Index = Index(os.environ.get('ES_INDEX', 'moovie'))

movie_index.settings(number_of_shards=5, number_of_replicas=1)

completion_analyzer = analyzer('completion_analyzer',
                               tokenizer=tokenizer('trigram',
                                                   'nGram',
                                                   min_gram=3,
                                                   max_gram=3),
                               filter=['lowercase'])

normalization_analyzer = analyzer('normalization_analyzer',
                                  tokenizer="standard",
                                  filter=["lowercase", "stop", "snowball"],
                                  char_filter=["html_strip"])

movie_index.analyzer(normalization_analyzer)


def init_index():
    if not movie_index.exists():
        movie_index.create()


def destroy_index():
    if movie_index.exists():
        movie_index.delete(ignore=404)
예제 #9
0
def get_fields_meta_index():
    fields_meta = Index(constants.FIELDS_INDEX, using=ElasticInstance.get())

    add_analyzer(fields_meta)
    return fields_meta
예제 #10
0
from django.template.loader import render_to_string
from elasticsearch_dsl import Keyword
from django.urls import reverse

from abstract.elastic_models import (
    BASIC_INDEX_SETTINGS,
    AbstractDatasetMapping,
    namesAutocompleteAnalyzer,
    namesAutocompleteSearchAnalyzer,
    ukrainianAddressesStopwordsAnalyzer,
)
from elasticsearch_dsl import DocType, Index

LETS_PARTY_INDEX = "ragoogle_lets_party"
lets_party_idx = Index(LETS_PARTY_INDEX)
lets_party_idx.settings(**BASIC_INDEX_SETTINGS)

lets_party_idx.analyzer(namesAutocompleteAnalyzer)
lets_party_idx.analyzer(namesAutocompleteSearchAnalyzer)
lets_party_idx.analyzer(ukrainianAddressesStopwordsAnalyzer)


@lets_party_idx.doc_type
class ElasticLetsPartyModel(AbstractDatasetMapping):
    start_date = Keyword()
    end_date = Keyword()

    def render_infocard(self):
        from .apps import LetsPartyConfig as AppConfig

        return render_to_string(
예제 #11
0
def index(name):
    if settings.ELASTIC_SEARCH["index_prefix"]:
        name = "%s-%s" % (settings.ELASTIC_SEARCH["index_prefix"], name)
    return Index(name)
예제 #12
0
def elasticsearch_status(request):
    client = get_es_client()

    disk_fields = ['node', 'disk.avail', 'disk.used', 'disk.percent']
    disk_status = [{
        _to_camel_case(field.replace('.', '_')): disk[field]
        for field in disk_fields
    } for disk in client.cat.allocation(format="json", h=','.join(disk_fields))
                   ]

    index_fields = [
        'index', 'docs.count', 'store.size', 'creation.date.string'
    ]
    indices = [{
        _to_camel_case(field.replace('.', '_')): index[field]
        for field in index_fields
    } for index in client.cat.indices(format="json", h=','.join(index_fields))
               if all(not index['index'].startswith(omit_prefix)
                      for omit_prefix in ['.', 'index_operations_log'])]

    aliases = defaultdict(list)
    for alias in client.cat.aliases(format="json", h='alias,index'):
        aliases[alias['alias']].append(alias['index'])

    mappings = Index('_all', using=client).get_mapping(doc_type='variant')

    active_samples = Sample.objects.filter(
        dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS,
        is_active=True,
        elasticsearch_index__isnull=False,
    ).prefetch_related('individual', 'individual__family')
    prefetch_related_objects(active_samples, 'individual__family__project')
    seqr_index_projects = defaultdict(lambda: defaultdict(set))
    es_projects = set()
    for sample in active_samples:
        for index_name in sample.elasticsearch_index.split(','):
            project = sample.individual.family.project
            es_projects.add(project)
            if index_name in aliases:
                for aliased_index_name in aliases[index_name]:
                    seqr_index_projects[aliased_index_name][project].add(
                        sample.individual.guid)
            else:
                seqr_index_projects[index_name.rstrip('*')][project].add(
                    sample.individual.guid)

    for index in indices:
        index_name = index['index']
        index_mapping = mappings[index_name]['mappings']['variant']
        index.update(index_mapping.get('_meta', {}))

        projects_for_index = []
        for index_prefix in seqr_index_projects.keys():
            if index_name.startswith(index_prefix):
                projects_for_index += seqr_index_projects.pop(
                    index_prefix).keys()
        index['projects'] = [{
            'projectGuid': project.guid,
            'projectName': project.name
        } for project in projects_for_index]

    errors = [
        '{} does not exist and is used by project(s) {}'.format(
            index, ', '.join([
                '{} ({} samples)'.format(p.name, len(indivs))
                for p, indivs in project_individuals.items()
            ])) for index, project_individuals in seqr_index_projects.items()
        if project_individuals
    ]

    return create_json_response({
        'indices': indices,
        'diskStats': disk_status,
        'elasticsearchHost': ELASTICSEARCH_SERVER,
        'errors': errors,
    })
예제 #13
0
def setup_teardown(es_url, es_object):
    os.environ["ES_URL"] = es_url
    yield
    Index("store", using=es_object.connection).delete()
def main():
    # Disable warning about not verifying certificates
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    args = parse_args()
    es = Elasticsearch([args.es],
                        verify_certs=args.verify_certs,
                        timeout=35)
    if args.es_out:
        es_out = Elasticsearch([args.es_out],
                        verify_certs=args.verify_certs,
                        timeout=35)
    else:
        es_out = es

    es_index = args.es_index
    es_index_out = args.es_index_out

    index = Index(name=es_index, using=es)
    mappings = index.get_field_mapping(doc_type='items',
                        fields='is_git_commit,is_gerrit_review')
    for mapping in mappings.values():
        fields = mapping['mappings']['items']
        if 'is_git_commit' in fields:
            src = 'git'
        elif 'is_gerrit_review' in fields:
            src = 'gerrit'
        else:
            print("I couldn't identify data source for index, exiting")
            exit(0)
    print("Identified data source for index: " + src)

    src_fields = {
        'git': {
            'date': 'author_date',
            'repo': 'repo_name',
            'cardinal': 'hash'
            },
        'gerrit': {
            'date': 'opened',
            'repo': 'repository',
            'cardinal': 'number'
            }
        }

    def get_authors(es, es_index, src):

        date_field = src_fields[src]['date']
        repo_field = src_fields[src]['repo']
        cardinal_field = src_fields[src]['cardinal']
        # Buckets by author name, finding first commit for each of them
        s = Search(using=es, index=es_index)
        s.aggs.bucket('by_authors', 'terms', field='author_name', size=100000) \
            .bucket('repos', 'terms', field=repo_field, size=10000) \
            .metric('first_item', 'top_hits',
                _source=[date_field, 'author_org_name',
                    'author_uuid', 'project'],
                size=1, sort=[{date_field: {"order": "asc"}}]) \
            .metric('last_item', 'max', field=date_field) \
            .metric('contribs', 'cardinality', field=cardinal_field)
        s = s.sort(date_field)
        result = s.execute()

        # Get a dataframe with each author and their first commit
        buckets_result = result['aggregations']['by_authors']['buckets']
        buckets = []
        for bucket_author in buckets_result:
            author = bucket_author['key']
            first_all = None
            last_all = None
            buckets_author = []
            for bucket_repo in bucket_author['repos']['buckets']:
                first_item = bucket_repo['first_item']['hits']['hits'][0]
                first_date = first_item['sort'][0]/1000
                first = datetime.utcfromtimestamp(first_date)
                if first_all is None or first < first_all:
                    first_all = first
                last_date = bucket_repo['last_item']['value']/1000
                last = datetime.utcfromtimestamp(last_date)
                if last_all is None or last > last_all:
                    last_all = last
                contribs = bucket_repo['contribs']['value']
                org_name = first_item['_source']['author_org_name']
                project = first_item['_source']['project']
                uuid = first_item['_source']['author_uuid']
                buckets_author.append(
                    {'first': first,
                    'last': last,
                    'author_name': author,
                    'contribs': contribs,
                    'uuid': uuid,
                    'author_org_name': org_name,
                    'repo_name': bucket_repo['key'],
                    'project': project}
                )
            for bucket in buckets_author:
                bucket['first_all'] = first_all
                bucket['last_all'] = last_all
                buckets.append(bucket)
        authors_repos = pd.DataFrame.from_records(buckets)
        authors_repos.sort_values(by='first', ascending=False,
                                inplace=True)
        return(authors_repos)

    authors_repos = get_authors(es, es_index, src=src)
    authors = authors_repos.groupby('author_name').last().reset_index()
    authors.sort_values(by='first', ascending=False, inplace=True)

    print("Creating CSV for first date for authors: " + new_file)
    authors.to_csv(new_file,
                columns=['first', 'last',
                         'author_name', 'contribs', 'author_org_name',
                         'repo_name', 'project'],
                index=False)

    def mapping_es (es, es_index):

        mapping = Mapping('items')
        mapping.field('author_name', String(index='not_analyzed'))
        mapping.field('first', Date())
        mapping.field('last', Date())
        mapping.field('first_all', Date())
        mapping.field('last_all', Date())
        mapping.field('contribs', 'integer')
        mapping.field('author_org_name', String(index='not_analyzed'))
        mapping.field('repo_name', String(index='not_analyzed'))
        mapping.field('project', String(index='not_analyzed'))
        mapping.field('uuid', String(index='not_analyzed'))
        print("Uploading mapping to ElasticSearch")
        mapping.save(es_index, using=es)

    def upload_es (es, es_index, df, columns):

        es_type = 'items'
        actions = []
        for row in df[columns].to_dict(orient='records'):
            id_src = row['uuid']+row['repo_name']
            id = hashlib.sha1(id_src.encode('utf-8', errors='surrogateescape')).hexdigest()
            to_write = {
                '_op_type': 'index',
                '_index': es_index,
                '_type': es_type,
                '_id': id
            }
            to_write.update(row)
            actions.append(to_write)
        print("Uploading to ElasticSearch")
        result = elasticsearch.helpers.bulk(es, actions,
                                            raise_on_error=True,
                                            stats_only=True)
        print("Bulk upload result (succesful / errors): ", result)

    mapping_es(es_out, es_index_out)
    upload_es(es_out, es_index_out, authors_repos,
                ['first', 'last', 'first_all', 'last_all',
                 'author_name', 'uuid', 'contribs', 'author_org_name',
                 'repo_name', 'project'])
예제 #15
0
def test_conflicting_analyzer_raises_error():
    i = Index('i')
    i.analyzer('my_analyzer', tokenizer='whitespace', filter=['lowercase', 'stop'])

    with raises(ValueError):
        i.analyzer('my_analyzer', tokenizer='keyword', filter=['lowercase', 'stop'])
예제 #16
0
 def __init__(self):
     movies = Index('imdb', using=es)
     movies.doc_type(Movie)
     movies.delete(ignore=404)
     movies.create()
예제 #17
0
from django.conf import settings

edge_ngram_analyzer = analyzer('edge_ngram_analyzer',
                               type='custom',
                               tokenizer='standard',
                               filter=[
                                   'lowercase',
                                   token_filter('edge_ngram_filter',
                                                type='edgeNGram',
                                                min_gram=1,
                                                max_gram=20)
                               ])


class TitleDoc(DocType):
    id = Keyword()
    domain = Keyword(required=True)
    url = Keyword(required=True, index=False)
    title = Text(required=True,
                 analyzer=edge_ngram_analyzer,
                 search_analyzer='standard')
    popularity = Float()
    group = Keyword()


# create an index and register the doc types
index = Index(settings.ES_INDEX)
index.settings(**settings.ES_INDEX_SETTINGS)
index.doc_type(TitleDoc)
예제 #18
0
파일: models.py 프로젝트: rpkilby/yurika
 def index(self):
     return Index(self.index_name)
예제 #19
0
def index_doc(doc, index_name="wiki-dumps"):
    global es
    index = Index(index_name)
    index.create()
예제 #20
0
                if address.get("streetNumber"):
                    address["streetAddress"] += ", " + address["streetNumber"]
                    address["fullAddress"] += ", " + address["streetNumber"]

                    del address["streetNumber"]
        except (ValueError, KeyError, IndexError) as e:
            print(e)
            return address

        return address

    class Meta:
        index = ADDRESSES_INDEX


addresses_idx = Index(ADDRESSES_INDEX)

addresses_idx.settings(number_of_shards=settings.NUM_THREADS, number_of_replicas=0)

addresses_idx.doc_type(Address)

shingle_analyzer = analyzer(
    "shingleAnalyzer",
    tokenizer=tokenizer(
        "ukrainianTokenizer", type="pattern", pattern="[А-ЯЄІЇҐа-яєіїґA-Za-z0-9']+"
    ),
    filter=[
        token_filter(
            "shingleFilter",
            type="shingle",
            max_shingle_size=5,
예제 #21
0
파일: guesser.py 프로젝트: Pinafore/qb-api
 def exists():
     return Index(INDEX_NAME).exists()
예제 #22
0
import faker
from elasticsearch_dsl import Index, Search
from django.test import TestCase
from django_datajsonar.tasks import read_datajson
from django_datajsonar.models import ReadDataJsonTask, Node, Field as datajsonar_Field
from elasticsearch_dsl.connections import connections

from series_tiempo_ar_api.apps.metadata.indexer.catalog_meta_indexer import CatalogMetadataIndexer
from series_tiempo_ar_api.apps.metadata.indexer.index import add_analyzer
from series_tiempo_ar_api.apps.metadata.models import IndexMetadataTask
from series_tiempo_ar_api.apps.management import meta_keys
SAMPLES_DIR = os.path.join(os.path.dirname(__file__), 'samples')

fake = faker.Faker()

fake_index = Index(fake.pystr(max_chars=50).lower())
add_analyzer(fake_index)


class IndexerTests(TestCase):

    def setUp(self):
        self.task = ReadDataJsonTask.objects.create()
        self.meta_task = IndexMetadataTask.objects.create()

    def test_index(self):
        index_ok = self._index(catalog_id='test_catalog', catalog_url='single_distribution.json')
        search = Search(
            index=fake_index._name,
        ).filter('term',
                 catalog_id='test_catalog')
예제 #23
0
        Should return list of family member names
        """
        family = getattr(self.general, "family", None)
        if family:
            for member in family:
                if hasattr(member, "family_name"):
                    yield member.family_name
        else:
            for member in parse_raw_family_string(
                getattr(self.general, "family_raw", "")
            ):
                if "family_name" in member:
                    yield member["family_name"]


declarations_idx = Index(OLD_DECLARATION_INDEX)
declarations_idx.settings(
    number_of_shards=NUMBER_OF_SHARDS, number_of_replicas=NUMBER_OF_REPLICAS
)
declarations_idx.analyzer(namesAutocompleteAnalyzer)
declarations_idx.analyzer(namesAutocompleteSearchAnalyzer)


@declarations_idx.doc_type
class Declaration(DocType, AbstractDeclaration):
    """Declaration document.
    Assumes there's a dynamic mapping with all fields not indexed by default."""

    persons = Text(analyzer="ukrainian", copy_to="all")
    countries = Text(analyzer="ukrainian", copy_to="all")
    companies = Text(analyzer="ukrainian", copy_to="all")
    def setUp(self):
        m = get_user_model()
        self.user1 = m.objects.create_user('test1', '*****@*****.**', 'super_password')
        self.user2 = m.objects.create_user('test2', '*****@*****.**', 'super_password')
        self.user1_actions = [
            CustomAction.objects.create(
                owner=self.user1,
                name="Custom action %s" % i,
                description="Custom action %s description" % i
            ) for i in range(0, 10)
            ]

        self.user2_actions = [
            CustomAction.objects.create(
                owner=self.user2,
                name="Custom action %s" % i,
                description="Custom action %s description" % i
            ) for i in range(0, 25)
            ]

        self.user1_campaigns = [
            Campaign.objects.create(
                owner=self.user1,
                name="Campaign %s" % i,
                description="Campaign %s description" % i,
                start=timezone.now()
            ) for i in range(0, 10)
            ]

        self.user2_campaigns = [
            Campaign.objects.create(
                owner=self.user2,
                name="Campaign %s" % i,
                description="Campaign %s description" % i,
                start=timezone.now()
            ) for i in range(0, 25)
            ]

        self.other_users = [
            m.objects.create_user('other_test_%s' % k, '*****@*****.**' % k, 'super_password') for k in
            range(0, 10)
            ]

        self.es = Elasticsearch(settings.ES_CLUSTER)

        c_a_idx_data = [
            {
                'user_id': u.id,
                'page_id': 0,
                'subscription_uuid': self.user1_actions[0].uuid,
                'start': None,
                'end': None,
                'created': timezone.now(),
                'modified': timezone.now(),
                'payload': {
                    'data': 'Super-awesome'
                },
                'status': 'active'
            } for u in self.other_users
            ]

        c_idx_data = [
            {
                'user_id': u.id,
                'page_id': 0,
                'subscription_uuid': self.user1_campaigns[0].uuid,
                'start': None,
                'end': None,
                'created': timezone.now(),
                'modified': timezone.now(),
                'payload': {
                    'data': 'Super-awesome'
                },
                'status': 'active'
            } for u in self.other_users
            ]

        self.campaign_idx = Index(settings.ES_CAMPAIGN_SUBSCRIBERS_IDX, using=self.es)
        self.custom_action_idx = Index(settings.ES_CUSTOM_ACTION_SUBSCRIBERS_IDX, using=self.es)
        self.campaign_idx.doc_type(CampaignSubscriptionObj)
        self.custom_action_idx.doc_type(CustomActionSubscriptionObj)

        for u in self.other_users:
            c = CampaignSubscriptionObj(
                user_id=u.id,
                page_id=0,
                subscription_uuid=self.user1_campaigns[0].uuid,
                start=None,
                end=None,
                created=timezone.now(),
                modified=timezone.now(),
                status='active'
            )
            c.save(using=self.es)

            c = CustomActionSubscriptionObj(
                user_id=u.id,
                page_id=0,
                subscription_uuid=self.user1_actions[0].uuid,
                start=None,
                end=None,
                created=timezone.now(),
                modified=timezone.now(),
                status='active'
            )
            c.save(using=self.es)
            sleep(1)
예제 #25
0
def refresh():

    index = Index(APIDoc.Index.name)
    index.refresh()
예제 #26
0
from urllib import parse
from os.path import splitext, basename

res = requests.get('http://localhost:9200')
print(res.content)
es = Elasticsearch([{'host': 'localhost', 'port': '9200'}])
client = Elasticsearch()

fimgin = open('./image_data.json', "r")
img_list = json.load(fimgin)

m = Mapping()
m.field('image_id', 'long')

idx1 = Index('idx1')
idx1.mapping(m)

for CDict in img_list:
    image_id = CDict["image_id"]
    url = CDict["url"]
    imagefile = os.path.basename(url)
    CDict['imagefile'] = imagefile
    print("Imagefile= ", imagefile)

    img_str = json.dumps(CDict)
    es.index(index='idx1', body=json.loads(img_str))


def get_imgfile(img_id):
    s = Search(index='idx1').query('match', image_id=img_id)
예제 #27
0
def test_search_is_limited_to_index_name():
    i = Index('my-index')
    s = i.search()

    assert s._index == ['my-index']
예제 #28
0
from elasticsearch_dsl import DocType, Index, Integer
from elasticsearch_dsl.connections import connections

from paginatify_elasticsearch_dsl import Pagination

conn = connections.create_connection(hosts=['localhost'])

index = Index('test-paginatify-elasticsearch-dsl')


@index.doc_type
class Item(DocType):
    id = Integer(index='not_analyzed')


def paginate(count, page=1):
    if conn.indices.exists(index._name):
        index.delete()
    index.create()
    try:
        Item.init()
        for i in range(1, count + 1):
            Item(id=i, meta={'id': i}).save(refresh=True)
        return Pagination(Item.search().sort('id'),
                          page=page,
                          map_=lambda x: x.id,
                          per_page=3,
                          per_nav=3)
    finally:
        index.delete()
예제 #29
0
파일: views.py 프로젝트: owaisj/portal
    def get(self, request):
        """GET handler."""
        q = request.GET.get('query_string')
        offset = int(request.GET.get('offset', 0))
        limit = int(request.GET.get('limit', 10))
        if limit > 500:
            return HttpResponseBadRequest("limit must not exceed 500")
        type_filter = request.GET.get('type_filter', 'all')

        doc_type_map = {
            Index(settings.ES_INDEX_PREFIX.format('publications')).get_alias().keys(
            )[0]:
            'publication',
            Index(settings.ES_INDEX_PREFIX.format('publications-legacy')).get_alias(
            ).keys()[0]:
            'publication',
            Index(settings.ES_INDEX_PREFIX.format('files')).get_alias().keys()[0]:
            'file',
            Index(settings.ES_INDEX_PREFIX.format('cms')).get_alias().keys()[0]:
            'modelresult'
        }

        public_files_query = CommunityDataSearchManager(
            request).construct_query() | PublishedDataSearchManager(
                request).construct_query()
        publications_query = PublicationsSearchManager(
            request).construct_query()
        cms_query = es_query = CMSSearchManager(request).construct_query()

        if type_filter == 'public_files':
            es_query = Search().query(public_files_query)
        elif type_filter == 'published':
            es_query = Search().query(publications_query)
        elif type_filter == 'cms':
            es_query = Search().query(cms_query).highlight(
                'body',
                fragment_size=100).highlight_options(pre_tags=["<b>"],
                                                     post_tags=["</b>"],
                                                     require_field_match=False)
        elif type_filter == 'all':
            es_query = Search().query(public_files_query | publications_query
                                      | cms_query).highlight(
                                          'body',
                                          fragment_size=100).highlight_options(
                                              pre_tags=["<b>"],
                                              post_tags=["</b>"],
                                              require_field_match=False)

        try:
            res = es_query.execute()
        except (TransportError, ConnectionTimeout) as err:
            if getattr(err, 'status_code', 500) == 404:
                raise
            res = es_query.execute()

        out = {}
        hits = []

        for r in res:
            d = r.to_dict()
            d["doc_type"] = doc_type_map[r.meta.index]
            if hasattr(r.meta, 'highlight'):
                highlight = r.meta.highlight.to_dict()
                d["highlight"] = highlight
            if r.meta.doc_type == 'publication' and hasattr(r, 'users'):
                users = r.users
                pi = r.project.value.pi
                pi_user = filter(lambda x: x.username == pi, users)[0]
                d["piLabel"] = "{}, {}".format(pi_user.last_name,
                                               pi_user.first_name)
            hits.append(d)

        out['total_hits'] = res.hits.total.value
        out['hits'] = hits
        out['all_total'] = Search().query(public_files_query
                                          | publications_query
                                          | cms_query).count()
        out['public_files_total'] = Search().query(public_files_query).count()
        out['published_total'] = Search().query(publications_query).count()
        out['cms_total'] = Search().query(cms_query).count()

        return JsonResponse(out, safe=False)
예제 #30
0
def update_record_statistics(start_date=None, end_date=None):
    """Update "_stats" field of affected records."""
    start_date = dateutil_parse(start_date) if start_date else None
    end_date = dateutil_parse(end_date) if start_date else None
    aggr_configs = {}

    if not start_date and not end_date:
        start_date = datetime.utcnow()
        end_date = datetime.utcnow()

        for aggr_name in current_stats.enabled_aggregations:
            aggr_cfg = current_stats.aggregations[aggr_name]
            aggr = aggr_cfg.aggregator_class(name=aggr_cfg.name,
                                             **aggr_cfg.aggregator_config)

            if not Index(aggr.aggregation_alias, using=aggr.client).exists():
                if not Index(aggr.event_index, using=aggr.client).exists():
                    start_date = min(start_date, datetime.utcnow())
                else:
                    start_date = min(start_date,
                                     aggr._get_oldest_event_timestamp())

            # Retrieve the last two bookmarks
            bookmarks = Search(using=aggr.client,
                               index=aggr.aggregation_alias,
                               doc_type=aggr.bookmark_doc_type)[0:2].sort({
                                   'date': {
                                       'order': 'desc'
                                   }
                               }).execute()

            if len(bookmarks) >= 1:
                end_date = max(
                    end_date,
                    datetime.strptime(bookmarks[0].date, aggr.doc_id_suffix))
            if len(bookmarks) == 2:
                start_date = min(
                    start_date,
                    datetime.strptime(bookmarks[1].date, aggr.doc_id_suffix))

            aggr_configs[aggr.aggregation_alias] = aggr
    elif start_date and end_date:
        for aggr_name in current_stats.enabled_aggregations:
            aggr_cfg = current_stats.aggregations[aggr_name]
            aggr = aggr_cfg.aggregator_class(name=aggr_cfg.name,
                                             **aggr_cfg.aggregator_config)
            aggr_configs[aggr.aggregation_alias] = aggr
    else:
        return

    # Get conceptrecids for all the affected records between the two dates
    conceptrecids = set()
    for aggr_alias, aggr in aggr_configs.items():
        query = Search(
            using=aggr.client,
            index=aggr.aggregation_alias,
            doc_type=aggr.aggregation_doc_type,
        ).filter('range',
                 timestamp={
                     'gte':
                     start_date.replace(microsecond=0).isoformat() + '||/d',
                     'lte':
                     end_date.replace(microsecond=0).isoformat() + '||/d'
                 }).extra(_source=False)
        query.aggs.bucket('ids', 'terms', field='conceptrecid', size=0)
        conceptrecids |= {
            b.key
            for b in query.execute().aggregations.ids.buckets
        }

    indexer = RecordIndexer()
    for concpetrecid_val in conceptrecids:
        conceptrecid = PersistentIdentifier.get('recid', concpetrecid_val)
        pv = PIDVersioning(parent=conceptrecid)
        children_recids = pv.children.all()
        indexer.bulk_index([str(p.object_uuid) for p in children_recids])