Exemplo n.º 1
0
class TopicModellingIndex(es.Document):
    corpus = es.Keyword()
    source = es.Keyword()
    number_of_documents = es.Integer()
    is_ready = es.Boolean()
    has_topic_info = es.Boolean()
    name = es.Keyword()
    description = es.Text()
    datetime_created = es.Date()
    datetime_finished = es.Date()

    datetime_from = es.Date()
    datetime_to = es.Date()

    algorithm = es.Keyword()
    number_of_topics = es.Integer()
    hierarchical = es.Boolean()
    meta_parameters = es.Object()

    perplexity = es.Float()
    purity = es.Float()
    contrast = es.Float()
    coherence = es.Float()

    tau_smooth_sparse_theta = es.Float()
    tau_smooth_sparse_phi = es.Float()
    tau_decorrelator_phi = es.Float()
    tau_coherence_phi = es.Float()

    topics = es.Nested(Topic)

    is_actualizable = es.Boolean()

    class Index:
        name = ES_INDEX_TOPIC_MODELLING
        using = ES_CLIENT
Exemplo n.º 2
0
class Job(es.DocType):
    class Meta:
        index = 'jobs'
        doc_type = 'job-offer'

    french_elision = es.token_filter('french_elision',
                                     type='elision',
                                     articles_case=True,
                                     articles=[
                                         'l', 'm', 't', 'qu', 'n', 's', 'j',
                                         'd', 'c', 'jusqu', 'quoiqu', 'lorsqu',
                                         'puisqu'
                                     ])

    french_stopwords = es.token_filter('french_stopwords',
                                       type='stop',
                                       stopwords='_french_')

    # Do not include this filter if keywords is empty
    french_keywords = es.token_filter('french_keywords',
                                      type='keyword_marker',
                                      keywords=[])

    french_stemmer = es.token_filter('french_stemmer',
                                     type='stemmer',
                                     language='light_french')

    french_analyzer = es.analyzer(
        'french_analyzer',
        tokenizer='standard',
        filter=[
            'lowercase',
            'asciifolding',
            french_elision,
            french_stopwords,
            # french_keywords,
            french_stemmer
        ],
        char_filter=['html_strip'])

    technologies_tokenizer = es.tokenizer('comma_tokenizer',
                                          type='pattern',
                                          pattern=' |,|, ')

    technologies_synonyms_filter = es.token_filter(
        'technologies_synonyms',
        type='synonym',
        synonyms=[
            'c => c_language', 'c++, cpp => cpp_language',
            'c/c++, c/cpp => c_language', 'c/c++, c/cpp => cpp_language',
            'c#, c♯, csharp => csharp_language',
            'f#, f♯, fsharp => fsharp_language', 'c#, c♯, csharp => dotnet',
            'f#, f♯, fsharp => dotnet', '.net => dotnet'
        ])

    technologies_analyzer = es.analyzer(
        'technologies_analyzer',
        tokenizer=technologies_tokenizer,
        filter=['lowercase', 'asciifolding', technologies_synonyms_filter])

    company_name_analyzer = es.analyzer('company_name_analyzer',
                                        tokenizer='standard',
                                        filter=['lowercase', 'asciifolding'])

    id = es.Integer()

    url = es.String(index='no')
    source = es.String(index='not_analyzed')

    title = es.String(
        analyzer=french_analyzer,
        fields={'technologies': es.String(analyzer=technologies_analyzer)})

    description = es.String(
        analyzer=french_analyzer,
        fields={'technologies': es.String(analyzer=technologies_analyzer)})

    company = es.String(analyzer=company_name_analyzer)

    company_url = es.String(index='no')

    address = es.String(analyzer=french_analyzer)
    address_is_valid = es.Boolean()

    tags = es.Nested(doc_class=Tag,
                     properties=dict(tag=es.String(index='not_analyzed'),
                                     weight=es.Integer()))

    publication_datetime = es.Date()
    publication_datetime_is_fake = es.Boolean()

    crawl_datetime = es.Date()

    geolocation = es.GeoPoint()
    geolocation_is_valid = es.Boolean()

    def __init__(self, meta=None, **kwargs):
        super(Job, self).__init__(meta, **kwargs)
        self._doc_type.index = compute_index_name(self.index)

    @property
    def index(self):
        return self._doc_type.index

    @property
    def doc_type(self):
        return self._doc_type.name

    @property
    def published(self):
        return format_date(self.publication_datetime, locale='FR_fr')

    @property
    def published_in_days(self):
        delta = datetime.now() - self.publication_datetime  # TODO: bugfix
        return format_timedelta(delta, granularity='day', locale='en_US')

    @property
    def alltags(self):
        tags = []
        if self.tags:
            for tag in self.tags:
                if tag['tag'] not in condition_tags:
                    tags.append(Tag2(tag['tag'], tag['weight']))
        return tags

    @property
    def condition_tags(self):
        tags = []
        if self.tags:
            for tag in self.tags:
                if tag['tag'] in condition_tags:
                    tag = Tag2(tag['tag'], tag['weight'],
                               Tag2.get_css(tag['tag']))
                    tags.append(tag)
        return tags
Exemplo n.º 3
0
class Nummeraanduiding(es.DocType):
    """
    All bag objects should have one or more adresses

    Een nummeraanduiding, in de volksmond ook wel adres genoemd, is een door
    het bevoegde gemeentelijke orgaan als
    zodanig toegekende aanduiding van een verblijfsobject, standplaats of
    ligplaats.

    [Stelselpedia](http://www.amsterdam.nl/stelselpedia/bag-index/catalogus-bag/objectklasse-2/)
    """
    straatnaam = es.Text(analyzer=analyzers.adres,
                         fields={
                             'raw':
                             es.Keyword(),
                             'ngram_edge':
                             es.Text(analyzer=analyzers.autocomplete,
                                     search_analyzer='standard')
                         })

    straatnaam_keyword = es.Keyword()

    straatnaam_nen = es.Text(analyzer=analyzers.adres,
                             fields={
                                 'raw':
                                 es.Keyword(),
                                 'ngram_edge':
                                 es.Text(analyzer=analyzers.autocomplete,
                                         search_analyzer='standard')
                             })

    straatnaam_nen_keyword = es.Keyword()

    straatnaam_ptt = es.Text(analyzer=analyzers.adres,
                             fields={
                                 'raw':
                                 es.Keyword(),
                                 'ngram_edge':
                                 es.Text(analyzer=analyzers.autocomplete,
                                         search_analyzer='standard'),
                                 'keyword':
                                 es.Keyword(normalizer=analyzers.lowercase),
                             })

    straatnaam_ptt_keyword = es.Keyword()

    adres = es.Text(analyzer=analyzers.adres,
                    fields={
                        'raw':
                        es.Keyword(),
                        'ngram_edge':
                        es.Text(analyzer=analyzers.autocomplete,
                                search_analyzer='standard'),
                    })

    comp_address = es.Text(analyzer=analyzers.adres,
                           fields={
                               'raw':
                               es.Keyword(),
                               'ngram':
                               es.Text(analyzer=analyzers.autocomplete,
                                       search_analyzer='standard')
                           })
    comp_address_nen = es.Text(analyzer=analyzers.adres,
                               fields={
                                   'raw':
                                   es.Keyword(),
                                   'ngram':
                                   es.Text(analyzer=analyzers.autocomplete,
                                           search_analyzer='standard')
                               })
    comp_address_ptt = es.Text(analyzer=analyzers.adres,
                               fields={
                                   'raw':
                                   es.Keyword(),
                                   'ngram':
                                   es.Text(analyzer=analyzers.autocomplete,
                                           search_analyzer='standard')
                               })
    comp_address_pcode = es.Text(analyzer=analyzers.adres,
                                 fields={
                                     'raw':
                                     es.Keyword(),
                                     'ngram':
                                     es.Text(analyzer=analyzers.autocomplete,
                                             search_analyzer='standard')
                                 })

    huisnummer = es.Integer(
        fields={'variation': es.Text(analyzer=analyzers.huisnummer)})

    toevoeging = es.Text(analyzer=analyzers.toevoeging,
                         fields={'keyword': es.Keyword()})

    # to return official bag fields
    bag_toevoeging = es.Keyword()
    bag_huisletter = es.Keyword()
    woonplaats = es.Keyword()

    postcode = es.Text(
        analyzer=analyzers.postcode,
        fields=postcode_fields,
    )

    order = es.Integer()

    hoofdadres = es.Boolean()
    status = es.Nested(
        properties={
            'code': es.Keyword(normalizer=analyzers.lowercase),
            'omschrijving': es.Text()
        })

    vbo_status = es.Nested(
        properties={
            'code': es.Keyword(normalizer=analyzers.lowercase),
            'omschrijving': es.Text()
        })

    subtype = es.Keyword()
    _display = es.Keyword()

    landelijk_id = es.Text(analyzer=analyzers.autocomplete,
                           fields={
                               'raw': es.Keyword(),
                               'nozero': es.Text(analyzer=analyzers.nozero)
                           })
    adresseerbaar_object_id = es.Text(  # Is landelijk_id for related verblijfsobject, ligplaats of standplaats
        analyzer=analyzers.autocomplete,
        fields={
            'raw': es.Keyword(),
            'nozero': es.Text(analyzer=analyzers.nozero)
        })

    class Index:
        name = settings.ELASTIC_INDICES['NUMMERAANDUIDING']
Exemplo n.º 4
0
class Inschrijving(es.DocType):

    _display = es.Keyword()

    _kvk_display = es.Keyword()

    doctype = es.Keyword()

    kvk_nummer = es.Text(analyzer=analyzers.autocomplete,
                         fields={
                             'raw': es.Keyword(),
                             'nozero': es.Text(analyzer=analyzers.nozero)
                         })

    vestigingsnummer = es.Text(analyzer=analyzers.autocomplete,
                               fields={
                                   'raw': es.Keyword(),
                                   'nozero':
                                   es.Text(analyzer=analyzers.nozero),
                                   'int': es.Integer()
                               })

    hoofdvestiging = es.Boolean()

    sbi = es.Nested(
        properties={
            'code':
            es.Text(analyzer=analyzers.autocomplete,
                    fields={'raw': es.Keyword()}),
            'omschrijving':
            es.Text(),
        })

    naam = es.Text(analyzer=analyzers.adres,
                   fields={
                       'raw':
                       es.Keyword(),
                       'ngram':
                       es.Text(analyzer=analyzers.autocomplete,
                               search_analyzer='standard')
                   })

    handelsnamen = es.Nested(
        properties={
            'naam':
            es.Text(analyzer=analyzers.adres,
                    fields={
                        'raw':
                        es.Keyword(),
                        'ngram':
                        es.Text(analyzer=analyzers.autocomplete,
                                search_analyzer='standard')
                    })
        })

    postadres = es.Text(analyzer=analyzers.adres,
                        fields={
                            'raw':
                            es.Keyword(),
                            'ngram':
                            es.Text(analyzer=analyzers.autocomplete,
                                    search_analyzer='standard')
                        })

    bezoekadres = es.Text(analyzer=analyzers.adres,
                          fields={
                              'raw':
                              es.Keyword(),
                              'ngram':
                              es.Text(analyzer=analyzers.autocomplete,
                                      search_analyzer='standard')
                          })

    bezoekadres_correctie = es.Boolean()

    # hoofdvestiging

    centroid = es.GeoPoint()

    class Index:
        name = settings.ELASTIC_INDICES['HR']
Exemplo n.º 5
0
class PublicationDoc(DocType):
    all_data = edsl.Text()
    id = edsl.Integer()
    title = edsl.Text(copy_to=ALL_DATA_FIELD)
    date_published = edsl.Date()
    last_modified = edsl.Date()
    code_archive_url = edsl.Keyword()
    doi = edsl.Keyword()
    contact_email = edsl.Keyword(copy_to=ALL_DATA_FIELD)
    container = edsl.Object(ContainerInnerDoc)
    tags = edsl.Nested(RelatedInnerDoc)
    sponsors = edsl.Nested(RelatedInnerDoc)
    platforms = edsl.Nested(RelatedInnerDoc)
    model_documentation = edsl.Keyword()
    authors = edsl.Nested(AuthorInnerDoc)

    @classmethod
    def from_instance(cls, publication):
        container = publication.container
        doc = cls(meta={'id': publication.id},
                  id=publication.id,
                  title=publication.title,
                  date_published=publication.date_published,
                  last_modified=publication.date_modified,
                  code_archive_url=publication.code_archive_url,
                  contact_email=publication.contact_email,
                  container=ContainerInnerDoc(id=container.id,
                                              name=container.name,
                                              issn=container.issn),
                  doi=publication.doi,
                  tags=[
                      RelatedInnerDoc(id=t.id, name=t.name)
                      for t in publication.tags.all()
                  ],
                  sponsors=[
                      RelatedInnerDoc(id=s.id, name=s.name)
                      for s in publication.sponsors.all()
                  ],
                  platforms=[
                      RelatedInnerDoc(id=p.id, name=p.name)
                      for p in publication.platforms.all()
                  ],
                  model_documentation=[
                      md.name for md in publication.model_documentation.all()
                  ],
                  authors=[
                      AuthorInnerDoc(id=a.id,
                                     name=a.name,
                                     orcid=a.orcid,
                                     researcherid=a.researcherid,
                                     email=a.email)
                      for a in publication.creators.all()
                  ])
        return doc.to_dict(include_meta=True)

    def get_public_detail_url(self):
        return reverse('core:public-publication-detail',
                       kwargs={'pk': self.meta.id})

    @classmethod
    def get_breadcrumb_data(cls):
        return {
            'breadcrumb_trail': [{
                'link': reverse('core:public-home'),
                'text': 'Home'
            }, {
                'text': 'Publications'
            }]
        }

    @classmethod
    def get_public_list_url(cls, search=None):
        location = reverse('core:public-search')
        if search:
            query_string = urlencode({'search': search})
            location += '?{}'.format(query_string)
        return location

    class Index:
        name = 'publication'
        settings = {'number_of_shards': 1}
Exemplo n.º 6
0
class Topic(es.InnerDoc):
    id = es.Keyword()
    name = es.Keyword()
    topic_words = es.Nested(TopicWord)
    topic_size = es.Integer()
    topic_weight = es.Float()
Exemplo n.º 7
0
def genbibres(stream, estype='bibliographicResource'):
    """
    Creates the mapping for type bibliographicResource in Elasticsearch
    :param estype: Name of ES type (defaults to 'bibliographicResource')
    """
    m = dsl.Mapping(estype)
    # Set properties
    m.properties.dynamic = 'strict'
    # Adding mapping
    m = gencontext(m)
    m = m.field('@id', 'string', index='not_analyzed')
    m = m.field('@type', 'string', index='no')
    m = m.field('bibo:edition', 'string', index='analyzed')
    m = m.field('bibo:isbn10', 'string', index='not_analyzed')
    m = m.field('bibo:isbn13', 'string', index='not_analyzed')
    m = m.field('bibo:issn', 'string', index='not_analyzed')
    m = m.field('dbp:originalLanguage',
                dsl.Object().property('@id', 'string', index='not_analyzed'))
    contrib = dsl.Nested()
    contrib = contrib.property('@id', dsl.String(index='no'))
    contrib = contrib.property('@type', dsl.String(index='no'))
    contrib = contrib.property('dbp:birthYear',
                               dsl.String(index='not_analyzed'))
    contrib = contrib.property('dbp:deathYear',
                               dsl.String(index='not_analyzed'))
    contrib = contrib.property('foaf:firstName', dsl.String(index='analyzed'))
    contrib = contrib.property('foaf:lastName', dsl.String(index='analyzed'))
    contrib = contrib.property('foaf:name', dsl.String(index='analyzed'))
    contrib = contrib.property('rdfs:label', dsl.String(index='analyzed'))
    contrib = contrib.property('skos:note', dsl.String(index='analyzed'))
    m = m.field('dc:contributor', contrib)
    m = m.field('dc:format', 'string', index='analyzed')
    m = m.field('dct:alternative',
                'string',
                index='analyzed',
                fields={'folded': dsl.String(analyzer='text_folded')})
    m = m.field('dct:bibliographicCitation',
                'string',
                index='analyzed',
                analyzer='standard')
    m = m.field('dct:hasPart', 'string', index='analyzed')
    m = m.field('dct:isPartOf',
                dsl.Object().property('@id', 'string', index='not_analyzed'))
    m = m.field('dct:issued', 'string', index='analyzed')
    m = m.field('dct:language',
                dsl.Object().property('@id', 'string', index='not_analyzed'))
    m = m.field('dct:subject',
                dsl.Object().property('@id', 'string', index='not_analyzed'))
    m = m.field('dct:title',
                'string',
                index='analyzed',
                fields={'folded': dsl.String(analyzer='text_folded')})
    m = m.field('rdau:contentType',
                dsl.Object().property('@id', 'string', index='not_analyzed'))
    m = m.field('rdau:dissertationOrThesisInformation',
                'string',
                index='analyzed')
    m = m.field('rdau:mediaType',
                dsl.Object().property('@id', 'string', index='not_analyzed'))
    m = m.field('rdau:modeOfIssuance',
                dsl.Object().property('@id', 'string', index='not_analyzed'))
    m = m.field('rdau:noteOnResource', 'string', index='not_analyzed')
    m = m.field('rdau:placeOfPublication',
                dsl.Object().property('@id', 'string', index='not_analyzed'))
    m = m.field('rdau:publicationStatement', 'string', index='analyzed')
    m = m.field(
        'rdfs:isDefinedBy',
        dsl.Object().property('@id',
                              'string',
                              index='analyzed',
                              analyzer='extr_id'))
    # Save the mapping in ES
    pprint(m.to_dict(), stream=stream)