def get_search_query(phrase):
    query = Q('function_score',
              query=MultiMatch(
                  fields=['name', 'description', 'speaker', 'transcript'],
                  query=phrase),
              functions=[SF('field_value_factor', field='number_of_views')])
    return TalkDocument.search().query(query)
예제 #2
0
    def search_close(self, origin_timestamp, channel, qterm, number_results):
        """
        Find log entries close to origin timestamp, filter by channel, highlight qterm and return them sorted by date.

        :param origin_timestamp: origin timestamp to find logs around
        :param channel: Channel to be filtered
        :param qterm: Term to be highlighted
        :param number_results: how many results
        :return: List of sorted log entries (Elastic-search response)
        :rtype: ``list``
        """
        # Prepare query
        s = DslSearch(using=self._es, index=self._index_prefix.format('*'))

        # Function score
        main_query_boosting = 1e-15  # only used for highlighting, not for scoring -> give very low signifance
        pos = MatchPhrase(msg={'query': qterm, 'boost': main_query_boosting}) | \
              Match(**{'username': {'query': qterm, 'boost': main_query_boosting}}) | \
              Match(channel={'query': qterm, 'boost': main_query_boosting}) | \
              Match(msg={'query': qterm, 'boost': main_query_boosting})
        main_query = (pos | Q('match_all'))

        function_score_query = Q('function_score',
                                 query=main_query,
                                 functions=[
                                     SF(
                                         'exp', **{
                                             '@timestamp': {
                                                 "origin": origin_timestamp,
                                                 "scale": "1m",
                                                 "decay": 0.999
                                             }
                                         })
                                 ])

        s = s.query(function_score_query)

        # filter channel
        s = s.filter('term', **{'channel.keyword': channel})

        # Number of results
        s = s[0:number_results]

        # Highlight
        s = s.highlight_options(order='score')
        s = s.highlight('msg', number_of_fragments=0)
        s = s.highlight('username')
        s = s.highlight('channel')

        # Execute
        response = s.execute()

        # Sort results
        response_sorted = sorted(response, key=lambda hit: hit['@timestamp'])

        return response_sorted
예제 #3
0
def build_search_company_query(params):
    term = params.pop('term', None)

    # perform OR operation for items specified in same group and
    # then an AND operation for different groups e.g.,
    # (NORTH_EAST OR NORTH_WEST) AND (AEROSPACE OR AIRPORTS)
    # each sibling filter should have equal score with each other
    must = []
    for key, values in params.items():
        should = [
            ConstantScore(filter=Q('term', **{key: value})) for value in values
        ]
        must.append(Q('bool', should=should, minimum_should_match=1))
    should = []
    if term:
        should.append(
            Q('bool',
              should=[
                  ConstantScore(filter=Q('term', keyword_wildcard=term)),
                  ConstantScore(filter=Q('match_phrase', wildcard=term)),
                  ConstantScore(filter=Q('match', wildcard=term)),
                  ConstantScore(
                      filter=Q('match_phrase', casestudy_wildcard=term)),
                  ConstantScore(filter=Q('match', casestudy_wildcard=term))
              ],
              minimum_should_match=1))

        return Q('function_score',
                 query=Q('bool',
                         must=must,
                         should=should,
                         minimum_should_match=1 if should else 0),
                 functions=[
                     SF({
                         'weight':
                         5,
                         'filter':
                         (Q('match_phrase', name=term) | Q('match', name=term))
                     })
                 ],
                 boost_mode='sum')
    else:
        return Q('bool',
                 must=must,
                 should=should,
                 minimum_should_match=1 if should else 0)
예제 #4
0
 def run(self):
     emails = {
         'breached': set(),
         'unbreached': set(),
     }
     # contact_email exists
     must = [Q('exists', field='contact_email')]
     # matches source if specified
     if self.source:
         must.append(Q({'term': {'analysis.source': self.source}}))
     # not already tagged with breached
     s = Search(using=self.es).\
         query(FunctionScore(
               query=Q('bool',
                       must=must,
                       must_not=[Q('exists', field='analysis.breached')]),
               functions=[SF('random_score', seed=int(time.time()))]
         )).\
         source(['contact_email'])
     print('%s breached: source=%s limit=%s' %
           (datetime.now().isoformat(), self.source, self.limit))
     print('query=\n%s' % json.dumps(s.to_dict()))
     for filing in s[:self.limit]:
         email = filing['contact_email']
         if not email or email in emails['breached'] or email in emails[
                 'unbreached']:
             continue
         breached = self.is_breached(email)
         emails['breached' if breached else 'unbreached'].add(email)
     docs = []
     print('done source=%s' % self.source)
     if emails['breached']:
         docs += self.tag_by_email(list(emails['breached']), True)
     if emails['unbreached']:
         docs += self.tag_by_email(list(emails['unbreached']), False)
     try:
         lib.bulk_update(self.es, docs)
     except Exception as e:
         print('error indexing: %s' % e)
예제 #5
0
    def get_search_results(term, page, size):
        """Search companies by term

        Wildcard search of companies by provided term. The position of
        companies that have only one sector is increased.

        Arguments:
            term {str} -- Search term to match on
            page {int} -- Page number to query
            size {int} -- Number of results per page

        Returns:
            dict -- Companies that match the term

        """

        start = (page - 1) * size
        end = start + size
        query = search.CompanyDocType.search().query(
            'function_score',
            query=Q('match', _all=term),
            functions=[SF('field_value_factor', field='has_single_sector')])
        return query[start:end].execute().to_dict()
예제 #6
0
def browse(request):

    s = Search(using=es)
    description = None

    s.query = FunctionScore(
        query=s.query, functions=[SF('random_score', seed=int(time.time()))])

    if 'source' in request.GET:
        source = request.GET['source']
        s = s.filter('terms', **{'analysis.source': [source]})
        description = SOURCE_MAP.get(source, {}).get('name') or source
    elif 'titleii' in request.GET:
        title_ii = request.GET['titleii']
        if title_ii == 'pro':
            s = s.filter('terms', **{'analysis.titleii': [True]})
            description = "Pro Title II"
        elif title_ii == 'anti':
            description = 'Anti Title II'
            s = s.filter('terms', **{'analysis.titleii': [False]})
        elif title_ii == 'unknown':
            description = 'Uncategorized'
            s = s.exclude('exists', field='analysis.titleii')

    s.aggs.bucket('address', A('terms', field='analysis.fulladdress'))
    s.aggs.bucket('site', A('terms', field='analysis.onsite'))

    s.aggs.bucket(
        'email_confirmation',
        A('filters',
          filters={
              'true': {
                  'term': {
                      'emailConfirmation': 'true'
                  }
              },
              'false': {
                  'term': {
                      'emailConfirmation': 'false'
                  }
              }
          }))

    s.aggs.bucket('unique_emails', A('cardinality', field='contact_email.raw'))

    # s.aggs.bucket('email_confirmation', A('filters', field='analysis.fulladdress'))

    stats = OrderedDict({
        'Comment Form': {
            'On-site': 0,
            'Off-site': 0
        },
        'Emails': {
            'Unique': 0,
        },
        'Address': {
            'Full Address': 0,
            'Partial Address': 0,
        },
        'Email Confirmation': {
            'True': 0,
            'False': 0,
            'Missing': 0
        }
    })

    response = s[:50].execute()
    total = s.count()
    for bucket in response.aggregations.address.buckets:
        if bucket.key == 1:
            stats['Address']['Full Address'] = bucket.doc_count
        elif bucket.key == 0:
            stats['Address']['Partial Address'] = bucket.doc_count

    for bucket in response.aggregations.site.buckets:
        if bucket.key == 1:
            stats['Comment Form']['On-site'] = bucket.doc_count
        elif bucket.key == 0:
            stats['Comment Form']['Off-site'] = bucket.doc_count

    stats['Emails']['Unique'] = response.aggregations.unique_emails.value

    for bucket, value in response.aggs.email_confirmation.to_dict(
    )['buckets'].items():
        if bucket == 'true':
            stats['Email Confirmation']['True'] = value['doc_count']
        elif bucket == 'false':
            stats['Email Confirmation']['False'] = value['doc_count']
    stats['Email Confirmation']['Missing'] = (
        total - stats['Email Confirmation']['True'] -
        stats['Email Confirmation']['False'])

    context = {
        'description': description,
        'stats': stats,
        'results': response,
        'comment_count': total
    }

    return render(request, 'listing.html', context)
예제 #7
0
def browse(request, sentiment=None, group=None):

    s = Search(using=es, index="fcc-comments")
    description = None

    s.query = FunctionScore(
        query=s.query, functions=[SF('random_score', seed=int(time.time()))]
    )

    if group:
        source = group
        s = s.filter('terms', **{'analysis.source.keyword': [source]})
        description = SOURCE_MAP.get(source, {}).get('name') or source
        details = SOURCE_MAP.get(source, {}).get('details') or ""
        url = SOURCE_MAP.get(source, {}).get('url') or ""

    elif sentiment:
        title_ii = sentiment
        if title_ii == 'pro':
            s = s.filter('terms', **{'analysis.titleii': [True]})
            description = "Pro Title II"
        elif title_ii == 'anti':
            description = 'Anti Title II'
            s = s.filter('terms', **{'analysis.titleii': [False]})
        elif title_ii == 'unknown':
            description = 'Uncategorized'
            s = s.exclude('exists', field='analysis.titleii')
        details, url = "", None
    
    s.aggs.bucket("date", A('date_histogram', field='date_submission', interval='month'))
    s.aggs.bucket('address', A('terms', field='analysis.fulladdress'))
    s.aggs.bucket('email_domain', A('terms', field='analysis.throwawayemail'))
    s.aggs.bucket('site', A('terms', field='analysis.onsite'))
    s.aggs.bucket('ingestion', A('terms', field='analysis.ingestion_method.keyword'))
    s.aggs.bucket('email_confirmation', A('filters', filters={
        'true': {'term': {'emailConfirmation': 'true'}},
        'false': {'term': {'emailConfirmation': 'false'}}
    }))

    # s.aggs.bucket('unique_emails', A('cardinality', field='contact_email.raw'))


    stats = OrderedDict({
        'Comment Form': {
            'On-site': 0,
            'Off-site': 0
        },
        'Throwaway Email': {
            'True': 0,
            'False': 0
        },
        'Address': {
            'Full Address': 0,
            'Partial Address': 0,
        },
        'Email Confirmation': {
            'True': 0,
            'False': 0,
            'Missing': 0
        },
        'Filing Method': {
            'API': 0,
            'Spreadsheet': 0,
            'Direct': 0
        },
        'Filing Dates': OrderedDict({
            
        })
    })

    response = s[:50].execute()
    total = s.count()

    for bucket in response.aggregations.date.buckets:
        d = datetime.fromtimestamp((bucket.key/1000.) + 14400)
        title = "%s/17 - %s" % (d.strftime("%m"), d.strftime("%B"))
        stats['Filing Dates'][title] = bucket.doc_count

    for bucket in response.aggregations.address.buckets:
        if bucket.key == 1:
            stats['Address']['Full Address'] = bucket.doc_count
        elif bucket.key == 0:
            stats['Address']['Partial Address'] = bucket.doc_count

    for bucket in response.aggregations.email_domain.buckets:
        if bucket.key == 1:
            stats['Throwaway Email']['True'] = bucket.doc_count
        elif bucket.key == 0:
            stats['Throwaway Email']['False'] = bucket.doc_count

    for bucket in response.aggregations.ingestion.buckets:
        if bucket.key == "api":
            stats['Filing Method']['API'] = bucket.doc_count
        elif bucket.key == "csv":
            stats['Filing Method']['Spreadsheet'] = bucket.doc_count
        elif bucket.key == "direct":
            stats['Filing Method']['Direct'] = bucket.doc_count


    for bucket in response.aggregations.site.buckets:
        if bucket.key == 1:
            stats['Comment Form']['On-site'] = bucket.doc_count
        elif bucket.key == 0:
            stats['Comment Form']['Off-site'] = bucket.doc_count

    # stats['Emails']['Unique'] = response.aggregations.unique_emails.value

    for bucket, value in response.aggs.email_confirmation.to_dict()['buckets'].items():
        if bucket == 'true':
            stats['Email Confirmation']['True'] = value['doc_count']
        elif bucket == 'false':
            stats['Email Confirmation']['False'] = value['doc_count']
    stats['Email Confirmation']['Missing'] = (
        total - stats['Email Confirmation']['True'] - stats['Email Confirmation']['False']
    )

    context = {
        'description': description,
        'details': details,
        'url': url,
        'stats': stats,
        'results': response,
        'comment_count': total
    }

    return render(request, 'listing.html', context)
예제 #8
0
    def get_sort_popularity(self, request):
        score = FunctionScore(
            score_mode='sum',
            functions=[
                SF(
                    'field_value_factor',
                    field='status_score',
                    weight=10,
                    factor=10
                ),
                SF(
                    'gauss',
                    weight=0.1,
                    created={
                        'scale': "365d"
                    },
                ),
            ]
        ) | FunctionScore(
            score_mode='multiply',
            functions=[
                SF(
                    'field_value_factor',
                    field='contribution_count',
                    missing=0
                ),
                SF(
                    'gauss',
                    weight=0.1,
                    multi_value_mode='avg',
                    contributions={
                        'scale': '5d'
                    },
                ),
            ]
        )

        if request.user.is_authenticated:
            if request.user.skills:
                score = score | FunctionScore(
                    score_mode='first',
                    functions=[
                        SF({
                            'filter': Nested(
                                path='expertise',
                                query=Q(
                                    'terms',
                                    expertise__id=[skill.pk for skill in request.user.skills.all()]
                                )
                            ),
                            'weight': 1,
                        }),
                        SF({'weight': 0}),
                    ]
                )

            if request.user.favourite_themes:
                score = score | FunctionScore(
                    score_mode='first',
                    functions=[
                        SF({
                            'filter': Nested(
                                path='theme',
                                query=Q(
                                    'terms',
                                    theme__id=[theme.pk for theme in request.user.favourite_themes.all()]
                                )
                            ),
                            'weight': 1,
                        }),
                        SF({'weight': 0}),
                    ]
                )

            position = None
            if request.user.location and request.user.location.position:
                position = {
                    'lat': request.user.location.position.latitude,
                    'lon': request.user.location.position.longitude
                }
            elif request.user.place and request.user.place.position:
                position = {
                    'lat': request.user.place.position.latitude,
                    'lon': request.user.place.position.longitude
                }

            if position:
                score = score | FunctionScore(
                    score_mode='first',
                    functions=[
                        SF({
                            'filter': {'exists': {'field': 'position'}},
                            'weight': 1,
                            'gauss': {
                                'position': {
                                    'origin': position,
                                    'scale': "100km"
                                },
                                'multi_value_mode': 'max',
                            },
                        }),
                        SF({'weight': 0}),
                    ]
                )

        return score
예제 #9
0
def get_search_query(phrase):
    query = Q('function_score',
              query=MultiMatch(fields=['title', 'author', 'publisher'],
                               query=phrase),
              functions=[SF('field_value_factor', field='number_of_views')])
    return BooksIndex.search().query(query)