Exemplo n.º 1
0
    def more_like_this(self, request, pk=None, id=None):
        """More-like-this functionality detail view.

        :param request:
        :return:
        """
        if 'view' in request.parser_context:
            view = request.parser_context['view']
            kwargs = copy.copy(getattr(view, 'more_like_this_options', {}))
            id_ = pk if pk else id

            # Use current queryset
            queryset = self.filter_queryset(self.get_queryset())
            # We do not try to get fields from current serializer. On the
            # Elasticsearch side if no ``fields`` value is given, ``_all`` is
            # used, and although some serializers could contain less fields
            # than available, this seems like the best approach. If you want to
            # fall back to ``_all`` of Elasticsearch, leave it empty.
            fields = kwargs.pop('fields', [])
            # if not fields:
            #     serializer_class = self.get_serializer_class()
            #     fields = serializer_class.Meta.fields[:]
            if fields:
                queryset = queryset.query(
                    MoreLikeThis(
                        fields=fields,
                        like={
                            '_id': "{}".format(id_),
                            '_index': "{}".format(self.index),
                            '_type': "{}".format(self.mapping)
                        },
                        **kwargs
                    )
                ).sort('_score')
            else:
                queryset = queryset.query(
                    MoreLikeThis(
                        like={
                            '_id': "{}".format(id_),
                            '_index': "{}".format(self.index),
                            '_type': "{}".format(self.mapping)
                        },
                        **kwargs
                    )
                ).sort('_score')

            # Standard list-view implementation
            page = self.paginate_queryset(queryset)
            if page is not None:
                serializer = self.get_serializer(page, many=True)
                return self.get_paginated_response(serializer.data)

            serializer = self.get_serializer(queryset, many=True)
            return Response(serializer.data)
Exemplo n.º 2
0
    def more_like_this(self, item, max_hits=3):

        query = MoreLikeThis(
            like=[
                # {'_id': ElasticIndex._get_id(item), '_index': self.index_name},
                item.indexable_content(),
                item.category_names()
            ],
            min_term_freq=1,
            min_doc_freq=2,
            max_query_terms=12,
            fields=[
                'title', 'content', 'description', 'location', 'category',
                'organization_name', 'website'
            ])

        elastic_search = Search(index=self.index_name)\
            .doc_type(StarDocument)\
            .query(query)

        elastic_search = elastic_search[0:max_hits]

        # Filter out past events
        elastic_search = elastic_search.filter(
            'bool', **{"should": self._default_filter()})

        return elastic_search.execute()
    def more_like_this(self, request, pk=None, id=None):
        """More-like-this functionality detail view.

        :param request:
        :return:
        """
        if 'view' in request.parser_context:
            view = request.parser_context['view']
            kwargs = copy.copy(getattr(view, 'more_like_this_options', {}))
            id_ = pk if pk else id
            # obj = self.get_object()
            queryset = self.filter_queryset(self.get_queryset())
            fields = kwargs.pop('fields', {})
            if not fields:
                serializer_class = self.get_serializer_class()
                fields = serializer_class.Meta.fields[:]
            queryset = queryset.query(
                MoreLikeThis(
                    fields=fields,
                    like={
                        '_id': "{}".format(id_),
                        '_index': "{}".format(self.index),
                        '_type': "{}".format(self.mapping)
                    },
                    **kwargs
                )
            ).sort('_score')
            return Response(queryset.execute().to_dict())
Exemplo n.º 4
0
def find_related_documents(*, user, post_id):
    """
    Execute a "more like this" query to find posts that are related to a specific post

     Args:
        user (User): The user executing the search
        post_id (str): The id of the post that you want to find related posts for

    Returns:
        dict: The Elasticsearch response dict
    """
    index = get_default_alias_name(ALIAS_ALL_INDICES)
    search = Search(index=index)
    search = _apply_general_query_filters(search, user)
    search = search.query(
        MoreLikeThis(
            like={
                "_id": gen_post_id(post_id),
                "_type": GLOBAL_DOC_TYPE
            },
            fields=RELATED_POST_RELEVANT_FIELDS,
            min_term_freq=1,
            min_doc_freq=1,
        ))
    # Limit results to the number indicated in settings
    search = search[0:settings.OPEN_DISCUSSIONS_RELATED_POST_COUNT]
    return search.execute().to_dict()
Exemplo n.º 5
0
def search_more_like_this(talk):
    """ Get more like this documents
    """
    client = Elasticsearch([{
        'host':
        settings.ELASTICSEARCH['default']['HOSTNAME'],
        'port':
        settings.ELASTICSEARCH['default']['PORT'],
    }])

    s = Search(using=client, index="vtalks")

    s = s.query(
        MoreLikeThis(like={
            "_index": "vtalks",
            "_type": "talk",
            "_id": talk.id
        },
                     fields=['title', 'description', 'tags']))

    # Sorting
    s = s.sort({"_score": {"order": "desc"}})

    # Fields selection
    s = s.source(['id'])

    response = s.execute()

    results_total = response.hits.total
    results_ids = [hit.id for hit in response.hits]

    return results_total, results_ids
Exemplo n.º 6
0
def find_similar_resources(*, user, value_doc):
    """
    Execute a "more like this" query to find learning resources that are similar to the one provided.

     Args:
        user (User): The user executing the search
        value_doc (dict):
            a document representing the data fields we want to search with

    Returns:
        dict: The Elasticsearch response dict
    """
    index = get_default_alias_name(ALIAS_ALL_INDICES)
    search = Search(index=index)
    search = _apply_general_query_filters(search, user)
    search = search.filter(Q("terms", object_type=LEARNING_RESOURCE_TYPES))
    search = search.query(
        MoreLikeThis(
            like={
                "doc": value_doc,
                "fields": list(value_doc.keys())
            },
            fields=SIMILAR_RESOURCE_RELEVANT_FIELDS,
            min_term_freq=settings.OPEN_RESOURCES_MIN_TERM_FREQ,
            min_doc_freq=settings.OPEN_RESOURCES_MIN_DOC_FREQ,
        ))
    response = search.execute()

    return [
        hit.to_dict() for hit in response.hits
        if (hit["id"] != value_doc.get("id", None)
            or hit["object_type"] != value_doc.get("object_type", None))
    ][0:settings.OPEN_DISCUSSIONS_SIMILAR_RESOURCES_COUNT]
Exemplo n.º 7
0
def get_more_like_this(s, query_text):
    s = s.query(
        MoreLikeThis(like=query_text,
                     fields=['title', 'abstract', 'body^3'],
                     stop_words=get_stop_words()))
    # get first top 10 similar articles
    response = s[1:11].execute()
    return _extract_response(response)
Exemplo n.º 8
0
    def get_related_items(self, obj):
        """Get related items.

        :param obj:
        :return:
        """
        max_query_terms = 25
        min_term_freq = 1,
        min_doc_freq = 1
        max_doc_freq = 25

        kwargs = {}

        if max_query_terms is not None:
            kwargs['max_query_terms'] = max_query_terms

        # if min_term_freq is not None:
        #     kwargs['min_term_freq'] = min_term_freq

        if min_doc_freq is not None:
            kwargs['min_doc_freq'] = min_doc_freq

        if max_doc_freq is not None:
            kwargs['max_doc_freq'] = max_doc_freq

        query = CollectionItemDocument().search()
        search = query.query(
            MoreLikeThis(
                fields=(
                    'title_en.natural',
                    'description_en.natural',
                ),
                like={
                    '_id': "{}".format(obj.id),
                    '_index': "{}".format(INDEX._name),
                    '_type': "{}".format(list(INDEX._mappings.keys())[0])
                },
                **kwargs
            )
        )
        related_items = []
        for __o in search:
            _id = int(__o.meta.id)
            related_items.append(
                OrderedDict([
                    ('id', _id),
                    ('images_urls', __o.images_urls._l_),
                    # English
                    ('title_en', __o.title_en._l_),
                    ('description_en', __o.description_en._l_),
                    # Dutch
                    ('title_nl', __o.title_nl._l_),
                    ('description_nl', __o.description_nl._l_),
                ])
            )

        return related_items
Exemplo n.º 9
0
def more_like_this(doc):
    like = serialize_document(doc)
    q = MoreLikeThis(like=like,
                     fields=["title", "body", "content"],
                     min_term_freq=1,
                     min_doc_freq=1)
    query = LetterDocument.search().query(q)
    # print(query.to_dict())
    return query.execute()
Exemplo n.º 10
0
def build_query_body(item_uri, media_type=None, max_duration=None, published_after=None,
                     region=None, similarity_method=None, limit=constants.DEFAULT_QUERY_LIMIT,
                     offset=constants.DEFAULT_QUERY_OFFSET):
    """Build query dict ready to pass to Elasticsearch search instance for retrieving a list of similar items given a
    URI."""
    if published_after is not None:
        raise NotImplementedError('The parameter `publishedAfter` is not yet implemented.')
    if region is not None:
        raise NotImplementedError('The parameter `region` is not yet implemented.')
    if similarity_method is not None:
        raise NotImplementedError('The parameter `similarityMethod` is not yet implemented for ES.')

    search = Search(index='pips')
    search = search[offset:offset + limit]  # TODO: THIS DOESNT WORK?? query builds as should but no effect

    if media_type:
        for media in media_type:
            search = search.filter('term', mediaType=media)

    if max_duration:
        search = search.filter('range', duration={'lte': max_duration})

    similarity_filters = [
        # by title
        MoreLikeThis(
            like={'_index': 'pips', '_type': 'clip', '_id': item_uri},
            fields=['title', 'masterBrand.mid', 'mediaType'],
            min_term_freq=1,
            min_doc_freq=1
        ),
        Q(
            'nested',
            path='genres',
            query=MoreLikeThis(
                fields=['genres.key'],
                like={'_index': 'pips', '_type': 'clip', '_id': item_uri},
                min_term_freq=1,
                min_doc_freq=1
            )
        )
    ]
    search = search.query('bool', should=similarity_filters)

    return search.to_dict()
Exemplo n.º 11
0
def create_mlt_with_id(document_id, position, index):
    s = Search(using=client, index=index)
    s.source(includes=['*'], excludes=["body"])
    mlt_match = MoreLikeThis(fields=["body.content"],
                             like=[id],
                             min_term_freq=1,
                             min_doc_freq=1)
    nested_query = Nested(path='body', inner_hits={}, query=mlt_match)
    s = s.query(nested_query)
    return s
Exemplo n.º 12
0
def create_mlt_with_id(document_id, index, size=20):
    s = Search(using=client, index=index)
    if not isinstance(document_id, list):
        mlt_match = MoreLikeThis(fields=["content"],
                                 like={
                                     '_index': index,
                                     '_id': document_id
                                 },
                                 min_term_freq=1,
                                 min_doc_freq=1,
                                 minimum_should_match='5%',
                                 analyzer='stop')
    else:
        like_list = [{'_index': index, '_id': item} for item in document_id]
        mlt_match = MoreLikeThis(fields=["content"],
                                 like=like_list,
                                 min_term_freq=1,
                                 min_doc_freq=1,
                                 analyzer='stop')
    s = s.query(mlt_match)
    s = s[:size]
    return s
Exemplo n.º 13
0
def recommend_mlt(request):
    """Recommend two more-like-this jokes for inclusion on another page."""
    try:
        search = Search(index='toja_jokes')
        search = search.query(MoreLikeThis(like={'_id': request.matchdict['jid']}))
        search = search[0:2]
        results = search.execute()
        joke_ids = [joke.meta.id for joke in results]
        if joke_ids and len(joke_ids) >= 2:
            jokes = request.dbsession.query(Image).filter(Image.id.in_(joke_ids))
            return {'jokes': jokes}
    except ConnectionError:
        pass
    raise HTTPNotFound()
Exemplo n.º 14
0
    def more_like_this(self, item, max_hits=3):

        query = MoreLikeThis(
            like=[
                # {'_id': ElasticIndex._get_id(item), '_index': self.index_name},
                item.indexable_content(),
                item.category_names()
            ],
            min_term_freq=1,
            min_doc_freq=2,
            max_query_terms=12,
            fields=[
                'title', 'content', 'description', 'location', 'category',
                'organization_name', 'website'
            ])

        elastic_search = Search(index=self.index_name)\
            .doc_type(StarDocument)\
            .query(query)

        elastic_search = elastic_search[0:max_hits]

        # Filter out past events
        elastic_search = elastic_search.filter(
            'bool',
            **{
                "should": [
                    {
                        "range": {
                            "date": {
                                "gte": datetime.datetime.utcnow()
                            }
                        }
                    },  # Future events OR
                    {
                        "bool": {
                            "must_not": {
                                "exists": {
                                    "field": "date"
                                }
                            }
                        }
                    }  # Date field is empty
                ]
            })

        return elastic_search.execute()
Exemplo n.º 15
0
    def get(self, request, format=None):
        """推薦相似餐廳功能"""
        res = {'status': False}

        restaurant = request.GET.get('restaurant', '')

        text = {'_index': 'restaurant', '_type': '_doc', '_id': restaurant}

        query = RestaurantDocument.search()\
                                .query(MoreLikeThis(like = text, fields = ['descriptions'], min_term_freq = 1, max_query_terms = 5))\
                                .source(excludes=['descriptions', 'created_time'])

        pages = []
        for restaurant in query.execute():
            pages.append({'title': restaurant.restaurant})
        res.update({'status': True, 'data': pages})

        return JsonResponse(res)
Exemplo n.º 16
0
def get_similar_topics(value_doc, num_topics, min_term_freq, min_doc_freq):
    """
    Get a list of similar topics based on text values

    Args:
        value_doc (dict):
            a document representing the data fields we want to search with
        num_topics (int):
            number of topics to return
        min_term_freq (int):
            minimum times a term needs to show up in input
        min_doc_freq (int):
            minimum times a term needs to show up in docs

    Returns:
        list of str:
            list of topic values
    """
    index = get_default_alias_name(ALIAS_ALL_INDICES)
    search = Search(index=index)
    search = search.filter(Q("terms", object_type=[COURSE_TYPE]))
    search = search.query(
        MoreLikeThis(
            like=[{
                "doc": value_doc,
                "fields": list(value_doc.keys())
            }],
            fields=[
                "course_id", "title", "short_description", "full_description"
            ],
            min_term_freq=min_term_freq,
            min_doc_freq=min_doc_freq,
        ))
    search = search.source(includes="topics")

    response = search.execute()

    topics = [topic for hit in response.hits for topic in hit.topics]

    counter = Counter(topics)

    return list(dict(counter.most_common(num_topics)).keys())
Exemplo n.º 17
0
    def more_like_this(self, doc_id):
        """This method takes in a doc ID and queries the elasticsearch index for
            courses with similar title or description"""
        likeObj = [{"_index": self.index, "_id": doc_id}]
        fields = [
            "Course.CourseShortDescription", "Course.CourseTitle",
            "Course.CourseProvider"
        ]

        # We're going to match based only on two fields
        self.search = self.search.query(
            MoreLikeThis(like=likeObj, fields=fields))
        self.user_organization_filtering()

        # only fetch the first 6 results
        # TODO: make the size configurable
        self.search = self.search[0:6]
        response = self.search.execute()
        logger.info(response)

        return response
Exemplo n.º 18
0
def search(request):
    query = request.GET.get('q')
    # q = MultiMatch(query=query, fields=['title', 'body'])

    data = {
        'query':
        True,
        'more':
        NoteDocument.search().query(
            MoreLikeThis(like=query, fields=['title', 'body'])),
        'page':
        NoteDocument.search().query('multi_match',
                                    query=query,
                                    fields=['title', 'body'])
        # .query("match", body=query)
    }
    # print(data['page'])
    for k in data['more']:
        print(k.body)
        print(k.title)
    return render(request, 'search/search.html', context=data)
Exemplo n.º 19
0
def more_like_this(obj,
                   fields,
                   max_query_terms=25,
                   min_term_freq=2,
                   min_doc_freq=5,
                   max_doc_freq=0,
                   query=None):
    _index, _mapping = get_index_and_mapping_for_model(obj._meta.model)

    if _index is None:
        return None

    _client = connections.get_connection()
    _search = Search(using=_client, index=_index)

    if query is not None:
        _search = _search.query(query)

    kwargs = {}

    if max_query_terms is not None:
        kwargs['max_query_terms'] = max_query_terms

    if min_term_freq is not None:
        kwargs['min_term_freq'] = min_term_freq

    if min_doc_freq is not None:
        kwargs['min_doc_freq'] = min_doc_freq

    if max_doc_freq is not None:
        kwargs['max_doc_freq'] = max_doc_freq

    return _search.query(
        MoreLikeThis(fields=fields,
                     like={
                         '_id': "{}".format(obj.pk),
                         '_index': "{}".format(_index),
                         '_type': "{}".format(_mapping)
                     },
                     **kwargs))
Exemplo n.º 20
0
    def more_like_this(elastic_url, fields: list, like: list, size: int, filters: list, aggregations: list, include: bool, if_agg_only: bool, dataset: Dataset, return_fields=None):
        # Create the base query creator and unite with ES gateway.
        search = Search(using=Elasticsearch(elastic_url)).index(dataset.index).doc_type(dataset.mapping)
        mlt = MoreLikeThis(like=like, fields=fields, min_term_freq=1, max_query_terms=12, include=include)  # Prepare the MLT part of the query.

        paginated_search = search[0:size]  # Set how many documents to return.
        limited_search = paginated_search.source(return_fields) if return_fields else paginated_search  # If added, choose which FIELDS to return.
        finished_search = limited_search.query(mlt)  # Add the premade MLT into the query.

        # Apply all the user-set filters, if they didn't add any this value will be [] and it quits.
        for filter_dict in filters:
            finished_search = finished_search.filter(Q(filter_dict))

        # Apply all the user-set aggregations, if they didn't add any this value will be [] and it quits.
        for aggregation_dict in aggregations:
            # aggs.bucket() does not return a Search object but changes it instead.
            if aggregation_dict["agg_type"] == "composite":
                after = aggregation_dict.get("after_key", None)
                finished_search = ES_Manager.handle_composition_aggregation(finished_search.to_dict(), aggregation_dict, after)
            else:
                field_name = aggregation_dict["field"]
                index = like[0]["_index"]
                field = "{}.keyword".format(field_name) if ES_Manager.is_field_text_field(field_name=field_name, index_name=index) else field_name
                finished_search.aggs.bucket(name=aggregation_dict["bucket_name"], agg_type=aggregation_dict["agg_type"], field=field)

        # Choose if you want to return only the aggregations in {"bucket_name": {results...}} format.
        if if_agg_only:
            finished_search = finished_search.params(size=0)
            response = finished_search.execute()
            return response.aggs.to_dict()

        try:
            response = finished_search.execute()
            result = {"hits": [hit.to_dict() for hit in response]}  # Throw out all metadata and keep only the documents.
            if response.aggs: result.update({"aggregations": response.aggs.to_dict()})  # IF the aggregation query returned anything, THEN add the "aggregatons" key with results.
            return result

        except ElasticsearchException as e:
            logging.getLogger(ERROR_LOGGER).exception(e)
            return {"elasticsearch": [str(e)]}
Exemplo n.º 21
0
def get_similar_clips(clip_id):
    """ Return 10 clips deemed to be most similar. """
    clip = get_clip_by_id(clip_id)
    if clip is None:
        raise ClipNotFoundException(["clip_id searched for not in index."])

    # Stop words taken from nltk's list of stop words.
    stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
                  "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself",
                  "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
                  "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
                  "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as",
                  "until", "while", "of", "at", "by", "for", "with", "about", "into", "through", "now",  "should"
                  "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off",
                  "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how",
                  "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not",
                  "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don"]

    dsl_search = Clip.search()
    similar_clips = dsl_search.query(MoreLikeThis(like={'_id': clip.meta.id},
                                                  fields=['title', 'caption', 'categories'],
                                                  min_term_freq=1, stop_words=stop_words,
                                                  min_doc_freq=5, minimum_should_match=0)).execute()
    return similar_clips
Exemplo n.º 22
0
from elasticsearch import Elasticsearch
from elasticsearch.exceptions import RequestError
from elasticsearch_dsl import Search, Index, Document, connections, \
    Keyword, Date, Text, Integer, MetaField, Nested, InnerDoc
from elasticsearch_dsl.query import Nested
from elasticsearch_dsl.query import Match, Nested, Term, MoreLikeThis
from tabulate import tabulate

client = Elasticsearch()

mlt_match = MoreLikeThis(fields=["body.content"],
                         like=["you owe me"],
                         min_term_freq=1,
                         min_doc_freq=1)
innerMatch = Match(body__content='stock')
nestedMatch = Nested(path='body', query=innerMatch)

# retrieve all documents containing stock in its body)
s = Search(using=client, index='enron') \
        .query("match", body="stock")
"""
in order to change the size of return:
s= s[0:0] will create a size 0 request.
It's all done with python slicing.
"""
Exemplo n.º 23
0
print("Got %d Hits:" % res['hits']['total']['value'])
for hit in res['hits']['hits']:
    print(hit['_source']['url'])

print('\n')
print('Lets try to find more like: ' +
      str(res['hits']['hits'][0]['_source']['url']))

# We compare the contents of the html field now to find something similar

# A bit hacky.. we now use the DSL version of elasticsearch. This one's
# supposed to be a bit easier to use, so I wanted to see if it's easier
# to define a query with this syntax (compared to above)

from elasticsearch_dsl.query import MoreLikeThis
from elasticsearch_dsl import Search

#html my_text = str(res['hits']['hits'][0]['_source']['html'])
my_text = str(res['hits']['hits'][0]['_source']['url'].replace(
    '/', ' ').replace('-', ' '))  #url

s = Search(using=es)
s = s.query(MoreLikeThis(like=my_text, fields=['url', 'html', 'title']))
# You can also exclude fields from the result to make the response quicker in the normal way
# s = s.source(exclude=["sentences", "text"])
response = s.execute().to_dict()
print('There are ' + str(response['hits']['total']['value']) + ' results:\n')
for i in range(0, 5):
    print(response['hits']['hits'][i]['_source']['url'])

#The recommendation probably works terrible because it now also compares html tags
Exemplo n.º 24
0
def more_like_this(obj,
                   fields,
                   max_query_terms=25,
                   min_term_freq=2,
                   min_doc_freq=5,
                   max_doc_freq=0,
                   query=None):
    """More like this.

    https://www.elastic.co/guide/en/elasticsearch/reference/current/
    query-dsl-mlt-query.html

    :param obj: Django model instance for which similar objects shall be found.
    :param fields: Fields to search in.
    :param max_query_terms:
    :param min_term_freq:
    :param min_doc_freq:
    :param max_doc_freq:
    :param query: Q query
    :type obj: Instance of `django.db.models.Model` (sub-classed) model.
    :type fields: list
    :type max_query_terms: int
    :type min_term_freq: int
    :type min_doc_freq: int
    :type max_doc_freq: int
    :type query: elasticsearch_dsl.query.Q
    :return: List of objects.
    :rtype: elasticsearch_dsl.search.Search

    Example:

        >>> from django_elasticsearch_dsl_drf.helpers import more_like_this
        >>> from books.models import Book
        >>> book = Book.objects.first()
        >>> similar_books = more_like_this(
        >>>     book,
        >>>     ['title', 'description', 'summary']
        >>> )
    """
    _index, _mapping = get_index_and_mapping_for_model(obj._meta.model)
    if _index is None:
        return None

    _client = connections.get_connection()
    _search = Search(using=_client, index=_index)

    if query is not None:
        _search = _search.query(query)

    kwargs = {}

    if max_query_terms is not None:
        kwargs['max_query_terms'] = max_query_terms

    if min_term_freq is not None:
        kwargs['min_term_freq'] = min_term_freq

    if min_doc_freq is not None:
        kwargs['min_doc_freq'] = min_doc_freq

    if max_doc_freq is not None:
        kwargs['max_doc_freq'] = max_doc_freq

    _like_options = {
        '_id': "{}".format(obj.pk),
        '_index': "{}".format(_index),
    }
    if not ELASTICSEARCH_GTE_7_0:
        _like_options.update({'_type': "{}".format(_mapping)})

    return _search.query(
        MoreLikeThis(fields=fields, like=_like_options, **kwargs))
Exemplo n.º 25
0
def ejercicio2():
    es = config()

    query = raw_input("Introduzca un termino/frase a buscar >> ")
    print()

    util.install_and_import("elasticsearch-dsl", "elasticsearch_dsl")
    from elasticsearch_dsl import Search
    from elasticsearch_dsl.query import MoreLikeThis

    properties = select_estadistico()
    est = properties[0]
    properties_est = properties[1]

    number = 25

    s = Search(using=es, index="reddit-mentalhealth")
    s = s.query(
        MoreLikeThis(
            like=query,
            fields=['selftext', 'title', 'subreddit'],
            min_term_freq=1,
            max_query_terms=number,
        ))

    agg = {
        "significant_terms": {
            "field": "selftext",
            "size": number,
            est: properties_est
        }
    }

    s.aggs.bucket('Text', agg)

    agg = {
        "significant_terms": {
            "field": "subreddit",
            "size": number,
            est: properties_est
        }
    }

    s.aggs.bucket('Subreddit', agg)

    agg = {
        "significant_terms": {
            "field": "title",
            "size": number,
            est: properties_est
        }
    }

    s.aggs.bucket('Title', agg)

    results = s.execute().to_dict()

    stops_words = []

    #quitar palabras vacias
    with open("stop.txt") as f:
        for line in f:
            stops_words.append(line.split(" ", 1)[0])
    stops_words = list(filter(lambda x: x != "\n" and x != "", stops_words))

    words = []
    for j in ["Subreddit", "Text", "Title"]:
        for i in results["aggregations"][j]["buckets"]:
            if (i["key"] not in stops_words and i["key"] not in words):
                words.append(i["key"])

    results = es.search(
        index="reddit-mentalhealth",
        body={"query": {
            "query_string": {
                "query": ' OR '.join(words),
            }
        }})

    json_data = []
    for element in results['hits']['hits']:
        data = {}
        element = element['_source']
        data['selftext'] = element['selftext']
        data['title'] = element['title']
        data['subreddit'] = element['subreddit']
        json_data.append(data)

    if len(json_data) != 0:
        util.serializer(json_data, 'Ejercicio2.json')
    else:
        print("No hay datos para guardar.\n")