def get_subjects_for_doc_ids(doc_ids, dataset):
        results = []

        for doc_id in doc_ids:
            solr_result = Emails.get_email_from_solr(dataset, doc_id)
            parsed_solr_result = parse_solr_result(solr_result)
            if parsed_solr_result['response']['numFound'] == 0:
                results.append({
                    'subject': 'NO THREAD DATA FOUND',
                    'doc_id': doc_id
                })
            else:
                email = parse_email_list(parsed_solr_result['response']['docs'])[0]
                results.append({
                    'subject': email['header']['subject'],
                    'doc_id': doc_id
                })

        return results
    def search():
        dataset = Controller.get_arg('dataset')
        core_topics_name = get_config(dataset)['SOLR_CONNECTION']['Core-Topics']
        limit = Controller.get_arg('limit', arg_type=int, required=False)
        offset = Controller.get_arg('offset', arg_type=int, required=False)
        sort = Controller.get_arg('sort', arg_type=str, required=False)

        filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False)
        filter_object = json.loads(filter_string)
        filter_query = build_filter_query(filter_object, core_type=core_topics_name)
        term = filter_object.get('searchTerm', '')

        query = build_fuzzy_solr_query(term)

        query_builder = QueryBuilder(
            dataset=dataset,
            query=query,
            limit=limit,
            offset=offset,
            fq=filter_query,
            sort=sort
        )
        solr_result = query_builder.send()

        parsed_solr_result = parse_solr_result(solr_result)
        results = parse_email_list(parsed_solr_result['response']['docs'])

        if len(results) == 0:
            return {
                'results': results,
                'searchTopics': {
                    'main': {
                        'topics': []
                    },
                    'singles': [],
                },
                'numFound': parsed_solr_result['response']['numFound'],
                'searchTerm': term
            }

        conditions = map(lambda result_element: 'doc_id:' + result_element['doc_id'], results)
        doc_id_filter_query = reduce(lambda condition_1, condition_2: condition_1 + ' OR ' + condition_2, conditions)

        facet_query = {
            'facet_topic_id': {
                'type': 'terms',
                'field': 'topic_id',
                'facet': {
                    'sum_of_confs_for_topic': 'sum(topic_conf)',
                    'facet_terms': {
                        'type': 'terms',
                        'field': 'terms',
                        'limit': 1
                    }
                },
                'sort': 'index asc',
                'limit': FACET_LIMIT,
                'refine': True
            }
        }

        query_builder = QueryBuilder(
            dataset=dataset,
            query={
                'q': doc_id_filter_query,
                'group': 'true',
                'group.field': 'doc_id',
                'group.limit': '100',
                'json.facet': json.dumps(facet_query)
            },
            limit=SOLR_MAX_INT,
            core_type='Core-Topics'
        )
        solr_topic_result = query_builder.send()
        topic_dists_for_emails = solr_topic_result['grouped']['doc_id']['groups']
        topic_dists_for_emails_parsed = Search.parse_grouped_topic_distributions(topic_dists_for_emails)

        aggregated_topic_dist_parsed = list(map(
            Topics.parse_topic_closure_wrapper(len(topic_dists_for_emails)),
            solr_topic_result['facets']['facet_topic_id']['buckets']
        ))

        all_topics = Topics.get_all_topics(dataset)

        for distribution in topic_dists_for_emails_parsed:
            topics = Topics.complete_distribution_and_add_ranks(distribution['topics'], all_topics)
            topics = Topics.remove_words(topics)
            distribution['topics'] = topics

        aggregated_topic_dist_parsed = Topics.complete_distribution_and_add_ranks(
            aggregated_topic_dist_parsed, all_topics)

        return {
            'results': results,
            'searchTopics': {
                'main': {
                    'topics': aggregated_topic_dist_parsed
                },
                'singles': topic_dists_for_emails_parsed,
            },
            'numFound': parsed_solr_result['response']['numFound'],
            'searchTerm': term
        }
    def get_email_by_doc_id():
        dataset = Controller.get_arg('dataset')
        doc_id = Controller.get_arg('doc_id')

        solr_result = Emails.get_email_from_solr(dataset, doc_id, True)
        parsed_solr_result = parse_solr_result(solr_result)

        if parsed_solr_result['response']['numFound'] == 0:
            return parsed_solr_result

        email = parse_email_list(parsed_solr_result['response']['docs'])[0]

        similars = solr_result['moreLikeThis'][solr_result['response']['docs'][0]['id']]['docs']

        similar_ids = list(map(lambda x: x['doc_id'], similars))

        if email['header']['recipients'][0] != 'NO RECIPIENTS FOUND':
            email['header']['recipients'] = [literal_eval(recipient) for recipient in email['header']['recipients']]

        if email['keyphrases'][0] != 'NO KEYPHRASES FOUND':
            email['keyphrases'] = [literal_eval(keyphrase)[0] for keyphrase in email['keyphrases']]

        if parsed_solr_result['response']['docs'][0]:
            request_results = Emails.get_topic_distribution_for_email(dataset, doc_id)
            topics_as_objects = Emails.parse_topics(request_results)

            solr_result_all_topics = Emails.get_all_topics(dataset)
            all_topics_parsed = parse_all_topics(solr_result_all_topics['response']['docs'])

            topics_as_objects = Topics.complete_distribution_and_add_ranks(topics_as_objects, all_topics_parsed)

            completed_dists = []

            if similar_ids:
                dists = [Emails.parse_topics(Emails
                                             .get_topic_distribution_for_email(dataset, id)) for id in similar_ids]
                completed_dists = [
                    {
                        'topics': Topics.remove_words(Topics.complete_distribution_and_add_ranks(dist,
                                                                                                 all_topics_parsed))
                    } for dist in dists]

                for dist, id in zip(completed_dists, similar_ids):
                    dist['highlightId'] = id

            email['topics'] = {
                'main': {
                    'topics': topics_as_objects
                },
                'singles': completed_dists if similar_ids else []
            }

            if email['predecessor'] == 'NO THREAD DATA FOUND':
                email['predecessor'] = {
                    'subject': email['predecessor'],
                    'doc_id': ''
                }
            else:
                email['predecessor'] = Emails.get_subjects_for_doc_ids([email['predecessor']], dataset)[0]
            if email['successor'][0] == 'NO THREAD DATA FOUND':
                email['successor'][0] = {
                    'subject': email['successor'][0],
                    'doc_id': ''
                }
            else:
                email['successor'] = Emails.get_subjects_for_doc_ids(email['successor'], dataset)

            return {
                'email': email,
                'numFound': parsed_solr_result['response']['numFound'],
                'searchTerm': doc_id
            }
        else:
            return {
                'numFound': parsed_solr_result['response']['numFound'],
                'searchTerm': doc_id
            }
    def get_similar_emails_by_doc_id():
        dataset = Controller.get_arg('dataset')

        doc_id = Controller.get_arg('doc_id')

        solr_result = Emails.get_email_from_solr(dataset, doc_id, more_like_this=True)

        if solr_result['response']['numFound'] == 0 or \
                solr_result['moreLikeThis'][solr_result['response']['docs'][0]['id']]['numFound'] == 0:
            return []

        result = {
            'response': {
                'docs': []
            }
        }
        parsed_solr_result = parse_solr_result(solr_result)
        main_email = parse_email_list(parsed_solr_result['response']['docs'])[0]

        result['response']['docs'] = solr_result['moreLikeThis'][main_email['id']]['docs']
        parsed_similar_result = parse_solr_result(result)
        parsed_similar_mails = parse_email_list(parsed_similar_result['response']['docs'])

        date = main_email['header']['date'].split("T")[0] if main_email['header']['date'] != 'NO DATE FOUND' else None
        similar_dates = {
            date: {
                'date': date,
                'business': 0,
                'personal': 0,
                'spam': 0,
                'this email': 1
            }
        }

        for mail in parsed_similar_mails:
            date = mail['header']['date'].split("T")[0] if mail['header']['date'] != 'NO DATE FOUND' else None
            if date not in similar_dates:
                similar_dates[date] = {
                    'date': date,
                    'business': 0,
                    'personal': 0,
                    'spam': 0
                }
            similar_dates[date][mail['category']] += 1

        dates = [x['date'] for x in similar_dates.values() if x['date'] is not None]
        start_date = datetime.datetime.strptime(min(dates), '%Y-%m-%d')
        end_date = datetime.datetime.strptime(max(dates), '%Y-%m-%d')

        for offset in range((end_date - start_date).days):
            date = (start_date + datetime.timedelta(days=offset)).strftime('%Y-%m-%d')
            if date not in similar_dates:
                similar_dates[date] = {
                    'date': date,
                    'business': 0,
                    'personal': 0,
                    'spam': 0
                }

        similar_dates = sorted(filter(lambda x: x['date'] is not None, similar_dates.values()), key=lambda k: k['date'])
        for i, entry in enumerate(similar_dates):
            similar_dates[i]['date'] = Dates.format_date_for_axis(entry['date'], 'day')

        return {
            'docs': parsed_similar_mails,
            'dates': {
                'month': [],
                'week': [],
                'day': similar_dates
            }
        }
示例#5
0
    def get_sender_recipient_email_list():
        dataset = Controller.get_arg('dataset')
        core_topics_name = get_config(
            dataset)['SOLR_CONNECTION']['Core-Topics']
        sender = Controller.get_arg('sender', default='*')
        recipient = Controller.get_arg('recipient', default='*')
        sender_or_recipient = Controller.get_arg('sender_or_recipient',
                                                 required=False)
        current_app.logger.debug(
            '########## %s ###### %s ###### %s ###########', sender, recipient,
            sender_or_recipient)
        limit = Controller.get_arg('limit', int, default=DEFAULT_LIMIT)
        offset = Controller.get_arg('offset', int, default=DEFAULT_OFFSET)

        filter_string = Controller.get_arg('filters',
                                           arg_type=str,
                                           default='{}',
                                           required=False)
        filter_object = json.loads(filter_string)
        filter_query = build_filter_query(filter_object,
                                          False,
                                          core_type=core_topics_name)

        if sender == '*' and recipient == '*' and not sender_or_recipient:
            raise SyntaxError(
                'Please provide sender or recipient or both or sender_or_recipient.'
            )

        original_sender = sender
        original_recipient = recipient
        if sender_or_recipient:
            sender = recipient = sender_or_recipient

        if sender != '*':
            sender = re.escape(sender)
        if recipient != '*':
            # all non-alphanumerics must be escaped in order for Solr to match only the identifying_name field-part:
            # if we DIDN'T specify 'identifying_name' for 'recipients' here, also 'name' and 'email' would be searched
            # because all these three attributes are stored in one big 'recipients' string in Solr!
            recipient = '*"\'identifying_name\': \'{}\'"*'.format(
                re.escape(recipient))

        operator = 'OR' if sender_or_recipient else 'AND'
        q = '(header.sender.identifying_name:{} {} header.recipients:{}) AND {}'.format(
            sender, operator, recipient,
            build_fuzzy_solr_query(filter_object.get('searchTerm', '')))

        query_builder = QueryBuilder(dataset=dataset,
                                     query=q,
                                     fq=filter_query,
                                     limit=limit,
                                     offset=offset,
                                     sort='Newest first')
        solr_result = query_builder.send()

        parsed_solr_result = parse_solr_result(solr_result)

        return {
            'results':
            parse_email_list(parsed_solr_result['response']['docs']),
            'numFound': parsed_solr_result['response']['numFound'],
            'query': q,
            'senderEmail': original_sender,
            'recipientEmail': original_recipient
        }