def get_classes_for_correspondent(): dataset = Controller.get_arg('dataset') core_topics_name = get_config(dataset)['SOLR_CONNECTION']['Core-Topics'] identifying_name = re.escape(Controller.get_arg('identifying_name')) filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False) filter_object = json.loads(filter_string) filter_query = build_filter_query(filter_object, False, core_type=core_topics_name) query_builder = QueryBuilder( dataset=dataset, query={ 'q': 'header.sender.identifying_name:' + identifying_name + \ ' AND ' + build_fuzzy_solr_query(filter_object.get('searchTerm', '')), 'group': 'true', 'group.field': 'category.top_subcategory' }, fq=filter_query, fl='groupValue' ) solr_result = query_builder.send() grouped_result = solr_result['grouped']['category.top_subcategory'] groups = grouped_result['groups'] num = grouped_result['matches'] if num == 0: return [] return [{ 'key': group['groupValue'], 'num': group['doclist']['numFound'], 'share': round(group['doclist']['numFound'] / num, 4) } for group in groups]
def get_correspondents_for_search(): dataset = Controller.get_arg('dataset') core_topics_name = get_config(dataset)['SOLR_CONNECTION']['Core-Topics'] sort = Controller.get_arg('sort', arg_type=str, required=False) filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False) filter_object = json.loads(filter_string) filter_query = build_filter_query(filter_object, core_type=core_topics_name) term = filter_object.get('searchTerm', '') query_builder = QueryBuilder( dataset=dataset, query={ 'q': build_fuzzy_solr_query(term), 'facet':'true', 'facet.mincount':'1', 'facet.limit': str(FACET_LIMIT), # group by 'facet.field': 'header.sender.identifying_name' }, fq=filter_query, limit=0 ) solr_result = query_builder.send() return Correspondents.build_correspondents_for_search_result(solr_result, dataset, sort)
def get_matrix_highlighting(): dataset = Controller.get_arg('dataset') filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False) correspondences = Matrix.search_correspondences_for_term( dataset, filter_string) return correspondences
def get_correspondents_for_correspondent(): dataset = Controller.get_arg('dataset') identifying_name = Controller.get_arg('identifying_name') limit = Controller.get_arg('limit', int, default=DEFAULT_LIMIT) sort = Controller.get_arg('sort', arg_type=str, required=False) filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False) filter_object = json.loads(filter_string) start_date = filter_object.get('startDate') start_stamp = time.mktime(datetime.datetime.strptime(start_date, "%Y-%m-%d").timetuple()) if start_date else 0 end_date = filter_object.get('endDate') end_stamp = time.mktime(datetime.datetime.strptime(end_date, "%Y-%m-%d") .timetuple()) if end_date else time.time() neo4j_requester = Neo4jRequester(dataset) result = {} all_deduplicated = [] all_with_duplicates = default_network_analysis( neo4j_requester.get_all_correspondents_for_identifying_name( identifying_name, start_time=start_stamp, end_time=end_stamp ) ) for new_correspondent in all_with_duplicates: found = False for existing_correspondent in all_deduplicated: if new_correspondent['identifying_name'] == existing_correspondent['identifying_name']: existing_correspondent['count'] += new_correspondent['count'] found = True if not found: all_deduplicated.append(new_correspondent) sort_key = 'hierarchy' if sort == HIERARCHY_SCORE_LABEL else 'count' result['all'] = sorted( all_deduplicated, key=lambda correspondent: correspondent[sort_key], reverse=True)[0:limit] result['from'] = default_network_analysis( neo4j_requester.get_sending_correspondents_for_identifying_name( identifying_name, start_time=start_stamp, end_time=end_stamp ) ) result['from'] = sorted( result['from'], key=lambda correspondent: correspondent[sort_key], reverse=True)[0:limit] result['to'] = default_network_analysis( neo4j_requester.get_receiving_correspondents_for_identifying_name( identifying_name, start_time=start_stamp, end_time=end_stamp ) ) result['to'] = sorted(result['to'], key=lambda correspondent: correspondent[sort_key], reverse=True)[0:limit] return result
def get_topics_for_correspondent(): dataset = Controller.get_arg('dataset') core_name = get_config(dataset)['SOLR_CONNECTION']['Core'] core_topics_name = get_config( dataset)['SOLR_CONNECTION']['Core-Topics'] identifying_name = re.escape(Controller.get_arg('identifying_name')) join_string = '{!join from=doc_id fromIndex=' + core_name + ' to=doc_id}' filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False) filter_object = json.loads(filter_string) filter_query = build_filter_query(filter_object, False, True, join_string, core_type=core_topics_name) filter_query.append(join_string + 'header.sender.identifying_name:' + identifying_name + \ ' AND ' + build_fuzzy_solr_query(filter_object.get('searchTerm', ''))) join_query = filter_query aggregated_topics_for_correspondent = Topics.get_aggregated_distribution( dataset, core_topics_name, identifying_name, filter_object, join_query) aggregated_distribution = { 'topics': aggregated_topics_for_correspondent } all_topics = Topics.get_all_topics(dataset) mail_topic_distributions = Topics.get_distributions_for_mails( dataset, join_query) all_topic_distributions = { 'main': aggregated_distribution, 'singles': mail_topic_distributions } for distribution in all_topic_distributions['singles']: topics = Topics.complete_distribution_and_add_ranks( distribution['topics'], all_topics) topics = Topics.remove_words(topics) distribution['topics'] = topics all_topic_distributions['main'][ 'topics'] = Topics.complete_distribution_and_add_ranks( all_topic_distributions['main']['topics'], all_topics) return all_topic_distributions
def get_correspondent_information(): dataset = Controller.get_arg('dataset') identifying_name = Controller.get_arg('identifying_name') neo4j_requester = Neo4jRequester(dataset) results = default_network_analysis(neo4j_requester.get_information_for_identifying_name(identifying_name)) if len(results) == 0: return { 'numFound': 0, 'identifying_name': identifying_name } elif len(results) > 1: raise Exception('More than one matching correspondent found for identifying_name ' + identifying_name) result = dict(results[0]) result['numFound'] = 1 result['identifying_name'] = identifying_name return result
def get_keyphrases_for_correspondent(): dataset = Controller.get_arg('dataset') core_topics_name = get_config( dataset)['SOLR_CONNECTION']['Core-Topics'] identifying_name = re.escape(Controller.get_arg('identifying_name')) filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False) filter_object = json.loads(filter_string) filter_query = build_filter_query(filter_object, core_type=core_topics_name) query_builder = QueryBuilder( dataset=dataset, query={ 'q': 'header.sender.identifying_name:' + identifying_name + ' AND ' + build_fuzzy_solr_query(filter_object.get('searchTerm', '')), 'facet': 'true', 'facet.field': 'keyphrases', 'facet.mincount': '1' }, fq=filter_query, limit=0, ) solr_result = query_builder.send() parsed_solr_result = parse_solr_result(solr_result) results = parsed_solr_result['facet_counts']['facet_fields'][ 'keyphrases'] if len(results) == 0: return results aggregated_keyphrases = Keyphrases.parse_keyphrases(results) return aggregated_keyphrases
def get_dates_for_correspondent(): dataset = Controller.get_arg('dataset') core_topics_name = get_config( dataset)['SOLR_CONNECTION']['Core-Topics'] filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False) filter_object = json.loads(filter_string) filter_query = build_filter_query(filter_object, core_type=core_topics_name) term = filter_object.get('searchTerm', '*') identifying_name = re.escape(Dates.get_arg('identifying_name')) identifying_name_filter = '*' + re.escape("'identifying_name': '" + identifying_name + "'") + '*' identifying_name_query = ( 'header.sender.identifying_name:{0} OR header.recipients:{1}' ).format(identifying_name, identifying_name_filter) start_range = Dates.get_date_range_border(dataset, 'start') end_range = Dates.get_date_range_border(dataset, 'end') start_date_filter = filter_object.get('startDate') start_date = (start_date_filter + 'T00:00:00Z') if start_date_filter else start_range end_date_filter = filter_object.get('endDate') end_date = (end_date_filter + 'T23:59:59Z') if end_date_filter else end_range obj = {} for period in ['day', 'week', 'month']: result = {} for category in ['business', 'personal', 'spam']: r = Dates.get_date_facet_result(dataset, filter_query, term, identifying_name_query, start_date, end_date, period, category) result['dates'] = [entry['date'] for entry in r] result[category] = [entry['count'] for entry in r] obj[period] = Dates.transform_category_frequencies_over_time( result) return obj
def get_matrix(): identifying_names = Controller.get_arg_list('identifying_name') dataset = Controller.get_arg('dataset') neo4j_requester = Neo4jRequester(dataset) relations = neo4j_requester.get_matrix_for_identifying_names( identifying_names) community_count = neo4j_requester.get_feature_count('community') role_count = neo4j_requester.get_feature_count('role') matrix = Matrix.build_matrix(relations, community_count, role_count) return matrix
def search_correspondents(): dataset = Controller.get_arg('dataset') search_phrase = Controller.get_arg('search_phrase') match_exact = Controller.get_arg('match_exact', arg_type=bool, default=False, required=False) offset = Controller.get_arg('offset', arg_type=int, default=0, required=False) limit = Controller.get_arg('limit', arg_type=int, default=10, required=False) search_fields = Controller.get_arg_list( 'search_field', default=['identifying_name'], required=False ) allowed_search_field_values = {'identifying_name', 'email_addresses', 'aliases'} if not set(search_fields).issubset(allowed_search_field_values): raise Exception('Allowed values for arg search_fields are ' + str(allowed_search_field_values)) neo4j_requester = Neo4jRequester(dataset) results, numFound = neo4j_requester.get_correspondents_for_search_phrase( search_phrase, match_exact, search_fields, offset, limit ) return { 'results': [dict(result) for result in default_network_analysis(results)], 'numFound': numFound }
def get_graph(): dataset = Controller.get_arg('dataset') is_correspondent_view = Controller.get_arg('is_correspondent_view', required=False) identifying_names = Controller.get_arg_list('identifying_name') neo4j_requester = Neo4jRequester(dataset) start_date = Controller.get_arg('start_date', required=False) start_stamp = time.mktime( datetime.datetime.strptime( start_date, '%Y-%m-%d').timetuple()) if start_date else 0 end_date = Controller.get_arg('end_date', required=False) end_stamp = time.mktime( datetime.datetime.strptime( end_date, '%Y-%m-%d').timetuple()) if end_date else time.time() graph = {'nodes': [], 'links': []} visited_nodes = [] for node in neo4j_requester.get_nodes_for_identifying_names( identifying_names): if not node['id'] in visited_nodes: visited_nodes.append(node['id']) graph['nodes'].append( build_node(node['id'], node['identifying_name'])) if is_correspondent_view == 'true': for neighbour in neo4j_requester.get_neighbours_for_node( node['id'], start_stamp, end_stamp): if not neighbour['id'] in visited_nodes: visited_nodes.append(neighbour['id']) graph['nodes'].append( build_node(neighbour['id'], neighbour['identifying_name'])) for relation in neo4j_requester.get_relations_for_nodes( visited_nodes, start_stamp, end_stamp): graph['links'].append( build_edge(relation['relation_id'], relation['source_id'], relation['target_id'])) # add hops to connect lonely nodes with other nodes in graph if is_correspondent_view == 'false': # in correspondent view, no nodes without links should appear nodes = list(graph['nodes']) links = list(graph['links']) for node in nodes: has_links = False for link in links: if (node['id'] == link['source']) or (node['id'] == link['target']): if link['source'] != link['target']: has_links = True if has_links: continue other_nodes = list(visited_nodes) other_nodes.remove(node['id']) for hop in neo4j_requester.get_path_between_nodes( node['id'], other_nodes): if not hop['hop_id'] in visited_nodes: visited_nodes.append(hop['hop_id']) graph['nodes'].append( build_node(hop['hop_id'], hop['hop_identifying_name'])) graph['links'].append( build_edge(hop['r1_id'], hop['source_id'], hop['hop_id'])) graph['links'].append( build_edge(hop['r2_id'], hop['hop_id'], hop['target_id'])) return graph
def search(): dataset = Controller.get_arg('dataset') core_topics_name = get_config(dataset)['SOLR_CONNECTION']['Core-Topics'] limit = Controller.get_arg('limit', arg_type=int, required=False) offset = Controller.get_arg('offset', arg_type=int, required=False) sort = Controller.get_arg('sort', arg_type=str, required=False) filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False) filter_object = json.loads(filter_string) filter_query = build_filter_query(filter_object, core_type=core_topics_name) term = filter_object.get('searchTerm', '') query = build_fuzzy_solr_query(term) query_builder = QueryBuilder( dataset=dataset, query=query, limit=limit, offset=offset, fq=filter_query, sort=sort ) solr_result = query_builder.send() parsed_solr_result = parse_solr_result(solr_result) results = parse_email_list(parsed_solr_result['response']['docs']) if len(results) == 0: return { 'results': results, 'searchTopics': { 'main': { 'topics': [] }, 'singles': [], }, 'numFound': parsed_solr_result['response']['numFound'], 'searchTerm': term } conditions = map(lambda result_element: 'doc_id:' + result_element['doc_id'], results) doc_id_filter_query = reduce(lambda condition_1, condition_2: condition_1 + ' OR ' + condition_2, conditions) facet_query = { 'facet_topic_id': { 'type': 'terms', 'field': 'topic_id', 'facet': { 'sum_of_confs_for_topic': 'sum(topic_conf)', 'facet_terms': { 'type': 'terms', 'field': 'terms', 'limit': 1 } }, 'sort': 'index asc', 'limit': FACET_LIMIT, 'refine': True } } query_builder = QueryBuilder( dataset=dataset, query={ 'q': doc_id_filter_query, 'group': 'true', 'group.field': 'doc_id', 'group.limit': '100', 'json.facet': json.dumps(facet_query) }, limit=SOLR_MAX_INT, core_type='Core-Topics' ) solr_topic_result = query_builder.send() topic_dists_for_emails = solr_topic_result['grouped']['doc_id']['groups'] topic_dists_for_emails_parsed = Search.parse_grouped_topic_distributions(topic_dists_for_emails) aggregated_topic_dist_parsed = list(map( Topics.parse_topic_closure_wrapper(len(topic_dists_for_emails)), solr_topic_result['facets']['facet_topic_id']['buckets'] )) all_topics = Topics.get_all_topics(dataset) for distribution in topic_dists_for_emails_parsed: topics = Topics.complete_distribution_and_add_ranks(distribution['topics'], all_topics) topics = Topics.remove_words(topics) distribution['topics'] = topics aggregated_topic_dist_parsed = Topics.complete_distribution_and_add_ranks( aggregated_topic_dist_parsed, all_topics) return { 'results': results, 'searchTopics': { 'main': { 'topics': aggregated_topic_dist_parsed }, 'singles': topic_dists_for_emails_parsed, }, 'numFound': parsed_solr_result['response']['numFound'], 'searchTerm': term }
def get_email_by_doc_id(): dataset = Controller.get_arg('dataset') doc_id = Controller.get_arg('doc_id') solr_result = Emails.get_email_from_solr(dataset, doc_id, True) parsed_solr_result = parse_solr_result(solr_result) if parsed_solr_result['response']['numFound'] == 0: return parsed_solr_result email = parse_email_list(parsed_solr_result['response']['docs'])[0] similars = solr_result['moreLikeThis'][solr_result['response']['docs'][0]['id']]['docs'] similar_ids = list(map(lambda x: x['doc_id'], similars)) if email['header']['recipients'][0] != 'NO RECIPIENTS FOUND': email['header']['recipients'] = [literal_eval(recipient) for recipient in email['header']['recipients']] if email['keyphrases'][0] != 'NO KEYPHRASES FOUND': email['keyphrases'] = [literal_eval(keyphrase)[0] for keyphrase in email['keyphrases']] if parsed_solr_result['response']['docs'][0]: request_results = Emails.get_topic_distribution_for_email(dataset, doc_id) topics_as_objects = Emails.parse_topics(request_results) solr_result_all_topics = Emails.get_all_topics(dataset) all_topics_parsed = parse_all_topics(solr_result_all_topics['response']['docs']) topics_as_objects = Topics.complete_distribution_and_add_ranks(topics_as_objects, all_topics_parsed) completed_dists = [] if similar_ids: dists = [Emails.parse_topics(Emails .get_topic_distribution_for_email(dataset, id)) for id in similar_ids] completed_dists = [ { 'topics': Topics.remove_words(Topics.complete_distribution_and_add_ranks(dist, all_topics_parsed)) } for dist in dists] for dist, id in zip(completed_dists, similar_ids): dist['highlightId'] = id email['topics'] = { 'main': { 'topics': topics_as_objects }, 'singles': completed_dists if similar_ids else [] } if email['predecessor'] == 'NO THREAD DATA FOUND': email['predecessor'] = { 'subject': email['predecessor'], 'doc_id': '' } else: email['predecessor'] = Emails.get_subjects_for_doc_ids([email['predecessor']], dataset)[0] if email['successor'][0] == 'NO THREAD DATA FOUND': email['successor'][0] = { 'subject': email['successor'][0], 'doc_id': '' } else: email['successor'] = Emails.get_subjects_for_doc_ids(email['successor'], dataset) return { 'email': email, 'numFound': parsed_solr_result['response']['numFound'], 'searchTerm': doc_id } else: return { 'numFound': parsed_solr_result['response']['numFound'], 'searchTerm': doc_id }
def get_similar_emails_by_doc_id(): dataset = Controller.get_arg('dataset') doc_id = Controller.get_arg('doc_id') solr_result = Emails.get_email_from_solr(dataset, doc_id, more_like_this=True) if solr_result['response']['numFound'] == 0 or \ solr_result['moreLikeThis'][solr_result['response']['docs'][0]['id']]['numFound'] == 0: return [] result = { 'response': { 'docs': [] } } parsed_solr_result = parse_solr_result(solr_result) main_email = parse_email_list(parsed_solr_result['response']['docs'])[0] result['response']['docs'] = solr_result['moreLikeThis'][main_email['id']]['docs'] parsed_similar_result = parse_solr_result(result) parsed_similar_mails = parse_email_list(parsed_similar_result['response']['docs']) date = main_email['header']['date'].split("T")[0] if main_email['header']['date'] != 'NO DATE FOUND' else None similar_dates = { date: { 'date': date, 'business': 0, 'personal': 0, 'spam': 0, 'this email': 1 } } for mail in parsed_similar_mails: date = mail['header']['date'].split("T")[0] if mail['header']['date'] != 'NO DATE FOUND' else None if date not in similar_dates: similar_dates[date] = { 'date': date, 'business': 0, 'personal': 0, 'spam': 0 } similar_dates[date][mail['category']] += 1 dates = [x['date'] for x in similar_dates.values() if x['date'] is not None] start_date = datetime.datetime.strptime(min(dates), '%Y-%m-%d') end_date = datetime.datetime.strptime(max(dates), '%Y-%m-%d') for offset in range((end_date - start_date).days): date = (start_date + datetime.timedelta(days=offset)).strftime('%Y-%m-%d') if date not in similar_dates: similar_dates[date] = { 'date': date, 'business': 0, 'personal': 0, 'spam': 0 } similar_dates = sorted(filter(lambda x: x['date'] is not None, similar_dates.values()), key=lambda k: k['date']) for i, entry in enumerate(similar_dates): similar_dates[i]['date'] = Dates.format_date_for_axis(entry['date'], 'day') return { 'docs': parsed_similar_mails, 'dates': { 'month': [], 'week': [], 'day': similar_dates } }
def get_sender_recipient_email_list(): dataset = Controller.get_arg('dataset') core_topics_name = get_config( dataset)['SOLR_CONNECTION']['Core-Topics'] sender = Controller.get_arg('sender', default='*') recipient = Controller.get_arg('recipient', default='*') sender_or_recipient = Controller.get_arg('sender_or_recipient', required=False) current_app.logger.debug( '########## %s ###### %s ###### %s ###########', sender, recipient, sender_or_recipient) limit = Controller.get_arg('limit', int, default=DEFAULT_LIMIT) offset = Controller.get_arg('offset', int, default=DEFAULT_OFFSET) filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False) filter_object = json.loads(filter_string) filter_query = build_filter_query(filter_object, False, core_type=core_topics_name) if sender == '*' and recipient == '*' and not sender_or_recipient: raise SyntaxError( 'Please provide sender or recipient or both or sender_or_recipient.' ) original_sender = sender original_recipient = recipient if sender_or_recipient: sender = recipient = sender_or_recipient if sender != '*': sender = re.escape(sender) if recipient != '*': # all non-alphanumerics must be escaped in order for Solr to match only the identifying_name field-part: # if we DIDN'T specify 'identifying_name' for 'recipients' here, also 'name' and 'email' would be searched # because all these three attributes are stored in one big 'recipients' string in Solr! recipient = '*"\'identifying_name\': \'{}\'"*'.format( re.escape(recipient)) operator = 'OR' if sender_or_recipient else 'AND' q = '(header.sender.identifying_name:{} {} header.recipients:{}) AND {}'.format( sender, operator, recipient, build_fuzzy_solr_query(filter_object.get('searchTerm', ''))) query_builder = QueryBuilder(dataset=dataset, query=q, fq=filter_query, limit=limit, offset=offset, sort='Newest first') solr_result = query_builder.send() parsed_solr_result = parse_solr_result(solr_result) return { 'results': parse_email_list(parsed_solr_result['response']['docs']), 'numFound': parsed_solr_result['response']['numFound'], 'query': q, 'senderEmail': original_sender, 'recipientEmail': original_recipient }