示例#1
0
文件: views.py 项目: cbentes/texta
def get_all_matched_rows(query, request, inclusive_instructions):
    buffer_ = StringIO()
    writer = csv.writer(buffer_)

    ds = Datasets().activate_dataset(request.session)
    es_m = ds.build_manager(ES_Manager)

    features = sorted([field['path'] for field in es_m.get_mapped_fields()])

    query['size'] = ES_SCROLL_BATCH

    writer.writerow(features)

    ds.get_index()
    ds.get_mapping()
    es_url

    request_url = os.path.join(es_url, ds.get_index(), ds.get_mapping(), '_search?scroll=1m')
    response = requests.get(request_url, data=json.dumps(query)).json()

    scroll_id = response['_scroll_id']
    hits = response['hits']['hits']

    scroll_payload = json.dumps({'scroll':'1m', 'scroll_id':scroll_id})
    while hits:
        for hit in hits:
            feature_dict = {feature_name:hit['_source'][feature_name] for feature_name in hit['_source']}

            feature_dict = {}

            row = []
            for feature_name in features:
                feature_path = feature_name.split('.')
                parent_source = hit['_source']
                for path_component in feature_path:
                    if path_component in parent_source:
                        parent_source = parent_source[path_component]
                    else:
                        parent_source = ""
                        break

                content = parent_source
                row.append(content)
                feature_dict[feature_name] = content

            layer_dict = matcher.LayerDict(feature_dict)
            if inclusive_instructions.match(layer_dict):
                writer.writerow([element.encode('utf-8') if isinstance(element,unicode) else element for element in row])

        buffer_.seek(0)
        data = buffer_.read()
        buffer_.seek(0)
        buffer_.truncate()
        yield data

        response = requests.get(os.path.join(es_url,'_search','scroll'), data=scroll_payload).json()
        hits = response['hits']['hits']
        scroll_id = response['_scroll_id']
示例#2
0
def index(request):
    # Define selected mapping
    ds = Datasets().activate_dataset(request.session)
    dataset = ds.get_index()
    mapping = ds.get_mapping()

    es_m = ds.build_manager(ES_Manager)

    fields = get_fields(es_m)

    searches = [{
        'id': search.pk,
        'desc': search.description
    } for search in Search.objects.filter(
        author=request.user, dataset__index=dataset, dataset__mapping=mapping)]

    datasets = Datasets().get_allowed_datasets(request.user)
    language_models = Task.objects.filter(
        task_type=TaskTypes.TRAIN_MODEL.value).filter(
            status=Task.STATUS_COMPLETED).order_by('-pk')

    template = loader.get_template('grammar_builder.html')
    return HttpResponse(
        template.render(
            {
                'STATIC_URL': STATIC_URL,
                'searches': searches,
                'features': fields,
                'language_models': language_models,
                'allowed_datasets': datasets
            }, request))
示例#3
0
def save_grammar(request):
    grammar_dict = json.loads(request.POST['json'])

    grammar_id = grammar_dict[0]['id']

    if grammar_id == 'new':
        name = grammar_dict[0]['text']

        ds = Datasets().activate_dataset(request.session)
        dataset = ds.get_index()
        mapping = ds.get_mapping()

        grammar = Grammar(name=name,
                          json='',
                          author=request.user,
                          dataset=Dataset.objects.filter(index=dataset,
                                                         mapping=mapping)[0])
        grammar.save()

        grammar_dict[0]['id'] = grammar.id
    else:
        grammar = Grammar.objects.get(id=grammar_id)

    grammar.json = json.dumps(grammar_dict)
    grammar.save()

    return HttpResponse(json.dumps({'id': grammar.id}))
示例#4
0
文件: views.py 项目: cbentes/texta
def get_example_texts(request, field, value):
    ds = Datasets().activate_dataset(request.session)
    dataset = ds.get_index()
    mapping = ds.get_mapping()

    query = json.dumps({
        "size": 10,
        "highlight": {
            "fields": {
                field: {}
            }
        },
        "query": {
            "match": {
                field: value
            }
        }
    })
    response = ES_Manager.plain_scroll(es_url, dataset, mapping, query)

    matched_sentences = []
    for hit in response['hits']['hits']:
        for match in hit['highlight'].values():
            matched_sentences.append(match[0])

    return matched_sentences
示例#5
0
文件: views.py 项目: cbentes/texta
def get_grammar_listing(request):
    ds = Datasets().activate_dataset(request.session)
    dataset = ds.get_index()
    mapping = ds.get_mapping()

    grammars = Grammar.objects.filter(author=request.user, dataset__index=dataset, dataset__mapping=mapping).order_by('-last_modified')
    grammar_json = json.dumps([{'id':grammar.id, 'name':grammar.name, 'last_modified':grammar.last_modified.strftime("%d/%m/%y %H:%M:%S")} for grammar in grammars])

    return HttpResponse(grammar_json)
示例#6
0
    def parse_request(self,request):

        self.lookup_types = request.POST['lookup_types'].split(',')
        self.key_constraints = request.POST['key_constraints'].split(',')
        self.content = request.POST['content'].split('\n')[-1].strip()
        print(self.content)
        ds = Datasets().activate_dataset(request.session)
        self.dataset = ds.get_index()
        self.mapping = ds.get_mapping()
        self.es_m = ES_Manager(self.dataset, self.mapping)

        self.user = request.user
示例#7
0
    def parse_request(self, request):

        self.lookup_types = request.POST['lookup_types'].split(',')
        self.key_constraints = request.POST['key_constraints'].split(',')
        self.content = request.POST['content'].split('\n')[-1].strip()
        print(self.content)
        ds = Datasets().activate_dataset(request.session)
        self.dataset = ds.get_index()
        self.mapping = ds.get_mapping()
        self.es_m = ES_Manager(self.dataset, self.mapping)

        self.user = request.user
示例#8
0
文件: views.py 项目: cbentes/texta
def get_example_texts(request, field, value):
    ds = Datasets().activate_dataset(request.session)
    dataset = ds.get_index()
    mapping = ds.get_mapping()

    query = json.dumps({ "size":10, "highlight": {"fields": {field: {}}}, "query": {"match": {field: value}}})
    response = ES_Manager.plain_scroll(es_url, dataset, mapping, query)

    matched_sentences = []
    for hit in response['hits']['hits']:
        for match in hit['highlight'].values():
            matched_sentences.append(match[0])

    return matched_sentences
示例#9
0
文件: views.py 项目: cbentes/texta
def get_table_header(request):
    ds = Datasets().activate_dataset(request.session)
    es_m = ds.build_manager(ES_Manager)

    # get columns names from ES mapping
    fields = es_m.get_column_names()
    template_params = {'STATIC_URL': STATIC_URL,
                       'URL_PREFIX': URL_PREFIX,
                       'fields': fields,
                       'searches': Search.objects.filter(author=request.user),
                       'columns': [{'index':index, 'name':field_name} for index, field_name in enumerate(fields)],
                       'dataset': ds.get_index(),
                       'mapping': ds.get_mapping()}
    template = loader.get_template('searcher_results.html')
    return HttpResponse(template.render(template_params, request))
示例#10
0
def get_grammar_listing(request):
    ds = Datasets().activate_dataset(request.session)
    dataset = ds.get_index()
    mapping = ds.get_mapping()

    grammars = Grammar.objects.filter(
        author=request.user, dataset__index=dataset,
        dataset__mapping=mapping).order_by('-last_modified')
    grammar_json = json.dumps([{
        'id':
        grammar.id,
        'name':
        grammar.name,
        'last_modified':
        grammar.last_modified.strftime("%d/%m/%y %H:%M:%S")
    } for grammar in grammars])

    return HttpResponse(grammar_json)
示例#11
0
    def __init__(self,request):
        ds = Datasets().activate_dataset(request.session)
        self.dataset = ds.get_index()
        self.mapping = ds.get_mapping()
        self.es_m = ES_Manager(self.dataset, self.mapping)

        # PREPARE AGGREGATION
        self.es_params = request.POST
        interval = self.es_params["interval_1"]


        self.daterange = self._get_daterange(self.es_params)
        
        self.ranges,self.date_labels = self._get_date_intervals(self.daterange,interval)
        self.agg_query = self.prepare_agg_query()
        # EXECUTE AGGREGATION
        agg_results = self.aggregate()

        # PARSE RESPONSES INTO JSON OBJECT
        self.agg_data = self.parse_responses(agg_results)
示例#12
0
    def __init__(self, request):
        ds = Datasets().activate_dataset(request.session)
        self.dataset = ds.get_index()
        self.mapping = ds.get_mapping()
        self.es_m = ES_Manager(self.dataset, self.mapping)

        # PREPARE AGGREGATION
        self.es_params = request.POST
        interval = self.es_params["interval_1"]

        self.daterange = self._get_daterange(self.es_params)

        self.ranges, self.date_labels = self._get_date_intervals(
            self.daterange, interval)
        self.agg_query = self.prepare_agg_query()
        # EXECUTE AGGREGATION
        agg_results = self.aggregate()

        # PARSE RESPONSES INTO JSON OBJECT
        self.agg_data = self.parse_responses(agg_results)
示例#13
0
文件: views.py 项目: cbentes/texta
def index(request):
    # Define selected mapping
    ds = Datasets().activate_dataset(request.session)
    dataset = ds.get_index()
    mapping = ds.get_mapping()

    es_m = ds.build_manager(ES_Manager)

    fields = get_fields(es_m)

    searches = [{'id':search.pk,'desc':search.description} for search in
                Search.objects.filter(author=request.user, dataset__index=dataset, dataset__mapping=mapping)]

    datasets = Datasets().get_allowed_datasets(request.user)
    language_models = Task.objects.filter(task_type='train_model').filter(status__iexact='completed').order_by('-pk')

    template = loader.get_template('grammar_builder.html')
    return HttpResponse(template.render({'STATIC_URL':STATIC_URL,
                                         'searches':searches,
                                         'features':fields,
                                         'language_models': language_models, 
                                         'allowed_datasets': datasets},request))
示例#14
0
文件: views.py 项目: cbentes/texta
def save_grammar(request):
    grammar_dict = json.loads(request.POST['json'])

    grammar_id = grammar_dict[0]['id']

    if grammar_id == 'new':
        name = grammar_dict[0]['text']

        ds = Datasets().activate_dataset(request.session)
        dataset = ds.get_index()
        mapping = ds.get_mapping()

        grammar = Grammar(name=name, json='', author=request.user, dataset=Dataset.objects.filter(index=dataset, mapping=mapping)[0])
        grammar.save()

        grammar_dict[0]['id'] = grammar.id
    else:
        grammar = Grammar.objects.get(id=grammar_id)

    grammar.json = json.dumps(grammar_dict)
    grammar.save()

    return HttpResponse(json.dumps({'id':grammar.id}))
示例#15
0
文件: views.py 项目: cbentes/texta
def get_table_data(request):
    query_data = {}

    query_data['search_id'] = request.GET['search_id']
    query_data['polarity'] = request.GET['polarity']
    query_data['requested_page'] = int(request.GET['iDisplayStart'])/int(request.GET['iDisplayLength'])+1
    query_data['page_length'] = int(request.GET['iDisplayLength'])

    if request.GET['is_test'] == 'true':
        query_data['inclusive_metaquery'] = json.loads(request.GET['inclusive_test_grammar'])

        query_data['inclusive_grammar_id'] = -1
        query_data['exclusive_grammar_id'] = -1

        query_data['features'] = sorted(extract_layers(query_data['inclusive_metaquery']))

    else:
        query_data['inclusive_grammar_id'] = request.GET['inclusive_grammar_id']
        query_data['exclusive_grammar_id'] = request.GET['exclusive_grammar_id']

        query_data['inclusive_metaquery'] = generate_metaquery_dict(int(query_data['inclusive_grammar_id']), request.user, component={})
        query_data['exclusive_metaquery'] = generate_metaquery_dict(int(query_data['exclusive_grammar_id']), request.user, component={})

        query_data['features'] = sorted(extract_layers(query_data['inclusive_metaquery']) | extract_layers(query_data['exclusive_metaquery']))


    GrammarPageMapping.objects.filter(search_id=query_data['search_id'],
                                    inclusive_grammar=query_data['inclusive_grammar_id'],
                                    exclusive_grammar=query_data['exclusive_grammar_id'],
                                    polarity=query_data['polarity'], author=request.user).delete()


    ds = Datasets().activate_dataset(request.session)

    query_data['dataset'] = ds.get_index()
    query_data['mapping'] = ds.get_mapping()

    component_query = ElasticGrammarQuery(query_data['inclusive_metaquery'], None).generate()

    es_m = ds.build_manager(ES_Manager)
    if query_data['search_id'] != '-1':
        saved_query = json.loads(Search.objects.get(pk=query_data['search_id']).query)
        es_m.load_combined_query(saved_query)

        if query_data['polarity'] == 'positive':
            es_m.merge_combined_query_with_query_dict(component_query)
    else:
        #es_m.combined_query = {"main": {"query": {"bool": {"should": [{"match_all":{}}], "must": [], "must_not": []}}},
                                #"facts": {"include": [], 'total_include': 0,
                                     #"exclude": [], 'total_exclude': 0}}
        es_m.combined_query = {"main": {"query":{"match_all":{}}}}
        if query_data['polarity'] == 'positive':
            es_m.combined_query = component_query

    # Add paging data to the query
    #es_m.set_query_parameter('from', request.session['grammar_'+polarity+'_cursor'])
    es_m.set_query_parameter('size', request.GET['iDisplayLength'])
    es_m.set_query_parameter('_source', query_data['features'])

    query_data['inclusive_instructions'] = generate_instructions(query_data['inclusive_metaquery'])
    query_data['exclusive_instructions'] = {} #generate_instructions(query_data['exclusive_metaquery'])

    data = scroll_data(es_m.combined_query['main'], request, query_data)
    data['sEcho'] = request.GET['sEcho']

    return HttpResponse(json.dumps(data,ensure_ascii=False))
示例#16
0
文件: views.py 项目: cbentes/texta
def facts_agg(es_params, request):
    logger = LogManager(__name__, 'FACTS AGGREGATION')

    distinct_values = []
    query_results = []
    lexicon = []
    aggregation_data = es_params['aggregate_over']
    aggregation_data = json.loads(aggregation_data)
    original_aggregation_field = aggregation_data['path']
    aggregation_field = 'texta_link.facts'

    try:
        aggregation_size = 50
        aggregations = {"strings": {es_params['sort_by']: {"field": aggregation_field, 'size': 0}},
                        "distinct_values": {"cardinality": {"field": aggregation_field}}}

        # Define selected mapping
        ds = Datasets().activate_dataset(request.session)
        dataset = ds.get_index()
        mapping = ds.get_mapping()
        date_range = ds.get_date_range()
        es_m = ES_Manager(dataset, mapping, date_range)

        for item in es_params:
            if 'saved_search' in item:
                s = Search.objects.get(pk=es_params[item])
                name = s.description
                saved_query = json.loads(s.query)
                es_m.load_combined_query(saved_query)
                es_m.set_query_parameter('aggs', aggregations)
                response = es_m.search()

                # Filter response
                bucket_filter = '{0}.'.format(original_aggregation_field.lower())
                final_bucket = []
                for b in response['aggregations']['strings']['buckets']:
                    if bucket_filter in b['key']:
                        fact_name = b['key'].split('.')[-1]
                        b['key'] = fact_name
                        final_bucket.append(b)
                final_bucket = final_bucket[:aggregation_size]
                response['aggregations']['distinct_values']['value'] = len(final_bucket)
                response['aggregations']['strings']['buckets'] = final_bucket

                normalised_counts,labels = normalise_agg(response, es_m, es_params, 'strings')
                lexicon = list(set(lexicon+labels))
                query_results.append({'name':name,'data':normalised_counts,'labels':labels})
                distinct_values.append({'name':name,'data':response['aggregations']['distinct_values']['value']})


        es_m.build(es_params)
        # FIXME
        # this is confusing for the user
        if not es_m.is_combined_query_empty():
            es_m.set_query_parameter('aggs', aggregations)
            response = es_m.search()

            # Filter response
            bucket_filter = '{0}.'.format(original_aggregation_field.lower())
            final_bucket = []
            for b in response['aggregations']['strings']['buckets']:
                if bucket_filter in b['key']:
                    fact_name = b['key'].split('.')[-1]
                    b['key'] = fact_name
                    final_bucket.append(b)
            final_bucket = final_bucket[:aggregation_size]
            response['aggregations']['distinct_values']['value'] = len(final_bucket)
            response['aggregations']['strings']['buckets'] = final_bucket

            normalised_counts,labels = normalise_agg(response, es_m, es_params, 'strings')
            lexicon = list(set(lexicon+labels))
            query_results.append({'name':'Query','data':normalised_counts,'labels':labels})
            distinct_values.append({'name':'Query','data':response['aggregations']['distinct_values']['value']})

        data = [a+zero_list(len(query_results)) for a in map(list, zip(*[lexicon]))]
        data = [['Word']+[query_result['name'] for query_result in query_results]]+data

        for i,word in enumerate(lexicon):
            for j,query_result in enumerate(query_results):
                for k,label in enumerate(query_result['labels']):
                    if word == label:
                        data[i+1][j+1] = query_result['data'][k]

        logger.set_context('user_name', request.user.username)
        logger.info('facts_aggregation_queried')

    except Exception as e:
        print('-- Exception[{0}] {1}'.format(__name__, e))
        logger.set_context('user_name', request.user.username)
        logger.exception('facts_aggregation_query_failed')

    table_height = len(data)*15
    table_height = table_height if table_height > 500 else 500
    return {'data':[data[0]]+sorted(data[1:], key=lambda x: sum(x[1:]), reverse=True),'height':table_height,'type':'bar','distinct_values':json.dumps(distinct_values)}
示例#17
0
def get_all_matched_rows(query, request, inclusive_instructions):
    buffer_ = StringIO()
    writer = csv.writer(buffer_)

    ds = Datasets().activate_dataset(request.session)
    es_m = ds.build_manager(ES_Manager)

    features = sorted([field['path'] for field in es_m.get_mapped_fields()])

    query['size'] = ES_SCROLL_BATCH

    writer.writerow(features)

    ds.get_index()
    ds.get_mapping()
    es_url

    request_url = os.path.join(es_url, ds.get_index(), ds.get_mapping(),
                               '_search?scroll=1m')
    response = requests.get(request_url, data=json.dumps(query)).json()

    scroll_id = response['_scroll_id']
    hits = response['hits']['hits']

    scroll_payload = json.dumps({'scroll': '1m', 'scroll_id': scroll_id})
    while hits:
        for hit in hits:
            feature_dict = {
                feature_name: hit['_source'][feature_name]
                for feature_name in hit['_source']
            }

            feature_dict = {}

            row = []
            for feature_name in features:
                feature_path = feature_name.split('.')
                parent_source = hit['_source']
                for path_component in feature_path:
                    if path_component in parent_source:
                        parent_source = parent_source[path_component]
                    else:
                        parent_source = ""
                        break

                content = parent_source
                row.append(content)
                feature_dict[feature_name] = content

            layer_dict = matcher.LayerDict(feature_dict)
            if inclusive_instructions.match(layer_dict):
                writer.writerow([
                    element.encode('utf-8')
                    if isinstance(element, unicode) else element
                    for element in row
                ])

        buffer_.seek(0)
        data = buffer_.read()
        buffer_.seek(0)
        buffer_.truncate()
        yield data

        response = requests.get(os.path.join(es_url, '_search', 'scroll'),
                                data=scroll_payload).json()
        hits = response['hits']['hits']
        scroll_id = response['_scroll_id']
示例#18
0
def get_table_data(request):
    query_data = {}

    query_data['search_id'] = request.GET['search_id']
    query_data['polarity'] = request.GET['polarity']
    query_data['requested_page'] = int(request.GET['iDisplayStart']) / int(
        request.GET['iDisplayLength']) + 1
    query_data['page_length'] = int(request.GET['iDisplayLength'])

    if request.GET['is_test'] == 'true':
        query_data['inclusive_metaquery'] = json.loads(
            request.GET['inclusive_test_grammar'])

        query_data['inclusive_grammar_id'] = -1
        query_data['exclusive_grammar_id'] = -1

        query_data['features'] = sorted(
            extract_layers(query_data['inclusive_metaquery']))

    else:
        query_data['inclusive_grammar_id'] = request.GET[
            'inclusive_grammar_id']
        query_data['exclusive_grammar_id'] = request.GET[
            'exclusive_grammar_id']

        query_data['inclusive_metaquery'] = generate_metaquery_dict(
            int(query_data['inclusive_grammar_id']),
            request.user,
            component={})
        query_data['exclusive_metaquery'] = generate_metaquery_dict(
            int(query_data['exclusive_grammar_id']),
            request.user,
            component={})

        query_data['features'] = sorted(
            extract_layers(query_data['inclusive_metaquery'])
            | extract_layers(query_data['exclusive_metaquery']))

    GrammarPageMapping.objects.filter(
        search_id=query_data['search_id'],
        inclusive_grammar=query_data['inclusive_grammar_id'],
        exclusive_grammar=query_data['exclusive_grammar_id'],
        polarity=query_data['polarity'],
        author=request.user).delete()

    ds = Datasets().activate_dataset(request.session)

    query_data['dataset'] = ds.get_index()
    query_data['mapping'] = ds.get_mapping()

    component_query = ElasticGrammarQuery(query_data['inclusive_metaquery'],
                                          None).generate()

    es_m = ds.build_manager(ES_Manager)
    if query_data['search_id'] != '-1':
        saved_query = json.loads(
            Search.objects.get(pk=query_data['search_id']).query)
        es_m.load_combined_query(saved_query)

        if query_data['polarity'] == 'positive':
            es_m.merge_combined_query_with_query_dict(component_query)
    else:
        #es_m.combined_query = {"main": {"query": {"bool": {"should": [{"match_all":{}}], "must": [], "must_not": []}}},
        #"facts": {"include": [], 'total_include': 0,
        #"exclude": [], 'total_exclude': 0}}
        es_m.combined_query = {"main": {"query": {"match_all": {}}}}
        if query_data['polarity'] == 'positive':
            es_m.combined_query = component_query

    # Add paging data to the query
    #es_m.set_query_parameter('from', request.session['grammar_'+polarity+'_cursor'])
    es_m.set_query_parameter('size', request.GET['iDisplayLength'])
    es_m.set_query_parameter('_source', query_data['features'])

    query_data['inclusive_instructions'] = generate_instructions(
        query_data['inclusive_metaquery'])
    query_data['exclusive_instructions'] = {
    }  #generate_instructions(query_data['exclusive_metaquery'])

    data = scroll_data(es_m.combined_query['main'], request, query_data)
    data['sEcho'] = request.GET['sEcho']

    return HttpResponse(json.dumps(data, ensure_ascii=False))
示例#19
0
文件: views.py 项目: cbentes/texta
def find_mappings(request):
    try:
        slop = int(request.POST['slop'])
        max_len = int(request.POST['max_len'])
        min_len = int(request.POST['min_len'])
        min_freq = int(request.POST['min_freq'])
        match_field = request.POST['match_field']
        description = request.POST['description']

        batch_size = 50

        # Define selected mapping
        ds = Datasets().activate_dataset(request.session)
        dataset = ds.get_index()
        mapping = ds.get_mapping()

        lexicon = []
        word_index = {}
        num_lexicons = 0
        for i, lexicon_id in enumerate(request.POST.getlist('lexicons[]')):
            num_lexicons += 1
            for word in Word.objects.filter(lexicon=lexicon_id):
                word = word.wrd
                lexicon.append(word)
                if word not in word_index:
                    word_index[word] = []
                word_index[word].append(i)
        lexicon = list(set(lexicon))
        if min_len > num_lexicons:
            min_len = num_lexicons
        mwe_counter = 0
        group_counter = 0
        phrases = []
        final = {}
        data = []
        new_run = Run(minimum_frequency=min_freq,
                      maximum_length=max_len,
                      minimum_length=min_len,
                      run_status='running',
                      run_started=datetime.now(),
                      run_completed=None,
                      user=request.user,
                      description=description)
        new_run.save()
        logging.getLogger(INFO_LOGGER).info(
            json.dumps({
                'process': 'MINE MWEs',
                'event': 'mwe_mining_started',
                'args': {
                    'user_name': request.user.username,
                    'run_id': new_run.id,
                    'slop': slop,
                    'min_len': min_len,
                    'max_len': max_len,
                    'min_freq': min_freq,
                    'match_field': match_field,
                    'desc': description
                }
            }))
        for i in range(min_len, max_len + 1):
            print('Permutation len:', i)
            for permutation in itertools.permutations(lexicon, i):
                word_indices = list(
                    flatten([word_index[word] for word in permutation]))
                if len(word_indices) == len(set(word_indices)):
                    permutation = ' '.join(permutation)
                    if slop > 0:
                        query = {
                            "query": {
                                "match_phrase": {
                                    match_field: {
                                        "query": permutation,
                                        "slop": slop
                                    }
                                }
                            }
                        }
                    else:
                        query = {
                            "query": {
                                "match_phrase": {
                                    match_field: {
                                        "query": permutation
                                    }
                                }
                            }
                        }
                    data.append(
                        json.dumps({
                            "index": dataset,
                            "mapping": mapping
                        }) + '\n' + json.dumps(query))
                    phrases.append(permutation)
                    if len(data) == batch_size:
                        for j, response in enumerate(
                                ES_Manager.plain_multisearch(
                                    es_url, dataset, mapping, data)):
                            try:
                                if response['hits']['total'] >= min_freq:
                                    sorted_phrase = ' '.join(
                                        sorted(phrases[j].split(' ')))
                                    sorted_conceptualised_phrase = conceptualise_phrase(
                                        sorted_phrase, request.user)
                                    if sorted_conceptualised_phrase not in final:
                                        final[sorted_conceptualised_phrase] = {
                                            'total_freq': 0,
                                            'mwes': [],
                                            'display_name': {
                                                'freq': 0,
                                                'label': False
                                            },
                                            'id': group_counter
                                        }
                                        group_counter += 1
                                    final[sorted_conceptualised_phrase][
                                        'total_freq'] += response['hits'][
                                            'total']
                                    final[sorted_conceptualised_phrase][
                                        'mwes'].append({
                                            'mwe':
                                            phrases[j],
                                            'freq':
                                            response['hits']['total'],
                                            'accepted':
                                            False,
                                            'id':
                                            mwe_counter
                                        })
                                    mwe_counter += 1
                                    final[sorted_conceptualised_phrase][
                                        'mwes'].sort(reverse=True,
                                                     key=lambda k: k['freq'])
                                    if response['hits']['total'] > final[
                                            sorted_conceptualised_phrase][
                                                'display_name']['freq']:
                                        final[sorted_conceptualised_phrase][
                                            'display_name']['freq'] = response[
                                                'hits']['total']
                                        final[sorted_conceptualised_phrase][
                                            'display_name']['label'] = phrases[
                                                j]
                            except KeyError as e:
                                raise e
                        data = []
                        phrases = []
            logging.getLogger(INFO_LOGGER).info(
                json.dumps({
                    'process': 'MINE MWEs',
                    'event': 'mwe_mining_progress',
                    'args': {
                        'user_name': request.user.username,
                        'run_id': new_run.id
                    },
                    'data': {
                        'permutations_processed': i + 1 - min_len,
                        'total_permutations': max_len - min_len + 1
                    }
                }))

        m_response = ES_Manager.plain_multisearch(es_url, dataset, mapping,
                                                  data)

        for j, response in enumerate(m_response):
            try:
                if response['hits']['total'] >= min_freq:
                    sorted_phrase = ' '.join(sorted(phrases[j].split(' ')))
                    sorted_conceptualised_phrase = conceptualise_phrase(
                        sorted_phrase, request.user)
                    if sorted_conceptualised_phrase not in final:
                        final[sorted_conceptualised_phrase] = {
                            'total_freq': 0,
                            'mwes': [],
                            'display_name': {
                                'freq': 0,
                                'label': False
                            },
                            'id': group_counter
                        }
                        group_counter += 1
                    final[sorted_conceptualised_phrase][
                        'total_freq'] += response['hits']['total']
                    final[sorted_conceptualised_phrase]['mwes'].append({
                        'mwe':
                        phrases[j],
                        'freq':
                        response['hits']['total'],
                        'accepted':
                        False,
                        'id':
                        mwe_counter
                    })
                    mwe_counter += 1
                    final[sorted_conceptualised_phrase]['mwes'].sort(
                        reverse=True, key=lambda k: k['freq'])
                    if response['hits']['total'] > final[
                            sorted_conceptualised_phrase]['display_name'][
                                'freq']:
                        final[sorted_conceptualised_phrase]['display_name'][
                            'freq'] = response['hits']['total']
                        final[sorted_conceptualised_phrase]['display_name'][
                            'label'] = phrases[j]
            except KeyError as e:
                raise e
        for key in final:
            final[key]['concept_name'] = {'freq': -1, 'label': ''}
        r = Run.objects.get(pk=new_run.pk)
        r.run_completed = datetime.now()
        r.run_status = 'completed'
        r.results = json.dumps(final)
        r.save()
        logging.getLogger(INFO_LOGGER).info(
            json.dumps({
                'process': 'MINE MWEs',
                'event': 'mwe_mining_completed',
                'args': {
                    'user_name': request.user.username,
                    'run_id': new_run.id
                }
            }))
    except Exception as e:
        print(e)
        logging.getLogger(ERROR_LOGGER).error(json.dumps({
            'process': 'MINE MWEs',
            'event': 'mwe_mining_failed',
            'args': {
                'user_name': request.user.username,
                'run_id': new_run.id
            }
        }),
                                              exc_info=True)
示例#20
0
文件: views.py 项目: cbentes/texta
def find_mappings(request):
    try:
        slop     = int(request.POST['slop'])
        max_len  = int(request.POST['max_len'])
        min_len  = int(request.POST['min_len'])
        min_freq = int(request.POST['min_freq'])
        match_field = request.POST['match_field']
        description = request.POST['description']

        batch_size = 50

        # Define selected mapping
        ds = Datasets().activate_dataset(request.session)
        dataset = ds.get_index()
        mapping = ds.get_mapping()

        lexicon = []
        word_index = {}
        num_lexicons = 0
        for i,lexicon_id in enumerate(request.POST.getlist('lexicons[]')):
            num_lexicons +=1
            for word in Word.objects.filter(lexicon=lexicon_id):
                word = word.wrd
                lexicon.append(word)
                if word not in word_index:
                    word_index[word] = []
                word_index[word].append(i)
        lexicon = list(set(lexicon))
        if min_len > num_lexicons:
            min_len = num_lexicons
        mwe_counter = 0
        group_counter = 0
        phrases = []
        final   = {}
        data = []
        new_run = Run(minimum_frequency=min_freq,maximum_length=max_len,minimum_length=min_len,run_status='running',run_started=datetime.now(),run_completed=None,user=request.user,description=description)
        new_run.save()
        logging.getLogger(INFO_LOGGER).info(json.dumps({'process':'MINE MWEs','event':'mwe_mining_started','args':{'user_name':request.user.username,'run_id':new_run.id,'slop':slop,'min_len':min_len,'max_len':max_len,'min_freq':min_freq,'match_field':match_field,'desc':description}}))
        for i in range(min_len,max_len+1):
            print('Permutation len:',i)
            for permutation in itertools.permutations(lexicon,i):
                word_indices = list(flatten([word_index[word] for word in permutation])) 
                if len(word_indices) == len(set(word_indices)):
                    permutation = ' '.join(permutation)
                    if slop > 0:
                        query = {"query": {"match_phrase": {match_field: {"query": permutation,"slop": slop}}}}
                    else:
                        query = {"query": {"match_phrase": {match_field: {"query": permutation}}}}
                    data.append(json.dumps({"index":dataset,"mapping":mapping})+'\n'+json.dumps(query))
                    phrases.append(permutation)
                    if len(data) == batch_size:
                        for j,response in enumerate(ES_Manager.plain_multisearch(es_url, dataset, mapping, data)):
                            try:
                                if response['hits']['total'] >= min_freq:
                                    sorted_phrase = ' '.join(sorted(phrases[j].split(' ')))
                                    sorted_conceptualised_phrase = conceptualise_phrase(sorted_phrase,request.user)
                                    if sorted_conceptualised_phrase not in final:
                                        final[sorted_conceptualised_phrase] = {'total_freq':0,'mwes':[],'display_name':{'freq':0,'label':False},'id':group_counter}
                                        group_counter+=1
                                    final[sorted_conceptualised_phrase]['total_freq']+=response['hits']['total']
                                    final[sorted_conceptualised_phrase]['mwes'].append({'mwe':phrases[j],'freq':response['hits']['total'],'accepted':False,'id':mwe_counter})
                                    mwe_counter+=1
                                    final[sorted_conceptualised_phrase]['mwes'].sort(reverse=True,key=lambda k: k['freq'])
                                    if response['hits']['total'] > final[sorted_conceptualised_phrase]['display_name']['freq']:
                                        final[sorted_conceptualised_phrase]['display_name']['freq'] = response['hits']['total']
                                        final[sorted_conceptualised_phrase]['display_name']['label'] = phrases[j]
                            except KeyError as e:
                                raise e
                        data = []
                        phrases = []
            logging.getLogger(INFO_LOGGER).info(json.dumps({'process':'MINE MWEs','event':'mwe_mining_progress','args':{'user_name':request.user.username,'run_id':new_run.id},'data':{'permutations_processed':i+1-min_len,'total_permutations':max_len-min_len+1}}))
        
        m_response = ES_Manager.plain_multisearch(es_url, dataset, mapping, data)
        
        for j,response in enumerate(m_response):
            try:
                if response['hits']['total'] >= min_freq:
                    sorted_phrase = ' '.join(sorted(phrases[j].split(' ')))
                    sorted_conceptualised_phrase = conceptualise_phrase(sorted_phrase,request.user)
                    if sorted_conceptualised_phrase not in final:
                        final[sorted_conceptualised_phrase] = {'total_freq':0,'mwes':[],'display_name':{'freq':0,'label':False},'id':group_counter}
                        group_counter+=1
                    final[sorted_conceptualised_phrase]['total_freq']+=response['hits']['total']
                    final[sorted_conceptualised_phrase]['mwes'].append({'mwe':phrases[j],'freq':response['hits']['total'],'accepted':False,'id':mwe_counter})
                    mwe_counter+=1
                    final[sorted_conceptualised_phrase]['mwes'].sort(reverse=True,key=lambda k: k['freq'])
                    if response['hits']['total'] > final[sorted_conceptualised_phrase]['display_name']['freq']:
                        final[sorted_conceptualised_phrase]['display_name']['freq'] = response['hits']['total']
                        final[sorted_conceptualised_phrase]['display_name']['label'] = phrases[j]
            except KeyError as e:       
                raise e
        for key in final:
            final[key]['concept_name'] = {'freq':-1,'label':''}
        r = Run.objects.get(pk=new_run.pk)
        r.run_completed = datetime.now()
        r.run_status = 'completed'
        r.results =json.dumps(final)
        r.save()
        logging.getLogger(INFO_LOGGER).info(json.dumps({'process':'MINE MWEs','event':'mwe_mining_completed','args':{'user_name':request.user.username,'run_id':new_run.id}}))
    except Exception as e:
        print(e)
        logging.getLogger(ERROR_LOGGER).error(json.dumps({'process':'MINE MWEs','event':'mwe_mining_failed','args':{'user_name':request.user.username,'run_id':new_run.id}}),exc_info=True)
示例#21
0
class FactManager:
    """ Manage Searcher facts, like deleting/storing, adding facts.
    """
    def __init__(self, request):
        self.es_params = request.POST
        self.ds = Datasets().activate_dataset(request.session)
        self.index = self.ds.get_index()
        self.mapping = self.ds.get_mapping()
        self.es_m = ES_Manager(self.index, self.mapping)
        self.field = 'texta_facts'

    def remove_facts_from_document(self, rm_facts_dict, bs=7500):
        '''remove a certain fact from all documents given a [str]key and [str]val'''
        logger = LogManager(__name__, 'FACT MANAGER REMOVE FACTS')

        try:
            # Clears readonly block just in case the index has been set to read only
            self.es_m.clear_readonly_block()

            query = self._fact_deletion_query(rm_facts_dict)
            self.es_m.load_combined_query(query)
            response = self.es_m.scroll(size=bs, field_scroll=self.field)
            scroll_id = response['_scroll_id']
            total_docs = response['hits']['total']
            docs_left = total_docs  # DEBUG
            print('Starting.. Total docs - ', total_docs)  # DEBUG
            batch = 0
            while total_docs > 0:
                print('Docs left:', docs_left)  # DEBUG
                data = ''
                for document in response['hits']['hits']:
                    new_field = []  # The new facts field
                    for fact in document['_source'][self.field]:
                        # If the fact name is in rm_facts_dict keys
                        if fact["fact"] in rm_facts_dict:
                            # If the fact value is not in the delete key values
                            if fact['str_val'] not in rm_facts_dict.getlist(
                                    fact["fact"]):
                                new_field.append(fact)
                        else:
                            new_field.append(fact)
                    # Update dataset
                    data += json.dumps({
                        "update": {
                            "_id": document['_id'],
                            "_type": document['_type'],
                            "_index": document['_index']
                        }
                    }) + '\n'
                    document = {'doc': {self.field: new_field}}
                    data += json.dumps(document) + '\n'
                response = self.es_m.scroll(scroll_id=scroll_id,
                                            size=bs,
                                            field_scroll=self.field)
                total_docs = len(response['hits']['hits'])
                docs_left -= bs  # DEBUG
                scroll_id = response['_scroll_id']
                self.es_m.plain_post_bulk(self.es_m.es_url, data)
            print('DONE')  # DEBUG

            logger.set_context('docs_left', total_docs)
            logger.set_context('batch', batch)
            logger.info('remove_facts_from_document')
        except:
            print(traceback.format_exc())
            logger.set_context('es_params', self.es_params)
            logger.exception('remove_facts_from_document_failed')

    def tag_documents_with_fact(self, es_params, tag_name, tag_value,
                                tag_field):
        '''Used to tag all documents in the current search with a certain fact'''

        self.es_m.build(es_params)
        self.es_m.load_combined_query(self.es_m.combined_query)

        response = self.es_m.scroll()

        data = ''
        for document in response['hits']['hits']:
            if 'mlp' in tag_field:
                split_field = tag_field.split('.')
                span = [
                    0,
                    len(document['_source'][split_field[0]][split_field[1]])
                ]
            else:
                span = [0, len(document['_source'][tag_field].strip())]
            document['_source'][self.field].append({
                "str_val": tag_value,
                "spans": str([span]),
                "fact": tag_name,
                "doc_path": tag_field
            })

            data += json.dumps({
                "update": {
                    "_id": document['_id'],
                    "_type": document['_type'],
                    "_index": document['_index']
                }
            }) + '\n'
            document = {'doc': {self.field: document['_source'][self.field]}}
            data += json.dumps(document) + '\n'
        self.es_m.plain_post_bulk(self.es_m.es_url, data)
        response = requests.post(
            '{0}/{1}/_update_by_query?refresh&conflicts=proceed'.format(
                self.es_m.es_url, self.index),
            headers=self.es_m.HEADERS)

    def count_cooccurrences(self, fact_pairs):
        """Finds the counts of cooccuring facts

        Arguments:
            fact_pairs {list of tuples of tuples} -- Example:[(('ORG', 'Riigikohus'),('PER', 'Jaan')), (('ORG', 'Riigikohus'),('PER', 'Peeter'))]

        Returns:
            [int list] -- Occurances of the given facts
        """
        queries = []
        for fact_pair in fact_pairs:
            fact_constraints = []

            for fact in fact_pair:
                constraint = {
                    "nested": {
                        "path": "texta_facts",
                        "query": {
                            "bool": {
                                "must": [{
                                    "term": {
                                        "texta_facts.fact": fact[0]
                                    }
                                }, {
                                    "term": {
                                        "texta_facts.str_val": fact[1]
                                    }
                                }]
                            }
                        }
                    }
                }
                fact_constraints.append(constraint)

            query = {"query": {"bool": {"must": fact_constraints}}, "size": 0}
            queries.append(json.dumps(query))

        header = json.dumps({"index": self.index})
        data = "\n".join(["{0}\n{1}".format(header, q)
                          for q in queries]) + "\n"

        responses = requests.post("{0}/{1}/_msearch".format(
            self.es_m.es_url, self.index),
                                  data=data,
                                  headers={"Content-Type": "application/json"})
        counts = [
            response["hits"]["total"]
            for response in responses.json()['responses']
        ]

        return counts

    def facts_via_aggregation(self, size=15):
        """Finds all facts from current search.
        Parameters:
            size - [int=15] -- Amount of fact values per fact name to search in query
        Returns:
            facts - [dict] -- Details for each fact, ex: {'PER - kostja': {'id': 0, 'name': 'PER', 'value': 'kostja', 'doc_count': 44}}
            fact_combinations - [list of tuples] -- All possible combinations of all facts: [(('FIRST_FACTNAME', 'FIRST_FACTVAL'), ('SECOND_FACTNAME', 'SECOND_FACTVAL'))]
            unique_fact_names - [list of string] -- All unique fact names
        """

        aggs = {
            "facts": {
                "nested": {
                    "path": "texta_facts"
                },
                "aggs": {
                    "fact_names": {
                        "terms": {
                            "field": "texta_facts.fact"
                        },
                        "aggs": {
                            "fact_values": {
                                "terms": {
                                    "field": "texta_facts.str_val",
                                    "size": size
                                }
                            }
                        }
                    }
                }
            }
        }
        self.es_m.build(self.es_params)
        self.es_m.set_query_parameter('aggs', aggs)

        response = self.es_m.search()

        response_aggs = response['aggregations']['facts']['fact_names'][
            'buckets']

        facts = {}
        fact_combinations = []
        fact_count = 0
        unique_fact_names = []
        for bucket in response_aggs:
            unique_fact_names.append(bucket['key'])
            for fact in bucket['fact_values']['buckets']:
                facts[bucket['key'] + " - " + fact['key']] = {
                    'id': fact_count,
                    'name': bucket['key'],
                    'value': fact['key'],
                    'doc_count': fact['doc_count']
                }
                fact_combinations.append((bucket['key'], fact['key']))
                fact_count += 1

        fact_combinations = [
            x for x in itertools.combinations(fact_combinations, 2)
        ]
        return (facts, fact_combinations, unique_fact_names)

    def fact_graph(self, search_size):
        facts, fact_combinations, unique_fact_names = self.facts_via_aggregation(
            size=search_size)
        # Get cooccurrences and remove values with 0
        fact_combinations = {
            k: v
            for k, v in dict(
                zip(fact_combinations,
                    self.count_cooccurrences(fact_combinations))).items()
            if v != 0
        }
        shapes = [
            "circle", "cross", "diamond", "square", "triangle-down",
            "triangle-up"
        ]
        types = dict(zip(unique_fact_names, itertools.cycle(shapes)))

        nodes = []
        for i, fact in enumerate(facts):
            nodes.append({
                "source": facts[fact]['id'],
                "size": facts[fact]['doc_count'],
                "score": facts[fact]['doc_count'],
                "name": facts[fact]['name'],
                "id": facts[fact]['value'],
                "type": types[facts[fact]['name']]
            })
            # Track max/min count
            count = facts[fact]['doc_count']
            if i == 0:
                max_node_size = count
                min_node_size = count
            max_node_size = max(max_node_size, count)
            min_node_size = min(min_node_size, count)

        links = []
        max_link_size = 0
        for fact in fact_combinations.keys():
            max_link_size = max(max_link_size, fact_combinations[fact])
            links.append({
                "source": facts[fact[0][0] + " - " + fact[0][1]]['id'],
                "target": facts[fact[1][0] + " - " + fact[1][1]]['id'],
                "count": fact_combinations[fact]
            })

        graph_data = json.dumps({"nodes": nodes, "links": links})
        return (graph_data, unique_fact_names, max_node_size, max_link_size,
                min_node_size)

    def _fact_deletion_query(self, rm_facts_dict):
        '''Creates the query for fact deletion based on dict of facts {name: val}'''
        fact_queries = []
        for key in rm_facts_dict:
            for val in rm_facts_dict.getlist(key):
                fact_queries.append({
                    "bool": {
                        "must": [{
                            "match": {
                                self.field + ".fact": key
                            }
                        }, {
                            "match": {
                                self.field + ".str_val": val
                            }
                        }]
                    }
                })

        query = {
            "main": {
                "query": {
                    "nested": {
                        "path": self.field,
                        "query": {
                            "bool": {
                                "should": fact_queries
                            }
                        }
                    }
                },
                "_source": [self.field]
            }
        }

        return query