Пример #1
0
    def start_requests(self):
        base_url = 'http://www.elpais.com.uy/ediciones-anteriores/{}'

        response = es.search(
            index=settings.ES_INDEX,
            doc_type=settings.ES_DOCTYPE,
            body=LAST_DAYS_QUERY,
            search_type='count',
        )

        start_urls = []

        buckets = response['aggregations']['counts_per_day']['buckets']
        for bucket in buckets:
            if bucket['doc_count'] == 0:
                logger.info('missing_date=%s', bucket['key_as_string'])
                missing_date = bucket['key_as_string']
                start_urls.append(Request(base_url.format(missing_date)))

        if not buckets:
            # If no buckets returned from Elasticsearch, add yesterday's date
            # to bootstrap the spider.
            now = datetime.now() - timedelta(days=1)
            strnow = '{}/{}/{}'.format(now.year, now.month, now.day)
            start_urls.append(Request(base_url.format(strnow)))

        return start_urls
Пример #2
0
def get_corpus_size_by_source():
    query = {
        'aggs': {
            'per_source': {
                'terms': {
                    'field': 'data_source',
                    'size': 0,
                },
                'aggs': {
                    'words': {
                        'sum': {
                            'field': 'word_count'
                        }
                    }
                }
            }
        }
    }

    response = es.search(index=settings.ES_INDEX,
                         doc_type=settings.ES_DOCTYPE,
                         search_type='count',
                         body=query)

    result = []
    for bucket in response['aggregations']['per_source']['buckets']:
        result.append({
            'source': bucket['key'],
            'size': bucket['words']['value'],
        })

    return result
Пример #3
0
def get_corpus_size():
    query = {'aggs': {'words': {'sum': {'field': 'word_count'}}}}
    response = es.search(index=settings.ES_INDEX,
                         doc_type=settings.ES_DOCTYPE,
                         search_type='count',
                         body=query)
    return response['aggregations']['words']['value']
Пример #4
0
def search():
    offset = int(request.args.get('offset', 0))
    user_query = request.get_json(force=True)['query']

    highlight = {
        'fields': {
            'content': {
                'fragment_size': 100,
                'number_of_fragments': 1
            }
        }
    }

    query = {
        'query': user_query,
        'aggs': {
            'words': {
                'sum': {
                    'field': 'word_count'
                }
            }
        },
        'highlight': highlight,
        'from': offset,
        'size': 25,
    }

    try:
        response = es.search(index=settings.ES_INDEX,
                             doc_type=settings.ES_DOCTYPE,
                             body=query)
    except ElasticsearchException as e:
        return jsonify(message=e.error, error='Bad Request'), 400

    word_count = response['aggregations']['words']['value']
    hits = [clean_doc(hit) for hit in response['hits']['hits']]
    download_link = url_for('.download_search', query=json.dumps(user_query))

    data = {
        'word_count': word_count,
        'download_link': download_link,
        'hits': hits
    }

    meta = {
        'count': response['hits']['total'],
        'offset': offset,
    }

    return jsonify(meta=meta, data=data)
Пример #5
0
def sentence_generator(query, preprocessing_params, report=None):
    """
    Generator returning all the documents that match `query`.

    Receives additional parameters on `preprocessing_params` indicating how to
    process the documents when turning them into sentences.

    Optionally, a `report` callback may be specified, which will be called with
    the completion percentage 1000 times during the training.
    """
    preprocessor = build_token_preprocessor(preprocessing_params)
    word_tokenizer = build_word_tokenizer(preprocessing_params)
    sentence_tokenizer = build_sentence_tokenizer(preprocessing_params)

    if report:
        # Get the approximate number of results.
        result = es.search(index=settings.ES_INDEX,
                           doc_type=settings.ES_DOCTYPE,
                           search_type='count',
                           body={'query': query})
        count = result['hits']['total']
        # If there aren't even 1000 results, report for every document.
        step = int(count / 1000) if count > 1000 else 1

    documents = scan(es,
                     index=settings.ES_INDEX,
                     doc_type=settings.ES_DOCTYPE,
                     scroll='30m',
                     fields='content',
                     query={'query': query})

    processed = 0
    for document in documents:
        processed += 1

        content = preprocessor(document['fields']['content'][0])
        for sentence in sentence_tokenizer(content):
            yield word_tokenizer(sentence)
            # Report how many documents have been processed, if necessary.

        if report and processed % step == 0:
            report(processed / count)
Пример #6
0
def get_corpus_size(context):
    """
    Calculate the word count for a document.
    """
    corpus_query = context.current_parameters['query']
    query = {
        'query': corpus_query,
        'aggs': {
            'words': {
                'sum': {
                    'field': 'word_count'
                }
            }
        }
    }
    response = es.search(index=settings.ES_INDEX,
                         doc_type=settings.ES_DOCTYPE,
                         search_type='count',
                         body=query)
    return response['aggregations']['words']['value']
Пример #7
0
    def start_requests(self):
        base_url = "http://elpais.com/tag/fecha/{}"

        response = es.search(
            index=settings.ES_INDEX,
            doc_type=settings.ES_DOCTYPE,
            body=LAST_DAYS_QUERY,
            search_type='count',
        )

        start_urls = []
        buckets = response['aggregations']['counts_per_day']['buckets']
        for bucket in buckets:
            if bucket['doc_count'] == 0:
                logger.info("missing_date=%s", bucket['key_as_string'])
                missing_date = bucket['key_as_string'].replace('-', '')

                request = Request(base_url.format(missing_date),
                                  callback=self.parse_tag_page)
                start_urls.append(request)

        # If no documents returned from Elasticsearch, add all the articles,
        # back from 1976/5/4 to bootstrap the spider.
        if not sum([b['doc_count'] for b in buckets]):
            start_urls = []
            logger.info("missing all dates, starting from beginning")
            first_date = date(1976, 5, 4)
            last_date = date.today()

            current_date = first_date
            while current_date < last_date:
                date_str = current_date.isoformat().replace('-', '')
                request = Request(base_url.format(date_str),
                                  callback=self.parse_tag_page)
                start_urls.append(request)
                current_date = current_date + timedelta(days=1)

        return start_urls
Пример #8
0
def get_corpus_size_increase():
    query = {
        'query': {
            'range': {
                'entry.date_scraped': {
                    'gte': 'now-10d/d'
                }
            }
        },
        'aggs': {
            'over_time': {
                'terms': {
                    'field': 'data_source',
                    'min_doc_count': 0,
                    'size': 0,
                },
                'aggs': {
                    'over_time': {
                        'date_histogram': {
                            'field': 'entry.date_scraped',
                            'format': 'yyyy-MM-dd',
                            'interval': '1d',
                            'min_doc_count': 0,
                        },
                        'aggs': {
                            'word_counts': {
                                'sum': {
                                    'field': 'word_count'
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    response = es.search(index=settings.ES_INDEX,
                         doc_type=settings.ES_DOCTYPE,
                         search_type='count',
                         body=query)

    days = set()
    result = []
    for outer_bucket in response['aggregations']['over_time']['buckets']:
        on_day = []
        for inner_bucket in outer_bucket['over_time']['buckets']:
            day = inner_bucket['key_as_string']
            days.add(day)
            on_day.append({
                'day': day,
                'value': inner_bucket['word_counts']['value'],
            })
        result.append({
            'source': outer_bucket['key'],
            'values': on_day,
        })

    # Fill up the missing days with zeros.
    for source in result:
        missing_days = days - set(map(lambda d: d['day'], source['values']))
        for day in missing_days:
            source['values'].append({
                'day': day,
                'value': 0,
            })
        source['values'].sort(key=lambda d: d['day'])
        source['values'] = [v['value'] for v in source['values']]

    return result