def start_requests(self): base_url = 'http://www.elpais.com.uy/ediciones-anteriores/{}' response = es.search( index=settings.ES_INDEX, doc_type=settings.ES_DOCTYPE, body=LAST_DAYS_QUERY, search_type='count', ) start_urls = [] buckets = response['aggregations']['counts_per_day']['buckets'] for bucket in buckets: if bucket['doc_count'] == 0: logger.info('missing_date=%s', bucket['key_as_string']) missing_date = bucket['key_as_string'] start_urls.append(Request(base_url.format(missing_date))) if not buckets: # If no buckets returned from Elasticsearch, add yesterday's date # to bootstrap the spider. now = datetime.now() - timedelta(days=1) strnow = '{}/{}/{}'.format(now.year, now.month, now.day) start_urls.append(Request(base_url.format(strnow))) return start_urls
def get_corpus_size_by_source(): query = { 'aggs': { 'per_source': { 'terms': { 'field': 'data_source', 'size': 0, }, 'aggs': { 'words': { 'sum': { 'field': 'word_count' } } } } } } response = es.search(index=settings.ES_INDEX, doc_type=settings.ES_DOCTYPE, search_type='count', body=query) result = [] for bucket in response['aggregations']['per_source']['buckets']: result.append({ 'source': bucket['key'], 'size': bucket['words']['value'], }) return result
def get_corpus_size(): query = {'aggs': {'words': {'sum': {'field': 'word_count'}}}} response = es.search(index=settings.ES_INDEX, doc_type=settings.ES_DOCTYPE, search_type='count', body=query) return response['aggregations']['words']['value']
def search(): offset = int(request.args.get('offset', 0)) user_query = request.get_json(force=True)['query'] highlight = { 'fields': { 'content': { 'fragment_size': 100, 'number_of_fragments': 1 } } } query = { 'query': user_query, 'aggs': { 'words': { 'sum': { 'field': 'word_count' } } }, 'highlight': highlight, 'from': offset, 'size': 25, } try: response = es.search(index=settings.ES_INDEX, doc_type=settings.ES_DOCTYPE, body=query) except ElasticsearchException as e: return jsonify(message=e.error, error='Bad Request'), 400 word_count = response['aggregations']['words']['value'] hits = [clean_doc(hit) for hit in response['hits']['hits']] download_link = url_for('.download_search', query=json.dumps(user_query)) data = { 'word_count': word_count, 'download_link': download_link, 'hits': hits } meta = { 'count': response['hits']['total'], 'offset': offset, } return jsonify(meta=meta, data=data)
def sentence_generator(query, preprocessing_params, report=None): """ Generator returning all the documents that match `query`. Receives additional parameters on `preprocessing_params` indicating how to process the documents when turning them into sentences. Optionally, a `report` callback may be specified, which will be called with the completion percentage 1000 times during the training. """ preprocessor = build_token_preprocessor(preprocessing_params) word_tokenizer = build_word_tokenizer(preprocessing_params) sentence_tokenizer = build_sentence_tokenizer(preprocessing_params) if report: # Get the approximate number of results. result = es.search(index=settings.ES_INDEX, doc_type=settings.ES_DOCTYPE, search_type='count', body={'query': query}) count = result['hits']['total'] # If there aren't even 1000 results, report for every document. step = int(count / 1000) if count > 1000 else 1 documents = scan(es, index=settings.ES_INDEX, doc_type=settings.ES_DOCTYPE, scroll='30m', fields='content', query={'query': query}) processed = 0 for document in documents: processed += 1 content = preprocessor(document['fields']['content'][0]) for sentence in sentence_tokenizer(content): yield word_tokenizer(sentence) # Report how many documents have been processed, if necessary. if report and processed % step == 0: report(processed / count)
def get_corpus_size(context): """ Calculate the word count for a document. """ corpus_query = context.current_parameters['query'] query = { 'query': corpus_query, 'aggs': { 'words': { 'sum': { 'field': 'word_count' } } } } response = es.search(index=settings.ES_INDEX, doc_type=settings.ES_DOCTYPE, search_type='count', body=query) return response['aggregations']['words']['value']
def start_requests(self): base_url = "http://elpais.com/tag/fecha/{}" response = es.search( index=settings.ES_INDEX, doc_type=settings.ES_DOCTYPE, body=LAST_DAYS_QUERY, search_type='count', ) start_urls = [] buckets = response['aggregations']['counts_per_day']['buckets'] for bucket in buckets: if bucket['doc_count'] == 0: logger.info("missing_date=%s", bucket['key_as_string']) missing_date = bucket['key_as_string'].replace('-', '') request = Request(base_url.format(missing_date), callback=self.parse_tag_page) start_urls.append(request) # If no documents returned from Elasticsearch, add all the articles, # back from 1976/5/4 to bootstrap the spider. if not sum([b['doc_count'] for b in buckets]): start_urls = [] logger.info("missing all dates, starting from beginning") first_date = date(1976, 5, 4) last_date = date.today() current_date = first_date while current_date < last_date: date_str = current_date.isoformat().replace('-', '') request = Request(base_url.format(date_str), callback=self.parse_tag_page) start_urls.append(request) current_date = current_date + timedelta(days=1) return start_urls
def get_corpus_size_increase(): query = { 'query': { 'range': { 'entry.date_scraped': { 'gte': 'now-10d/d' } } }, 'aggs': { 'over_time': { 'terms': { 'field': 'data_source', 'min_doc_count': 0, 'size': 0, }, 'aggs': { 'over_time': { 'date_histogram': { 'field': 'entry.date_scraped', 'format': 'yyyy-MM-dd', 'interval': '1d', 'min_doc_count': 0, }, 'aggs': { 'word_counts': { 'sum': { 'field': 'word_count' } } } } } } } } response = es.search(index=settings.ES_INDEX, doc_type=settings.ES_DOCTYPE, search_type='count', body=query) days = set() result = [] for outer_bucket in response['aggregations']['over_time']['buckets']: on_day = [] for inner_bucket in outer_bucket['over_time']['buckets']: day = inner_bucket['key_as_string'] days.add(day) on_day.append({ 'day': day, 'value': inner_bucket['word_counts']['value'], }) result.append({ 'source': outer_bucket['key'], 'values': on_day, }) # Fill up the missing days with zeros. for source in result: missing_days = days - set(map(lambda d: d['day'], source['values'])) for day in missing_days: source['values'].append({ 'day': day, 'value': 0, }) source['values'].sort(key=lambda d: d['day']) source['values'] = [v['value'] for v in source['values']] return result