Exemplo n.º 1
0
 def handle(self, *args, **options):
     s3 = boto3.resource('s3', aws_access_key_id=settings.AWS_ACCESS_KEY, aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY)
     dt = options['start_date'].replace(hour=0, minute=0, second=0, microsecond=0)
     if not options['to_stdout']:
         connections.create_connection(hosts=[options['es_url']], **settings.ES_CONNECTION_PARAMS)
         CRECDoc.init()
     while dt < options['end_date']:
         logger.info('Processing files for {0}.'.format(dt))
         try:
             response = s3.Object(
                 options['source_bucket'],
                 crec_s3_key('mods.xml', dt)
             ).get()
         except botocore.exceptions.ClientError as e:
             logger.info('Could not find mods file for {0}.'.format(dt))
             response = None
         if response is not None and response.get('Body'):
             try:
                 crecs = extract_crecs_from_mods(response['Body'])
                 logger.info('Found {0} new records.'.format(len(crecs)))
                 if options['to_stdout']:
                     logger.info('Using stdout:')
                 for crec in crecs:
                     if not crec.is_skippable():
                         if options['to_stdout']:
                             logger.info(crec.to_es_doc())
                         else:
                             es_doc = crec.to_es_doc()
                             es_doc.save()
                         upload_speaker_word_counts(crec)
             except Exception as e:
                 logger.exception('Error processing data for {0}.'.format(dt.strftime('%Y-%m-%d')))
         dt += timedelta(days=1)
Exemplo n.º 2
0
 def setUp(self):
     self.es_conn = connections.get_connection()
     self.test_crecs = []
     for i in range(20):
         self.test_crecs.append(
             CRECDoc(title=str(i),
                     content='foo bar baz Foo',
                     date_issued=datetime(2017, 1, i % 5 + 1)))
     self.index = Index(settings.ES_CW_INDEX)
     CRECDoc.init()
     for c in self.test_crecs:
         c.save(refresh=True)
     self.client = Client()
Exemplo n.º 3
0
def get_text_search_results(start_date, end_date, terms, size=10, offset=0):
    """Runs a "match query against any provided field in the terms argument.
    Returns a list of docs as dicts including the search score.
    
    Args:
        start_date (datetime): Start of date range.
        end_date (datetime): End of date range.
        terms (dict): A dict mapping field name to search term, multiple fields
            are or'd together.
        size (int): The number of results to retrieve, defaults to 10.
        offset (int): The offset from the highest search result to return items
            from (for pagination).
    
    Returns:
        list: A list of CREC documents as dicts, reverse sorted by score.
    """
    search = CRECDoc.search()
    for field, search_term in terms.items():
        m = Match(**{field: {'query': search_term, 'type': 'phrase'}})
        search = search.query(m)
    search = search.filter(
        'range', date_issued={'gte': start_date, 'lte': end_date}
    )
    search = search.sort('_score')
    search = search[offset:offset+size]
    results = search.execute()
    data = []
    for r in results:
        d = r.to_dict()
        d['date_issued'] = r.date_issued.strftime('%Y-%m-%d')
        d['score'] = r.meta.score
        data.append(d)
    data.sort(key=lambda x: -x['score'])
    return data
Exemplo n.º 4
0
 def setUp(self):
     self.es_conn = connections.get_connection()
     self.test_crecs = []
     for i in range(20):
         self.test_crecs.append(
             CRECDoc(
                 title=str(i),
                 content='foo bar baz Foo',
                 date_issued=datetime(2017, 1, i % 5 + 1)
             )
         )
     self.index = Index(settings.ES_CW_INDEX)
     CRECDoc.init()
     for c in self.test_crecs:
         c.save(refresh=True)
     self.client = Client()
Exemplo n.º 5
0
 def test_search_by_content(self):
     c = CRECDoc(title='foo',
                 content='blah',
                 date_issued=datetime(2017, 1, 1))
     c.save(refresh=True)
     start_date = datetime(2017, 1, 1)
     end_date = datetime(2017, 1, 30)
     query_args = {
         'start_date': start_date.strftime('%Y-%m-%d'),
         'end_date': end_date.strftime('%Y-%m-%d'),
         'content': 'blah',
     }
     response = self.client.get('/cwapi/search/', query_args)
     response_content = response.json()
     results = response_content['data']
     self.assertEquals(1, len(results))
     self.assertEquals('foo', results[0]['title'])
     self.assertEquals('blah', results[0]['content'])
Exemplo n.º 6
0
 def test_search_by_content(self):
     c = CRECDoc(
         title='foo',
         content='blah',
         date_issued=datetime(2017, 1, 1)
     )
     c.save(refresh=True)
     start_date = datetime(2017, 1, 1)
     end_date = datetime(2017, 1, 30)
     query_args = {
         'start_date': start_date.strftime('%Y-%m-%d'),
         'end_date': end_date.strftime('%Y-%m-%d'),
         'content': 'blah',
     }
     response = self.client.get('/cwapi/search/', query_args)
     response_content = response.json()
     results = response_content['data']
     self.assertEquals(1, len(results))
     self.assertEquals('foo', results[0]['title'])
     self.assertEquals('blah', results[0]['content'])
Exemplo n.º 7
0
 def handle(self, *args, **options):
     s3 = boto3.resource(
         's3',
         aws_access_key_id=settings.AWS_ACCESS_KEY,
         aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY)
     dt = options['start_date'].replace(hour=0,
                                        minute=0,
                                        second=0,
                                        microsecond=0)
     if not options['to_stdout']:
         connections.create_connection(hosts=[options['es_url']],
                                       **settings.ES_CONNECTION_PARAMS)
         CRECDoc.init()
     while dt < options['end_date']:
         logger.info('Processing files for {0}.'.format(dt))
         try:
             response = s3.Object(options['source_bucket'],
                                  crec_s3_key('mods.xml', dt)).get()
         except botocore.exceptions.ClientError as e:
             logger.info('Could not find mods file for {0}.'.format(dt))
             response = None
         if response is not None and response.get('Body'):
             try:
                 crecs = extract_crecs_from_mods(response['Body'])
                 logger.info('Found {0} new records.'.format(len(crecs)))
                 if options['to_stdout']:
                     logger.info('Using stdout:')
                 for crec in crecs:
                     if not crec.is_skippable():
                         if options['to_stdout']:
                             logger.info(crec.to_es_doc())
                         else:
                             es_doc = crec.to_es_doc()
                             es_doc.save()
                         upload_speaker_word_counts(crec)
             except Exception as e:
                 logger.exception('Error processing data for {0}.'.format(
                     dt.strftime('%Y-%m-%d')))
         dt += timedelta(days=1)
Exemplo n.º 8
0
 def to_es_doc(self):
     """Returns the CRECParser as a dict ready to be uploaded to
     elasticsearch.
     
     Returns:
         dict: A dict representation of this document.
     """
     return CRECDoc(
         title=self.title,
         title_part=self.title_part,
         date_issued=self.date_issued,
         content=self.content,
         crec_id=self.id,
         pdf_url=self.pdf_url,
         html_url=self.html_url,
         page_start=self.page_start,
         page_end=self.page_end,
         speakers=','.join(self.speakers),
         segments=self.segments,
     )
Exemplo n.º 9
0
def get_text_search_results(start_date, end_date, terms, size=10, offset=0):
    """Runs a "match query against any provided field in the terms argument.
    Returns a list of docs as dicts including the search score.
    
    Args:
        start_date (datetime): Start of date range.
        end_date (datetime): End of date range.
        terms (dict): A dict mapping field name to search term, multiple fields
            are or'd together.
        size (int): The number of results to retrieve, defaults to 10.
        offset (int): The offset from the highest search result to return items
            from (for pagination).
    
    Returns:
        list: A list of CREC documents as dicts, reverse sorted by score.
    """
    search = CRECDoc.search()
    for field, search_term in terms.items():
        m = Match(**{field: {'query': search_term, 'type': 'phrase'}})
        search = search.query(m)
    search = search.filter('range',
                           date_issued={
                               'gte': start_date,
                               'lte': end_date
                           })
    search = search.sort('_score')
    search = search[offset:offset + size]
    results = search.execute()
    data = []
    for r in results:
        d = r.to_dict()
        d['date_issued'] = r.date_issued.strftime('%Y-%m-%d')
        d['score'] = r.meta.score
        data.append(d)
    data.sort(key=lambda x: -x['score'])
    return data