示例#1
0
def update_contexts(incremental=True):
    localangle_person_names = dict([(person['name'], person['name']) for person in _db.persons.find()])
    localangle_company_names = collections.defaultdict(str, [(clean_company_name(company['name']), company['name']) for company in _db.companies.find()])
    
    story_criteria = { 'entities' : { '$exists' : True } }
    if incremental:
        story_criteria['contexts'] = { '$exists' : False }
    
    for story in _db.stories.find(story_criteria):
        
        contexts = []
        
        for entity in story['entities']:
            contexts += find_person_contexts(entity, localangle_person_names)
            contexts += find_company_contexts(entity, localangle_company_names)
            
        # Collapse by location
        story['contexts'] = []
        groupby_fn = lambda context: context['location']
        contexts.sort(key=groupby_fn)
        for location, location_contexts in itertools.groupby(contexts, key=groupby_fn):
            story['contexts'].append({
                'location' : location,
                'entities' : map(lambda context: context['entity'], location_contexts)
            })
        
        _db.stories.save(story)
示例#2
0
def transform_headlines_blurbs(incremental=True):
    alchemy = AlchemyAPI()

    PERSON_PATTERN = '<span class=\"context\">%s native %s</span>'
    COMPANY_PATTERN = '<span class=\"context\">%s-based %s</span>'

    story_criteria = { 'entities' : { '$exists' : True }, 'contexts' : { '$exists' : True, '$ne' : [] }}
    if incremental:
        story_criteria['contexts.headline'] = { '$exists' : False }
            
    for story in _db.stories.find(story_criteria):
        for context in story['contexts']:
            context['headline'] = None
            context['blurb'] = None
            
            display_location = context['location']['city'] if context['location']['city'] else context['location']['state']
            
            for entity in context['entities']:
                
                # Transform headlines
                if entity['type'] == 'Person':
                    new_headline = search_and_replace_text(story['titleNoFormatting'], [entity['name'], entity['name'].split()[-1]], PERSON_PATTERN % (display_location, '%s'))
                    new_blurb = search_and_replace_text(story['content'], entity['name'], PERSON_PATTERN % (display_location, '%s'))
                elif entity['type'] == 'Company':
                    new_headline = search_and_replace_text(story['titleNoFormatting'], [entity['name'], clean_company_name(entity['name'], robust=True)], COMPANY_PATTERN % (display_location, '%s'))
                    new_blurb = search_and_replace_text(story['content'], entity['name'], COMPANY_PATTERN % (display_location, '%s'))
                    
                if new_headline:
                    logging.debug(new_headline)
                    context['headline'] = new_headline
                    
                if new_blurb:
                    context['blurb'] = new_blurb
                
                # Transform "blurbs" 
                entity['instances'] = list((e['instances'] for e in story['entities'] if e['text'] == entity['name']).next())
                for i, instance in enumerate(entity['instances']):
                    if entity['type'] == 'Person':
                        entity['instances'][i] = search_and_replace_text(instance, entity['name'], PERSON_PATTERN % (display_location, '%s'))
                    elif entity['type'] == 'Company':
                        entity['instances'][i] = search_and_replace_text(instance, entity['name'], COMPANY_PATTERN % (display_location, '%s'))
            
        _db.stories.save(story)