def update_address(): def get_action(hit): address = hit['_source']['gs_data']['full_address'] return batch.get_update_action(hit, { 'address': address, 'address_updated': True }) batch.run( { '_source': ['gs_data.full_address'], 'query': { 'type': 'ngo', 'bool': { 'must': [{ 'exists': { 'field': 'gs_data.full_address' } }], 'must_not': [{ 'exists': { 'field': 'address_updated' } }] } } }, get_action)
def update_city(): with open('../data/location.jsonl', 'r') as f: locations = (json.loads(line) for line in f) loc_dict = {x['_id']: x['geocoding_data'] for x in locations} def get_action(hit): src = hit['_source'] geos = loc_dict.get(hit['_id'], None) if geos and len(geos) > 0 and len(src.get('state') or '') > 2: geo = geos[0] doc = { 'state': get_component(geo, 'administrative_area_level_1'), } else: return [] return [batch.get_update_action(hit, doc)] batch.run( { '_source': ['state'], 'query': { 'bool': { 'must': [{ 'type': { 'value': 'ngo' } }, { 'exists': { 'field': 'location' } }] } } }, get_action)
def copy_fb_location(): def get_action(hit): location = deep_get(hit, '_source.fb_data.location') return batch.get_update_action(hit, {'location': location}) batch.run( { '_source': ['fb_data.location'], 'query': { 'bool': { 'must': [ { 'type': { 'value': 'ngo' } }, { 'exists': { 'field': 'fb_data.location' } }, ], 'must_not': [{ 'exists': { 'field': 'location' } }] } } }, get_action)
def update_source(): def get_action(hit): if 'source_link' not in hit['_source']: print hit source_link = hit['_source']['source_link'] source = None if 'guidestar' in source_link: source = 'guidestar' elif 'unodc' in source_link: source = 'unodc' return batch.get_update_action(hit, {'source': source}) batch.run( { '_source': ['source_link'], 'query': { 'bool': { 'must': [{ 'type': { 'value': 'ngo' } }], 'must_not': [{ 'exists': { 'field': 'source' } }] } } }, get_action)
def copy_data(): def get_action(hit): return { '_index': 'ngos', '_type': 'ngo', '_id': hit['_id'], '_source': hit['_source'] } batch.run({'query': {'match_all': {}}}, get_action)
def update_guidestar(): def get_action(hit): org_id = hit['_source']['source_data']['organization_id'] gs_data = None retry_count = 0 while not gs_data and retry_count <= 10: try: gs_data = guidestar.get(org_id) except Exception as ex: print org_id, 'Error:', ex retry_count += 1 if not gs_data: print org_id, 'Not Found' time.sleep(5) print org_id, 'Found' if gs_data else 'Really Not Found' return batch.get_update_action(hit['_id'], {'gs_data': gs_data}) batch.run( { '_source': ['source_data.organization_id'], 'query': { 'bool': { 'must': [{ 'exists': { 'field': 'source_data.organization_id' } }, { 'match': { "gs_data.not_found": { "query": True } } }] }, } }, get_action)
def clean_address(): def get_action(hit): address = hit['_source']['address'] m = re.search(r'.*[\r\n\t\s]+([\w\W]+?)(tel|EIN):', address, re.M) address = clean_text(m.group(1)) return batch.get_update_action(hit, {'address': address}) batch.run( { '_source': ['address'], 'query': { 'bool': { 'must': [{ 'type': { 'value': 'ngo' } }, { 'term': { 'source': 'charity_nav' } }] } } }, get_action)
def update_coords(): def get_action(hit): address = clean_text(hit['_source']['address']) try: results = geocoding.get(address) except Exception as ex: print address, 'Error:', ex return batch.get_update_action( hit, { 'address': address, 'geocoding_data': None, 'location_error': 'NotFound' }) n = len(results) doc = {} if n > 0: geo = results[0] doc = { 'formatted_address': geo['formatted_address'], 'location': { 'lat': geo['geometry']['location']['lat'], 'lon': geo['geometry']['location']['lng'] } } if n != 1: doc['location_error'] = 'NotFound' if n == 0 else 'Ambiguous' doc['geocoding_data'] = results doc['address'] = address return batch.get_update_action(hit, doc) batch.run( { 'query': { 'bool': { 'must': [{ 'type': { 'value': 'ngo' } }, { 'wildcard': { 'address': '*' } }], 'must_not': [{ 'exists': { 'field': 'location' } }, { 'exists': { 'field': 'location_error' } }] } } }, get_action)