# Some entries in art_data are null, remove them before writing final json null_art_indexes = [] # art without a match, for manual inspection unmatched_art = [] # First, merge camps-2012.json data into playaevents-camps-2012.json if scraper_art_file: for scraper_art in scraper_json: max_match = 0 max_match_playa_art_index = -1 if scraper_art != None: for index, playa_art in enumerate(playa_json): if scraper_art != None: match = Levenshtein.ratio( cleanString(playa_art['name']), cleanString(scraper_art['title'])) if match > max_match: max_match = match max_match_playa_art_index = index #print "Best match for " + camp['name'] + " : " + max_match_location['name'] + " (confidence: " + str(max_match) + ")" if max_match > MATCH_THRESHOLD: # Match found. Merge scraper data into playa data if 'description' in scraper_art: playa_json[max_match_playa_art_index][ 'description'] = scraper_art['description'] if 'url' in scraper_art: playa_json[max_match_playa_art_index]['url'] = scraper_art[ 'url'] if 'image_url' in scraper_art: playa_json[max_match_playa_art_index][
events_json = json.loads(events_file.read()) # Some entries in event_data are null, remove them before writing final json null_camp_indexes = [] # camps without a match, for manual inspection unmatched_camps = [] # match name fields between entries in two files for index, camp in enumerate(camp_json): max_match = 0 max_match_event = "" if camp != None and "name" in camp: for event in events_json: if "hosted_by_camp" in event: match = Levenshtein.ratio(cleanString(camp["name"]), cleanString(event["hosted_by_camp"]["name"])) if match > max_match: max_match = match max_match_event = event # print "Best match for " + event['name'] + " : " + max_match_camp['name'] + " (confidence: " + str(max_match) + ")" if max_match > MATCH_THRESHOLD: # Match found camp["id"] = max_match_event["hosted_by_camp"]["id"] else: unmatched_camps.append(camp) elif not "name" in camp: null_camp_indexes.append(index) # To remove null entries from list, we must move in reverse # to preserve list order as we remove null_camp_indexes.reverse()
null_camp_indexes = [] # camps without a match, for manual inspection unmatched_camps = [] matched_camps = [] # First, merge camps-2012.json data into playaevents-camps-2012.json if scraper_file: for scraper_camp in scraper_json: max_match = 0 max_match_playa_camp_index = -1 if scraper_camp != None: for index, playa_camp in enumerate(playa_json): if playa_camp != None: match = Levenshtein.ratio(cleanString(playa_camp['name']), cleanString(scraper_camp['name'])) if match > max_match: max_match = match max_match_playa_camp_index = index matchDict = None if max_match > MATCH_THRESHOLD: matchDict = playa_json[max_match_playa_camp_index] else: #print "Best match for " + camp['name'] + " : " + max_match_location['name'] + " (confidence: " + str(max_match) + ")" matchDict = {} matchDict['name'] = scraper_camp['name'] playa_json.append(matchDict) if 'description' in scraper_camp: matchDict['description'] = scraper_camp['description']
null_camp_indexes = [] # camps without a match, for manual inspection unmatched_camps = [] matched_camps = [] # match name fields between entries in two files for index, camp in enumerate(camp_json): max_match = 0 max_match_event = '' if camp != None and 'name' in camp: for event in events_json: if 'hosted_by_camp' in event: match = Levenshtein.ratio( cleanString(camp['name']), cleanString(event['hosted_by_camp']['name'])) if match > max_match: max_match = match max_match_event = event #print "Best match for " + event['name'] + " : " + max_match_camp['name'] + " (confidence: " + str(max_match) + ")" if max_match > MATCH_THRESHOLD: # Match found camp['id'] = max_match_event['hosted_by_camp']['id'] matched_camps.append(camp) else: unmatched_camps.append(camp) elif not 'name' in camp: null_camp_indexes.append(index) # To remove null entries from list, we must move in reverse
# Some entries in art_data are null, remove them before writing final json null_art_indexes = [] # art without a match, for manual inspection unmatched_art = [] # First, merge camps-2012.json data into playaevents-camps-2012.json if scraper_art_file: for scraper_art in scraper_json: max_match = 0 max_match_playa_art_index = -1 if scraper_art != None: for index, playa_art in enumerate(playa_json): if scraper_art != None: match = Levenshtein.ratio(cleanString(playa_art['name']), cleanString(scraper_art['title'])) if match > max_match: max_match = match max_match_playa_art_index = index #print "Best match for " + camp['name'] + " : " + max_match_location['name'] + " (confidence: " + str(max_match) + ")" if max_match > MATCH_THRESHOLD: # Match found. Merge scraper data into playa data if 'description' in scraper_art: playa_json[max_match_playa_art_index]['description'] = scraper_art['description'] if 'url' in scraper_art: playa_json[max_match_playa_art_index]['url'] = scraper_art['url'] if 'image_url' in scraper_art: playa_json[max_match_playa_art_index]['image_url'] = scraper_art['image_url'] if 'artists' in scraper_art: playa_json[max_match_playa_art_index]['artist'] = scraper_art['artists'] if 'artist_location' in scraper_art:
events_json = json.loads(events_file.read()) # Some entries in event_data are null, remove them before writing final json null_event_indexes = [] # events without a match, for manual inspection unmatched_events = [] matched_events = [] # match name fields between entries in two files for index, event in enumerate(events_json): max_match = 0 max_match_location = '' if event != None and 'hosted_by_camp' in event: for location in location_json: match = Levenshtein.ratio(cleanString(location['name']), cleanString(event['hosted_by_camp']['name'])) if match > max_match: max_match = match max_match_location = location #print "Best match for " + event['name'] + " : " + max_match_location['name'] + " (confidence: " + str(max_match) + ")" if max_match > MATCH_THRESHOLD: # Match found if 'latitude' in max_match_location and max_match_location['latitude'] != "": event['latitude'] = max_match_location['latitude'] event['longitude'] = max_match_location['longitude'] #event['location'] = max_match_location['location'] event['matched_name'] = max_match_location['name'] matched_events.append(event) else: unmatched_events.append(event) elif not 'hosted_by_camp' in event:
# Some entries in event_data are null, remove them before writing final json null_event_indexes = [] # events without a match, for manual inspection unmatched_events = [] matched_events = [] # match name fields between entries in two files for index, event in enumerate(events_json): max_match = 0 max_match_location = '' if event != None and 'hosted_by_camp' in event: for location in location_json: match = Levenshtein.ratio( cleanString(location['name']), cleanString(event['hosted_by_camp']['name'])) if match > max_match: max_match = match max_match_location = location #print "Best match for " + event['name'] + " : " + max_match_location['name'] + " (confidence: " + str(max_match) + ")" if max_match > MATCH_THRESHOLD: # Match found if 'latitude' in max_match_location and max_match_location[ 'latitude'] != "": event['latitude'] = max_match_location['latitude'] event['longitude'] = max_match_location['longitude'] #event['location'] = max_match_location['location'] event['matched_name'] = max_match_location['name'] matched_events.append(event) else: