예제 #1
0
# Some entries in art_data are null, remove them before writing final json
null_art_indexes = []

# art without a match, for manual inspection
unmatched_art = []

# First, merge camps-2012.json data into playaevents-camps-2012.json
if scraper_art_file:
    for scraper_art in scraper_json:
        max_match = 0
        max_match_playa_art_index = -1
        if scraper_art != None:
            for index, playa_art in enumerate(playa_json):
                if scraper_art != None:
                    match = Levenshtein.ratio(
                        cleanString(playa_art['name']),
                        cleanString(scraper_art['title']))
                    if match > max_match:
                        max_match = match
                        max_match_playa_art_index = index
            #print "Best match for " + camp['name'] + " : " + max_match_location['name'] + " (confidence: " + str(max_match) + ")"
            if max_match > MATCH_THRESHOLD:
                # Match found. Merge scraper data into playa data
                if 'description' in scraper_art:
                    playa_json[max_match_playa_art_index][
                        'description'] = scraper_art['description']
                if 'url' in scraper_art:
                    playa_json[max_match_playa_art_index]['url'] = scraper_art[
                        'url']
                if 'image_url' in scraper_art:
                    playa_json[max_match_playa_art_index][
events_json = json.loads(events_file.read())

# Some entries in event_data are null, remove them before writing final json
null_camp_indexes = []

# camps without a match, for manual inspection
unmatched_camps = []

# match name fields between entries in two files
for index, camp in enumerate(camp_json):
    max_match = 0
    max_match_event = ""
    if camp != None and "name" in camp:
        for event in events_json:
            if "hosted_by_camp" in event:
                match = Levenshtein.ratio(cleanString(camp["name"]), cleanString(event["hosted_by_camp"]["name"]))
                if match > max_match:
                    max_match = match
                    max_match_event = event
        # print "Best match for " + event['name'] + " : " + max_match_camp['name'] + " (confidence: " + str(max_match) + ")"
        if max_match > MATCH_THRESHOLD:
            # Match found
            camp["id"] = max_match_event["hosted_by_camp"]["id"]
        else:
            unmatched_camps.append(camp)
    elif not "name" in camp:
        null_camp_indexes.append(index)

# To remove null entries from list, we must move in reverse
# to preserve list order as we remove
null_camp_indexes.reverse()
예제 #3
0
null_camp_indexes = []

# camps without a match, for manual inspection
unmatched_camps = []

matched_camps = []

# First, merge camps-2012.json data into playaevents-camps-2012.json
if scraper_file:
    for scraper_camp in scraper_json:
        max_match = 0
        max_match_playa_camp_index = -1
        if scraper_camp != None:
            for index, playa_camp in enumerate(playa_json):
                if playa_camp != None:
                    match = Levenshtein.ratio(cleanString(playa_camp['name']), cleanString(scraper_camp['name']))
                    if match > max_match:
                        max_match = match
                        max_match_playa_camp_index = index
            matchDict = None
            if max_match > MATCH_THRESHOLD:
                matchDict = playa_json[max_match_playa_camp_index]
            else:
              #print "Best match for " + camp['name'] + " : " + max_match_location['name'] + " (confidence: " + str(max_match) + ")"
              
              matchDict = {}
              matchDict['name'] = scraper_camp['name']
              playa_json.append(matchDict)
                
            if 'description' in scraper_camp:
                matchDict['description'] = scraper_camp['description']
null_camp_indexes = []

# camps without a match, for manual inspection
unmatched_camps = []

matched_camps = []

# match name fields between entries in two files
for index, camp in enumerate(camp_json):
    max_match = 0
    max_match_event = ''
    if camp != None and 'name' in camp:
        for event in events_json:
            if 'hosted_by_camp' in event:
                match = Levenshtein.ratio(
                    cleanString(camp['name']),
                    cleanString(event['hosted_by_camp']['name']))
                if match > max_match:
                    max_match = match
                    max_match_event = event
        #print "Best match for " + event['name'] + " : " + max_match_camp['name'] + " (confidence: " + str(max_match) + ")"
        if max_match > MATCH_THRESHOLD:
            # Match found
            camp['id'] = max_match_event['hosted_by_camp']['id']
            matched_camps.append(camp)
        else:
            unmatched_camps.append(camp)
    elif not 'name' in camp:
        null_camp_indexes.append(index)

# To remove null entries from list, we must move in reverse
예제 #5
0
# Some entries in art_data are null, remove them before writing final json
null_art_indexes = []

# art without a match, for manual inspection
unmatched_art = []

# First, merge camps-2012.json data into playaevents-camps-2012.json
if scraper_art_file:
    for scraper_art in scraper_json:
        max_match = 0
        max_match_playa_art_index = -1
        if scraper_art != None:
            for index, playa_art in enumerate(playa_json):
                if scraper_art != None:
                    match = Levenshtein.ratio(cleanString(playa_art['name']), cleanString(scraper_art['title']))
                    if match > max_match:
                        max_match = match
                        max_match_playa_art_index = index
            #print "Best match for " + camp['name'] + " : " + max_match_location['name'] + " (confidence: " + str(max_match) + ")"
            if max_match > MATCH_THRESHOLD:
                # Match found. Merge scraper data into playa data
                if 'description' in scraper_art:
                    playa_json[max_match_playa_art_index]['description'] = scraper_art['description']
                if 'url' in scraper_art:
                    playa_json[max_match_playa_art_index]['url'] = scraper_art['url']
                if 'image_url' in scraper_art:
                    playa_json[max_match_playa_art_index]['image_url'] = scraper_art['image_url']
                if 'artists' in scraper_art:
                    playa_json[max_match_playa_art_index]['artist'] = scraper_art['artists']
                if 'artist_location' in scraper_art:
events_json = json.loads(events_file.read())

# Some entries in event_data are null, remove them before writing final json
null_event_indexes = []

# events without a match, for manual inspection
unmatched_events = []
matched_events = []

# match name fields between entries in two files
for index, event in enumerate(events_json):
    max_match = 0
    max_match_location = ''
    if event != None and 'hosted_by_camp' in event:
        for location in location_json:
                match = Levenshtein.ratio(cleanString(location['name']), cleanString(event['hosted_by_camp']['name']))
                if match > max_match:
                    max_match = match
                    max_match_location = location
        #print "Best match for " + event['name'] + " : " + max_match_location['name'] + " (confidence: " + str(max_match) + ")"
        if max_match > MATCH_THRESHOLD:
            # Match found
            if 'latitude' in max_match_location and max_match_location['latitude'] != "":
                event['latitude'] = max_match_location['latitude']
                event['longitude'] = max_match_location['longitude']
            #event['location'] = max_match_location['location']
            event['matched_name'] = max_match_location['name']
            matched_events.append(event)
        else:
            unmatched_events.append(event)
    elif not 'hosted_by_camp' in event:
예제 #7
0
# Some entries in event_data are null, remove them before writing final json
null_event_indexes = []

# events without a match, for manual inspection
unmatched_events = []
matched_events = []

# match name fields between entries in two files
for index, event in enumerate(events_json):
    max_match = 0
    max_match_location = ''
    if event != None and 'hosted_by_camp' in event:
        for location in location_json:
            match = Levenshtein.ratio(
                cleanString(location['name']),
                cleanString(event['hosted_by_camp']['name']))
            if match > max_match:
                max_match = match
                max_match_location = location
        #print "Best match for " + event['name'] + " : " + max_match_location['name'] + " (confidence: " + str(max_match) + ")"
        if max_match > MATCH_THRESHOLD:
            # Match found
            if 'latitude' in max_match_location and max_match_location[
                    'latitude'] != "":
                event['latitude'] = max_match_location['latitude']
                event['longitude'] = max_match_location['longitude']
            #event['location'] = max_match_location['location']
            event['matched_name'] = max_match_location['name']
            matched_events.append(event)
        else: