def calculate_musicbrainz_works_and_recordings_count(inputfile, output_compositions): # an input file contains mapped author data with musicbrainz author IDs summary = summarize.read_csv_summary(inputfile) outputfile = glob.glob(output_compositions) if not outputfile: #if not output_compositions: with codecs.open(output_compositions, 'w') as csvfile: writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=common.musicbrainz_compositions_count_fieldnames, lineterminator='\n') writer.writeheader() for row in summary[1:]: # ignore first row, which is a header try: mapping_musicbrainz_id = row[common.MUSICBRAINZ_ID_COL] author_name = row[common.AUTHOR_NAME_COL] print 'author name:', author_name, 'mapping musicbrainz id:', mapping_musicbrainz_id # an input file contains author compositions count with musicbrainz author IDs and names author_compositions = summarize.read_csv_summary(output_compositions) isStored = False for count_row in author_compositions[1:]: musicbrainz_author_id = count_row[0] if mapping_musicbrainz_id == musicbrainz_author_id: isStored = True print 'is already stored.' break if isStored == False and mapping_musicbrainz_id: calculate_musicbrainz_works_and_recordings_by_id(mapping_musicbrainz_id.split(' ')[0], author_name, output_compositions) except: print ''
def retrieve_wikidata_compositions_by_freebase_id(inputfile): summary = summarize.read_csv_summary(inputfile) for row in summary[1:]: # ignore first row, which is a header FREEBASE_ID_COL = 1 print row[FREEBASE_ID_COL] wikidata_composition_response = retrieve_wikidata_composition_by_freebase_id( row[FREEBASE_ID_COL]) print wikidata_composition_response try: wikidata_composition_response_json = json.loads( wikidata_composition_response.content) items = wikidata_composition_response_json[ITEMS_JSON] if len(items) > 0: wikidata_composition_id = items[0] print 'wikidata_composition_id:', wikidata_composition_id composition_response_json = common.is_stored_as_json_file( WIKIDATA_API_URL + ITEMS_JSON + '[' + str(wikidata_composition_id) + ']&' + PROPS_JSON + '=*') if (composition_response_json == None): #inputfile = glob.glob(WIKIDATA_COMPOSITION_DATA_DIR + SLASH + str(wikidata_composition_id)) #if not inputfile: print 'composition data not exists for composition:', wikidata_composition_id #composition_response_json = retrieve_wikidata_composition_data(wikidata_composition_id) print 'composition json:', composition_response_json store_wikidata_composition_data(wikidata_composition_id, composition_response_json) # store_wikidata_composition_data(wikidata_composition_id, composition_response_json.content) except KeyError as ke: print 'no composition items found:', row[FREEBASE_ID_COL], ke
def retrieve_wikidata_compositions_by_musicbrainz_id(inputfile, outputfile): with codecs.open(outputfile, 'w') as csvfile: writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=common.map_compositions_fieldnames, lineterminator='\n') writer.writeheader() summary = summarize.read_csv_summary(inputfile) for row in summary[1:]: # ignore first row, which is a header MUSICBRAINZ_ID_COL = 0 MUSICBRAINZ_AUTHOR_NAME_COL = 1 MUSICBRAINZ_TITLE_COL = 2 musicbrainz_id = row[MUSICBRAINZ_ID_COL] print musicbrainz_id wikidata_composition_response = retrieve_wikidata_composition_by_musicbrainz_id( musicbrainz_id) print wikidata_composition_response try: wikidata_composition_response_json = json.loads( wikidata_composition_response.content) items = wikidata_composition_response_json[ITEMS_JSON] viaf_id = 0 wikidata_composition_id = 0 if len(items) > 0: wikidata_composition_id = items[0] print 'wikidata_composition_id:', wikidata_composition_id composition_response_json = common.is_stored_as_json_file( WIKIDATA_API_URL + ITEMS_JSON + '[' + str(wikidata_composition_id) + ']&' + PROPS_JSON + '=*') if (composition_response_json == None): print 'composition data not exists for composition:', wikidata_composition_id print 'composition json:', composition_response_json store_wikidata_composition_data( wikidata_composition_id, composition_response_json) wikidata_composition_viaf_response = retrieve_wikidata_composition_viaf_id_by_wikidata_id( wikidata_composition_id) try: #wikidata_composition_viaf_response_json = wikidata_composition_viaf_response.json() wikidata_composition_viaf_response_json = json.loads( wikidata_composition_viaf_response.content) #items = wikidata_composition_response_json[ITEMS_JSON] viaf_id = extract_viaf_id_from_wikidata_composition_id( wikidata_composition_viaf_response_json) except: print 'No VIAF id found for composition ID:', wikidata_composition_id print 'viaf id:', viaf_id entry = build_composition_mapping_entry( row[MUSICBRAINZ_TITLE_COL], row[MUSICBRAINZ_AUTHOR_NAME_COL], wikidata_composition_id, viaf_id, musicbrainz_id) writer.writerow(entry) except KeyError as ke: print 'no composition items found:', row[ MUSICBRAINZ_ID_COL], ke
def retrieve_wikidata_objects_by_internet_archive_id(inputfile, outputfile): with codecs.open(outputfile, 'w') as csvfile: writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=common.map_band_fieldnames, lineterminator='\n') writer.writeheader() summary = summarize.read_csv_summary(inputfile) for row in summary[1:]: # ignore first row, which is a header INTERNET_ARCHIVE_ID_COL = 0 BAND_NAME_COL = 1 internet_archive_id_path = row[INTERNET_ARCHIVE_ID_COL] internet_archive_id = internet_archive_id_path.split("/")[-1] print "internet_archive_id:", internet_archive_id wikidata_object_response = retrieve_wikidata_object_by_internet_archive_id( internet_archive_id) print wikidata_object_response try: wikidata_object_response_json = json.loads( wikidata_object_response.content) items = wikidata_object_response_json[ITEMS_JSON] wikidata_band_id = 0 musicbrainz_id = 0 if len(items) > 0: wikidata_band_id = items[0] print 'wikidata_band_id:', wikidata_band_id wikidata_band_response_json = common.is_stored_as_json_file( WIKIDATA_API_URL + ITEMS_JSON + '[' + str(wikidata_band_id) + ']&' + PROPS_JSON + '=*') if (wikidata_band_response_json == None): print 'band data not exists for id:', wikidata_band_id band_data_response = retrieve_wikidata_band_data( wikidata_band_id) wikidata_band_data_response_json = common.validate_response_json( band_data_response) store_wikidata_band_data(wikidata_band_id, wikidata_band_data_response_json) print 'band json:', wikidata_band_data_response_json try: musicbrainz_id = extract_property_value( wikidata_band_data_response_json, MUSIC_BRAINZ_ARTIST_ID_PROP) except: print 'No musicbrainz id found for band ID:', wikidata_band_id print 'musicbrainz id:', musicbrainz_id entry = build_band_mapping_entry(row[BAND_NAME_COL], wikidata_band_id, internet_archive_id, musicbrainz_id) writer.writerow(entry) except KeyError as ke: print 'no composition items found:', row[ INTERNET_ARCHIVE_ID_COL], ke
def retrieve_viaf_composition_data(inputfile): # an input file contains work titles from the VIAF repository summary = summarize.read_csv_summary(inputfile) for row in summary[1:]: # ignore first row, which is a header author_name = row[common.VIAF_COMPOSITIONS_CSV_AUTHOR_ID_COL] composition_title = row[common.VIAF_COMPOSITIONS_CSV_COMPOSITION_TITLE_COL] viaf_id = row[common.VIAF_COMPOSITIONS_CSV_VIAF_WORK_ID_COL].replace('VIAF|','') print 'author name:', author_name, 'composition title:', composition_title, 'viaf ID:', viaf_id retrieve_viaf_compositions_by_id(viaf_id)
def retrieve_authors_data_by_viaf_id(inputfile, outputfile): with codecs.open(outputfile, 'w') as csvfile: writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=common.viaf_compositions_fieldnames, lineterminator='\n') writer.writeheader() summary = summarize.read_csv_summary(inputfile) for row in summary[1:]: # ignore first row, which is a header author_name = row[common.AUTHOR_NAME_COL] viaf_id = row[common.VIAF_ID_COL] print 'author name:', author_name, 'viaf ID:', viaf_id for id in viaf_id.split(common.BLANK): retrieve_viaf_compositions_by_author_id(author_name, id, outputfile)
def retrieve_musicbrainz_composition_data(inputfile): # an input file contains work titles from the VIAF repository summary = summarize.read_csv_summary(inputfile) if not os.path.exists(inputfile): with codecs.open(VIAF_MUSICBRAINZ_COMPOSITION_MAPPING_FILE, 'w') as csvfile: writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=common.viaf_musicbrainz_compositions_mapping_fieldnames, lineterminator='\n') writer.writeheader() for row in summary[1:]: # ignore first row, which is a header author_name = row[common.VIAF_COMPOSITIONS_CSV_AUTHOR_ID_COL] composition_title = row[common.VIAF_COMPOSITIONS_CSV_COMPOSITION_TITLE_COL] viaf_id = row[common.VIAF_COMPOSITIONS_CSV_VIAF_WORK_ID_COL].replace('VIAF|','') print 'author name:', author_name, 'composition title:', composition_title, 'viaf ID:', viaf_id retrieve_musicbrainz_compositions_by_title(composition_title, viaf_id)
def retrieve_musicbrainz_works_and_recordings(inputfile, output_works, output_recordings): # an input file contains mapped author data with musicbrainz author IDs summary = summarize.read_csv_summary(inputfile) with codecs.open(output_works, 'w') as csvfile: writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=common.musicbrainz_works_and_recordings_fieldnames, lineterminator='\n') writer.writeheader() with codecs.open(output_recordings, 'w') as csvfile: writer2 = csv.DictWriter(csvfile, delimiter=';', fieldnames=common.musicbrainz_works_and_recordings_fieldnames, lineterminator='\n') writer2.writeheader() for row in summary[1:]: # ignore first row, which is a header try: musicbrainz_id = row[common.MUSICBRAINZ_ID_COL] author_name = row[common.AUTHOR_NAME_COL] print 'author name:', author_name, 'musicbrainz id:', musicbrainz_id retrieve_musicbrainz_works_and_recordings_by_id(musicbrainz_id, author_name, output_works, output_recordings) except: print ''