def summarize_titles_data(data, fieldnames): global link_person_gnd link_person_gnd = [] global titles titles = [] global authors authors = [] global labels labels = [] if(PERSONS_JSON in data): for person in data[PERSONS_JSON]: name = common.toByteStr(person[NAME_JSON]) authors.append(name) if(SAMEAS_JSON in person): links = [link for link in person[SAMEAS_JSON]] link_person_gnd_tmp = [link for link in links if GND_JSON in link] link_person_gnd.append(' '.join(link_person_gnd_tmp)) if('terms' in data): for subject in data['terms']: if('labels' in subject): labels = [label for label in subject['labels']] if('notes' in data): titles = data['notes'] print data['notes'] print 'author: ', ' '.join(map(str,authors)) print 'labels: ', ' '.join(map(str,labels)) titles_res = '' str_list = [] for elem in titles: byte_str = common.toByteStr(elem) titles_res = titles_res + byte_str + ' ' str_list.append(byte_str) print 'titles out: ', titles_res print 'en translation: ', gs.translate(titles_res, 'en') #entry = { # 'gnd': ' '.join(map(str,link_person_gnd)), # 'author': ' '.join(map(str,authors)), # 'subject': ' '.join(map(str,labels)), # 'title': titles_res #} values = [ ' '.join(map(str,link_person_gnd)) , ' '.join(map(str,authors)) , ' '.join(map(str,labels)) , titles_res ] entry = dict(zip(fieldnames, values)) return entry
def summarize_titles_data(data, fieldnames): global link_person_gnd link_person_gnd = [] global titles titles = [] global authors authors = [] global labels labels = [] if (PERSONS_JSON in data): for person in data[PERSONS_JSON]: name = common.toByteStr(person[NAME_JSON]) authors.append(name) if (SAMEAS_JSON in person): links = [link for link in person[SAMEAS_JSON]] link_person_gnd_tmp = [ link for link in links if GND_JSON in link ] link_person_gnd.append(' '.join(link_person_gnd_tmp)) if ('terms' in data): for subject in data['terms']: if ('labels' in subject): labels = [label for label in subject['labels']] if ('notes' in data): titles = data['notes'] print data['notes'] print 'author: ', ' '.join(map(str, authors)) print 'labels: ', ' '.join(map(str, labels)) titles_res = '' str_list = [] for elem in titles: byte_str = common.toByteStr(elem) titles_res = titles_res + byte_str + ' ' str_list.append(byte_str) print 'titles out: ', titles_res print 'en translation: ', gs.translate(titles_res, 'en') #entry = { # 'gnd': ' '.join(map(str,link_person_gnd)), # 'author': ' '.join(map(str,authors)), # 'subject': ' '.join(map(str,labels)), # 'title': titles_res #} values = [ ' '.join(map(str, link_person_gnd)), ' '.join(map(str, authors)), ' '.join(map(str, labels)), titles_res ] entry = dict(zip(fieldnames, values)) return entry
def calculate_musicbrainz_works_and_recordings_by_id(id, author, output_file): try: # query_work = MUSICBRAINZ_API_URL + 'artist/' + id + '?inc=aliases%20works%20recordings&fmt=json' query_work = MUSICBRAINZ_API_URL + 'work?artist=' + id + '&inc=aliases&fmt=json' print 'query compositions:', query_work work_response = common.process_http_query(query_work) print 'musicbrainz composition:', work_response musicbrainz_composition_response_json = json.loads(work_response.content) # works_count = len(musicbrainz_composition_response_json[common.WORKS_JSON]) compositions_count = str(musicbrainz_composition_response_json[common.WORK_COUNT_JSON]) #recordings_count = len(musicbrainz_composition_response_json[common.RECORDINGS_JSON]) #compositions_count = works_count + recordings_count print 'musicbrainz #composition:', compositions_count values = [ id , common.toByteStr(author) , str(compositions_count) ] entry = dict(zip(common.musicbrainz_compositions_count_fieldnames, values)) with open(output_file, 'ab') as csvfile: writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=common.musicbrainz_compositions_count_fieldnames, lineterminator='\n') writer.writerow(entry) except ValueError as ve: print 'Could not find JSON for given Musicbrainz composition.', id, ve.message except Exception as e: print 'Could not find Musicbrainz composition.', id, e.message return work_response
def aggregate_compositions_data(): with codecs.open(COMPOSITIONS_DATA_FILE, 'w') as csvfile: writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=composition_data_fieldnames, lineterminator='\n') writer.writeheader() for inputfile in glob.glob(FREEBASE_COMPOSITIONS_DIR + common.SLASH + '*'): print inputfile compositions_content_json = common.read_json_file(inputfile) composition_json_list = compositions_content_json['result'][0][ 'compositions'] composition_list = get_composition_id_list_from_json_list( composition_json_list) if len(composition_list) > 0: for index, composition_id in enumerate(composition_list): composition_data = retrieve_compositions_data( composition_id) if composition_data: try: mid = composition_data['result'][0]['mid'] name = composition_data['result'][0]['name'] entry = build_composition_data_entry( composition_id, mid, common.toByteStr(name)) writer.writerow(entry) except: print 'Composition values mid and/or name is empty.'
def summarize_sameas_data(data, fieldnames): entries = [] onb_id = '' if(DOC_ID_JSON in data): onb_id = data[DOC_ID_JSON] print data[DOC_ID_JSON] if(PERSONS_JSON in data): for person in data[PERSONS_JSON]: author = person[NAME_JSON] links = [] link_person_gnd = [] if(SAMEAS_JSON in person): links = [link for link in person[SAMEAS_JSON]] link_person_gnd = [link for link in links if GND_JSON in link] gnd = '' if(len(link_person_gnd) > 0): gnd = link_person_gnd[0] values = [ onb_id , common.toByteStr(author) , gnd , ' '.join(map(str,links)) ] entry = dict(zip(fieldnames, values)) entries.append(entry) return entries
def summarize_sameas_data(data, fieldnames): entries = [] onb_id = '' if (DOC_ID_JSON in data): onb_id = data[DOC_ID_JSON] print data[DOC_ID_JSON] if (PERSONS_JSON in data): for person in data[PERSONS_JSON]: author = person[NAME_JSON] links = [] link_person_gnd = [] if (SAMEAS_JSON in person): links = [link for link in person[SAMEAS_JSON]] link_person_gnd = [link for link in links if GND_JSON in link] gnd = '' if (len(link_person_gnd) > 0): gnd = link_person_gnd[0] values = [ onb_id, common.toByteStr(author), gnd, ' '.join(map(str, links)) ] entry = dict(zip(fieldnames, values)) entries.append(entry) return entries
def get_composition_id_list_from_json_list(composition_json_list): composition_list = [] for composition_json in composition_json_list: if composition_json['id'] != None: composition_str = common.toByteStr(composition_json['id']).lower() if composition_str != None: composition_list.append(composition_str) return sorted(composition_list)
def summarize_authors_data(data, fieldnames): entries = [] onb_id = '' if(DOC_ID_JSON in data): onb_id = data[DOC_ID_JSON] print data[DOC_ID_JSON] if(PERSONS_JSON in data): #isFirstTime = True for person in data[PERSONS_JSON]: author = person[NAME_JSON] link_person_gnd = [] if(SAMEAS_JSON in person): links = [link for link in person[SAMEAS_JSON]] link_person_gnd = [link for link in links if GND_JSON in link] dbpedia_items = dbpedia_helper.find_dbpedia_items(author) dbpedia_id_res = '' for key, value in dbpedia_items.iteritems(): dbpedia_id = dbpedia_helper.find_dbpedia_id(key) dbpeida_id_str = common.toByteStr(dbpedia_id) print 'DBPedia ID', dbpeida_id_str dbpedia_id_res = dbpeida_id_str + ' ' + dbpedia_id_res #if(isFirstTime == False): # onb_id = '' gnd = '' if(len(link_person_gnd) > 0): gnd = link_person_gnd[0] values = [ onb_id , common.toByteStr(author) , gnd , dbpedia_id_res ] entry = dict(zip(fieldnames, values)) entries.append(entry) #isFirstTime = False return entries
def store_composition_musicbrainz(id, json_data, author, output_file): alias_names = '' for name_list in json_data[common.ALIASES_JSON]: value = name_list[common.NAME_JSON] if alias_names=='': alias_names = value else: alias_names += ' ' + value values = [ id , common.toByteStr(author) , common.toByteStr(json_data[common.TITLE_JSON]) , common.toByteStr(alias_names) ] entry = dict(zip(common.musicbrainz_works_and_recordings_fieldnames, values)) write_composition_in_csv_file(output_file, entry)
def build_viaf_composition_entry( author_id, author_name, composition_id, title): values = [ author_id , author_name , composition_id , common.toByteStr(title) ] return dict(zip(common.viaf_compositions_fieldnames, values))
def extract_property_label(response, property): values = '' try: json_data = response values = json_data['entities']['P'+str(property)]['labels']['en']['value'] except JSONDecodeError as jde: print 'JSONDecodeError. Response property data:', response, jde except: print 'Response json:', response print 'Unexpected error:', sys.exc_info()[0] print 'property:', property, 'values:', values return common.toByteStr(values)
def summarize_authors_data(data, fieldnames): entries = [] onb_id = '' if (DOC_ID_JSON in data): onb_id = data[DOC_ID_JSON] print data[DOC_ID_JSON] if (PERSONS_JSON in data): #isFirstTime = True for person in data[PERSONS_JSON]: author = person[NAME_JSON] link_person_gnd = [] if (SAMEAS_JSON in person): links = [link for link in person[SAMEAS_JSON]] link_person_gnd = [link for link in links if GND_JSON in link] dbpedia_items = dbpedia_helper.find_dbpedia_items(author) dbpedia_id_res = '' for key, value in dbpedia_items.iteritems(): dbpedia_id = dbpedia_helper.find_dbpedia_id(key) dbpeida_id_str = common.toByteStr(dbpedia_id) print 'DBPedia ID', dbpeida_id_str dbpedia_id_res = dbpeida_id_str + ' ' + dbpedia_id_res #if(isFirstTime == False): # onb_id = '' gnd = '' if (len(link_person_gnd) > 0): gnd = link_person_gnd[0] values = [onb_id, common.toByteStr(author), gnd, dbpedia_id_res] entry = dict(zip(fieldnames, values)) entries.append(entry) #isFirstTime = False return entries
def extract_property_label(response, property): values = '' try: json_data = response values = json_data['entities']['P' + str(property)]['labels']['en']['value'] except JSONDecodeError as jde: print 'JSONDecodeError. Response property data:', response, jde except: print 'Response json:', response print 'Unexpected error:', sys.exc_info()[0] print 'property:', property, 'values:', values return common.toByteStr(values)
def save_mapping_authors_to_composition_count_in_csv(filename_authors, outputfile): reader = csv.DictReader(open(filename_authors), delimiter=';', fieldnames=common.viaf_compositions_count_fieldnames, lineterminator='\n') firstTime = True for row in reader: if not firstTime: print 'row', row author = row[common.AUTHOR_NAME] author_id = author.split('.')[0] name, length = count_compositions(author_id) if name != None: print 'author:', name, 'len compositions', length entry = build_freebase_composition_count_entry(common.toByteStr(name), length) write_composition_in_csv_file(outputfile, entry) else: firstTime = False
def analyze_compositions(): with codecs.open(AUTHOR_COMPOSITIONS_FILE, 'w') as csvfile: writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=author_composition_fieldnames, lineterminator='\n') writer.writeheader() for inputfile in glob.glob(FREEBASE_COMPOSITIONS_DIR + common.SLASH + '*'): print inputfile compositions_content_json = common.read_json_file(inputfile) name = compositions_content_json['result'][0]['name'] composition_json_list = compositions_content_json['result'][0][ 'compositions'] composition_list = get_composition_string_list_from_json_list( composition_json_list) if len(composition_list) > 0: # parent = assign_parent(composition_list[0]) parent = '' for index, composition in enumerate(composition_list): main = composition if index == 0: parent = assign_parent(composition_list[0]) else: #if parent not in composition: if not composition.startswith(parent): parent = assign_parent(composition) else: parent_new = common.find_common_substring( parent, composition_list[index - 1]) #parent_new = common.find_common_parent(parent,composition_list[index-1]) # parent ending must be either ' ' or ',' if parent_new != '': print 'parent:', parent, 'parent_new:', parent_new, 'composition:', composition if (len(parent_new) <= len(composition) and composition[len(parent_new)-1] != ' ' \ and composition[len(parent_new)-1] != ','): parent_new = composition parent = parent_new entry = build_author_composition_entry( common.toByteStr(name), composition, parent, main) writer.writerow(entry)
def extract_property_value(response, property): values = '' try: json_data = response for value_list in json_data['P' + str(property)]: value = value_list['mainsnak']['datavalue']['value'] if values == '': values = value else: values = values + ' ' + value #property_data_list = json_data[PROPS_JSON][str(property)] #values = " ".join(str(value_list[VALUE_POS_IN_WIKIDATA_PROP_LIST]) for value_list in property_data_list) except JSONDecodeError as jde: print 'JSONDecodeError. Response author data:', response, jde except: print 'Response json:', response print 'Unexpected error:', sys.exc_info()[0] print 'property:', property, 'values:', values return common.toByteStr(values)
def extract_property_value(response, property): values = '' try: json_data = response for value_list in json_data['P' + str(property)]: value = value_list['mainsnak']['datavalue']['value'] if values=='': values = value else: values = values + ' ' + value #property_data_list = json_data[PROPS_JSON][str(property)] #values = " ".join(str(value_list[VALUE_POS_IN_WIKIDATA_PROP_LIST]) for value_list in property_data_list) except JSONDecodeError as jde: print 'JSONDecodeError. Response author data:', response, jde except: print 'Response json:', response print 'Unexpected error:', sys.exc_info()[0] print 'property:', property, 'values:', values return common.toByteStr(values)
def save_mapping_authors_to_composition_count_in_csv(filename_authors, outputfile): reader = csv.DictReader( open(filename_authors), delimiter=';', fieldnames=common.viaf_compositions_count_fieldnames, lineterminator='\n') firstTime = True for row in reader: if not firstTime: print 'row', row author = row[common.AUTHOR_NAME] author_id = author.split('.')[0] name, length = count_compositions(author_id) if name != None: print 'author:', name, 'len compositions', length entry = build_freebase_composition_count_entry( common.toByteStr(name), length) write_composition_in_csv_file(outputfile, entry) else: firstTime = False
def analyze_compositions(): with codecs.open(AUTHOR_COMPOSITIONS_FILE, 'w') as csvfile: writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=author_composition_fieldnames, lineterminator='\n') writer.writeheader() for inputfile in glob.glob(FREEBASE_COMPOSITIONS_DIR + common.SLASH + '*'): print inputfile compositions_content_json = common.read_json_file(inputfile) name = compositions_content_json['result'][0]['name'] composition_json_list = compositions_content_json['result'][0]['compositions'] composition_list = get_composition_string_list_from_json_list(composition_json_list) if len(composition_list) > 0: # parent = assign_parent(composition_list[0]) parent = '' for index, composition in enumerate(composition_list): main = composition if index == 0: parent = assign_parent(composition_list[0]) else: #if parent not in composition: if not composition.startswith(parent): parent = assign_parent(composition) else: parent_new = common.find_common_substring(parent,composition_list[index-1]) #parent_new = common.find_common_parent(parent,composition_list[index-1]) # parent ending must be either ' ' or ',' if parent_new != '': print 'parent:', parent, 'parent_new:', parent_new, 'composition:', composition if (len(parent_new) <= len(composition) and composition[len(parent_new)-1] != ' ' \ and composition[len(parent_new)-1] != ','): parent_new = composition parent = parent_new entry = build_author_composition_entry(common.toByteStr(name), composition, parent, main) writer.writerow(entry)
def extract_and_save_label_data(europeana_response_json): fields = [] try: fields = europeana_response_json[FACETS_JSON][0] with codecs.open(FACET_COLLECTION_FILE, 'w') as csvfile: writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=facet_collection_fieldnames, lineterminator='\n') writer.writeheader() for field in fields[FIELDS_JSON]: label = field[LABEL_JSON] count = field[COUNT_JSON] with open(FACET_COLLECTION_FILE, 'ab') as csvfile: writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=facet_collection_fieldnames, lineterminator='\n') try: id = '' if '_' in label: id = label.split('_')[0] print 'label:', label, 'id:', id, 'count:', count label_res = common.toByteStr(label) values = [str(id), label_res, str(count)] entry = dict(zip(facet_collection_fieldnames, values)) writer.writerow(entry) except UnicodeEncodeError as uee: print 'UnicodeEncodeError. Writing data in CSV for europeana facet collection. label:', label, uee except JSONDecodeError as jde: print 'JSONDecodeError. Response europeana facet collection data:', jde except: print 'Response json:', europeana_response_json print 'Unexpected error:', sys.exc_info()[0] return fields
def aggregate_compositions_data(): with codecs.open(COMPOSITIONS_DATA_FILE, 'w') as csvfile: writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=composition_data_fieldnames, lineterminator='\n') writer.writeheader() for inputfile in glob.glob(FREEBASE_COMPOSITIONS_DIR + common.SLASH + '*'): print inputfile compositions_content_json = common.read_json_file(inputfile) composition_json_list = compositions_content_json['result'][0]['compositions'] composition_list = get_composition_id_list_from_json_list(composition_json_list) if len(composition_list) > 0: for index, composition_id in enumerate(composition_list): composition_data = retrieve_compositions_data(composition_id) if composition_data: try: mid = composition_data['result'][0]['mid'] name = composition_data['result'][0]['name'] entry = build_composition_data_entry(composition_id, mid, common.toByteStr(name)) writer.writerow(entry) except: print 'Composition values mid and/or name is empty.'