def serialize_object(self, obj): """Serialize a single record and persistent identifier. :param obj: Record instance """ json = self._schema_cls().dump(obj) return simpledc.tostring(json)
def serialize(self, pid, record, links_factory=None): """Serialize a single record and persistent identifier. :param pid: Persistent identifier instance. :param record: Record instance. :param links_factory: Factory function for record links. """ return simpledc.tostring( self.transform_record(pid, record, links_factory))
def serialize_object_list(self, obj_list): """Serialize a list of records. :param obj_list: List of record instances """ records = obj_list.get("hits", {}).get("hits", []) json_list = self._schema_cls().dump(records, many=True) # TODO: multiple records should be wrapped in a single root tag. return "\n".join(simpledc.tostring(json) for json in json_list)
def dictionary2xml(dictionary): '''takes a dictionary and creates an xml string from the object, and the file name''' file_name = dictionary['identifiers'] + '.xml' for key, value in dictionary.items(): dictionary[key] = [value] xml = simpledc.tostring(dictionary) return xml, file_name
def test_elements(): """Test simple dc.""" elements = [ ('contributors', 'contributor'), ('coverage', 'coverage'), ('creators', 'creator'), ('dates', 'date'), ('descriptions', 'description'), ('formats', 'format'), ('identifiers', 'identifier'), ('languages', 'language'), ('publishers', 'publisher'), ('relations', 'relation'), ('rights', 'rights'), ('sources', 'source'), ('subjects', 'subject'), ('titles', 'title'), ('types', 'type'), ] # Test each element individually for plural, singular in elements: # Test multiple values tree = simpledc.dump_etree({plural: ['value 1', 'value 2']}) elems = tree.xpath( '/oai_dc:dc/dc:{0}'.format(singular), namespaces=simpledc.ns) assert len(elems) == 2, singular assert elems[0].text == 'value 1' assert elems[1].text == 'value 2' # Test empty values tree = simpledc.dump_etree({plural: []}) elem = tree.xpath( '//dc:{0}'.format(singular), namespaces=simpledc.ns) assert len(elem) == 0, singular # Test all elements together data = {} for plural, singular in elements: data[plural] = ['test 1', 'test 2'] tree = simpledc.dump_etree(data) for plural, singular in elements: elems = tree.xpath( '/oai_dc:dc/dc:{0}'.format(singular), namespaces=simpledc.ns) assert len(elems) == 2, singular assert elems[0].text == 'test 1' assert elems[1].text == 'test 2' # Test tostring xml = simpledc.tostring(data) for plural, singular in elements: assert '<dc:{0}>'.format(singular) in xml
def test_elements(): """Test simple dc.""" elements = [ ('contributors', 'contributor'), ('coverage', 'coverage'), ('creators', 'creator'), ('dates', 'date'), ('descriptions', 'description'), ('formats', 'format'), ('identifiers', 'identifier'), ('languages', 'language'), ('publishers', 'publisher'), ('relations', 'relation'), ('rights', 'rights'), ('sources', 'source'), ('subjects', 'subject'), ('titles', 'title'), ('types', 'type'), ] # Test each element individually for plural, singular in elements: # Test multiple values tree = simpledc.dump_etree({plural: ['value 1', 'value 2']}) elems = tree.xpath('/oai_dc:dc/dc:{0}'.format(singular), namespaces=simpledc.ns) assert len(elems) == 2, singular assert elems[0].text == 'value 1' assert elems[1].text == 'value 2' # Test empty values tree = simpledc.dump_etree({plural: []}) elem = tree.xpath('//dc:{0}'.format(singular), namespaces=simpledc.ns) assert len(elem) == 0, singular # Test all elements together data = {} for plural, singular in elements: data[plural] = ['test 1', 'test 2'] tree = simpledc.dump_etree(data) for plural, singular in elements: elems = tree.xpath('/oai_dc:dc/dc:{0}'.format(singular), namespaces=simpledc.ns) assert len(elems) == 2, singular assert elems[0].text == 'test 1' assert elems[1].text == 'test 2' # Test tostring xml = simpledc.tostring(data) for plural, singular in elements: assert '<dc:{0}>'.format(singular) in xml
def serialize_search(self, pid_fetcher, search_result, links=None, item_links_factory=None): """Serialize a search result. :param pid_fetcher: Persistent identifier fetcher. :param search_result: Elasticsearch search result. :param links: Dictionary of links to add to response. """ records = [] for hit in search_result['hits']['hits']: records.append(simpledc.tostring(self.transform_search_hit( pid_fetcher(hit['_id'], hit['_source']), hit, links_factory=item_links_factory, ))) return "\n".join(records)
def generate_dublin_core(pubrecord): """ This function turns a publication record into a simple dublin core XML record :param pubrecord: :return: dublin core XML record """ authors = pubrecord.get('authorsList') editors = pubrecord.get('editorsList') all_contributors = None if authors and editors: all_contributors = authors + editors elif authors: all_contributors = authors elif editors: all_contributors = editors data = { "dates": [pubrecord.get('publicationYear')], "descriptions": [pubrecord.get('docAbstract')], "formats": ['application/pdf'], "identifiers": [pubrecord.get('doi')], "languages": ['en'], "publishers": [pubrecord.get('publisher')], "titles": [pubrecord.get('title')], } if all_contributors and len(all_contributors) >= 1: data["creators"] = [all_contributors[0]] if all_contributors and len(all_contributors) >= 2: data["contributors"] = all_contributors[1:] if pubrecord['publicationType']['text'] == 'Book chapter': data['types'] = ['chapter'] elif pubrecord['publicationType']['text'] == 'Book': data['types'] = ['book'] elif pubrecord['publicationType']['text'] == 'Article': data['types'] = ['article'] elif pubrecord['publicationType']['text'] == 'Report': data['types'] = ['reports'] else: data['types'] = ['text'] return '\n'.join(simpledc.tostring(data).splitlines()[1:])
os.makedirs("xml", exist_ok=True) # df is a name or variable that we use to store a DataFrame # we read rows of csv file and make a DataFrame named df df = pd.read_csv('dc_sample.csv') # what does the data look like? Do we need to do anything to it? # How can we stop code after looking at data? # empty values are read in as NaN, which can be difficult to work with # so, we take all empty values and make them an empty string df = df.fillna('') # Let's look at ist0977.xml, what do we notice about the subject item? # How might we address this? # Let's look at all the rows that have a comma in subject # What do we learn? # How might we address the larger issue? # Could we fix the data in the file using python? list_of_dicts = df.to_dict(orient='records') for dictionary in list_of_dicts: for key, value in dictionary.items(): dictionary[key] = [value] xml = simpledc.tostring(dictionary) fn = dictionary['identifiers'][0] with open('xml/' + fn + '.xml', mode='w', encoding='utf8') as fp: fp.write(xml)