def write_document_results_to_file(graph): mappings = {} results = graph.query(UNSOURCED_QUERY) for row in results: (item, document) = row obj = document.replace("http://data.austalk.edu.au/", configmanager.get_config("DOCUMENT_BASE_URL") + configmanager.get_config("AUSTALK") + "/") subject = item.replace("http://id.austalk.edu.au/item", corpus_prefix_namespace(configmanager.get_config("AUSTALK"))) + "/document/" + os.path.basename(obj) mappings[item] = item.replace("http://id.austalk.edu.au/item", corpus_prefix_namespace(configmanager.get_config("AUSTALK"))) mappings[document] = subject graph.add((Namespace(subject), DC.source, Literal(obj))) file_path = urlparse.urlparse(obj).path if os.path.exists(file_path): graph.add((Namespace(subject), DC.extent, Literal(os.path.getsize(file_path)))) else: print "Missing files - " + file_path continue results = graph.query(DOCUMENT_QUERY) for row in results: (document, source) = row obj = source.replace("http://data.austalk.edu.au/", configmanager.get_config("DOCUMENT_BASE_URL") + configmanager.get_config("AUSTALK") + "/") subject = document graph.set((Namespace(subject), DC.source, Literal(obj))) file_path = urlparse.urlparse(obj).path if os.path.exists(file_path): graph.add((Namespace(subject), DC.extent, Literal(os.path.getsize(file_path)))) else: print "Missing files - " + file_path continue return mappings
def get_required_urls(self, corpus_folder_name): # If there are files to upload authenticate with the server loginUrl = configmanager.get_config("LOGINURL") # 09/03/2012: Switched to the new upload form uploadUrl = self.build_url(configmanager.get_config("ADMINUPLOADURL"), corpus_folder_name) corpusuploadUrl = self.build_url(configmanager.get_config('CORPUSLOADURL'), corpus_folder_name) return (loginUrl, uploadUrl, corpusuploadUrl)
def map_data_uris(graph): """Modify the RDF to change the URI of all data items to our own configured server. Return a serialisation of the graph.""" document = graph.serialize(format="nt") return document.replace( "http://data.austalk.edu.au/", configmanager.get_config("DOCUMENT_BASE_URL") + configmanager.get_config("AUSTALK") + "/")
def map_tuplelist(self, metadata, identify_documents): graph = Graph(identifier=self.corpus_uri()) graph = bind_graph(graph) documents = [] itemuris = [v for k, v in metadata if 'URI' in k] if itemuris: itemuri = Namespace(itemuris[0]) for k, v in metadata: if k.startswith("table_document"): docmeta = v docuri = self.document(docmeta, graph) docmeta.update({'uri': docuri}) documents.append(docmeta) graph.add((itemuri, AUSNC.document, docuri)) baseuri = configmanager.get_config("DOCUMENT_BASE_URL", "") if not baseuri == "": docid = docmeta['filename'] docid = urllib.quote(docid) if 'subdir' in metadata: uri = URIRef(baseuri + self.corpusID.lower() + metadata['subdir'] + docid) elif self.corpusID.lower() == "paradisec": subdir = docid.split("-")[0] + "/" + docid.split( "-")[1] + "/" uri = URIRef(baseuri + self.corpusID.lower() + "/" + subdir + docid) else: uri = URIRef(baseuri + self.corpusID.lower() + "/" + docid) graph.add((docuri, DC.source, URIRef(uri))) elif k.startswith("table_person"): speakermeta = v speakeruri = self.speaker(speakermeta, graph) graph.add((itemuri, speakermeta['role'], speakeruri)) else: for (prop, value) in self.map(k, v): if prop: graph.add((itemuri, prop, value)) corpusuri = self.corpus_uri() graph.add((itemuri, RDF.type, AUSNC.AusNCObject)) graph.add((itemuri, DC.isPartOf, corpusuri)) (indexable_document, display_document) = identify_documents(documents) if indexable_document: graph.add( (itemuri, ALVEO.indexable_document, indexable_document)) if display_document: graph.add((itemuri, ALVEO.display_document, display_document)) self.update_schema(graph) else: graph = None return graph
def map_tuplelist(self, metadata, identify_documents): graph = Graph(identifier=self.corpus_uri()) graph = bind_graph(graph) documents = [] itemuris = [v for k,v in metadata if 'URI' in k] if itemuris: itemuri = Namespace(itemuris[0]) for k, v in metadata: if k.startswith("table_document"): docmeta = v docuri = self.document(docmeta, graph) docmeta.update({'uri':docuri}) documents.append(docmeta) graph.add((itemuri, AUSNC.document, docuri)) baseuri = configmanager.get_config("DOCUMENT_BASE_URL", "") if not baseuri == "": docid = docmeta['filename'] docid = urllib.quote(docid) if 'subdir' in metadata: uri = URIRef(baseuri + self.corpusID.lower() + metadata['subdir'] + docid) elif self.corpusID.lower() == "paradisec": subdir = docid.split("-")[0] + "/" + docid.split("-")[1] + "/" uri = URIRef(baseuri + self.corpusID.lower() + "/" + subdir + docid) else: uri = URIRef(baseuri + self.corpusID.lower() + "/" + docid) graph.add((docuri, DC.source, URIRef(uri))) elif k.startswith("table_person"): speakermeta = v speakeruri = self.speaker(speakermeta, graph) graph.add((itemuri, speakermeta['role'], speakeruri)) else: for (prop, value) in self.map(k, v): if prop: graph.add((itemuri, prop, value)) corpusuri = self.corpus_uri() graph.add((itemuri, RDF.type, AUSNC.AusNCObject)) graph.add((itemuri, DC.isPartOf, corpusuri)) (indexable_document, display_document) = identify_documents(documents) if indexable_document: graph.add((itemuri, ALVEO.indexable_document, indexable_document)) if display_document: graph.add((itemuri, ALVEO.display_document, display_document)) self.update_schema(graph) else: graph = None return graph
def get_download_url(self, sampleid, collection_name): ''' This function builds the download url for a particular sample and collection ''' configmanager.configinit() baseUrl = self.build_url(configmanager.get_config("BASEURL"), configmanager.get_config(collection_name)) return baseUrl + sampleid
def map_tuplelist(self, metadata, identify_documents): graph = Graph(identifier=self.corpus_uri()) graph = bind_graph(graph) documents = [] uri = metadata.get('URI', None) identifier = metadata['identifier'] if uri: itemuri = self.item_uri(identifier) for k in metadata.keys(): v = metadata[k] if k.startswith("table_document"): docmeta = v docuri = self.document(identifier, docmeta, graph) docmeta.update({'uri':docuri}) documents.append(docmeta) graph.add((itemuri, AUSNC.document, docuri)) baseuri = configmanager.get_config("DOCUMENT_BASE_URL", "") if not baseuri == "": docid = docmeta['filename'] docid = urllib.quote(docid) if 'subdir' in metadata: uri = URIRef(baseuri + self.corpusID.lower() + metadata['subdir'] + docid) elif self.corpusID.lower().startswith("paradisec"): uri = URIRef(baseuri + "paradisec" + "/" + docid.split("-")[0] + "/" + docid.split("-")[1] + "/" + docid) else: uri = URIRef(baseuri + self.corpusID.lower() + "/" + docid) graph.add((docuri, DC.source, URIRef(uri))) file_path = urlparse.urlparse(uri).path if os.path.exists(file_path): graph.add((docuri, DC.extent, Literal(os.path.getsize(file_path)))) else: print "Missing files - " + file_path continue elif k.startswith("table_person"): speakermeta = v speakeruri = self.speaker(speakermeta, graph) graph.add((itemuri, speakermeta['role'], speakeruri)) else: if isinstance(v, list): for value in v: for (prop, subject) in self.map(k, value): if prop: graph.add((itemuri, prop, subject)) else: for (prop, subject) in self.map(k, v): if prop: graph.add((itemuri, prop, subject)) corpusuri = self.corpus_uri() graph.add((itemuri, RDF.type, AUSNC.AusNCObject)) graph.add((itemuri, DC.isPartOf, corpusuri)) (indexable_document, display_document) = identify_documents(documents) if indexable_document: graph.add((itemuri, ALVEO.indexable_document, indexable_document)) if display_document: graph.add((itemuri, ALVEO.display_document, display_document)) self.update_schema(graph) else: graph = None return graph
def mapdict(self, metadata, identify_documents): ''' This function takes one metadata dictionary as extracted by the ingest module in this package, and returns a rdflib Graph instance. ''' graph = Graph(identifier=self.corpus_uri()) graph = bind_graph(graph) itemuri = self.item_uri(metadata['sampleid']) # sourceuri = self.item_source_uri(metadata['sampleid'], metadata['table_document']['filename']) corpusuri = self.corpus_uri() documents = [] for key in metadata.keys(): if key == 'sampleid': pass elif type(metadata[key]) == str and metadata[key].strip() == "": # don't record empty fields pass elif key.startswith("table_person"): speakermeta = metadata[key] # make a speaker uri speakeruri = self.speaker(speakermeta, graph) graph.add((itemuri, OLAC.speaker, speakeruri)) elif key.startswith("table_document"): docmeta = metadata[key] # make a document uri docuri = self.document(metadata['sampleid'], docmeta, graph) docmeta.update({'uri':docuri}) documents.append(docmeta) graph.add((itemuri, AUSNC.document, docuri)) # TODO: what is a document? # add a property recording a URI for the document if # we're given a DOCUMENT_BASE_URL in the configuration baseuri = configmanager.get_config("DOCUMENT_BASE_URL", "") if not baseuri == "": docid = docmeta['filename'] docid = urllib.quote(docid) if 'subdir' in metadata: uri = URIRef(baseuri + self.corpusID.lower() + metadata['subdir'] + docid) else: if self.corpusID.lower().starts_with("paradisec"): uri = URIRef(baseuri + "paradisec" + "/" + docid.split("-")[0] + "/" + docid.split("-")[1] + "/" + docid) else: uri = URIRef(baseuri + self.corpusID.lower() + "/" + docid) graph.add((docuri, DC.source, URIRef(uri))) file_path = urlparse.urlparse(uri).path if os.path.exists(file_path): graph.add((docuri, DC.extent, Literal(os.path.getsize(file_path)))) else: print "Missing files - " + file_path continue elif metadata[key] != '': # convert and add the property/value try: for (prop, value) in self.map(key, metadata[key]): if prop: #print itemuri, prop, value graph.add((itemuri, prop, value)) except ValueError: # print key pass # type infos: graph.add((itemuri, RDF.type, AUSNC.AusNCObject)) # we want to say that this item is part of it's corpus graph.add((itemuri, DC.isPartOf, corpusuri)) (indexable_document, display_document) = identify_documents(documents) # 13/03/2012 SDP: The use of dc:source is not supported as we are using document instead if indexable_document: graph.add((itemuri, ALVEO.indexable_document, indexable_document)) if display_document: graph.add((itemuri, ALVEO.display_document, display_document)) # graph.add((sourceuri, RDF.type, FOAF.Document)) # link item to other objects: # graph.add((itemuri, DC.source, sourceuri)) # g.add((itemuri, DC.creator, authoruri)) # keep original item identifier as separate field # TODO: should we derive a new property from dc:identifier? graph.add((itemuri, DC.identifier, Literal(metadata['sampleid']))) self.update_schema(graph) return graph
def mapdict(self, metadata, identify_documents): ''' This function takes one metadata dictionary as extracted by the ingest module in this package, and returns a rdflib Graph instance. ''' graph = Graph(identifier=self.corpus_uri()) graph = bind_graph(graph) itemuri = self.item_uri(metadata['sampleid']) corpusuri = self.corpus_uri() documents = [] for key in metadata.keys(): if key == 'sampleid': pass elif type(metadata[key]) == str and metadata[key].strip() == "": # don't record empty fields pass elif key.startswith("table_person"): speakermeta = metadata[key] # make a speaker uri speakeruri = self.speaker(speakermeta, graph) graph.add((itemuri, OLAC.speaker, speakeruri)) elif key.startswith("table_document"): docmeta = metadata[key] # make a document uri docuri = self.document(metadata['sampleid'], docmeta, graph) docmeta.update({'uri': docuri}) documents.append(docmeta) graph.add((itemuri, AUSNC.document, docuri)) # TODO: what is a document? # add a property recording a URI for the document if # we're given a DOCUMENT_BASE_URL in the configuration baseuri = configmanager.get_config("DOCUMENT_BASE_URL", "") if not baseuri == "": docid = docmeta['filename'] docid = urllib.quote(docid) graph.add((docuri, DC.source, Literal(docid))) elif metadata[key] != '': # convert and add the property/value try: for (prop, value) in self.map(key, metadata[key]): if prop: #print itemuri, prop, value graph.add((itemuri, prop, value)) except ValueError: # print key pass # type infos: graph.add((itemuri, RDF.type, AUSNC.AusNCObject)) # we want to say that this item is part of it's corpus graph.add((itemuri, DC.isPartOf, corpusuri)) (indexable_document, display_document) = identify_documents(documents) # 13/03/2012 SDP: The use of dc:source is not supported as we are using document instead if indexable_document: graph.add((itemuri, ALVEO.indexable_document, indexable_document)) if display_document: graph.add((itemuri, ALVEO.display_document, display_document)) # graph.add((sourceuri, RDF.type, FOAF.Document)) # link item to other objects: # graph.add((itemuri, DC.source, sourceuri)) # g.add((itemuri, DC.creator, authoruri)) # keep original item identifier as separate field # TODO: should we derive a new property from dc:identifier? graph.add((itemuri, DC.identifier, Literal(metadata['sampleid']))) self.update_schema(graph) return graph
def map_data_uris(graph): """Modify the RDF to change the URI of all data items to our own configured server. Return a serialisation of the graph.""" document = graph.serialize(format="nt") return document.replace("http://data.austalk.edu.au/", configmanager.get_config("DOCUMENT_BASE_URL") + configmanager.get_config("AUSTALK") + "/")
def main(): ''' Primary application entry point for uploading of collection rdf files ''' print "Corpora upload tool " if len(sys.argv) < 5: print 'Insufficient Parameters, please provide a collection name (e.g. ace, md etc), the name of the corpus folder, the location of the files to upload and a True or False value indicating whether corpus documents require uploading.' print 'Example: python uploader.py cooee cooee ../output/cooee True' return # Example command: python uploader.py md mdtest '/Users/Shirren/Desktop/md/S00/ True' # First parameter (i.e. md) is the name of the collection, this value should # be reflected in the supported_collections dictionary # Second parameter (i.e. mdtest) is the name of the corpus folder in Plone # Third paramter is the location of the files collection_name = sys.argv[1].strip() corpus_folder_name = sys.argv[2].strip() folder_loc = sys.argv[3].strip() upload_corpus_doc = sys.argv[4].strip().lower() == 'true' if len(sys.argv) == 6: # We have been provided with an ini file to use as a reference ini_file = sys.argv[5].strip() else: ini_file = 'griffithconfig.ini' print 'Parameters: Collection Name->', collection_name, \ ' Corpus Folder Name->', corpus_folder_name, \ ' Location of Upload Files->', folder_loc, \ ' Upload Corpus Documents->', upload_corpus_doc if collection_name in supported_collections: # Initialise configuration file and grab reference data for the upload. This reference data # also includes the files we have already uploaded which comes from the log files in the tmp folder configmanager.configinit(ini_file) uploaded_files = helper.get_uploaded_files(collection_name) (loginUrl, uploadUrl, corpusuploadUrl) = helper.get_required_urls(corpus_folder_name) # Get the file we would like to upload for a particular collection fileHandler = FileHandler() fileList = sorted(fileHandler.getFiles(folder_loc, r'^.+-metadata.rdf$')) print 'Attempting Authentication using ', loginUrl session = Session() session.authenticate(loginUrl, configmanager.get_config("USERNAME"), configmanager.get_config("PASSWORD")) # Authentication successful proceed with the upload if session.inSession(): print "Authenticated and pushing data up for ", collection_name # Switched to the new upload form rdfForm = FormDecorator(AdminRDFUploadForm(), collection_name) # Collection name is used for the logger corpusForm = FormDecorator(CorpusItemUploadForm(), collection_name) for meta_file in fileList: meta_path = fileHandler.findFilePath(folder_loc, meta_file) ann_file = helper.derive_annotation_filename_from_meta_filename(meta_file) ann_path = fileHandler.findFilePath(folder_loc, ann_file) item_uri = resolver.get_item_uri(meta_path) source_files = resolver.get_upload_units(meta_path) # Now upload the meta data file and annotation file rdf_upload(rdfForm, session, meta_file, meta_path, uploadUrl, item_uri, uploaded_files) rdf_upload(rdfForm, session, ann_file, ann_path, uploadUrl, item_uri, uploaded_files) # Only upload corpus documents if asked too if upload_corpus_doc: # As cooee is a one-off special case at this moment I am using a simple if statement, if more # complex character set determination is required then we can refactor this code if collection_name == 'cooee': corpus_item_upload(corpusForm, session, meta_path, corpusuploadUrl, folder_loc, uploaded_files, 'ISO-8859-1') else: corpus_item_upload(corpusForm, session, meta_path, corpusuploadUrl, folder_loc, uploaded_files) else: print 'Authentication failure for user: '******' with password ', configmanager.get_config("PASSWORD") else: print collection_name + " is an unsupported collection type."