Пример #1
0
def write_document_results_to_file(graph):
    mappings = {}
    results = graph.query(UNSOURCED_QUERY)
    for row in results:
        (item, document) = row

        obj = document.replace("http://data.austalk.edu.au/", configmanager.get_config("DOCUMENT_BASE_URL") + configmanager.get_config("AUSTALK") + "/")
        subject = item.replace("http://id.austalk.edu.au/item", corpus_prefix_namespace(configmanager.get_config("AUSTALK"))) + "/document/" + os.path.basename(obj)

        mappings[item] = item.replace("http://id.austalk.edu.au/item", corpus_prefix_namespace(configmanager.get_config("AUSTALK")))
        mappings[document] = subject

        graph.add((Namespace(subject), DC.source, Literal(obj)))
        file_path = urlparse.urlparse(obj).path
        if os.path.exists(file_path):
            graph.add((Namespace(subject), DC.extent, Literal(os.path.getsize(file_path))))
        else:
            print "Missing files - " + file_path
            continue

    results = graph.query(DOCUMENT_QUERY)
    for row in results:
        (document, source) = row
        obj = source.replace("http://data.austalk.edu.au/", configmanager.get_config("DOCUMENT_BASE_URL") + configmanager.get_config("AUSTALK") + "/")
        subject = document
        graph.set((Namespace(subject), DC.source, Literal(obj)))
        file_path = urlparse.urlparse(obj).path
        if os.path.exists(file_path):
            graph.add((Namespace(subject), DC.extent, Literal(os.path.getsize(file_path))))
        else:
            print "Missing files - " + file_path
            continue

    return mappings
Пример #2
0
  def get_required_urls(self, corpus_folder_name):
    # If there are files to upload authenticate with the server      
    loginUrl = configmanager.get_config("LOGINURL")

    # 09/03/2012: Switched to the new upload form
    uploadUrl = self.build_url(configmanager.get_config("ADMINUPLOADURL"), corpus_folder_name)
    corpusuploadUrl = self.build_url(configmanager.get_config('CORPUSLOADURL'), corpus_folder_name)
    
    return (loginUrl, uploadUrl, corpusuploadUrl)
Пример #3
0
def map_data_uris(graph):
    """Modify the RDF to change the URI of all data items to our own 
	configured server.  Return a serialisation of the graph."""

    document = graph.serialize(format="nt")
    return document.replace(
        "http://data.austalk.edu.au/",
        configmanager.get_config("DOCUMENT_BASE_URL") +
        configmanager.get_config("AUSTALK") + "/")
Пример #4
0
    def map_tuplelist(self, metadata, identify_documents):
        graph = Graph(identifier=self.corpus_uri())
        graph = bind_graph(graph)
        documents = []

        itemuris = [v for k, v in metadata if 'URI' in k]

        if itemuris:
            itemuri = Namespace(itemuris[0])
            for k, v in metadata:
                if k.startswith("table_document"):
                    docmeta = v
                    docuri = self.document(docmeta, graph)
                    docmeta.update({'uri': docuri})
                    documents.append(docmeta)
                    graph.add((itemuri, AUSNC.document, docuri))
                    baseuri = configmanager.get_config("DOCUMENT_BASE_URL", "")
                    if not baseuri == "":
                        docid = docmeta['filename']
                        docid = urllib.quote(docid)
                        if 'subdir' in metadata:
                            uri = URIRef(baseuri + self.corpusID.lower() +
                                         metadata['subdir'] + docid)
                        elif self.corpusID.lower() == "paradisec":
                            subdir = docid.split("-")[0] + "/" + docid.split(
                                "-")[1] + "/"
                            uri = URIRef(baseuri + self.corpusID.lower() +
                                         "/" + subdir + docid)
                        else:
                            uri = URIRef(baseuri + self.corpusID.lower() +
                                         "/" + docid)
                        graph.add((docuri, DC.source, URIRef(uri)))
                elif k.startswith("table_person"):
                    speakermeta = v
                    speakeruri = self.speaker(speakermeta, graph)
                    graph.add((itemuri, speakermeta['role'], speakeruri))
                else:
                    for (prop, value) in self.map(k, v):
                        if prop:
                            graph.add((itemuri, prop, value))

            corpusuri = self.corpus_uri()
            graph.add((itemuri, RDF.type, AUSNC.AusNCObject))
            graph.add((itemuri, DC.isPartOf, corpusuri))

            (indexable_document,
             display_document) = identify_documents(documents)

            if indexable_document:
                graph.add(
                    (itemuri, ALVEO.indexable_document, indexable_document))

            if display_document:
                graph.add((itemuri, ALVEO.display_document, display_document))

            self.update_schema(graph)
        else:
            graph = None
        return graph
Пример #5
0
    def map_tuplelist(self, metadata, identify_documents):
        graph = Graph(identifier=self.corpus_uri())
        graph = bind_graph(graph)
        documents = []

        itemuris = [v for k,v in metadata if 'URI' in k]

        if itemuris:
          itemuri = Namespace(itemuris[0])
          for k, v in metadata:
              if k.startswith("table_document"):
                docmeta = v
                docuri = self.document(docmeta, graph)
                docmeta.update({'uri':docuri})
                documents.append(docmeta)
                graph.add((itemuri, AUSNC.document, docuri))
                baseuri = configmanager.get_config("DOCUMENT_BASE_URL", "")
                if not baseuri == "":
                    docid = docmeta['filename']
                    docid = urllib.quote(docid)
                    if 'subdir' in metadata:
                      uri = URIRef(baseuri + self.corpusID.lower() + metadata['subdir'] + docid)
                    elif self.corpusID.lower() == "paradisec":
                      subdir = docid.split("-")[0] + "/" + docid.split("-")[1] + "/"
                      uri = URIRef(baseuri + self.corpusID.lower() + "/" + subdir + docid)
                    else:
                      uri = URIRef(baseuri + self.corpusID.lower() + "/" + docid)
                    graph.add((docuri, DC.source, URIRef(uri)))
              elif k.startswith("table_person"):
                speakermeta = v
                speakeruri = self.speaker(speakermeta, graph)
                graph.add((itemuri, speakermeta['role'], speakeruri))
              else:
                for (prop, value) in self.map(k, v):
                    if prop:
                        graph.add((itemuri, prop, value))

          corpusuri = self.corpus_uri()
          graph.add((itemuri, RDF.type, AUSNC.AusNCObject))
          graph.add((itemuri, DC.isPartOf, corpusuri))

          (indexable_document, display_document) = identify_documents(documents)

          if indexable_document:
              graph.add((itemuri, ALVEO.indexable_document, indexable_document))

          if display_document:
              graph.add((itemuri, ALVEO.display_document, display_document))

          self.update_schema(graph)
        else:
          graph = None
        return graph
Пример #6
0
 def get_download_url(self, sampleid, collection_name):
   ''' This function builds the download url for a particular sample and collection '''
   configmanager.configinit()
   baseUrl = self.build_url(configmanager.get_config("BASEURL"), configmanager.get_config(collection_name))
   return baseUrl + sampleid
Пример #7
0
    def map_tuplelist(self, metadata, identify_documents):
        graph = Graph(identifier=self.corpus_uri())
        graph = bind_graph(graph)
        documents = []

        uri = metadata.get('URI', None)
        identifier = metadata['identifier']
        if uri:
          itemuri = self.item_uri(identifier)
          for k in metadata.keys():
              v = metadata[k]
              if k.startswith("table_document"):
                docmeta = v
                docuri = self.document(identifier, docmeta, graph)
                docmeta.update({'uri':docuri})
                documents.append(docmeta)
                graph.add((itemuri, AUSNC.document, docuri))
                baseuri = configmanager.get_config("DOCUMENT_BASE_URL", "")
                if not baseuri == "":
                    docid = docmeta['filename']
                    docid = urllib.quote(docid)
                    if 'subdir' in metadata:
                      uri = URIRef(baseuri + self.corpusID.lower() + metadata['subdir'] + docid)
                    elif self.corpusID.lower().startswith("paradisec"):
                      uri = URIRef(baseuri + "paradisec" + "/" + docid.split("-")[0] + "/" + docid.split("-")[1] + "/" + docid)
                    else:
                      uri = URIRef(baseuri + self.corpusID.lower() + "/" + docid)
                    graph.add((docuri, DC.source, URIRef(uri)))
                    file_path = urlparse.urlparse(uri).path
                    if os.path.exists(file_path):
                        graph.add((docuri, DC.extent, Literal(os.path.getsize(file_path))))
                    else:
                        print "Missing files - " + file_path
                        continue

              elif k.startswith("table_person"):
                speakermeta = v
                speakeruri = self.speaker(speakermeta, graph)
                graph.add((itemuri, speakermeta['role'], speakeruri))
              else:
                if isinstance(v, list):
                    for value in v:
                        for (prop, subject) in self.map(k, value):
                            if prop:
                                graph.add((itemuri, prop, subject))
                else:
                    for (prop, subject) in self.map(k, v):
                        if prop:
                            graph.add((itemuri, prop, subject))

          corpusuri = self.corpus_uri()
          graph.add((itemuri, RDF.type, AUSNC.AusNCObject))
          graph.add((itemuri, DC.isPartOf, corpusuri))

          (indexable_document, display_document) = identify_documents(documents)

          if indexable_document:
              graph.add((itemuri, ALVEO.indexable_document, indexable_document))

          if display_document:
              graph.add((itemuri, ALVEO.display_document, display_document))

          self.update_schema(graph)
        else:
          graph = None
        return graph
Пример #8
0
    def mapdict(self, metadata, identify_documents):
        '''
        This function takes one metadata dictionary as extracted by the ingest
        module in this package, and returns a rdflib Graph instance.
        '''

        graph = Graph(identifier=self.corpus_uri())
        graph = bind_graph(graph)

        itemuri = self.item_uri(metadata['sampleid'])
        # sourceuri = self.item_source_uri(metadata['sampleid'], metadata['table_document']['filename'])
        corpusuri = self.corpus_uri()
        documents = []

        for key in metadata.keys():
            if key == 'sampleid':
                pass
            elif type(metadata[key]) == str and metadata[key].strip() == "":
                # don't record empty fields
                pass
            elif key.startswith("table_person"):
                speakermeta = metadata[key]
                # make a speaker uri
                speakeruri = self.speaker(speakermeta, graph)
                graph.add((itemuri, OLAC.speaker, speakeruri))

            elif key.startswith("table_document"):
                docmeta = metadata[key]
                # make a document uri
                docuri = self.document(metadata['sampleid'], docmeta, graph)
                docmeta.update({'uri':docuri})
                documents.append(docmeta)
                graph.add((itemuri, AUSNC.document, docuri))  # TODO: what is a document?
                # add a property recording a URI for the document if
                # we're given a  DOCUMENT_BASE_URL in the configuration

                baseuri = configmanager.get_config("DOCUMENT_BASE_URL", "")
                if not baseuri == "":
                    docid = docmeta['filename']
                    docid = urllib.quote(docid)
                    if 'subdir' in metadata:
                      uri = URIRef(baseuri + self.corpusID.lower() + metadata['subdir'] + docid)
                    else:
                      if self.corpusID.lower().starts_with("paradisec"):
                        uri = URIRef(baseuri + "paradisec" + "/" + docid.split("-")[0] + "/" + docid.split("-")[1] + "/" + docid)
                      else:
                        uri = URIRef(baseuri + self.corpusID.lower() + "/" + docid)
                    graph.add((docuri, DC.source, URIRef(uri)))
                    file_path = urlparse.urlparse(uri).path
                    if os.path.exists(file_path):
                        graph.add((docuri, DC.extent, Literal(os.path.getsize(file_path))))
                    else:
                        print "Missing files - " + file_path
                        continue

            elif  metadata[key] != '':
                # convert and add the property/value
                try:
                  for (prop, value) in self.map(key, metadata[key]):
                      if prop:
                          #print itemuri, prop, value
                          graph.add((itemuri, prop, value))
                except ValueError:
#                  print key
                  pass

        # type infos:
        graph.add((itemuri, RDF.type, AUSNC.AusNCObject))

        # we want to say that this item is part of it's corpus
        graph.add((itemuri, DC.isPartOf, corpusuri))

        (indexable_document, display_document) = identify_documents(documents)

        # 13/03/2012 SDP: The use of dc:source is not supported as we are using document instead
        if indexable_document:
          graph.add((itemuri, ALVEO.indexable_document, indexable_document))

        if display_document:
          graph.add((itemuri, ALVEO.display_document, display_document))

        # graph.add((sourceuri, RDF.type, FOAF.Document))
        # link item to other objects:
        # graph.add((itemuri, DC.source, sourceuri))
        # g.add((itemuri, DC.creator, authoruri))
        # keep original item identifier as separate field

        # TODO: should we derive a new property from dc:identifier?
        graph.add((itemuri, DC.identifier, Literal(metadata['sampleid'])))

        self.update_schema(graph)

        return graph
Пример #9
0
    def mapdict(self, metadata, identify_documents):
        '''
        This function takes one metadata dictionary as extracted by the ingest
        module in this package, and returns a rdflib Graph instance.
        '''

        graph = Graph(identifier=self.corpus_uri())
        graph = bind_graph(graph)

        itemuri = self.item_uri(metadata['sampleid'])

        corpusuri = self.corpus_uri()
        documents = []

        for key in metadata.keys():
            if key == 'sampleid':
                pass
            elif type(metadata[key]) == str and metadata[key].strip() == "":
                # don't record empty fields
                pass
            elif key.startswith("table_person"):
                speakermeta = metadata[key]
                # make a speaker uri
                speakeruri = self.speaker(speakermeta, graph)
                graph.add((itemuri, OLAC.speaker, speakeruri))

            elif key.startswith("table_document"):
                docmeta = metadata[key]
                # make a document uri
                docuri = self.document(metadata['sampleid'], docmeta, graph)
                docmeta.update({'uri': docuri})
                documents.append(docmeta)
                graph.add((itemuri, AUSNC.document,
                           docuri))  # TODO: what is a document?
                # add a property recording a URI for the document if
                # we're given a  DOCUMENT_BASE_URL in the configuration

                baseuri = configmanager.get_config("DOCUMENT_BASE_URL", "")

                if not baseuri == "":
                    docid = docmeta['filename']
                    docid = urllib.quote(docid)
                    graph.add((docuri, DC.source, Literal(docid)))

            elif metadata[key] != '':
                # convert and add the property/value
                try:
                    for (prop, value) in self.map(key, metadata[key]):
                        if prop:
                            #print itemuri, prop, value
                            graph.add((itemuri, prop, value))
                except ValueError:
                    #                  print key
                    pass

        # type infos:
        graph.add((itemuri, RDF.type, AUSNC.AusNCObject))

        # we want to say that this item is part of it's corpus
        graph.add((itemuri, DC.isPartOf, corpusuri))

        (indexable_document, display_document) = identify_documents(documents)

        # 13/03/2012 SDP: The use of dc:source is not supported as we are using document instead
        if indexable_document:
            graph.add((itemuri, ALVEO.indexable_document, indexable_document))

        if display_document:
            graph.add((itemuri, ALVEO.display_document, display_document))

        # graph.add((sourceuri, RDF.type, FOAF.Document))
        # link item to other objects:
        # graph.add((itemuri, DC.source, sourceuri))
        # g.add((itemuri, DC.creator, authoruri))
        # keep original item identifier as separate field

        # TODO: should we derive a new property from dc:identifier?
        graph.add((itemuri, DC.identifier, Literal(metadata['sampleid'])))

        self.update_schema(graph)

        return graph
Пример #10
0
def map_data_uris(graph):
	"""Modify the RDF to change the URI of all data items to our own 
	configured server.  Return a serialisation of the graph."""
	
	document = graph.serialize(format="nt")
	return document.replace("http://data.austalk.edu.au/", configmanager.get_config("DOCUMENT_BASE_URL") + configmanager.get_config("AUSTALK") + "/")
Пример #11
0
def main():
    ''' Primary application entry point for uploading of collection rdf files '''
    
    print "Corpora upload tool "
  
    if len(sys.argv) < 5:
        print 'Insufficient Parameters, please provide a collection name (e.g. ace, md etc), the name of the corpus folder, the location of the files to upload and a True or False value indicating whether corpus documents require uploading.'
        print 'Example: python uploader.py cooee cooee ../output/cooee True'
        return
  
    # Example command: python uploader.py md mdtest '/Users/Shirren/Desktop/md/S00/ True'
    # First parameter (i.e. md) is the name of the collection, this value should 
    # be reflected in the supported_collections dictionary
    # Second parameter (i.e. mdtest) is the name of the corpus folder in Plone
    # Third paramter is the location of the files
    collection_name = sys.argv[1].strip()
    corpus_folder_name = sys.argv[2].strip()
    folder_loc = sys.argv[3].strip()
    upload_corpus_doc = sys.argv[4].strip().lower() == 'true'
  
    if len(sys.argv) == 6:
        # We have been provided with an ini file to use as a reference
        ini_file = sys.argv[5].strip()
    else:
        ini_file = 'griffithconfig.ini'
  
    print 'Parameters: Collection Name->', collection_name, \
          ' Corpus Folder Name->', corpus_folder_name, \
          ' Location of Upload Files->', folder_loc, \
          ' Upload Corpus Documents->', upload_corpus_doc
  
    if collection_name in supported_collections:
  
        # Initialise configuration file and grab reference data for the upload. This reference data
        # also includes the files we have already uploaded which comes from the log files in the tmp folder
        configmanager.configinit(ini_file)
        uploaded_files = helper.get_uploaded_files(collection_name)
        (loginUrl, uploadUrl, corpusuploadUrl) = helper.get_required_urls(corpus_folder_name)
  
        # Get the file we would like to upload for a particular collection
        fileHandler = FileHandler()
        fileList = sorted(fileHandler.getFiles(folder_loc, r'^.+-metadata.rdf$'))
    
        print 'Attempting Authentication using ', loginUrl
    
        session = Session()
        session.authenticate(loginUrl, configmanager.get_config("USERNAME"), configmanager.get_config("PASSWORD"))

        # Authentication successful proceed with the upload
        if session.inSession():

            print "Authenticated and pushing data up for ", collection_name
      
            # Switched to the new upload form
            rdfForm = FormDecorator(AdminRDFUploadForm(), collection_name)        # Collection name is used for the logger
            corpusForm = FormDecorator(CorpusItemUploadForm(), collection_name)
    
            for meta_file in fileList:       
        
                meta_path = fileHandler.findFilePath(folder_loc, meta_file)
                ann_file = helper.derive_annotation_filename_from_meta_filename(meta_file)
                ann_path = fileHandler.findFilePath(folder_loc, ann_file)
        
                item_uri = resolver.get_item_uri(meta_path)
                source_files = resolver.get_upload_units(meta_path)
    
                # Now upload the meta data file and annotation file
                rdf_upload(rdfForm, session, meta_file, meta_path, uploadUrl, item_uri, uploaded_files)
                rdf_upload(rdfForm, session, ann_file, ann_path, uploadUrl, item_uri, uploaded_files)
                
                # Only upload corpus documents if asked too
                if upload_corpus_doc:
                  
                  # As cooee is a one-off special case at this moment I am using a simple if statement, if more
                  # complex character set determination is required then we can refactor this code
                  if collection_name == 'cooee':
                    corpus_item_upload(corpusForm, session, meta_path, corpusuploadUrl, folder_loc, uploaded_files, 'ISO-8859-1')
                  else:
                    corpus_item_upload(corpusForm, session, meta_path, corpusuploadUrl, folder_loc, uploaded_files)
          
        else:
            print 'Authentication failure for user: '******' with password ', configmanager.get_config("PASSWORD")
  
    else:
        print collection_name + " is an unsupported collection type."