Пример #1
0
 def update_metadata_for_all_resources(self):
     resources_list = self.get_csv_resource_list_current()
     for resource_id in resources_list[2:]:
         try:
             mapping = csv2rdf.tabular.mapping.Mapping(resource_id)
             mapping.init()
             mapping.update_metadata()
         except BaseException as e:
             print str(e)
Пример #2
0
    def transformResourceToRdf(self, mappingName, resourceId = None, mappingResourceId = None):
        if(not resourceId):
            resourceId = self.resourceId

        if(not mappingResourceId):
            mappingResourceId = resourceId 
                
        logging.info("Getting the CSV filepath...")
        tabularFile = csv2rdf.tabular.tabularfile.TabularFile(resourceId)
        filePath = tabularFile.getCsvFilePathDownload()

        logging.info("Fetching the mapping...")
        mapping = csv2rdf.tabular.mapping.Mapping(mappingResourceId)
        mapping.init()
        mappingPath = mapping.get_mapping_path(mappingName)
        mappingCurrent = mapping.get_mapping_by_name(mappingName)

        #validate cSV to comply with xsd types
        logging.info("Validating CSV...")
        filePath = self.validateCsv(resourceId, mappingName, mappingResourceId)
        logging.info("Validated CSV is: %s" % (filePath,))

        #process file based on the mapping_current options
        processedFile = mapping.process_file(filePath, mappingCurrent)
        filePath = str(processedFile.name)
        delimiter = mappingCurrent['delimiter']
        
        sparqlifyCall = ["java",
                         "-cp", csv2rdf.config.config.sparqlify_jar_path,
                         "org.aksw.sparqlify.csv.CsvMapperCliMain",
                         "-f", filePath,
                         "-c", mappingPath,
                         "-s", delimiter,
                         "-h"]
        # -h - header omit
        # -d - delimiter ("")
        # -s - separator (@,;)
        # for your strange file with all the @, you could try: -s @ -d \0 -e \1 (\0 \1 - binary 0 and 1)
        # \123 or hex e.g. 0xface if you need
        
        logging.info(str(' '.join(sparqlifyCall)))

        rdfFile = os.path.join(csv2rdf.config.config.rdf_files_path, str(resourceId) + '_' + str(mappingName) + '.rdf')
        f = open(rdfFile, 'w')
        process = subprocess.Popen(sparqlifyCall, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        logging.info("rdfFile: %s" % rdfFile)

        stdoutQueue = Queue.Queue()
        stdoutReader = AsynchronousFileReader(process.stdout, stdoutQueue)
        stdoutReader.start()
        stderrQueue = Queue.Queue()
        stderrReader = AsynchronousFileReader(process.stderr, stderrQueue)
        stderrReader.start()

        stdoutSize = 0

        while not stdoutReader.eof() or not stderrReader.eof():
            # Show what we received from standard output.
            while not stdoutQueue.empty():
                stdoutSize += 1
                line = stdoutQueue.get()
                f.write(line)
                if(stdoutSize % 1000 == 0):
                    logging.info("Processed %d lines of %s" % (stdoutSize, rdfFile))
                    

            # Show what we received from standard error.
            while not stderrQueue.empty():
                line = stderrQueue.get()
                logging.info('Received line on standard error: ' + repr(line))

            # Sleep a bit before asking the readers again.
            time.sleep(.1)

        # Let's be tidy and join the threads we've started.
        stdoutReader.join()
        stderrReader.join()

        # Close subprocess' file descriptors.
        process.stdout.close()
        process.stderr.close()
        f.close()
        
        #update metadata_
        logging.info("updating metadata...")
        mapping.updateMetadata(resourceId, mappingName)
        logging.info("DONE")

        #upload to triplestore
        logging.warn("loading resource to virtuoso!")
        virtuoso = VirtuosoLoader()
        graphUri = "http://data.publicdata.eu/%s/%s" % (str(resourceId),str(mappingName))
        virtuoso.reload(rdfFile, graphUri)
        
        return process.returncode