def update_metadata_for_all_resources(self): resources_list = self.get_csv_resource_list_current() for resource_id in resources_list[2:]: try: mapping = csv2rdf.tabular.mapping.Mapping(resource_id) mapping.init() mapping.update_metadata() except BaseException as e: print str(e)
def transformResourceToRdf(self, mappingName, resourceId = None, mappingResourceId = None): if(not resourceId): resourceId = self.resourceId if(not mappingResourceId): mappingResourceId = resourceId logging.info("Getting the CSV filepath...") tabularFile = csv2rdf.tabular.tabularfile.TabularFile(resourceId) filePath = tabularFile.getCsvFilePathDownload() logging.info("Fetching the mapping...") mapping = csv2rdf.tabular.mapping.Mapping(mappingResourceId) mapping.init() mappingPath = mapping.get_mapping_path(mappingName) mappingCurrent = mapping.get_mapping_by_name(mappingName) #validate cSV to comply with xsd types logging.info("Validating CSV...") filePath = self.validateCsv(resourceId, mappingName, mappingResourceId) logging.info("Validated CSV is: %s" % (filePath,)) #process file based on the mapping_current options processedFile = mapping.process_file(filePath, mappingCurrent) filePath = str(processedFile.name) delimiter = mappingCurrent['delimiter'] sparqlifyCall = ["java", "-cp", csv2rdf.config.config.sparqlify_jar_path, "org.aksw.sparqlify.csv.CsvMapperCliMain", "-f", filePath, "-c", mappingPath, "-s", delimiter, "-h"] # -h - header omit # -d - delimiter ("") # -s - separator (@,;) # for your strange file with all the @, you could try: -s @ -d \0 -e \1 (\0 \1 - binary 0 and 1) # \123 or hex e.g. 0xface if you need logging.info(str(' '.join(sparqlifyCall))) rdfFile = os.path.join(csv2rdf.config.config.rdf_files_path, str(resourceId) + '_' + str(mappingName) + '.rdf') f = open(rdfFile, 'w') process = subprocess.Popen(sparqlifyCall, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) logging.info("rdfFile: %s" % rdfFile) stdoutQueue = Queue.Queue() stdoutReader = AsynchronousFileReader(process.stdout, stdoutQueue) stdoutReader.start() stderrQueue = Queue.Queue() stderrReader = AsynchronousFileReader(process.stderr, stderrQueue) stderrReader.start() stdoutSize = 0 while not stdoutReader.eof() or not stderrReader.eof(): # Show what we received from standard output. while not stdoutQueue.empty(): stdoutSize += 1 line = stdoutQueue.get() f.write(line) if(stdoutSize % 1000 == 0): logging.info("Processed %d lines of %s" % (stdoutSize, rdfFile)) # Show what we received from standard error. while not stderrQueue.empty(): line = stderrQueue.get() logging.info('Received line on standard error: ' + repr(line)) # Sleep a bit before asking the readers again. time.sleep(.1) # Let's be tidy and join the threads we've started. stdoutReader.join() stderrReader.join() # Close subprocess' file descriptors. process.stdout.close() process.stderr.close() f.close() #update metadata_ logging.info("updating metadata...") mapping.updateMetadata(resourceId, mappingName) logging.info("DONE") #upload to triplestore logging.warn("loading resource to virtuoso!") virtuoso = VirtuosoLoader() graphUri = "http://data.publicdata.eu/%s/%s" % (str(resourceId),str(mappingName)) virtuoso.reload(rdfFile, graphUri) return process.returncode