def __init__(self, dataset, dirname, author_profile, source=None, target='output.nq'): """ Initialization """ # Defaults self._processes = 4 # Use 4 separate processes by default for converting CSV files self._chunksize = 1000 # Feed each process with 1000 lines at a time self._delimiter = ',' # Comma is the default delimiter self._quotechar = '\"' # The double parenthesis is the default quoting character if source is None: self._source = os.path.join(dirname, dataset['file']) else: self._source = source self._target = target self._dataset = dataset self.dataset_name = dataset['name'] self.dataset_uri = URIRef(dataset['uri']) # For efficiency, convert the QBer-style value lists into dictionaries # But only for variables that have values, of course (see e.g. the use of valueUrl). self._variables = {} for variable, variable_definition in dataset['variables'].items(): self._variables[variable] = variable_definition if 'values' in self._variables[variable]: self._variables[variable]['values_dictionary'] = dict([ (unicode(v.get('label', '')), v) for v in variable_definition['values'] ]) # Initialize the nanopublication structure self.publication = Nanopublication(self._source) # Build profile information from the author profile provided self.addProfile(author_profile) # Build a datastructure definition based on the dataset description self.addDatastructureDefinition()
def __init__(self, file_name, delimiter=',', quotechar='\"', encoding='utf-8', processes=4, chunksize=5000, output_format='nquads'): logger.info("Initializing converter for {}".format(file_name)) self.file_name = file_name self.output_format = output_format self.target_file = self.file_name + '.' + extensions[self.output_format] schema_file_name = file_name + '-metadata.json' if not os.path.exists(schema_file_name) or not os.path.exists(file_name): raise Exception( "Could not find source or metadata file in path; make sure you called with a .csv file") self._processes = processes self._chunksize = chunksize logger.info("Processes: {}".format(self._processes)) logger.info("Chunksize: {}".format(self._chunksize)) self.np = Nanopublication(file_name) # self.metadata = json.load(open(schema_file_name, 'r')) self.metadata_graph = Graph() with open(schema_file_name) as f: try: self.metadata_graph.load(f, format='json-ld') except ValueError as err: err.message = err.message + " ; please check the syntax of your JSON-LD schema file" raise # Get the URI of the schema specification by looking for the subject # with a csvw:url property. (self.metadata_uri, _) = self.metadata_graph.subject_objects(CSVW.url).next() self.metadata = Item(self.metadata_graph, self.metadata_uri) # Add a prov:wasDerivedFrom between the nanopublication assertion graph # and the metadata_uri self.np.pg.add((self.np.ag.identifier, PROV[ 'wasDerivedFrom'], self.metadata_uri)) # Add an attribution relation and dc:creator relation between the # nanopublication, the assertion graph and the authors of the schema for o in self.metadata_graph.objects(self.metadata_uri, DC['creator']): self.np.pg.add((self.np.ag.identifier, PROV['wasAttributedTo'], o)) self.np.add((self.np.uri, PROV['wasAttributedTo'], o)) self.np.pig.add((self.np.ag.identifier, DC['creator'], o)) self.schema = self.metadata.csvw_tableSchema # Taking defaults from init arguments self.delimiter = delimiter self.quotechar = quotechar self.encoding = encoding # Read csv-specific dialiect specification from JSON structure if self.metadata.csvw_dialect is not None: if self.metadata.csvw_dialect.csvw_delimiter is not None: self.delimiter = str(self.metadata.csvw_dialect.csvw_delimiter) if self.metadata.csvw_dialect.csvw_quotechar is not None: self.quotechar = str(self.metadata.csvw_dialect.csvw_quoteChar) if self.metadata.csvw_dialect.csvw_encoding is not None: self.encoding = str(self.metadata.csvw_dialect.csvw_encoding) logger.info("Quotechar: {}".format(self.quotechar.__repr__())) logger.info("Delimiter: {}".format(self.delimiter.__repr__())) logger.info("Encoding : {}".format(self.encoding.__repr__())) logger.warning( "Taking encoding, quotechar and delimiter specifications into account...") # The metadata schema overrides the default namespace values # (NB: this does not affect the predefined Namespace objects!) # DEPRECATED # namespaces.update({ns: url for ns, url in self.metadata['@context'][1].items() if not ns.startswith('@')}) # Cast the CSVW column rdf:List into an RDF collection self.columns = Collection( self.metadata_graph, BNode(self.schema.csvw_column))
class CSVWConverter(object): """ Converter configuration object for **CSVW**-style conversion. Is used to set parameters for a conversion, and to initiate an actual conversion process (implemented in :class:`BurstConverter`) Takes a dataset_description (in CSVW format) and prepares: * An array of dictionaries for the rows to pass to the :class:`BurstConverter` (either in one go, or in parallel) * A nanopublication structure for publishing the converted data (using :class:`converter.util.Nanopublication`) """ def __init__(self, file_name, delimiter=',', quotechar='\"', encoding='utf-8', processes=4, chunksize=5000, output_format='nquads'): logger.info("Initializing converter for {}".format(file_name)) self.file_name = file_name self.output_format = output_format self.target_file = self.file_name + '.' + extensions[self.output_format] schema_file_name = file_name + '-metadata.json' if not os.path.exists(schema_file_name) or not os.path.exists(file_name): raise Exception( "Could not find source or metadata file in path; make sure you called with a .csv file") self._processes = processes self._chunksize = chunksize logger.info("Processes: {}".format(self._processes)) logger.info("Chunksize: {}".format(self._chunksize)) self.np = Nanopublication(file_name) # self.metadata = json.load(open(schema_file_name, 'r')) self.metadata_graph = Graph() with open(schema_file_name) as f: try: self.metadata_graph.load(f, format='json-ld') except ValueError as err: err.message = err.message + " ; please check the syntax of your JSON-LD schema file" raise # Get the URI of the schema specification by looking for the subject # with a csvw:url property. (self.metadata_uri, _) = self.metadata_graph.subject_objects(CSVW.url).next() self.metadata = Item(self.metadata_graph, self.metadata_uri) # Add a prov:wasDerivedFrom between the nanopublication assertion graph # and the metadata_uri self.np.pg.add((self.np.ag.identifier, PROV[ 'wasDerivedFrom'], self.metadata_uri)) # Add an attribution relation and dc:creator relation between the # nanopublication, the assertion graph and the authors of the schema for o in self.metadata_graph.objects(self.metadata_uri, DC['creator']): self.np.pg.add((self.np.ag.identifier, PROV['wasAttributedTo'], o)) self.np.add((self.np.uri, PROV['wasAttributedTo'], o)) self.np.pig.add((self.np.ag.identifier, DC['creator'], o)) self.schema = self.metadata.csvw_tableSchema # Taking defaults from init arguments self.delimiter = delimiter self.quotechar = quotechar self.encoding = encoding # Read csv-specific dialiect specification from JSON structure if self.metadata.csvw_dialect is not None: if self.metadata.csvw_dialect.csvw_delimiter is not None: self.delimiter = str(self.metadata.csvw_dialect.csvw_delimiter) if self.metadata.csvw_dialect.csvw_quotechar is not None: self.quotechar = str(self.metadata.csvw_dialect.csvw_quoteChar) if self.metadata.csvw_dialect.csvw_encoding is not None: self.encoding = str(self.metadata.csvw_dialect.csvw_encoding) logger.info("Quotechar: {}".format(self.quotechar.__repr__())) logger.info("Delimiter: {}".format(self.delimiter.__repr__())) logger.info("Encoding : {}".format(self.encoding.__repr__())) logger.warning( "Taking encoding, quotechar and delimiter specifications into account...") # The metadata schema overrides the default namespace values # (NB: this does not affect the predefined Namespace objects!) # DEPRECATED # namespaces.update({ns: url for ns, url in self.metadata['@context'][1].items() if not ns.startswith('@')}) # Cast the CSVW column rdf:List into an RDF collection self.columns = Collection( self.metadata_graph, BNode(self.schema.csvw_column)) def convert_info(self): """Converts the CSVW JSON file to valid RDF for serializing into the Nanopublication publication info graph.""" results = self.metadata_graph.query("""SELECT ?s ?p ?o WHERE { ?s ?p ?o . FILTER(?p = csvw:valueUrl || ?p = csvw:propertyUrl || ?p = csvw:aboutUrl)}""") for (s, p, o) in results: # Use iribaker escaped_object = URIRef(iribaker.to_iri(unicode(o))) # If the escaped IRI of the object is different from the original, # update the graph. if escaped_object != o: self.metadata_graph.set((s, p, escaped_object)) # Add the provenance of this operation. self.np.pg.add((escaped_object, PROV.wasDerivedFrom, Literal(unicode(o), datatype=XSD.string))) # Add the information of the schema file to the provenance graph of the # nanopublication self.np.ingest(self.metadata_graph, self.np.pg.identifier) return def convert(self): """Starts a conversion process (in parallel or as a single process) as defined in the arguments passed to the :class:`CSVWConverter` initialization""" logger.info("Starting conversion") # If the number of processes is set to 1, we start the 'simple' conversion (in a single thread) if self._processes == 1: self._simple() # Otherwise, we start the parallel processing procedure, but fall back to simple conversion # when it turns out that for some reason the parallel processing fails (this happens on some # files. The reason could not yet be determined.) elif self._processes > 1: try: self._parallel() except TypeError: logger.info( "TypeError in multiprocessing... falling back to serial conversion") self._simple() except Exception: logger.error( "Some exception occurred, falling back to serial conversion") traceback.print_exc() self._simple() else: logger.error("Incorrect process count specification") def _simple(self): """Starts a single process for converting the file""" with open(self.target_file, 'w') as target_file: with open(self.file_name, 'rb') as csvfile: logger.info("Opening CSV file for reading") reader = csv.DictReader(csvfile, encoding=self.encoding, delimiter=self.delimiter, quotechar=self.quotechar) logger.info("Starting in a single process") c = BurstConverter(self.np.ag.identifier, self.columns, self.schema, self.metadata_graph, self.encoding, self.output_format) # Out will contain an N-Quads serialized representation of the # converted CSV out = c.process(0, reader, 1) # We then write it to the file target_file.write(out) self.convert_info() # Finally, write the nanopublication info to file target_file.write(self.np.serialize(format=self.output_format)) def _parallel(self): """Starts parallel processes for converting the file. Each process will receive max ``chunksize`` number of rows""" with open(self.target_file, 'w') as target_file: with open(self.file_name, 'rb') as csvfile: logger.info("Opening CSV file for reading") reader = csv.DictReader(csvfile, encoding=self.encoding, delimiter=self.delimiter, quotechar=self.quotechar) # Initialize a pool of processes (default=4) pool = mp.Pool(processes=self._processes) logger.info("Running in {} processes".format(self._processes)) # The _burstConvert function is partially instantiated, and will be successively called with # chunksize rows from the CSV file burstConvert_partial = partial(_burstConvert, identifier=self.np.ag.identifier, columns=self.columns, schema=self.schema, metadata_graph=self.metadata_graph, encoding=self.encoding, chunksize=self._chunksize, output_format=self.output_format) # The result of each chunksize run will be written to the # target file for out in pool.imap(burstConvert_partial, enumerate(grouper(self._chunksize, reader))): target_file.write(out) # Make sure to close and join the pool once finished. pool.close() pool.join() self.convert_info() # Finally, write the nanopublication info to file target_file.write(self.np.serialize(format=self.output_format))
class Converter(object): """ Converter configuration object for **QBer**-style conversion. Is used to set parameters for a conversion, and to initiate an actual conversion process (implemented in :class:`BurstConverter`) Takes a dataset_description (in QBer format) and prepares: * A dictionary for the :class:`BurstConverter` (either in one go, or in parallel) * A nanopublication structure for publishing the converted data (using :class:`converter.util.Nanopublication`) * A datastructure definition inside the nanopublication (using :class:`converter.util.DatastructureDefinition`) """ def __init__(self, dataset, dirname, author_profile, source=None, target='output.nq'): """ Initialization """ # Defaults self._processes = 4 # Use 4 separate processes by default for converting CSV files self._chunksize = 1000 # Feed each process with 1000 lines at a time self._delimiter = ',' # Comma is the default delimiter self._quotechar = '\"' # The double parenthesis is the default quoting character if source is None: self._source = os.path.join(dirname, dataset['file']) else: self._source = source self._target = target self._dataset = dataset self.dataset_name = dataset['name'] self.dataset_uri = URIRef(dataset['uri']) # For efficiency, convert the QBer-style value lists into dictionaries # But only for variables that have values, of course (see e.g. the use of valueUrl). self._variables = {} for variable, variable_definition in dataset['variables'].items(): self._variables[variable] = variable_definition if 'values' in self._variables[variable]: self._variables[variable]['values_dictionary'] = dict([ (unicode(v.get('label', '')), v) for v in variable_definition['values'] ]) # Initialize the nanopublication structure self.publication = Nanopublication(self._source) # Build profile information from the author profile provided self.addProfile(author_profile) # Build a datastructure definition based on the dataset description self.addDatastructureDefinition() def setDelimiter(self, delimiter): """Sets the delimiter for the CSV reader to ``delimiter`` """ self._delimiter = delimiter def setQuotechar(self, quotechar): """Sets the quote character for the CSV reader to ``quotechar``""" self._quotechar = quotechar def setProcesses(self, processes): """Sets the number of processes to use for (parallel) conversion of the data""" self._processes = processes def setChunksize(self, chunksize): """Sets the number of lines to pass to each process""" self._chunksize = chunksize def setTarget(self, target): """Sets the output file to write the resulting RDF to (should be an N-Quads file)""" self._target = target def addProfile(self, author_profile): """Adds an author profile to the nanopublication""" print "Adding profile" # We add all triples from a Profile graph to the default graph of the nanopublication profile_graph = Profile(author_profile) self.publication.ingest(profile_graph) # We add an attribution relation between the nanopub assertion and the Profile author self.publication.pg.add( (self.publication.ag.identifier, PROV['wasAttributedTo'], profile_graph.author_uri)) self.publication.pig.add( (self.publication.uri, PROV['wasAttributedTo'], profile_graph.author_uri)) def addDatastructureDefinition(self): """Adds a datastructure definition to the nanopublication based on what we know about the current dataset""" print "Adding datastructure definition" # We add all triples from a DatastructureDefinition graph to the assertion graph of the nanopublication self.publication.ingest( DatastructureDefinition(self.dataset_uri, self.dataset_name, self._variables), self.publication.ag.identifier) # We link the dataset URI in the Provenance graph to the version of the dataset that was used in the conversion. self.publication.pg.add((self.dataset_uri, PROV['wasDerivedFrom'], self.publication.dataset_version_uri)) def convert(self): """Starts the conversion process based on the parameters passed to the :class:``Converter`` initalization.""" logger.info("Using {} processes".format(self._processes)) # Open the target file with open(self._target, 'w') as target_file: # Write the nanopublication structure to the target file target_file.write(self.publication.serialize(format='nquads')) # Open the source file with open(self._source, 'r') as source_file: # Open the source file for reading as CSV reader = csv.reader(source_file, delimiter=self._delimiter, quotechar=self._quotechar, strict=True) # The headers are the first line (should correspond to variables) headers = reader.next() # TODO: Add check that headers and variables actually match! if self._processes > 1: self._parallel(reader, headers, target_file) else: self._simple(reader, headers, target_file) def _simple(self, reader, headers, target_file): """Starts a converter in a single process""" # Initialize the BurstConverter with the dataset and headers c = BurstConverter(self.publication.ag.identifier, self._dataset, self._variables, headers) # Out will contain an N-Quads serialized representation of the converted CSV out = c.process(0, reader, 1) # We then write it to the file target_file.write(out) def _parallel(self, reader, headers, target_file): """Starts the converter using multiple processes""" # Initialize a pool of processes (default=4) pool = mp.Pool(processes=self._processes) # The _burstConvert function is partially instantiated, and will be successively called with # chunksize rows from the CSV file burstConvert_partial = partial( _burstConvert, graph_identifier=self.publication.ag.identifier, dataset=self._dataset, variables=self._variables, headers=headers, chunksize=self._chunksize) # The result of each chunksize run will be written to the target file for out in pool.imap(burstConvert_partial, enumerate(grouper(self._chunksize, reader))): target_file.write(out) # Make sure to close and join the pool once finished. pool.close() pool.join()
def __init__(self, file_name, delimiter=',', quotechar='\"', encoding='utf-8', processes=4, chunksize=5000, output_format='nquads'): logger.info("Initializing converter for {}".format(file_name)) self.file_name = file_name self.output_format = output_format self.target_file = self.file_name + '.' + extensions[self.output_format] schema_file_name = file_name + '-metadata.json' if not os.path.exists(schema_file_name) or not os.path.exists(file_name): raise Exception( "Could not find source or metadata file in path; make sure you called with a .csv file") self._processes = processes self._chunksize = chunksize logger.info("Processes: {}".format(self._processes)) logger.info("Chunksize: {}".format(self._chunksize)) self.np = Nanopublication(file_name) # self.metadata = json.load(open(schema_file_name, 'r')) self.metadata_graph = Graph() with open(schema_file_name, 'rb') as f: try: self.metadata_graph.load(f, format='json-ld') except ValueError as err: err.message = err.message + " ; please check the syntax of your JSON-LD schema file" raise # from pprint import pprint # pprint([term for term in sorted(self.metadata_graph)]) # Get the URI of the schema specification by looking for the subject # with a csvw:url property. try: # Python 2 (self.metadata_uri, _) = self.metadata_graph.subject_objects(CSVW.url).next() except AttributeError: # Python 3 (self.metadata_uri, _) = next(self.metadata_graph.subject_objects(CSVW.url)) self.metadata = Item(self.metadata_graph, self.metadata_uri) # Add a prov:wasDerivedFrom between the nanopublication assertion graph # and the metadata_uri self.np.pg.add((self.np.ag.identifier, PROV[ 'wasDerivedFrom'], self.metadata_uri)) # Add an attribution relation and dc:creator relation between the # nanopublication, the assertion graph and the authors of the schema for o in self.metadata_graph.objects(self.metadata_uri, DC['creator']): self.np.pg.add((self.np.ag.identifier, PROV['wasAttributedTo'], o)) self.np.add((self.np.uri, PROV['wasAttributedTo'], o)) self.np.pig.add((self.np.ag.identifier, DC['creator'], o)) self.schema = self.metadata.csvw_tableSchema # Taking defaults from init arguments self.delimiter = delimiter self.quotechar = quotechar self.encoding = encoding # Read csv-specific dialiect specification from JSON structure if self.metadata.csvw_dialect is not None: if self.metadata.csvw_dialect.csvw_delimiter is not None: self.delimiter = str(self.metadata.csvw_dialect.csvw_delimiter) if self.metadata.csvw_dialect.csvw_quotechar is not None: self.quotechar = str(self.metadata.csvw_dialect.csvw_quoteChar) if self.metadata.csvw_dialect.csvw_encoding is not None: self.encoding = str(self.metadata.csvw_dialect.csvw_encoding) logger.info("Quotechar: {}".format(self.quotechar.__repr__())) logger.info("Delimiter: {}".format(self.delimiter.__repr__())) logger.info("Encoding : {}".format(self.encoding.__repr__())) logger.warning( "Taking encoding, quotechar and delimiter specifications into account...") # The metadata schema overrides the default namespace values # (NB: this does not affect the predefined Namespace objects!) # DEPRECATED # namespaces.update({ns: url for ns, url in self.metadata['@context'][1].items() if not ns.startswith('@')}) # Cast the CSVW column rdf:List into an RDF collection #print(self.schema.csvw_column) # print(len(self.metadata_graph)) self.columns = Collection(self.metadata_graph, BNode(self.schema.csvw_column)) # Python 3 can't work out Item so we'll just SPARQL the graph if not self.columns: self.columns = [o for s,p,o in self.metadata_graph.triples((None, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#first"), None))]
class CSVWConverter(object): """ Converter configuration object for **CSVW**-style conversion. Is used to set parameters for a conversion, and to initiate an actual conversion process (implemented in :class:`BurstConverter`) Takes a dataset_description (in CSVW format) and prepares: * An array of dictionaries for the rows to pass to the :class:`BurstConverter` (either in one go, or in parallel) * A nanopublication structure for publishing the converted data (using :class:`converter.util.Nanopublication`) """ def __init__(self, file_name, delimiter=',', quotechar='\"', encoding='utf-8', processes=4, chunksize=5000, output_format='nquads'): logger.info("Initializing converter for {}".format(file_name)) self.file_name = file_name self.output_format = output_format self.target_file = self.file_name + '.' + extensions[self.output_format] schema_file_name = file_name + '-metadata.json' if not os.path.exists(schema_file_name) or not os.path.exists(file_name): raise Exception( "Could not find source or metadata file in path; make sure you called with a .csv file") self._processes = processes self._chunksize = chunksize logger.info("Processes: {}".format(self._processes)) logger.info("Chunksize: {}".format(self._chunksize)) self.np = Nanopublication(file_name) # self.metadata = json.load(open(schema_file_name, 'r')) self.metadata_graph = Graph() with open(schema_file_name, 'rb') as f: try: self.metadata_graph.load(f, format='json-ld') except ValueError as err: err.message = err.message + " ; please check the syntax of your JSON-LD schema file" raise # from pprint import pprint # pprint([term for term in sorted(self.metadata_graph)]) # Get the URI of the schema specification by looking for the subject # with a csvw:url property. try: # Python 2 (self.metadata_uri, _) = self.metadata_graph.subject_objects(CSVW.url).next() except AttributeError: # Python 3 (self.metadata_uri, _) = next(self.metadata_graph.subject_objects(CSVW.url)) self.metadata = Item(self.metadata_graph, self.metadata_uri) # Add a prov:wasDerivedFrom between the nanopublication assertion graph # and the metadata_uri self.np.pg.add((self.np.ag.identifier, PROV[ 'wasDerivedFrom'], self.metadata_uri)) # Add an attribution relation and dc:creator relation between the # nanopublication, the assertion graph and the authors of the schema for o in self.metadata_graph.objects(self.metadata_uri, DC['creator']): self.np.pg.add((self.np.ag.identifier, PROV['wasAttributedTo'], o)) self.np.add((self.np.uri, PROV['wasAttributedTo'], o)) self.np.pig.add((self.np.ag.identifier, DC['creator'], o)) self.schema = self.metadata.csvw_tableSchema # Taking defaults from init arguments self.delimiter = delimiter self.quotechar = quotechar self.encoding = encoding # Read csv-specific dialiect specification from JSON structure if self.metadata.csvw_dialect is not None: if self.metadata.csvw_dialect.csvw_delimiter is not None: self.delimiter = str(self.metadata.csvw_dialect.csvw_delimiter) if self.metadata.csvw_dialect.csvw_quotechar is not None: self.quotechar = str(self.metadata.csvw_dialect.csvw_quoteChar) if self.metadata.csvw_dialect.csvw_encoding is not None: self.encoding = str(self.metadata.csvw_dialect.csvw_encoding) logger.info("Quotechar: {}".format(self.quotechar.__repr__())) logger.info("Delimiter: {}".format(self.delimiter.__repr__())) logger.info("Encoding : {}".format(self.encoding.__repr__())) logger.warning( "Taking encoding, quotechar and delimiter specifications into account...") # The metadata schema overrides the default namespace values # (NB: this does not affect the predefined Namespace objects!) # DEPRECATED # namespaces.update({ns: url for ns, url in self.metadata['@context'][1].items() if not ns.startswith('@')}) # Cast the CSVW column rdf:List into an RDF collection #print(self.schema.csvw_column) # print(len(self.metadata_graph)) self.columns = Collection(self.metadata_graph, BNode(self.schema.csvw_column)) # Python 3 can't work out Item so we'll just SPARQL the graph if not self.columns: self.columns = [o for s,p,o in self.metadata_graph.triples((None, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#first"), None))] # # from pprint import pprint # pprint(self.columns) # print("LOOOOOOOOOOOOOOOOOOOOOOO") # from pprint import pprint # # pprint(self.schema.csvw_column) # pprint([term for term in self.schema]) # pprint('----------') # pprint([term for term in self.schema.csvw_column]) #print(self.schema.csvw_column) def convert_info(self): """Converts the CSVW JSON file to valid RDF for serializing into the Nanopublication publication info graph.""" results = self.metadata_graph.query("""SELECT ?s ?p ?o WHERE { ?s ?p ?o . FILTER(?p = csvw:valueUrl || ?p = csvw:propertyUrl || ?p = csvw:aboutUrl)}""") for (s, p, o) in results: # Use iribaker try: # Python 2 escaped_object = URIRef(iribaker.to_iri(unicode(o))) except NameError: # Python 3 escaped_object = URIRef(iribaker.to_iri(str(o))) # print(escaped_object) # If the escaped IRI of the object is different from the original, # update the graph. if escaped_object != o: self.metadata_graph.set((s, p, escaped_object)) # Add the provenance of this operation. try: # Python 2 self.np.pg.add((escaped_object, PROV.wasDerivedFrom, Literal(unicode(o), datatype=XSD.string))) except NameError: # Python 3 self.np.pg.add((escaped_object, PROV.wasDerivedFrom, Literal(str(o), datatype=XSD.string))) # print(str(o)) #walk through the metadata graph to remove illigal "Resource" blank node caused by python3 transition. for s, p, o in self.metadata_graph.triples((None, None, None)): if s.startswith("Resource("): self.metadata_graph.remove((s,p,o)) self.metadata_graph.add((BNode(str(s)[9:-1]), p, o)) logger.debug("removed a triple because it was not formatted right. (started with \"Resource\")") # Add the information of the schema file to the provenance graph of the # nanopublication self.np.ingest(self.metadata_graph, self.np.pg.identifier) # for s,p,o in self.np.triples((None,None,None)): # print(s.__repr__,p.__repr__,o.__repr__) return def convert(self): """Starts a conversion process (in parallel or as a single process) as defined in the arguments passed to the :class:`CSVWConverter` initialization""" logger.info("Starting conversion") # If the number of processes is set to 1, we start the 'simple' conversion (in a single thread) if self._processes == 1: self._simple() # Otherwise, we start the parallel processing procedure, but fall back to simple conversion # when it turns out that for some reason the parallel processing fails (this happens on some # files. The reason could not yet be determined.) elif self._processes > 1: try: self._parallel() except TypeError: logger.info( "TypeError in multiprocessing... falling back to serial conversion") self._simple() except Exception: logger.error( "Some exception occurred, falling back to serial conversion") traceback.print_exc() self._simple() else: logger.error("Incorrect process count specification") def _simple(self): """Starts a single process for converting the file""" with open(self.target_file, 'wb') as target_file: with open(self.file_name, 'rb') as csvfile: logger.info("Opening CSV file for reading") reader = csv.DictReader(csvfile, encoding=self.encoding, delimiter=self.delimiter, quotechar=self.quotechar) logger.info("Starting in a single process") c = BurstConverter(self.np.ag.identifier, self.columns, self.schema, self.metadata_graph, self.encoding, self.output_format) # Out will contain an N-Quads serialized representation of the # converted CSV out = c.process(0, reader, 1) # We then write it to the file try: # Python 2 target_file.write(out) except TypeError: # Python 3 target_file.write(out.decode('utf-8')) self.convert_info() # Finally, write the nanopublication info to file target_file.write(self.np.serialize(format=self.output_format)) def _parallel(self): """Starts parallel processes for converting the file. Each process will receive max ``chunksize`` number of rows""" with open(self.target_file, 'wb') as target_file: with open(self.file_name, 'rb') as csvfile: logger.info("Opening CSV file for reading") reader = csv.DictReader(csvfile, encoding=self.encoding, delimiter=self.delimiter, quotechar=self.quotechar) # Initialize a pool of processes (default=4) pool = mp.Pool(processes=self._processes) logger.info("Running in {} processes".format(self._processes)) # The _burstConvert function is partially instantiated, and will be successively called with # chunksize rows from the CSV file # print("LOOOOOOOOOOOOOOOOOOOOOOO") # from pprint import pprint # pprint([term.n3() for term in self.columns]) burstConvert_partial = partial(_burstConvert, identifier=self.np.ag.identifier, columns=self.columns, schema=self.schema, metadata_graph=self.metadata_graph, encoding=self.encoding, chunksize=self._chunksize, output_format=self.output_format) # The result of each chunksize run will be written to the # target file for out in pool.imap(burstConvert_partial, enumerate(grouper(self._chunksize, reader))): target_file.write(out) # Make sure to close and join the pool once finished. pool.close() pool.join() self.convert_info() # Finally, write the nanopublication info to file target_file.write(self.np.serialize(format=self.output_format))