Пример #1
0
    def __init__(self,
                 dataset,
                 dirname,
                 author_profile,
                 source=None,
                 target='output.nq'):
        """
        Initialization
        """

        # Defaults
        self._processes = 4  # Use 4 separate processes by default for converting CSV files
        self._chunksize = 1000  # Feed each process with 1000 lines at a time
        self._delimiter = ','  # Comma is the default delimiter
        self._quotechar = '\"'  # The double parenthesis is the default quoting character

        if source is None:
            self._source = os.path.join(dirname, dataset['file'])
        else:
            self._source = source

        self._target = target

        self._dataset = dataset

        self.dataset_name = dataset['name']
        self.dataset_uri = URIRef(dataset['uri'])

        # For efficiency, convert the QBer-style value lists into dictionaries
        # But only for variables that have values, of course (see e.g. the use of valueUrl).
        self._variables = {}
        for variable, variable_definition in dataset['variables'].items():
            self._variables[variable] = variable_definition
            if 'values' in self._variables[variable]:
                self._variables[variable]['values_dictionary'] = dict([
                    (unicode(v.get('label', '')), v)
                    for v in variable_definition['values']
                ])

        # Initialize the nanopublication structure
        self.publication = Nanopublication(self._source)

        # Build profile information from the author profile provided
        self.addProfile(author_profile)

        # Build a datastructure definition based on the dataset description
        self.addDatastructureDefinition()
Пример #2
0
    def __init__(self, file_name, delimiter=',', quotechar='\"', encoding='utf-8', processes=4, chunksize=5000, output_format='nquads'):
        logger.info("Initializing converter for {}".format(file_name))
        self.file_name = file_name
        self.output_format = output_format
        self.target_file = self.file_name + '.' + extensions[self.output_format]
        schema_file_name = file_name + '-metadata.json'

        if not os.path.exists(schema_file_name) or not os.path.exists(file_name):
            raise Exception(
                "Could not find source or metadata file in path; make sure you called with a .csv file")

        self._processes = processes
        self._chunksize = chunksize
        logger.info("Processes: {}".format(self._processes))
        logger.info("Chunksize: {}".format(self._chunksize))

        self.np = Nanopublication(file_name)
        # self.metadata = json.load(open(schema_file_name, 'r'))
        self.metadata_graph = Graph()
        with open(schema_file_name) as f:
            try:
                self.metadata_graph.load(f, format='json-ld')
            except ValueError as err:
                err.message = err.message + " ; please check the syntax of your JSON-LD schema file"
                raise

        # Get the URI of the schema specification by looking for the subject
        # with a csvw:url property.
        (self.metadata_uri, _) = self.metadata_graph.subject_objects(CSVW.url).next()

        self.metadata = Item(self.metadata_graph, self.metadata_uri)

        # Add a prov:wasDerivedFrom between the nanopublication assertion graph
        # and the metadata_uri
        self.np.pg.add((self.np.ag.identifier, PROV[
                       'wasDerivedFrom'], self.metadata_uri))
        # Add an attribution relation and dc:creator relation between the
        # nanopublication, the assertion graph and the authors of the schema
        for o in self.metadata_graph.objects(self.metadata_uri, DC['creator']):
            self.np.pg.add((self.np.ag.identifier, PROV['wasAttributedTo'], o))
            self.np.add((self.np.uri, PROV['wasAttributedTo'], o))
            self.np.pig.add((self.np.ag.identifier, DC['creator'], o))

        self.schema = self.metadata.csvw_tableSchema

        # Taking defaults from init arguments
        self.delimiter = delimiter
        self.quotechar = quotechar
        self.encoding = encoding

        # Read csv-specific dialiect specification from JSON structure
        if self.metadata.csvw_dialect is not None:
            if self.metadata.csvw_dialect.csvw_delimiter is not None:
                self.delimiter = str(self.metadata.csvw_dialect.csvw_delimiter)

            if self.metadata.csvw_dialect.csvw_quotechar is not None:
                self.quotechar = str(self.metadata.csvw_dialect.csvw_quoteChar)

            if self.metadata.csvw_dialect.csvw_encoding is not None:
                self.encoding = str(self.metadata.csvw_dialect.csvw_encoding)

        logger.info("Quotechar: {}".format(self.quotechar.__repr__()))
        logger.info("Delimiter: {}".format(self.delimiter.__repr__()))
        logger.info("Encoding : {}".format(self.encoding.__repr__()))
        logger.warning(
            "Taking encoding, quotechar and delimiter specifications into account...")

        # The metadata schema overrides the default namespace values
        # (NB: this does not affect the predefined Namespace objects!)
        # DEPRECATED
        # namespaces.update({ns: url for ns, url in self.metadata['@context'][1].items() if not ns.startswith('@')})

        # Cast the CSVW column rdf:List into an RDF collection
        self.columns = Collection(
            self.metadata_graph, BNode(self.schema.csvw_column))
Пример #3
0
class CSVWConverter(object):
    """
    Converter configuration object for **CSVW**-style conversion. Is used to set parameters for a conversion,
    and to initiate an actual conversion process (implemented in :class:`BurstConverter`)

    Takes a dataset_description (in CSVW format) and prepares:

    * An array of dictionaries for the rows to pass to the :class:`BurstConverter` (either in one go, or in parallel)
    * A nanopublication structure for publishing the converted data (using :class:`converter.util.Nanopublication`)
    """

    def __init__(self, file_name, delimiter=',', quotechar='\"', encoding='utf-8', processes=4, chunksize=5000, output_format='nquads'):
        logger.info("Initializing converter for {}".format(file_name))
        self.file_name = file_name
        self.output_format = output_format
        self.target_file = self.file_name + '.' + extensions[self.output_format]
        schema_file_name = file_name + '-metadata.json'

        if not os.path.exists(schema_file_name) or not os.path.exists(file_name):
            raise Exception(
                "Could not find source or metadata file in path; make sure you called with a .csv file")

        self._processes = processes
        self._chunksize = chunksize
        logger.info("Processes: {}".format(self._processes))
        logger.info("Chunksize: {}".format(self._chunksize))

        self.np = Nanopublication(file_name)
        # self.metadata = json.load(open(schema_file_name, 'r'))
        self.metadata_graph = Graph()
        with open(schema_file_name) as f:
            try:
                self.metadata_graph.load(f, format='json-ld')
            except ValueError as err:
                err.message = err.message + " ; please check the syntax of your JSON-LD schema file"
                raise

        # Get the URI of the schema specification by looking for the subject
        # with a csvw:url property.
        (self.metadata_uri, _) = self.metadata_graph.subject_objects(CSVW.url).next()

        self.metadata = Item(self.metadata_graph, self.metadata_uri)

        # Add a prov:wasDerivedFrom between the nanopublication assertion graph
        # and the metadata_uri
        self.np.pg.add((self.np.ag.identifier, PROV[
                       'wasDerivedFrom'], self.metadata_uri))
        # Add an attribution relation and dc:creator relation between the
        # nanopublication, the assertion graph and the authors of the schema
        for o in self.metadata_graph.objects(self.metadata_uri, DC['creator']):
            self.np.pg.add((self.np.ag.identifier, PROV['wasAttributedTo'], o))
            self.np.add((self.np.uri, PROV['wasAttributedTo'], o))
            self.np.pig.add((self.np.ag.identifier, DC['creator'], o))

        self.schema = self.metadata.csvw_tableSchema

        # Taking defaults from init arguments
        self.delimiter = delimiter
        self.quotechar = quotechar
        self.encoding = encoding

        # Read csv-specific dialiect specification from JSON structure
        if self.metadata.csvw_dialect is not None:
            if self.metadata.csvw_dialect.csvw_delimiter is not None:
                self.delimiter = str(self.metadata.csvw_dialect.csvw_delimiter)

            if self.metadata.csvw_dialect.csvw_quotechar is not None:
                self.quotechar = str(self.metadata.csvw_dialect.csvw_quoteChar)

            if self.metadata.csvw_dialect.csvw_encoding is not None:
                self.encoding = str(self.metadata.csvw_dialect.csvw_encoding)

        logger.info("Quotechar: {}".format(self.quotechar.__repr__()))
        logger.info("Delimiter: {}".format(self.delimiter.__repr__()))
        logger.info("Encoding : {}".format(self.encoding.__repr__()))
        logger.warning(
            "Taking encoding, quotechar and delimiter specifications into account...")

        # The metadata schema overrides the default namespace values
        # (NB: this does not affect the predefined Namespace objects!)
        # DEPRECATED
        # namespaces.update({ns: url for ns, url in self.metadata['@context'][1].items() if not ns.startswith('@')})

        # Cast the CSVW column rdf:List into an RDF collection
        self.columns = Collection(
            self.metadata_graph, BNode(self.schema.csvw_column))

    def convert_info(self):
        """Converts the CSVW JSON file to valid RDF for serializing into the Nanopublication publication info graph."""

        results = self.metadata_graph.query("""SELECT ?s ?p ?o
                                               WHERE { ?s ?p ?o .
                                                       FILTER(?p = csvw:valueUrl ||
                                                              ?p = csvw:propertyUrl ||
                                                              ?p = csvw:aboutUrl)}""")

        for (s, p, o) in results:
            # Use iribaker
            escaped_object = URIRef(iribaker.to_iri(unicode(o)))

            # If the escaped IRI of the object is different from the original,
            # update the graph.
            if escaped_object != o:
                self.metadata_graph.set((s, p, escaped_object))
                # Add the provenance of this operation.
                self.np.pg.add((escaped_object,
                                PROV.wasDerivedFrom,
                                Literal(unicode(o), datatype=XSD.string)))

        # Add the information of the schema file to the provenance graph of the
        # nanopublication
        self.np.ingest(self.metadata_graph, self.np.pg.identifier)

        return

    def convert(self):
        """Starts a conversion process (in parallel or as a single process) as defined in the arguments passed to the :class:`CSVWConverter` initialization"""
        logger.info("Starting conversion")

        # If the number of processes is set to 1, we start the 'simple' conversion (in a single thread)
        if self._processes == 1:
            self._simple()
        # Otherwise, we start the parallel processing procedure, but fall back to simple conversion
        # when it turns out that for some reason the parallel processing fails (this happens on some
        # files. The reason could not yet be determined.)
        elif self._processes > 1:
            try:
                self._parallel()
            except TypeError:
                logger.info(
                    "TypeError in multiprocessing... falling back to serial conversion")
                self._simple()
            except Exception:
                logger.error(
                    "Some exception occurred, falling back to serial conversion")
                traceback.print_exc()
                self._simple()
        else:
            logger.error("Incorrect process count specification")

    def _simple(self):
        """Starts a single process for converting the file"""
        with open(self.target_file, 'w') as target_file:
            with open(self.file_name, 'rb') as csvfile:
                logger.info("Opening CSV file for reading")
                reader = csv.DictReader(csvfile,
                                        encoding=self.encoding,
                                        delimiter=self.delimiter,
                                        quotechar=self.quotechar)

                logger.info("Starting in a single process")
                c = BurstConverter(self.np.ag.identifier, self.columns,
                                   self.schema, self.metadata_graph, self.encoding, self.output_format)
                # Out will contain an N-Quads serialized representation of the
                # converted CSV
                out = c.process(0, reader, 1)
                # We then write it to the file
                target_file.write(out)

            self.convert_info()
            # Finally, write the nanopublication info to file
            target_file.write(self.np.serialize(format=self.output_format))

    def _parallel(self):
        """Starts parallel processes for converting the file. Each process will receive max ``chunksize`` number of rows"""
        with open(self.target_file, 'w') as target_file:
            with open(self.file_name, 'rb') as csvfile:
                logger.info("Opening CSV file for reading")
                reader = csv.DictReader(csvfile,
                                        encoding=self.encoding,
                                        delimiter=self.delimiter,
                                        quotechar=self.quotechar)

                # Initialize a pool of processes (default=4)
                pool = mp.Pool(processes=self._processes)
                logger.info("Running in {} processes".format(self._processes))

                # The _burstConvert function is partially instantiated, and will be successively called with
                # chunksize rows from the CSV file
                burstConvert_partial = partial(_burstConvert,
                                               identifier=self.np.ag.identifier,
                                               columns=self.columns,
                                               schema=self.schema,
                                               metadata_graph=self.metadata_graph,
                                               encoding=self.encoding,
                                               chunksize=self._chunksize,
                                               output_format=self.output_format)

                # The result of each chunksize run will be written to the
                # target file
                for out in pool.imap(burstConvert_partial, enumerate(grouper(self._chunksize, reader))):
                    target_file.write(out)

                # Make sure to close and join the pool once finished.
                pool.close()
                pool.join()

            self.convert_info()
            # Finally, write the nanopublication info to file
            target_file.write(self.np.serialize(format=self.output_format))
Пример #4
0
class Converter(object):
    """
    Converter configuration object for **QBer**-style conversion. Is used to set parameters for a conversion,
    and to initiate an actual conversion process (implemented in :class:`BurstConverter`)

    Takes a dataset_description (in QBer format) and prepares:

    * A dictionary for the :class:`BurstConverter` (either in one go, or in parallel)
    * A nanopublication structure for publishing the converted data (using :class:`converter.util.Nanopublication`)
    * A datastructure definition inside the nanopublication (using :class:`converter.util.DatastructureDefinition`)
    """
    def __init__(self,
                 dataset,
                 dirname,
                 author_profile,
                 source=None,
                 target='output.nq'):
        """
        Initialization
        """

        # Defaults
        self._processes = 4  # Use 4 separate processes by default for converting CSV files
        self._chunksize = 1000  # Feed each process with 1000 lines at a time
        self._delimiter = ','  # Comma is the default delimiter
        self._quotechar = '\"'  # The double parenthesis is the default quoting character

        if source is None:
            self._source = os.path.join(dirname, dataset['file'])
        else:
            self._source = source

        self._target = target

        self._dataset = dataset

        self.dataset_name = dataset['name']
        self.dataset_uri = URIRef(dataset['uri'])

        # For efficiency, convert the QBer-style value lists into dictionaries
        # But only for variables that have values, of course (see e.g. the use of valueUrl).
        self._variables = {}
        for variable, variable_definition in dataset['variables'].items():
            self._variables[variable] = variable_definition
            if 'values' in self._variables[variable]:
                self._variables[variable]['values_dictionary'] = dict([
                    (unicode(v.get('label', '')), v)
                    for v in variable_definition['values']
                ])

        # Initialize the nanopublication structure
        self.publication = Nanopublication(self._source)

        # Build profile information from the author profile provided
        self.addProfile(author_profile)

        # Build a datastructure definition based on the dataset description
        self.addDatastructureDefinition()

    def setDelimiter(self, delimiter):
        """Sets the delimiter for the CSV reader to ``delimiter`` """
        self._delimiter = delimiter

    def setQuotechar(self, quotechar):
        """Sets the quote character for the CSV reader to ``quotechar``"""
        self._quotechar = quotechar

    def setProcesses(self, processes):
        """Sets the number of processes to use for (parallel) conversion of the data"""
        self._processes = processes

    def setChunksize(self, chunksize):
        """Sets the number of lines to pass to each process"""
        self._chunksize = chunksize

    def setTarget(self, target):
        """Sets the output file to write the resulting RDF to (should be an N-Quads file)"""
        self._target = target

    def addProfile(self, author_profile):
        """Adds an author profile to the nanopublication"""

        print "Adding profile"
        # We add all triples from a Profile graph to the default graph of the nanopublication
        profile_graph = Profile(author_profile)
        self.publication.ingest(profile_graph)
        # We add an attribution relation between the nanopub assertion and the Profile author
        self.publication.pg.add(
            (self.publication.ag.identifier, PROV['wasAttributedTo'],
             profile_graph.author_uri))
        self.publication.pig.add(
            (self.publication.uri, PROV['wasAttributedTo'],
             profile_graph.author_uri))

    def addDatastructureDefinition(self):
        """Adds a datastructure definition to the nanopublication based on what we know about the current dataset"""

        print "Adding datastructure definition"
        # We add all triples from a DatastructureDefinition graph to the assertion graph of the nanopublication
        self.publication.ingest(
            DatastructureDefinition(self.dataset_uri, self.dataset_name,
                                    self._variables),
            self.publication.ag.identifier)

        # We link the dataset URI in the Provenance graph to the version of the dataset that was used in the conversion.
        self.publication.pg.add((self.dataset_uri, PROV['wasDerivedFrom'],
                                 self.publication.dataset_version_uri))

    def convert(self):
        """Starts the conversion process based on the parameters passed to the :class:``Converter`` initalization."""

        logger.info("Using {} processes".format(self._processes))

        # Open the target file
        with open(self._target, 'w') as target_file:
            # Write the nanopublication structure to the target file
            target_file.write(self.publication.serialize(format='nquads'))

            # Open the source file
            with open(self._source, 'r') as source_file:
                # Open the source file for reading as CSV
                reader = csv.reader(source_file,
                                    delimiter=self._delimiter,
                                    quotechar=self._quotechar,
                                    strict=True)

                # The headers are the first line (should correspond to variables)
                headers = reader.next()

                # TODO: Add check that headers and variables actually match!

                if self._processes > 1:
                    self._parallel(reader, headers, target_file)
                else:
                    self._simple(reader, headers, target_file)

    def _simple(self, reader, headers, target_file):
        """Starts a converter in a single process"""
        # Initialize the BurstConverter with the dataset and headers
        c = BurstConverter(self.publication.ag.identifier, self._dataset,
                           self._variables, headers)
        # Out will contain an N-Quads serialized representation of the converted CSV
        out = c.process(0, reader, 1)
        # We then write it to the file
        target_file.write(out)

    def _parallel(self, reader, headers, target_file):
        """Starts the converter using multiple processes"""

        # Initialize a pool of processes (default=4)
        pool = mp.Pool(processes=self._processes)

        # The _burstConvert function is partially instantiated, and will be successively called with
        # chunksize rows from the CSV file
        burstConvert_partial = partial(
            _burstConvert,
            graph_identifier=self.publication.ag.identifier,
            dataset=self._dataset,
            variables=self._variables,
            headers=headers,
            chunksize=self._chunksize)

        # The result of each chunksize run will be written to the target file
        for out in pool.imap(burstConvert_partial,
                             enumerate(grouper(self._chunksize, reader))):
            target_file.write(out)

        # Make sure to close and join the pool once finished.
        pool.close()
        pool.join()
Пример #5
0
    def __init__(self, file_name, delimiter=',', quotechar='\"', encoding='utf-8', processes=4, chunksize=5000, output_format='nquads'):
        logger.info("Initializing converter for {}".format(file_name))
        self.file_name = file_name
        self.output_format = output_format
        self.target_file = self.file_name + '.' + extensions[self.output_format]
        schema_file_name = file_name + '-metadata.json'

        if not os.path.exists(schema_file_name) or not os.path.exists(file_name):
            raise Exception(
                "Could not find source or metadata file in path; make sure you called with a .csv file")

        self._processes = processes
        self._chunksize = chunksize
        logger.info("Processes: {}".format(self._processes))
        logger.info("Chunksize: {}".format(self._chunksize))

        self.np = Nanopublication(file_name)
        # self.metadata = json.load(open(schema_file_name, 'r'))
        self.metadata_graph = Graph()
        with open(schema_file_name, 'rb') as f:
            try:
                self.metadata_graph.load(f, format='json-ld')
            except ValueError as err:
                err.message = err.message + " ; please check the syntax of your JSON-LD schema file"
                raise

        # from pprint import pprint
        # pprint([term for term in sorted(self.metadata_graph)])

        # Get the URI of the schema specification by looking for the subject
        # with a csvw:url property.
        try:
            # Python 2
            (self.metadata_uri, _) = self.metadata_graph.subject_objects(CSVW.url).next()
        except AttributeError:
            # Python 3
            (self.metadata_uri, _) = next(self.metadata_graph.subject_objects(CSVW.url))


        self.metadata = Item(self.metadata_graph, self.metadata_uri)

        # Add a prov:wasDerivedFrom between the nanopublication assertion graph
        # and the metadata_uri
        self.np.pg.add((self.np.ag.identifier, PROV[
                       'wasDerivedFrom'], self.metadata_uri))
        # Add an attribution relation and dc:creator relation between the
        # nanopublication, the assertion graph and the authors of the schema
        for o in self.metadata_graph.objects(self.metadata_uri, DC['creator']):
            self.np.pg.add((self.np.ag.identifier, PROV['wasAttributedTo'], o))
            self.np.add((self.np.uri, PROV['wasAttributedTo'], o))
            self.np.pig.add((self.np.ag.identifier, DC['creator'], o))

        self.schema = self.metadata.csvw_tableSchema

        # Taking defaults from init arguments
        self.delimiter = delimiter
        self.quotechar = quotechar
        self.encoding = encoding

        # Read csv-specific dialiect specification from JSON structure
        if self.metadata.csvw_dialect is not None:
            if self.metadata.csvw_dialect.csvw_delimiter is not None:
                self.delimiter = str(self.metadata.csvw_dialect.csvw_delimiter)

            if self.metadata.csvw_dialect.csvw_quotechar is not None:
                self.quotechar = str(self.metadata.csvw_dialect.csvw_quoteChar)

            if self.metadata.csvw_dialect.csvw_encoding is not None:
                self.encoding = str(self.metadata.csvw_dialect.csvw_encoding)

        logger.info("Quotechar: {}".format(self.quotechar.__repr__()))
        logger.info("Delimiter: {}".format(self.delimiter.__repr__()))
        logger.info("Encoding : {}".format(self.encoding.__repr__()))
        logger.warning(
            "Taking encoding, quotechar and delimiter specifications into account...")

        # The metadata schema overrides the default namespace values
        # (NB: this does not affect the predefined Namespace objects!)
        # DEPRECATED
        # namespaces.update({ns: url for ns, url in self.metadata['@context'][1].items() if not ns.startswith('@')})

        # Cast the CSVW column rdf:List into an RDF collection
        #print(self.schema.csvw_column)
        # print(len(self.metadata_graph))

        self.columns = Collection(self.metadata_graph, BNode(self.schema.csvw_column))
        # Python 3 can't work out Item so we'll just SPARQL the graph

        if not self.columns:
            self.columns = [o for s,p,o in self.metadata_graph.triples((None, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#first"), None))]
Пример #6
0
class CSVWConverter(object):
    """
    Converter configuration object for **CSVW**-style conversion. Is used to set parameters for a conversion,
    and to initiate an actual conversion process (implemented in :class:`BurstConverter`)

    Takes a dataset_description (in CSVW format) and prepares:

    * An array of dictionaries for the rows to pass to the :class:`BurstConverter` (either in one go, or in parallel)
    * A nanopublication structure for publishing the converted data (using :class:`converter.util.Nanopublication`)
    """

    def __init__(self, file_name, delimiter=',', quotechar='\"', encoding='utf-8', processes=4, chunksize=5000, output_format='nquads'):
        logger.info("Initializing converter for {}".format(file_name))
        self.file_name = file_name
        self.output_format = output_format
        self.target_file = self.file_name + '.' + extensions[self.output_format]
        schema_file_name = file_name + '-metadata.json'

        if not os.path.exists(schema_file_name) or not os.path.exists(file_name):
            raise Exception(
                "Could not find source or metadata file in path; make sure you called with a .csv file")

        self._processes = processes
        self._chunksize = chunksize
        logger.info("Processes: {}".format(self._processes))
        logger.info("Chunksize: {}".format(self._chunksize))

        self.np = Nanopublication(file_name)
        # self.metadata = json.load(open(schema_file_name, 'r'))
        self.metadata_graph = Graph()
        with open(schema_file_name, 'rb') as f:
            try:
                self.metadata_graph.load(f, format='json-ld')
            except ValueError as err:
                err.message = err.message + " ; please check the syntax of your JSON-LD schema file"
                raise

        # from pprint import pprint
        # pprint([term for term in sorted(self.metadata_graph)])

        # Get the URI of the schema specification by looking for the subject
        # with a csvw:url property.
        try:
            # Python 2
            (self.metadata_uri, _) = self.metadata_graph.subject_objects(CSVW.url).next()
        except AttributeError:
            # Python 3
            (self.metadata_uri, _) = next(self.metadata_graph.subject_objects(CSVW.url))


        self.metadata = Item(self.metadata_graph, self.metadata_uri)

        # Add a prov:wasDerivedFrom between the nanopublication assertion graph
        # and the metadata_uri
        self.np.pg.add((self.np.ag.identifier, PROV[
                       'wasDerivedFrom'], self.metadata_uri))
        # Add an attribution relation and dc:creator relation between the
        # nanopublication, the assertion graph and the authors of the schema
        for o in self.metadata_graph.objects(self.metadata_uri, DC['creator']):
            self.np.pg.add((self.np.ag.identifier, PROV['wasAttributedTo'], o))
            self.np.add((self.np.uri, PROV['wasAttributedTo'], o))
            self.np.pig.add((self.np.ag.identifier, DC['creator'], o))

        self.schema = self.metadata.csvw_tableSchema

        # Taking defaults from init arguments
        self.delimiter = delimiter
        self.quotechar = quotechar
        self.encoding = encoding

        # Read csv-specific dialiect specification from JSON structure
        if self.metadata.csvw_dialect is not None:
            if self.metadata.csvw_dialect.csvw_delimiter is not None:
                self.delimiter = str(self.metadata.csvw_dialect.csvw_delimiter)

            if self.metadata.csvw_dialect.csvw_quotechar is not None:
                self.quotechar = str(self.metadata.csvw_dialect.csvw_quoteChar)

            if self.metadata.csvw_dialect.csvw_encoding is not None:
                self.encoding = str(self.metadata.csvw_dialect.csvw_encoding)

        logger.info("Quotechar: {}".format(self.quotechar.__repr__()))
        logger.info("Delimiter: {}".format(self.delimiter.__repr__()))
        logger.info("Encoding : {}".format(self.encoding.__repr__()))
        logger.warning(
            "Taking encoding, quotechar and delimiter specifications into account...")

        # The metadata schema overrides the default namespace values
        # (NB: this does not affect the predefined Namespace objects!)
        # DEPRECATED
        # namespaces.update({ns: url for ns, url in self.metadata['@context'][1].items() if not ns.startswith('@')})

        # Cast the CSVW column rdf:List into an RDF collection
        #print(self.schema.csvw_column)
        # print(len(self.metadata_graph))

        self.columns = Collection(self.metadata_graph, BNode(self.schema.csvw_column))
        # Python 3 can't work out Item so we'll just SPARQL the graph

        if not self.columns:
            self.columns = [o for s,p,o in self.metadata_graph.triples((None, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#first"), None))]
        #
        # from pprint import pprint
        # pprint(self.columns)
        # print("LOOOOOOOOOOOOOOOOOOOOOOO")
        # from pprint import pprint
        # # pprint(self.schema.csvw_column)
        # pprint([term for term in self.schema])
        # pprint('----------')
        # pprint([term for term in self.schema.csvw_column])

        #print(self.schema.csvw_column)


    def convert_info(self):
        """Converts the CSVW JSON file to valid RDF for serializing into the Nanopublication publication info graph."""

        results = self.metadata_graph.query("""SELECT ?s ?p ?o
                                               WHERE { ?s ?p ?o .
                                                       FILTER(?p = csvw:valueUrl ||
                                                              ?p = csvw:propertyUrl ||
                                                              ?p = csvw:aboutUrl)}""")

        for (s, p, o) in results:
            # Use iribaker
            try:
                # Python 2
                escaped_object = URIRef(iribaker.to_iri(unicode(o)))
            except NameError:
                # Python 3
                escaped_object = URIRef(iribaker.to_iri(str(o)))
                # print(escaped_object)

            # If the escaped IRI of the object is different from the original,
            # update the graph.
            if escaped_object != o:
                self.metadata_graph.set((s, p, escaped_object))
                # Add the provenance of this operation.
                try:
                    # Python 2
                    self.np.pg.add((escaped_object,
                                PROV.wasDerivedFrom,
                                Literal(unicode(o), datatype=XSD.string)))
                except NameError:
                    # Python 3
                    self.np.pg.add((escaped_object,
                                PROV.wasDerivedFrom,
                                Literal(str(o), datatype=XSD.string)))
                    # print(str(o))

        #walk through the metadata graph to remove illigal "Resource" blank node caused by python3 transition.
        for s, p, o in self.metadata_graph.triples((None, None, None)):
            if s.startswith("Resource("):
                self.metadata_graph.remove((s,p,o))
                self.metadata_graph.add((BNode(str(s)[9:-1]), p, o))
                logger.debug("removed a triple because it was not formatted right. (started with \"Resource\")")

        # Add the information of the schema file to the provenance graph of the
        # nanopublication
        self.np.ingest(self.metadata_graph, self.np.pg.identifier)

        # for s,p,o in self.np.triples((None,None,None)):
        #     print(s.__repr__,p.__repr__,o.__repr__)

        return

    def convert(self):
        """Starts a conversion process (in parallel or as a single process) as defined in the arguments passed to the :class:`CSVWConverter` initialization"""
        logger.info("Starting conversion")

        # If the number of processes is set to 1, we start the 'simple' conversion (in a single thread)
        if self._processes == 1:
            self._simple()
        # Otherwise, we start the parallel processing procedure, but fall back to simple conversion
        # when it turns out that for some reason the parallel processing fails (this happens on some
        # files. The reason could not yet be determined.)
        elif self._processes > 1:
            try:
                self._parallel()
            except TypeError:
                logger.info(
                    "TypeError in multiprocessing... falling back to serial conversion")
                self._simple()
            except Exception:
                logger.error(
                    "Some exception occurred, falling back to serial conversion")
                traceback.print_exc()
                self._simple()
        else:
            logger.error("Incorrect process count specification")

    def _simple(self):
        """Starts a single process for converting the file"""
        with open(self.target_file, 'wb') as target_file:
            with open(self.file_name, 'rb') as csvfile:
                logger.info("Opening CSV file for reading")
                reader = csv.DictReader(csvfile,
                                        encoding=self.encoding,
                                        delimiter=self.delimiter,
                                        quotechar=self.quotechar)

                logger.info("Starting in a single process")
                c = BurstConverter(self.np.ag.identifier, self.columns,
                                   self.schema, self.metadata_graph, self.encoding, self.output_format)
                # Out will contain an N-Quads serialized representation of the
                # converted CSV
                out = c.process(0, reader, 1)
                # We then write it to the file
                try:
                    # Python 2
                    target_file.write(out)
                except TypeError:
                    # Python 3
                    target_file.write(out.decode('utf-8'))

            self.convert_info()
            # Finally, write the nanopublication info to file
            target_file.write(self.np.serialize(format=self.output_format))

    def _parallel(self):
        """Starts parallel processes for converting the file. Each process will receive max ``chunksize`` number of rows"""
        with open(self.target_file, 'wb') as target_file:
            with open(self.file_name, 'rb') as csvfile:
                logger.info("Opening CSV file for reading")
                reader = csv.DictReader(csvfile,
                                        encoding=self.encoding,
                                        delimiter=self.delimiter,
                                        quotechar=self.quotechar)

                # Initialize a pool of processes (default=4)
                pool = mp.Pool(processes=self._processes)
                logger.info("Running in {} processes".format(self._processes))

                # The _burstConvert function is partially instantiated, and will be successively called with
                # chunksize rows from the CSV file
                # print("LOOOOOOOOOOOOOOOOOOOOOOO")
                # from pprint import pprint
                # pprint([term.n3() for term in self.columns])
                burstConvert_partial = partial(_burstConvert,
                                               identifier=self.np.ag.identifier,
                                               columns=self.columns,
                                               schema=self.schema,
                                               metadata_graph=self.metadata_graph,
                                               encoding=self.encoding,
                                               chunksize=self._chunksize,
                                               output_format=self.output_format)

                # The result of each chunksize run will be written to the
                # target file
                for out in pool.imap(burstConvert_partial, enumerate(grouper(self._chunksize, reader))):
                    target_file.write(out)

                # Make sure to close and join the pool once finished.
                pool.close()
                pool.join()

            self.convert_info()
            # Finally, write the nanopublication info to file
            target_file.write(self.np.serialize(format=self.output_format))