Exemplo n.º 1
0
 def test_write_csv_fp(self):
     data = read_csv("data/buildings.txt", delimiter='\t')
     fp = open('data/buildings_out.txt', 'w')
     write_csv_fp(fp, data, delimiter='\t')
     fp.close()
     data2 = read_csv("data/buildings.txt", delimiter='\t')
     self.assertTrue(data == data2)
Exemplo n.º 2
0
def load_enum(update_def):
    """
    Find all enumerations in the update_def. for each, read the corresponding enum file and build the corresponding
    pair of enum dictionaries.

    The two columns in the tab delimited input file must be called "short" and "vivo".  "vivo" is the value to put in
    vivo (update) or get from vivo.  short is the human usable short form.

    The input file name appears as the 'enum' value in update_def

    :return enumeration structure.  Pairs of dictionaries, one pair for each enumeration.  short -> vivo, vivo -> short
    """
    from vivopump import read_csv
    #    import os
    enum = {}
    for path in update_def['column_defs'].values():
        for step in path:
            if 'object' in step and 'enum' in step['object']:
                enum_filename = step['object']['enum']
                enum_name = enum_filename
                #                enum_name = os.path.splitext(os.path.split(enum_filename)[1])[0]
                if enum_name not in enum:
                    enum[enum_name] = {}
                    enum[enum_name]['get'] = {}
                    enum[enum_name]['update'] = {}
                    enum_data = read_csv(enum_filename, delimiter='\t')
                    for enum_datum in enum_data.values():
                        enum[enum_name]['get'][
                            enum_datum['vivo']] = enum_datum['short']
                        enum[enum_name]['update'][
                            enum_datum['short']] = enum_datum['vivo']
    return enum
Exemplo n.º 3
0
def load_enum(update_def):
    """
    Find all enumerations in the update_def. for each, read the corresponding enum file and build the corresponding
    pair of enum dictionaries.

    The two columns in the tab delimited input file must be called "short" and "vivo".  "vivo" is the value to put in
    vivo (update) or get from vivo.  short is the human usable short form.

    The input file name appears as the 'enum' value in update_def

    :return enumeration structure.  Pairs of dictionaries, one pair for each enumeration.  short -> vivo, vivo -> short
    """
    from vivopump import read_csv
#    import os
    enum = {}
    for path in update_def['column_defs'].values():
        for step in path:
            if 'object' in step and 'enum' in step['object']:
                enum_filename = step['object']['enum']
                enum_name = enum_filename
#                enum_name = os.path.splitext(os.path.split(enum_filename)[1])[0]
                if enum_name not in enum:
                    enum[enum_name] = {}
                    enum[enum_name]['get'] = {}
                    enum[enum_name]['update'] = {}
                    enum_data = read_csv(enum_filename, delimiter='\t')
                    for enum_datum in enum_data.values():
                        enum[enum_name]['get'][enum_datum['vivo']] = enum_datum['short']
                        enum[enum_name]['update'][enum_datum['short']] = enum_datum['vivo']
    return enum
Exemplo n.º 4
0
    def update(self, filename=None, inter='\t', intra=';'):
        """
        Prepare for the update, getting graph and update_data.  Then do the update, producing triples
        """

        from vivopump import read_csv, get_graph
        from rdflib import Graph
        import logging

        self.intra = intra
        self.inter = inter

        logging.basicConfig(level=logging.INFO)
        if filename is not None:
            self.out_filename = filename

        if self.update_data is None:  # Test for injection
            self.update_data = read_csv(self.out_filename, delimiter=inter)

        # Narrow the update_def to include only columns that appear in the update_data

        new_update_columns = {}
        for name, path in self.update_def['column_defs'].items():
            if name in self.update_data[1].keys():
                new_update_columns[name] = path
        self.update_def['column_defs'] = new_update_columns

        self.enum = load_enum(self.update_def)

        if self.original_graph is None:  # Test for injection
            self.original_graph = get_graph(
                self.update_def,
                debug=self.verbose)  # Create the original graph from VIVO

        self.update_graph = Graph()
        for s, p, o in self.original_graph:
            self.update_graph.add((s, p, o))

        if self.verbose:
            print datetime.now(), 'Graphs ready for processing. Original has ', len(self.original_graph), \
                '. Update graph has', len(self.update_graph)
            print datetime.now(), 'Updates ready for processing. ', len(
                self.update_data), 'rows.'
            if len(self.enum) == 0:
                print datetime.now(), "No enumerations"
            else:
                for key in self.enum.keys():
                    print datetime.now(), key, "get", len(self.enum[key]['get']), "update", \
                        len(self.enum[key]['update'])

        return self.do_update()
Exemplo n.º 5
0
    def update(self):
        """
        Prepare for the update, getting graph and update_data.  Then do the update, producing triples
        :return: list(graph, graph): The add and sub graphs for performing the update
        """
        from vivopump import read_csv, get_graph
        from rdflib import Graph
        import logging
        import os.path
        import time

        logging.basicConfig(level=logging.INFO)

        if self.update_data is None:  # Test for injection
            self.update_data = read_csv(self.out_filename,
                                        delimiter=self.inter)

        # Narrow the update_def to include only columns that appear in the update_data

        new_update_columns = {}
        for name, path in self.update_def['column_defs'].items():
            if name in self.update_data[self.update_data.keys()[0]].keys():
                new_update_columns[name] = path
        self.update_def['column_defs'] = new_update_columns

        if self.original_graph is None:  # Test for injection

            # Create the original graph from VIVO

            self.original_graph = get_graph(self.update_def,
                                            self.query_parms,
                                            debug=self.verbose)

        self.update_graph = Graph()
        for s, p, o in self.original_graph:
            self.update_graph.add((s, p, o))

        if self.verbose:
            print datetime.now(), 'Graphs ready for processing. Original has ', len(self.original_graph), \
                '. Update graph has', len(self.update_graph)
            print datetime.now(), 'Updates ready for processing. ', len(
                self.update_data), 'rows.'
            if len(self.enum) == 0:
                print datetime.now(), "No enumerations"
            else:
                for key in self.enum.keys():
                    print datetime.now(), key, "modified", time.ctime(os.path.getmtime(key)), \
                        "get", len(self.enum[key]['get']), "update", \
                        len(self.enum[key]['update'])

        return self.__do_update()
Exemplo n.º 6
0
    def update(self, filename=None, inter='\t', intra=';'):
        """
        Prepare for the update, getting graph and update_data.  Then do the update, producing triples
        """

        from vivopump import read_csv, get_graph
        from rdflib import Graph
        import logging

        self.intra = intra
        self.inter = inter

        logging.basicConfig(level=logging.INFO)
        if filename is not None:
            self.out_filename = filename

        if self.update_data is None:  # Test for injection
            self.update_data = read_csv(self.out_filename, delimiter=inter)

        # Narrow the update_def to include only columns that appear in the update_data

        new_update_columns = {}
        for name, path in self.update_def['column_defs'].items():
            if name in self.update_data[1].keys():
                new_update_columns[name] = path
        self.update_def['column_defs'] = new_update_columns

        self.enum = load_enum(self.update_def)

        if self.original_graph is None:  # Test for injection
            self.original_graph = get_graph(self.update_def, debug=self.verbose)  # Create the original graph from VIVO

        self.update_graph = Graph()
        for s, p, o in self.original_graph:
            self.update_graph.add((s, p, o))

        if self.verbose:
            print datetime.now(), 'Graphs ready for processing. Original has ', len(self.original_graph), \
                '. Update graph has', len(self.update_graph)
            print datetime.now(), 'Updates ready for processing. ', len(self.update_data), 'rows.'
            if len(self.enum) == 0:
                print datetime.now(), "No enumerations"
            else:
                for key in self.enum.keys():
                    print datetime.now(), key, "get", len(self.enum[key]['get']), "update", \
                        len(self.enum[key]['update'])

        return self.do_update()
Exemplo n.º 7
0
    def update(self):
        """
        Prepare for the update, getting graph and update_data.  Then do the update, producing triples
        :return: list(graph, graph): The add and sub graphs for performing the update
        """
        from vivopump import read_csv, get_graph
        from rdflib import Graph
        import logging
        import os.path
        import time

        logging.basicConfig(level=logging.INFO)

        if self.update_data is None:  # Test for injection
            self.update_data = read_csv(self.out_filename, delimiter=self.inter)

        # Narrow the update_def to include only columns that appear in the update_data

        new_update_columns = {}
        for name, path in self.update_def['column_defs'].items():
            if name in self.update_data[self.update_data.keys()[0]].keys():
                new_update_columns[name] = path
        self.update_def['column_defs'] = new_update_columns

        if self.original_graph is None:  # Test for injection

            # Create the original graph from VIVO

            self.original_graph = get_graph(self.update_def, self.query_parms, debug=self.verbose)

        self.update_graph = Graph()
        for s, p, o in self.original_graph:
            self.update_graph.add((s, p, o))

        if self.verbose:
            print datetime.now(), 'Graphs ready for processing. Original has ', len(self.original_graph), \
                '. Update graph has', len(self.update_graph)
            print datetime.now(), 'Updates ready for processing. ', len(self.update_data), 'rows.'
            if len(self.enum) == 0:
                print datetime.now(), "No enumerations"
            else:
                for key in self.enum.keys():
                    print datetime.now(), key, "modified", time.ctime(os.path.getmtime(key)), \
                        "get", len(self.enum[key]['get']), "update", \
                        len(self.enum[key]['update'])

        return self.__do_update()
Exemplo n.º 8
0
    def update(self):
        """
        Prepare for the update, getting graph and update_data.  Then do the update, producing triples
        :return: list(graph, graph): The add and sub graphs for performing the update
        """
        from vivopump import read_csv, get_graph
        from rdflib import Graph
        import os.path
        import time

        if self.update_data is None:  # Test for injection
            self.update_data = read_csv(self.out_filename, delimiter=self.inter)

        #   Narrow the update_def to include only columns that appear in the update_data

        new_update_columns = {}
        for name, path in self.update_def['column_defs'].items():
            if len(self.update_data) > 0 and name in self.update_data[self.update_data.keys()[0]].keys():
                new_update_columns[name] = path

        self.update_def['column_defs'] = new_update_columns

        if self.original_graph is None:  # Test for injection

            # Create the original graph from VIVO

            self.original_graph = get_graph(self.update_def, self.query_parms)

        self.update_graph = Graph()
        for s, p, o in self.original_graph:
            self.update_graph.add((s, p, o))

        logger.info(u'Graphs ready for processing. Original has {} triples.  Update graph has {} triples.'.format(
            len(self.original_graph), len(self.update_graph)))
        logger.info(u'Updates ready for processing. {} rows in update.'.format(len(self.update_data)))

        if len(self.enum) == 0:
            logger.info(u"No enumerations")
        else:
            for key in self.enum.keys():
                logger.info(
                    u"Enumeration {} modified {}. {} entries in get enum.  {} entries in update enum".format(
                        key, time.ctime(os.path.getmtime(key)), len(self.enum[key]['get']),
                        len(self.enum[key]['update'])))
        return self.__do_update()
Exemplo n.º 9
0
    def update(self):
        """
        Prepare for the update, getting graph and update_data.  Then do the update, producing triples
        :return: list(graph, graph): The add and sub graphs for performing the update
        """
        from vivopump import read_csv, get_graph
        from rdflib import Graph
        import os.path
        import time

        if self.update_data is None:  # Test for injection
            self.update_data = read_csv(self.out_filename, delimiter=self.inter)

        #   Narrow the update_def to include only columns that appear in the update_data

        new_update_columns = {}
        for name, path in self.update_def['column_defs'].items():
            if name in self.update_data[self.update_data.keys()[0]].keys():
                new_update_columns[name] = path
        self.update_def['column_defs'] = new_update_columns

        if self.original_graph is None:  # Test for injection

            # Create the original graph from VIVO

            self.original_graph = get_graph(self.update_def, self.query_parms)

        self.update_graph = Graph()
        for s, p, o in self.original_graph:
            self.update_graph.add((s, p, o))

        logger.info(u'Graphs ready for processing. Original has {} triples.  Update graph has {} triples.'.format(
            len(self.original_graph), len(self.update_graph)))
        logger.info(u'Updates ready for processing. {} rows in update.'.format(len(self.update_data)))

        if len(self.enum) == 0:
            logger.info(u"No enumerations")
        else:
            for key in self.enum.keys():
                logger.info(
                    u"Enumeration {} modified {}. {} entries in get enum.  {} entries in update enum".format(
                        key, time.ctime(os.path.getmtime(key)), len(self.enum[key]['get']),
                        len(self.enum[key]['update'])))
        return self.__do_update()
Exemplo n.º 10
0
    In processing of data for UF people, a previous filter (merge_filter) determines whether the person was
    in the source and/or VIVO and set the value of the 'current' column to 'yes' if the person is current and 'no'
    otherwise.
"""

__author__ = "Michael Conlon"
__copyright__ = "Copyright 2015 (c), Michael Conlon"
__license__ = "New BSD License"
__version__ = "0.01"

from vivopump import read_csv_fp, write_csv_fp, get_vivo_types, get_parms, read_csv
import sys

parms = get_parms()
type_data = read_csv('person_types.txt', delimiter='\t')
type_enum = {
    type_data[row]['vivo']: type_data[row]['short']
    for row in type_data
}  # convert spreadsheet to dict
plan_data = read_csv('salary_plan_enum.txt', delimiter='\t')
plan_enum = {
    plan_data[row]['short']: plan_data[row]['vivo']
    for row in plan_data
}  # convert spreadsheet to dict
vivo_types = get_vivo_types("?uri a uf:UFEntity . ?uri a foaf:Person .",
                            parms)  # must match entity_sparql
data_in = read_csv_fp(sys.stdin)
data_out = {}
for row, data in data_in.items():
    new_data = dict(data)
Exemplo n.º 11
0
#!/usr/bin/env/python
"""
    salary_plan_filter.py -- include only people with a qualifying salary plan
"""

__author__ = "Michael Conlon"
__copyright__ = "Copyright 2015, University of Florida"
__license__ = "New BSD License"
__version__ = "0.01"

from vivopump import read_csv_fp, read_csv, write_csv_fp
import sys

plan_data = read_csv('salary_plan_enum.txt', delimiter='\t')
vivo_plans = [
    plan_data[x]['short'] for x in plan_data if plan_data[x]['vivo'] != "None"
]  # list of qualifying plans
data_in = read_csv_fp(sys.stdin)
print >> sys.stderr, 'Data in', len(data_in)
data_out = {}
qualify = 0
do_not_qualify = 0
for row, data in data_in.items():
    new_data = dict(data)
    if new_data['SAL_ADMIN_PLAN'] in vivo_plans:
        qualify += 1
        new_data['types'] = new_data['SAL_ADMIN_PLAN']
        data_out[row] = new_data
    else:
        do_not_qualify += 1
Exemplo n.º 12
0
__copyright__ = "Copyright 2015 (c) Michael Conlon"
__license__ = "BSD 3-Clause license"
__version__ = "0.3"

from datetime import datetime
from vivopump import read_csv
import shelve
import os

#   Start here

print datetime.now(), "Start"

# Contact

contact_data = read_csv('contact_data.txt')
try:
    os.remove('contact')
except OSError:
    pass
contact = shelve.open('contact')
k = 0
for row, val in contact_data.items():
    k += 1
    if k % 1000 == 0:
        print k
    contact[str(val['UFID'])] = val
print datetime.now(), 'Contact has ', len(contact), 'entries'
contact.close()

# Deptid_exceptions
Exemplo n.º 13
0
 def test_read_csv_keys(self):
     data = read_csv("data/extension.txt", delimiter='\t')
     print data
     self.assertTrue(data.keys() == range(1, 74))
Exemplo n.º 14
0
 def test_read_csv_minimal(self):
     data = read_csv("data/minimal.txt", delimiter='|')
     data_string = "{1: {u'overview': u'None', u'uri': u'http://vivo.ufl.edu/individual/n7023304'}}"
     self.assertEqual(data_string, str(data))
Exemplo n.º 15
0
__copyright__ = "Copyright 2015, University of Florida"
__license__ = "BSD 3-Clause license"
__version__ = "0.3"

from datetime import datetime
from vivopump import read_csv
import shelve
import os

#   Start here

print datetime.now(), "Start"

# Contact

contact_data = read_csv('contact_data.txt')
try:
    os.remove('contact')
except OSError:
    pass
contact = shelve.open('contact')
k = 0
for row, val in contact_data.items():
    k += 1
    if k % 1000 == 0:
        print k
    contact[str(val['UFID'])] = val
print datetime.now(), 'Contact has ', len(contact), 'entries'
contact.close()

# Deptid_exceptions
Exemplo n.º 16
0
          parser.  This string of six characters should be replaced everywhere
          with a single space.

    In addition, Thomson Reuters uses a series of abbreviations for journal
    names and publishers that can be improved on a case by case basis.

    This program reads a file of improvements, and a bibtex file from stdin,
    makes the improvements that need to be made, and writes an improved
    file to stdout

    Version 1.0 2012-08-25 MC
    --  Added additional publisher name corrections
    Version 1.1 2014-01-13 MC
    --  All data moved to a CSV file
    --  Conform with commenting and coding standards
"""
__author__ = "Michael Conlon"
__copyright__ = "Copyright 2014, University of Florida"
__license__ = "BSD 3-Clause license"
__version__ = "1.1"

import sys
import fileinput
from vivopump import read_csv
fix_bibtex = read_csv("fix_bibtex.csv")
for line in fileinput.input():
    for row in fix_bibtex.values():
        line = line.replace(row['original'], row['improved'])
    sys.stdout.write(line)

Exemplo n.º 17
0
#!/usr/bin/env/python

"""
    salary_plan_filter.py -- include only people with a qualifying salary plan
"""

__author__ = "Michael Conlon"
__copyright__ = "Copyright 2015, University of Florida"
__license__ = "New BSD License"
__version__ = "0.01"

from vivopump import read_csv_fp, read_csv, write_csv_fp
import sys

plan_data = read_csv('salary_plan_enum.txt', delimiter='\t')
vivo_plans = [plan_data[x]['short'] for x in plan_data if plan_data[x]['vivo'] != "None"]  # list of qualifying plans
data_in = read_csv_fp(sys.stdin)
print >>sys.stderr, 'Data in', len(data_in)
data_out = {}
qualify = 0
do_not_qualify = 0
for row, data in data_in.items():
    new_data = dict(data)
    if new_data['SAL_ADMIN_PLAN'] in vivo_plans:
        qualify += 1
        new_data['types'] = new_data['SAL_ADMIN_PLAN']
        data_out[row] = new_data
    else:
        do_not_qualify += 1

print >>sys.stderr, 'Qualify', qualify
Exemplo n.º 18
0
    def __do_get(self):
        """
        Data is queried from VIVO and returned as a tab delimited text file suitable for
        editing using an editor or spreadsheet, and suitable for use by do_update.

        :return:  Number of rows of data
        """
        from vivopump import vivo_query, make_get_data, unique_path, make_get_query, read_csv, write_csv
        import codecs
        import sys
        from vivopump import improve_title, improve_email, improve_phone_number, improve_date, \
            improve_dollar_amount, improve_sponsor_award_id, improve_deptid, improve_display_name, \
            improve_org_name

        #   Generate the get query, execute the query, shape the query results into the return object

        query = make_get_query(self.update_def)
        if self.verbose:
            print self.query_parms
            print query
        result_set = vivo_query(query, self.query_parms, self.verbose)
        data = make_get_data(self.update_def, result_set)

        #   Write out the file

        outfile = codecs.open(self.out_filename,
                              mode='w',
                              encoding='ascii',
                              errors='xmlcharrefreplace')

        columns = ['uri'] + self.update_def['entity_def']['order']
        outfile.write(
            self.inter.join(columns)
        )  # write a header using the inter field separator between column names
        outfile.write('\n')

        for uri in sorted(data.keys()):
            for name in columns:
                if name in data[uri]:

                    # Translate VIVO values via enumeration if any

                    if name in self.update_def['column_defs']:
                        path = self.update_def['column_defs'][name]

                        # Warn/correct if path is unique and VIVO is not

                        if unique_path(path) and len(data[uri][name]) > 1:
                            print "WARNING. VIVO has non-unique values for unique path:", name, "at", uri, \
                                data[uri][name]
                            data[uri][name] = {
                                next(iter(data[uri][name]))
                            }  # Pick one element from multi-valued set
                            print data[uri][name]

                        # Handle filters

                        if self.filter and 'filter' in path[len(path) -
                                                            1]['object']:
                            a = set()
                            for x in data[uri][name]:
                                was_string = x
                                new_string = eval(
                                    path[len(path) - 1]['object']['filter'])(x)
                                if self.verbose and was_string != new_string:
                                    print uri, name, path[len(path) - 1]['object'][
                                        'filter'], "FILTER IMPROVED", was_string, 'to', \
                                        new_string
                                a.add(new_string)
                            data[uri][name] = a

                        # Handle enumerations

                        if 'enum' in path[len(path) - 1]['object']:
                            enum_name = path[len(path) - 1]['object']['enum']
                            a = set()
                            for x in data[uri][name]:
                                val = self.enum[enum_name]['get'].get(x, '')
                                if val != '':
                                    a.add(val)
                                else:
                                    print "WARNING: Unable to find ", x, "in", enum_name, \
                                        ". Blank substituted in", self.out_filename
                            data[uri][name] = a

                    # Gather values into a delimited string

                    val = self.intra.join(data[uri][name])
                    outfile.write(
                        val.replace('\r', ' ').replace('\n',
                                                       ' ').replace('\t', ' '))
                if name != columns[len(columns) - 1]:
                    outfile.write(self.inter)
            outfile.write('\n')

        outfile.close()

        # Rewrite the file based on the order_by or uri if none

        sort_column_name = self.update_def['entity_def'].get('order_by', 'uri')
        data = read_csv(self.out_filename, delimiter=self.inter)
        sdata = {}
        try:
            order = sorted(data, key=lambda rown: data[rown][sort_column_name])
        except KeyError:
            print >>sys.stderr, "ERROR: ", sort_column_name, \
                "in order_by not found.  No such column name. Sorting by uri."
            order = sorted(data, key=lambda rown: data[rown]['uri'])
        row = 1
        for o in order:
            sdata[row] = data[o]
            row += 1
        write_csv(self.out_filename, sdata, delimiter=self.inter)

        return len(data)
Exemplo n.º 19
0
    def __do_get(self):
        """
        Data is queried from VIVO and returned as a tab delimited text file suitable for
        editing using an editor or spreadsheet, and suitable for use by do_update.

        :return:  Number of rows of data
        """
        from vivopump import vivo_query, make_get_data, unique_path, make_get_query, read_csv, write_csv
        import codecs
        import sys
        from vivopump import improve_title, improve_email, improve_phone_number, improve_date, \
            improve_dollar_amount, improve_sponsor_award_id, improve_deptid, improve_display_name, \
            improve_org_name

        #   Generate the get query, execute the query, shape the query results into the return object

        query = make_get_query(self.update_def)
        if self.verbose:
            print self.query_parms
            print query
        result_set = vivo_query(query, self.query_parms, self.verbose)
        data = make_get_data(self.update_def, result_set)

        #   Write out the file

        outfile = codecs.open(self.out_filename, mode='w', encoding='ascii', errors='xmlcharrefreplace')

        columns = ['uri'] + self.update_def['entity_def']['order']
        outfile.write(self.inter.join(columns))  # write a header using the inter field separator between column names
        outfile.write('\n')

        for uri in sorted(data.keys()):
            for name in columns:
                if name in data[uri]:

                    # Translate VIVO values via enumeration if any

                    if name in self.update_def['column_defs']:
                        path = self.update_def['column_defs'][name]

                        # Warn/correct if path is unique and VIVO is not

                        if unique_path(path) and len(data[uri][name]) > 1:
                            print "WARNING. VIVO has non-unique values for unique path:", name, "at", uri, \
                                data[uri][name]
                            data[uri][name] = {next(iter(data[uri][name]))}  # Pick one element from multi-valued set
                            print data[uri][name]

                        # Handle filters

                        if self.filter and 'filter' in path[len(path) - 1]['object']:
                            a = set()
                            for x in data[uri][name]:
                                was_string = x
                                new_string = eval(path[len(path) - 1]['object']['filter'])(x)
                                if self.verbose and was_string != new_string:
                                    print uri, name, path[len(path) - 1]['object'][
                                        'filter'], "FILTER IMPROVED", was_string, 'to', \
                                        new_string
                                a.add(new_string)
                            data[uri][name] = a

                        # Handle enumerations

                        if 'enum' in path[len(path) - 1]['object']:
                            enum_name = path[len(path) - 1]['object']['enum']
                            a = set()
                            for x in data[uri][name]:
                                val = self.enum[enum_name]['get'].get(x, '')
                                if val != '':
                                    a.add(val)
                                else:
                                    print "WARNING: Unable to find ", x, "in", enum_name, \
                                        ". Blank substituted in", self.out_filename
                            data[uri][name] = a

                    # Gather values into a delimited string

                    val = self.intra.join(data[uri][name])
                    outfile.write(val.replace('\r', ' ').replace('\n', ' ').replace('\t', ' '))
                if name != columns[len(columns) - 1]:
                    outfile.write(self.inter)
            outfile.write('\n')

        outfile.close()

        # Rewrite the file based on the order_by or uri if none

        sort_column_name = self.update_def['entity_def'].get('order_by', 'uri')
        data = read_csv(self.out_filename, delimiter=self.inter)
        sdata = {}
        try:
            order = sorted(data, key=lambda rown: data[rown][sort_column_name])
        except KeyError:
            print >>sys.stderr, "ERROR: ", sort_column_name, \
                "in order_by not found.  No such column name. Sorting by uri."
            order = sorted(data, key=lambda rown: data[rown]['uri'])
        row = 1
        for o in order:
            sdata[row] = data[o]
            row += 1
        write_csv(self.out_filename, sdata, delimiter=self.inter)

        return len(data)
Exemplo n.º 20
0
In addition, Thomson Reuters uses a series of abbreviations for journal
names and publishers that can be improved on a case by case basis.

This program reads a file of improvements, and a bibtex file from stdin,
makes the improvements that need to be made, and writes an improved
file to stdout.

Version 1.0 2012-08-25 MC
--  Added additional publisher name corrections
Version 1.1 2014-01-13 MC
--  All data moved to a CSV file
--  Conform with commenting and coding standards
"""

__author__ = "Michael Conlon"
__copyright__ = "Copyright 2014, University of Florida"
__license__ = "BSD 3-Clause license"
__version__ = "1.1"

import sys
import fileinput
from vivopump import read_csv

names = read_csv("filters/publisher_name_filter.csv")

for line in fileinput.input():
    for row in names.values():
        line = line.replace(row['original'], row['improved'])
    sys.stdout.write(line)
Exemplo n.º 21
0
    def __do_get(self):
        """
        Data is queried from VIVO and returned as a tab delimited text file suitable for
        editing using an editor or spreadsheet, and suitable for use by do_update.

        :return:  Number of rows of data
        """
        from vivopump import vivo_query, make_get_data, unique_path, make_get_query, read_csv, write_csv
        from improve.improve import improve
        import codecs

        #   Generate the get query, execute the query, shape the query results into the return object

        query = make_get_query(self.update_def)
        logger.debug(u"do_get query_parms\n{}".format(self.query_parms))
        logger.debug(u"do_get query\n{}".format(query))
        result_set = vivo_query(query, self.query_parms)
        data = make_get_data(self.update_def, result_set)

        #   Write out the file

        outfile = codecs.open(self.out_filename, mode='w', encoding='ascii', errors='xmlcharrefreplace')

        columns = ['uri'] + self.update_def['entity_def']['order']
        outfile.write(self.inter.join(columns))  # write a header using the inter field separator between column names
        outfile.write('\n')

        for uri in sorted(data.keys()):
            for name in columns:
                if name in data[uri]:

                    #   Translate VIVO values via enumeration if any

                    if name in self.update_def['column_defs']:
                        path = self.update_def['column_defs'][name]

                        #   Warn/correct if path is unique and VIVO is not

                        if unique_path(path) and len(data[uri][name]) > 1:
                            logger.warning(u"VIVO has non-unique values for unique path {} at {} values {}".
                                           format(name, uri, data[uri][name]))
                            data[uri][name] = {next(iter(data[uri][name]))}  # Pick one element from multi-valued set
                            logger.warning(u"Using {}", data[uri][name])

                        #   Handle filters

                        if self.filter and 'filter' in path[len(path) - 1]['object']:
                            a = set()
                            for x in data[uri][name]:
                                was_string = x
                                new_string = improve(path[len(path) - 1]['object']['filter'], x)
                                if was_string != new_string:
                                    logger.debug(u"{} {} {} FILTER IMPROVED {} to {}".
                                                 format(uri, name, path[len(path) - 1]['object']['filter'],
                                                        was_string, new_string))
                                a.add(new_string)
                            data[uri][name] = a

                        #   Handle enumerations

                        if 'enum' in path[len(path) - 1]['object']:
                            enum_name = path[len(path) - 1]['object']['enum']
                            a = set()
                            for x in data[uri][name]:
                                val = self.enum[enum_name]['get'].get(x, '')
                                if val != '':
                                    a.add(val)
                                else:
                                    logger.warning(u"WARNING: Unable to find {} in {}. Blank substituted in {}".
                                                   format(x, enum_name, self.out_filename))
                            data[uri][name] = a

                    #   Gather values into a delimited string

                    val = self.intra.join(data[uri][name])
                    outfile.write(val.replace('\r', ' ').replace('\n', ' ').replace('\t', ' '))
                if name != columns[len(columns) - 1]:
                    outfile.write(self.inter)
            outfile.write('\n')

        outfile.close()

        #   Rewrite the file based on the order_by or uri if none

        sort_column_name = self.update_def['entity_def'].get('order_by', 'uri')
        data = read_csv(self.out_filename, delimiter=self.inter)
        sdata = {}
        try:
            order = sorted(data, key=lambda rown: data[rown][sort_column_name])
        except KeyError:
            logger.error(u"{} in order_by not found.  No such column name. Sorting by uri.".
                         format(sort_column_name))
            order = sorted(data, key=lambda rown: data[rown]['uri'])
        row = 1
        for o in order:
            sdata[row] = data[o]
            row += 1
        write_csv(self.out_filename, sdata, delimiter=self.inter)

        return len(data)
Exemplo n.º 22
0
    If the person is in the position data, they are current, if not, they are not current.

    In processing of data for UF people, a previous filter (merge_filter) determines whether the person was
    in the source and/or VIVO and set the value of the 'current' column to 'yes' if the person is current and 'no'
    otherwise.
"""

__author__ = "Michael Conlon"
__copyright__ = "Copyright 2015 (c), Michael Conlon"
__license__ = "New BSD License"
__version__ = "0.01"

from vivopump import read_csv_fp, write_csv_fp, get_vivo_types, read_csv
import sys

type_data = read_csv('people_types.txt', delimiter='\t')
type_enum = {type_data[row]['vivo']: type_data[row]['short'] for row in type_data}  # convert spreadsheet to dict
plan_data = read_csv('salary_plan_enum.txt', delimiter='\t')
plan_enum = {plan_data[row]['short']: plan_data[row]['vivo'] for row in plan_data}  # convert spreadsheet to dict
vivo_types = get_vivo_types("?uri a uf:UFEntity . ?uri a foaf:Person .")  # must match entity_sparql
data_in = read_csv_fp(sys.stdin)
data_out = {}
for row, data in data_in.items():
    new_data =dict(data)

    #   Convert the source type to a VIVO type.  The source has an HR code.  Convert that to a VIVO person type URI
    #   using the plan_enum.  Then convert that to the value to be stored in the type data.  Whew.

    src_type = new_data['types']
    if src_type in plan_enum:
        src_type = type_enum[plan_enum[src_type]]
Exemplo n.º 23
0
    def __do_get(self):
        """
        Data is queried from VIVO and returned as a tab delimited text file suitable for
        editing using an editor or spreadsheet, and suitable for use by do_update.

        :return:  Number of rows of data
        """
        from vivopump import vivo_query, make_get_data, unique_path, make_get_query, read_csv, write_csv
        from improve.improve import improve
        import codecs

        #   Generate the get query, execute the query, shape the query results into the return object

        query = make_get_query(self.update_def)
        logger.debug(u"do_get query_parms\n{}".format(self.query_parms))
        logger.debug(u"do_get query\n{}".format(query))
        result_set = vivo_query(query, self.query_parms)
        data = make_get_data(self.update_def, result_set)

        #   Write out the file

        outfile = codecs.open(self.out_filename, mode='w', encoding='ascii', errors='xmlcharrefreplace')

        columns = ['uri'] + self.update_def['entity_def']['order']
        outfile.write(self.inter.join(columns))  # write a header using the inter field separator between column names
        outfile.write('\n')

        for uri in sorted(data.keys()):
            for name in columns:
                if name in data[uri]:

                    #   Translate VIVO values via enumeration if any

                    if name in self.update_def['column_defs']:
                        path = self.update_def['column_defs'][name]

                        #   Warn/correct if path is unique and VIVO is not

                        if unique_path(path) and len(data[uri][name]) > 1:
                            logger.warning(u"VIVO has non-unique values for unique path {} at {} values {}".
                                           format(name, uri, data[uri][name]))
                            data[uri][name] = {next(iter(data[uri][name]))}  # Pick one element from multi-valued set
                            logger.warning(u"Using {}", data[uri][name])

                        #   Handle filters

                        if self.filter and 'filter' in path[len(path) - 1]['object']:
                            a = set()
                            for x in data[uri][name]:
                                was_string = x
                                new_string = improve(path[len(path) - 1]['object']['filter'], x)
                                if was_string != new_string:
                                    logger.debug(u"{} {} {} FILTER IMPROVED {} to {}".
                                                 format(uri, name, path[len(path) - 1]['object']['filter'],
                                                        was_string, new_string))
                                a.add(new_string)
                            data[uri][name] = a

                        #   Handle enumerations

                        if 'enum' in path[len(path) - 1]['object']:
                            enum_name = path[len(path) - 1]['object']['enum']
                            a = set()
                            for x in data[uri][name]:
                                val = self.enum[enum_name]['get'].get(x, '')
                                if val != '':
                                    a.add(val)
                                else:
                                    logger.warning(u"WARNING: Unable to find {} in {}. Blank substituted in {}".
                                                   format(x, enum_name, self.out_filename))
                            data[uri][name] = a

                    #   Gather values into a delimited string

                    val = self.intra.join(data[uri][name])
                    outfile.write(val.replace('\r', ' ').replace('\n', ' ').replace('\t', ' '))
                if name != columns[len(columns) - 1]:
                    outfile.write(self.inter)
            outfile.write('\n')

        outfile.close()

        #   Rewrite the file based on the order_by or uri if none

        sort_column_name = self.update_def['entity_def'].get('order_by', 'uri')
        data = read_csv(self.out_filename, delimiter=self.inter)
        sdata = {}
        try:
            order = sorted(data, key=lambda rown: data[rown][sort_column_name])
        except KeyError:
            logger.error(u"{} in order_by not found.  No such column name. Sorting by uri.".
                         format(sort_column_name))
            order = sorted(data, key=lambda rown: data[rown]['uri'])
        row = 1
        for o in order:
            sdata[row] = data[o]
            row += 1
        write_csv(self.out_filename, sdata, delimiter=self.inter)

        return len(data)
Exemplo n.º 24
0
 def test_write_csv(self):
     data = read_csv("data/buildings.txt", delimiter='\t')
     write_csv("data/buildings_out.txt", data, delimiter='\t')
     data2 = read_csv("data/buildings.txt", delimiter='\t')
     self.assertTrue(data == data2)