Exemplo n.º 1
0
    def __do_get(self):
        """
        Data is queried from VIVO and returned as a tab delimited text file suitable for
        editing using an editor or spreadsheet, and suitable for use by do_update.

        :return:  Number of rows of data
        """
        from vivopump import vivo_query, make_get_data, unique_path, make_get_query, read_csv, write_csv
        from improve.improve import improve
        import codecs

        #   Generate the get query, execute the query, shape the query results into the return object

        query = make_get_query(self.update_def)
        logger.debug(u"do_get query_parms\n{}".format(self.query_parms))
        logger.debug(u"do_get query\n{}".format(query))
        result_set = vivo_query(query, self.query_parms)
        data = make_get_data(self.update_def, result_set)

        #   Write out the file

        outfile = codecs.open(self.out_filename, mode='w', encoding='ascii', errors='xmlcharrefreplace')

        columns = ['uri'] + self.update_def['entity_def']['order']
        outfile.write(self.inter.join(columns))  # write a header using the inter field separator between column names
        outfile.write('\n')

        for uri in sorted(data.keys()):
            for name in columns:
                if name in data[uri]:

                    #   Translate VIVO values via enumeration if any

                    if name in self.update_def['column_defs']:
                        path = self.update_def['column_defs'][name]

                        #   Warn/correct if path is unique and VIVO is not

                        if unique_path(path) and len(data[uri][name]) > 1:
                            logger.warning(u"VIVO has non-unique values for unique path {} at {} values {}".
                                           format(name, uri, data[uri][name]))
                            data[uri][name] = {next(iter(data[uri][name]))}  # Pick one element from multi-valued set
                            logger.warning(u"Using {}", data[uri][name])

                        #   Handle filters

                        if self.filter and 'filter' in path[len(path) - 1]['object']:
                            a = set()
                            for x in data[uri][name]:
                                was_string = x
                                new_string = improve(path[len(path) - 1]['object']['filter'], x)
                                if was_string != new_string:
                                    logger.debug(u"{} {} {} FILTER IMPROVED {} to {}".
                                                 format(uri, name, path[len(path) - 1]['object']['filter'],
                                                        was_string, new_string))
                                a.add(new_string)
                            data[uri][name] = a

                        #   Handle enumerations

                        if 'enum' in path[len(path) - 1]['object']:
                            enum_name = path[len(path) - 1]['object']['enum']
                            a = set()
                            for x in data[uri][name]:
                                val = self.enum[enum_name]['get'].get(x, '')
                                if val != '':
                                    a.add(val)
                                else:
                                    logger.warning(u"WARNING: Unable to find {} in {}. Blank substituted in {}".
                                                   format(x, enum_name, self.out_filename))
                            data[uri][name] = a

                    #   Gather values into a delimited string

                    val = self.intra.join(data[uri][name])
                    outfile.write(val.replace('\r', ' ').replace('\n', ' ').replace('\t', ' '))
                if name != columns[len(columns) - 1]:
                    outfile.write(self.inter)
            outfile.write('\n')

        outfile.close()

        #   Rewrite the file based on the order_by or uri if none

        sort_column_name = self.update_def['entity_def'].get('order_by', 'uri')
        data = read_csv(self.out_filename, delimiter=self.inter)
        sdata = {}
        try:
            order = sorted(data, key=lambda rown: data[rown][sort_column_name])
        except KeyError:
            logger.error(u"{} in order_by not found.  No such column name. Sorting by uri.".
                         format(sort_column_name))
            order = sorted(data, key=lambda rown: data[rown]['uri'])
        row = 1
        for o in order:
            sdata[row] = data[o]
            row += 1
        write_csv(self.out_filename, sdata, delimiter=self.inter)

        return len(data)
Exemplo n.º 2
0
    def __do_get(self):
        """
        Data is queried from VIVO and returned as a tab delimited text file suitable for
        editing using an editor or spreadsheet, and suitable for use by do_update.

        :return:  Number of rows of data
        """
        from vivopump import vivo_query, make_get_data, unique_path, make_get_query, read_csv, write_csv
        import codecs
        import sys
        from vivopump import improve_title, improve_email, improve_phone_number, improve_date, \
            improve_dollar_amount, improve_sponsor_award_id, improve_deptid, improve_display_name, \
            improve_org_name

        #   Generate the get query, execute the query, shape the query results into the return object

        query = make_get_query(self.update_def)
        if self.verbose:
            print self.query_parms
            print query
        result_set = vivo_query(query, self.query_parms, self.verbose)
        data = make_get_data(self.update_def, result_set)

        #   Write out the file

        outfile = codecs.open(self.out_filename,
                              mode='w',
                              encoding='ascii',
                              errors='xmlcharrefreplace')

        columns = ['uri'] + self.update_def['entity_def']['order']
        outfile.write(
            self.inter.join(columns)
        )  # write a header using the inter field separator between column names
        outfile.write('\n')

        for uri in sorted(data.keys()):
            for name in columns:
                if name in data[uri]:

                    # Translate VIVO values via enumeration if any

                    if name in self.update_def['column_defs']:
                        path = self.update_def['column_defs'][name]

                        # Warn/correct if path is unique and VIVO is not

                        if unique_path(path) and len(data[uri][name]) > 1:
                            print "WARNING. VIVO has non-unique values for unique path:", name, "at", uri, \
                                data[uri][name]
                            data[uri][name] = {
                                next(iter(data[uri][name]))
                            }  # Pick one element from multi-valued set
                            print data[uri][name]

                        # Handle filters

                        if self.filter and 'filter' in path[len(path) -
                                                            1]['object']:
                            a = set()
                            for x in data[uri][name]:
                                was_string = x
                                new_string = eval(
                                    path[len(path) - 1]['object']['filter'])(x)
                                if self.verbose and was_string != new_string:
                                    print uri, name, path[len(path) - 1]['object'][
                                        'filter'], "FILTER IMPROVED", was_string, 'to', \
                                        new_string
                                a.add(new_string)
                            data[uri][name] = a

                        # Handle enumerations

                        if 'enum' in path[len(path) - 1]['object']:
                            enum_name = path[len(path) - 1]['object']['enum']
                            a = set()
                            for x in data[uri][name]:
                                val = self.enum[enum_name]['get'].get(x, '')
                                if val != '':
                                    a.add(val)
                                else:
                                    print "WARNING: Unable to find ", x, "in", enum_name, \
                                        ". Blank substituted in", self.out_filename
                            data[uri][name] = a

                    # Gather values into a delimited string

                    val = self.intra.join(data[uri][name])
                    outfile.write(
                        val.replace('\r', ' ').replace('\n',
                                                       ' ').replace('\t', ' '))
                if name != columns[len(columns) - 1]:
                    outfile.write(self.inter)
            outfile.write('\n')

        outfile.close()

        # Rewrite the file based on the order_by or uri if none

        sort_column_name = self.update_def['entity_def'].get('order_by', 'uri')
        data = read_csv(self.out_filename, delimiter=self.inter)
        sdata = {}
        try:
            order = sorted(data, key=lambda rown: data[rown][sort_column_name])
        except KeyError:
            print >>sys.stderr, "ERROR: ", sort_column_name, \
                "in order_by not found.  No such column name. Sorting by uri."
            order = sorted(data, key=lambda rown: data[rown]['uri'])
        row = 1
        for o in order:
            sdata[row] = data[o]
            row += 1
        write_csv(self.out_filename, sdata, delimiter=self.inter)

        return len(data)
Exemplo n.º 3
0
 def test_write_csv(self):
     data = read_csv("data/buildings.txt", delimiter='\t')
     write_csv("data/buildings_out.txt", data, delimiter='\t')
     data2 = read_csv("data/buildings.txt", delimiter='\t')
     self.assertTrue(data == data2)
Exemplo n.º 4
0
    def __do_get(self):
        """
        Data is queried from VIVO and returned as a tab delimited text file suitable for
        editing using an editor or spreadsheet, and suitable for use by do_update.

        :return:  Number of rows of data
        """
        from vivopump import vivo_query, make_get_data, unique_path, make_get_query, read_csv, write_csv
        from improve.improve import improve
        import codecs

        #   Generate the get query, execute the query, shape the query results into the return object

        query = make_get_query(self.update_def)
        logger.debug(u"do_get query_parms\n{}".format(self.query_parms))
        logger.debug(u"do_get query\n{}".format(query))
        result_set = vivo_query(query, self.query_parms)
        data = make_get_data(self.update_def, result_set)

        #   Write out the file

        outfile = codecs.open(self.out_filename, mode='w', encoding='ascii', errors='xmlcharrefreplace')

        columns = ['uri'] + self.update_def['entity_def']['order']
        outfile.write(self.inter.join(columns))  # write a header using the inter field separator between column names
        outfile.write('\n')

        for uri in sorted(data.keys()):
            for name in columns:
                if name in data[uri]:

                    #   Translate VIVO values via enumeration if any

                    if name in self.update_def['column_defs']:
                        path = self.update_def['column_defs'][name]

                        #   Warn/correct if path is unique and VIVO is not

                        if unique_path(path) and len(data[uri][name]) > 1:
                            logger.warning(u"VIVO has non-unique values for unique path {} at {} values {}".
                                           format(name, uri, data[uri][name]))
                            data[uri][name] = {next(iter(data[uri][name]))}  # Pick one element from multi-valued set
                            logger.warning(u"Using {}", data[uri][name])

                        #   Handle filters

                        if self.filter and 'filter' in path[len(path) - 1]['object']:
                            a = set()
                            for x in data[uri][name]:
                                was_string = x
                                new_string = improve(path[len(path) - 1]['object']['filter'], x)
                                if was_string != new_string:
                                    logger.debug(u"{} {} {} FILTER IMPROVED {} to {}".
                                                 format(uri, name, path[len(path) - 1]['object']['filter'],
                                                        was_string, new_string))
                                a.add(new_string)
                            data[uri][name] = a

                        #   Handle enumerations

                        if 'enum' in path[len(path) - 1]['object']:
                            enum_name = path[len(path) - 1]['object']['enum']
                            a = set()
                            for x in data[uri][name]:
                                val = self.enum[enum_name]['get'].get(x, '')
                                if val != '':
                                    a.add(val)
                                else:
                                    logger.warning(u"WARNING: Unable to find {} in {}. Blank substituted in {}".
                                                   format(x, enum_name, self.out_filename))
                            data[uri][name] = a

                    #   Gather values into a delimited string

                    val = self.intra.join(data[uri][name])
                    outfile.write(val.replace('\r', ' ').replace('\n', ' ').replace('\t', ' '))
                if name != columns[len(columns) - 1]:
                    outfile.write(self.inter)
            outfile.write('\n')

        outfile.close()

        #   Rewrite the file based on the order_by or uri if none

        sort_column_name = self.update_def['entity_def'].get('order_by', 'uri')
        data = read_csv(self.out_filename, delimiter=self.inter)
        sdata = {}
        try:
            order = sorted(data, key=lambda rown: data[rown][sort_column_name])
        except KeyError:
            logger.error(u"{} in order_by not found.  No such column name. Sorting by uri.".
                         format(sort_column_name))
            order = sorted(data, key=lambda rown: data[rown]['uri'])
        row = 1
        for o in order:
            sdata[row] = data[o]
            row += 1
        write_csv(self.out_filename, sdata, delimiter=self.inter)

        return len(data)
Exemplo n.º 5
0
    def __do_get(self):
        """
        Data is queried from VIVO and returned as a tab delimited text file suitable for
        editing using an editor or spreadsheet, and suitable for use by do_update.

        :return:  Number of rows of data
        """
        from vivopump import vivo_query, make_get_data, unique_path, make_get_query, read_csv, write_csv
        import codecs
        import sys
        from vivopump import improve_title, improve_email, improve_phone_number, improve_date, \
            improve_dollar_amount, improve_sponsor_award_id, improve_deptid, improve_display_name, \
            improve_org_name

        #   Generate the get query, execute the query, shape the query results into the return object

        query = make_get_query(self.update_def)
        if self.verbose:
            print self.query_parms
            print query
        result_set = vivo_query(query, self.query_parms, self.verbose)
        data = make_get_data(self.update_def, result_set)

        #   Write out the file

        outfile = codecs.open(self.out_filename, mode='w', encoding='ascii', errors='xmlcharrefreplace')

        columns = ['uri'] + self.update_def['entity_def']['order']
        outfile.write(self.inter.join(columns))  # write a header using the inter field separator between column names
        outfile.write('\n')

        for uri in sorted(data.keys()):
            for name in columns:
                if name in data[uri]:

                    # Translate VIVO values via enumeration if any

                    if name in self.update_def['column_defs']:
                        path = self.update_def['column_defs'][name]

                        # Warn/correct if path is unique and VIVO is not

                        if unique_path(path) and len(data[uri][name]) > 1:
                            print "WARNING. VIVO has non-unique values for unique path:", name, "at", uri, \
                                data[uri][name]
                            data[uri][name] = {next(iter(data[uri][name]))}  # Pick one element from multi-valued set
                            print data[uri][name]

                        # Handle filters

                        if self.filter and 'filter' in path[len(path) - 1]['object']:
                            a = set()
                            for x in data[uri][name]:
                                was_string = x
                                new_string = eval(path[len(path) - 1]['object']['filter'])(x)
                                if self.verbose and was_string != new_string:
                                    print uri, name, path[len(path) - 1]['object'][
                                        'filter'], "FILTER IMPROVED", was_string, 'to', \
                                        new_string
                                a.add(new_string)
                            data[uri][name] = a

                        # Handle enumerations

                        if 'enum' in path[len(path) - 1]['object']:
                            enum_name = path[len(path) - 1]['object']['enum']
                            a = set()
                            for x in data[uri][name]:
                                val = self.enum[enum_name]['get'].get(x, '')
                                if val != '':
                                    a.add(val)
                                else:
                                    print "WARNING: Unable to find ", x, "in", enum_name, \
                                        ". Blank substituted in", self.out_filename
                            data[uri][name] = a

                    # Gather values into a delimited string

                    val = self.intra.join(data[uri][name])
                    outfile.write(val.replace('\r', ' ').replace('\n', ' ').replace('\t', ' '))
                if name != columns[len(columns) - 1]:
                    outfile.write(self.inter)
            outfile.write('\n')

        outfile.close()

        # Rewrite the file based on the order_by or uri if none

        sort_column_name = self.update_def['entity_def'].get('order_by', 'uri')
        data = read_csv(self.out_filename, delimiter=self.inter)
        sdata = {}
        try:
            order = sorted(data, key=lambda rown: data[rown][sort_column_name])
        except KeyError:
            print >>sys.stderr, "ERROR: ", sort_column_name, \
                "in order_by not found.  No such column name. Sorting by uri."
            order = sorted(data, key=lambda rown: data[rown]['uri'])
        row = 1
        for o in order:
            sdata[row] = data[o]
            row += 1
        write_csv(self.out_filename, sdata, delimiter=self.inter)

        return len(data)