示例#1
0
文件: pump.py 项目: senrabc/vivo-pump
def get_step_triples(update_graph, uri, step_def, debug=True):
    """
    Return the triples matching the criteria defined in the current step of an update
    :param update_graph: the update graph
    :param uri: uri of the entity currently the subject of an update
    :param step_def: step definition from update_def
    :return:  Graph containing one or more triples that match the criteria for the step
    """
    from rdflib import Graph
    from vivopump import vivo_query, add_qualifiers, make_rdf_term
    if 'qualifier' not in step_def['object']:
        g = update_graph.triples((uri, step_def['predicate']['ref'], None))
    else:
        q = 'select (?' + step_def['object']['name'] +' as ?o) where { <' + str(uri) + '> <' + \
            str(step_def['predicate']['ref']) + '> ?' + step_def['object']['name'] + ' .\n' + \
            add_qualifiers([step_def]) + ' }\n'
        if debug:
            print "\nStep Triples Query\n", q
        result_set = vivo_query(q)
        g = Graph()
        for binding in result_set['results']['bindings']:
            o = make_rdf_term(binding['o'])
            g.add((uri, step_def['predicate']['ref'], o))
        if debug:
            print "Step Triples", len(g)
    return g
示例#2
0
def get_step_triples(update_graph, uri, step_def, debug=True):
    """
    Return the triples matching the criteria defined in the current step of an update
    :param update_graph: the update graph
    :param uri: uri of the entity currently the subject of an update
    :param step_def: step definition from update_def
    :return:  Graph containing one or more triples that match the criteria for the step
    """
    from rdflib import Graph
    from vivopump import vivo_query, add_qualifiers, make_rdf_term
    if 'qualifier' not in step_def['object']:
        g = update_graph.triples((uri, step_def['predicate']['ref'], None))
    else:
        q = 'select (?' + step_def['object']['name'] +' as ?o) where { <' + str(uri) + '> <' + \
            str(step_def['predicate']['ref']) + '> ?' + step_def['object']['name'] + ' .\n' + \
            add_qualifiers([step_def]) + ' }\n'
        if debug:
            print "\nStep Triples Query\n", q
        result_set = vivo_query(q)
        g = Graph()
        for binding in result_set['results']['bindings']:
            o = make_rdf_term(binding['o'])
            g.add((uri, step_def['predicate']['ref'], o))
        if debug:
            print "Step Triples", len(g)
    return g
示例#3
0
 def test_vivo_query(self):
     result = vivo_query("""
     SELECT ?label
     WHERE { <http://vivo.school.edu/individual/n1133> rdfs:label ?label }
     """, debug=True)
     print result
     self.assertTrue(len(result) > 0)
示例#4
0
 def test_bad_request(self):
     from SPARQLWrapper import SPARQLExceptions
     with self.assertRaises(SPARQLExceptions.QueryBadFormed):
         result = vivo_query("""
         SEWECT ?label
         WHERE { <http://vivo.ufl.edu/individual/n25562> rdfs:label ?label }
         """, debug=True)
         print result
示例#5
0
def find_author(author):
    """
    Given an author object with name parts, return the smallest set of uris
    that match the author in VIVO.  Could be an empty set, could be a singleton,
    could be a set requiring further disambiguation
    """
    from vivopump import vivo_query
    case = author_case(author)
    queries = author_queries(case, author)
    author_uri_set = set([])
    for query in queries:
        result = vivo_query(query.encode('utf-8'))
        count = len(result['results']['bindings'])
        if count == 1:
            author_uri_set = set([result['results']['bindings'][0] \
                                      ['uri']['value']])
            break
        elif 1 < count < len(author_uri_set):
            author_uri_set = set([])
            for row in result['results']['bindings']:
                author_uri_set.add(row['uri']['value'])
    return author_uri_set
示例#6
0
def get_vivo_academic_articles(parms):
    """
    Query VIVO and return a list of all the academic articles.
    @see uf_examples/publications/filters/pub_match_filter.py
    @see https://wiki.duraspace.org/display/VIVO/VIVO-ISF+1.6+relationship+diagrams%3A+Authorship

    :param: parms: vivo_query params
    :return: dictionary of uri keyed by DOI
    """
    query = """
    SELECT
    ?uri ?doi
    WHERE {
        ?uri a vivo:InformationResource .
        ?uri bibo:doi ?doi .
    }
    """
    results = vivo_query(query, parms)
    bindings = results['results']['bindings']
    doi_list = [b['doi']['value'] for b in bindings]
    uri_list = [b['uri']['value'] for b in bindings]
    return dict(zip(doi_list, uri_list))
示例#7
0
文件: pubmed.py 项目: ctsit/vivo-pump
def get_person_vivo_pmids(uri, query_parms):
    """
    Given the uri of a person, query VIVO to get a list of the person's publications with pmids
    :param uri:
    :return: a dictionary keyed by pmid with uris of the pubs for each pmid
    """
    from pump.vivopump import vivo_query
    query = """SELECT (MAX(?paper_uri) AS ?puri) ?pmid
    WHERE {
        <{}> vivo:relatedBy ?a .
        ?a a vivo:Authorship .
        ?a vivo:relates ?paper_uri .
        ?paper_uri a bibo:AcademicArticle .
        ?paper_uri bibo:pmid ?pmid .
    }
    GROUP BY ?pmid
    """
    query = query.replace('{}', uri)
    a = vivo_query(query, query_parms)
    pmid = [x['pmid']['value'] for x in a['results']['bindings']]
    puri = [x['puri']['value'] for x in a['results']['bindings']]
    return dict(zip(pmid, puri))
示例#8
0
文件: pubmed.py 项目: ctsit/vivo-pump
def get_person_catalyst_pmids(uri, query_parms):
    """
    Given a person uri, collect the attributes needed to call get_pmids and return two lists:
    a list of pubs for the person found in VIVO, and a list of pubs for the person found by
    the catalyst service
    :param uri: the uri of a person in VIVO
    :return: A dictionary of two lists, the vivo_pmids and the catalyst_pmids
    """
    from vivopump import vivo_query
    query = """
    SELECT ?first ?middle ?last ?email ?affiliation
    WHERE {
      <{}>
    }
    """
    query = query.format(uri)
    a = vivo_query(query, query_parms)
    first = a['results']['bindings'][0]['first']['value']
    middle = None
    last = None
    emails = None
    affiliations = None
    return get_catalyst_pmids(first, middle, last, emails, affiliations)
示例#9
0
    def _get_step_triples(self, uri, step_def):
        """
        Return the triples matching the criteria defined in the current step of an update
        :param uri: uri of the entity currently the subject of an update
        :param step_def: step definition from update_def
        :return:  Graph containing zero or more triples that match the criteria for the step
        """
        from rdflib import Graph, RDF
        from vivopump import add_qualifiers, vivo_query, make_rdf_term

        def step_graph(uris, pred, otype=None, graph=self.update_graph):
            """
            Given a list of uri, a pred and a type, return a graph of the update_graph triples satisfying
                uri pred any   <- these are the returned triples
                any a type
            :param uris: list of uris.
            :param pred: the predicate to use in selecting triples for the step_graph
            :param otype: the object type to use.  default in None, and no type selection will be done.
            :param graph: default is update_graph. Closure sieve requires original_graph
            :return: graph
            """
            sg = Graph()
            for suri in uris:
                for obj in graph.objects(suri, pred):
                    if otype is None:
                        sg.add((suri, pred, obj))
                    elif (obj, RDF.type, otype) in self.update_graph:
                        sg.add((suri, pred, obj))

            return sg

        def sieve_triples(sgc, column_name):
            """
            Given a step graph of triples from a closure (sgc), and the current column_name,
            select the triples from the closure graph that have a path from the entity_uri to
            one or more objects in the closure.  If there is no path, return an empty graph.
            :param sgc:  the step closure graph to be "sieved"
            :param column_name: the name of the column to use
            :return: the sieved closure graph
            """

            print "\nBeginning Closure Graph for", column_name
            for (s, p, o) in sgc.triples((None, None, None)):
                print s, p, o

            if len(sgc) == 0:
                return sgc  # Nothing to sieve
            else:
                pred = self.update_def['column_defs'][column_name][0]['predicate']['ref']
                otype = self.update_def['column_defs'][column_name][0]['object'].get('type', None)
                sg = step_graph([self.entity_uri], pred, otype, graph=self.original_graph)
                if len(sg) == 0 or len(self.update_def['column_defs'][column_name]) == 1:
                    return sg
                print "step 0 graph"
                for (s, p, o) in sg.triples((None, None, None)):
                    print s, p, o
                for step in self.update_def['column_defs'][column_name][1:]:
                    sg = step_graph([y for y in sg.objects(None, None)], step['predicate']['ref'],
                                    step['object'].get('type', None), graph=self.original_graph)
                    print "next step graph"
                    for (s, p, o) in sg.triples((None, None, None)):
                        print s, p, o
                if len(sg) == 0:
                    return sg  # column path is empty, so nothing in the closure can match

                #   Wait for it .... Here's the sieve.  Return triples in the closure graph that have
                #   objects on the column graph

                sgr = Graph()
                for (sgcs, sgcp, sgco) in sgc.triples((None, None, None)):
                    if sgco in sg.objects(None, None):
                        sgr.add((sgcs, sgcp, sgco))

                print "reduced step graph"
                for (s, p, o) in sgr.triples((None, None, None)):
                    print s, p, o

            return sgr
        
        if 'qualifier' not in step_def['object']:

            g = step_graph([uri], step_def['predicate']['ref'], step_def['object'].get('type', None))

            # print "\nStep_triples for", step_def['column_name'], [uri],
            # step_def['predicate']['ref'], step_def['object'].get('type', None)

            for (s, p, o) in g.triples((None, None, None)):
                print unicode(s), unicode(p), unicode(o)

            #   If the step_def is in a closure, and its the last step in the closure, then the
            #   closure triples must be sieved against the objects defined by the column.

            if step_def['closure'] and step_def['last']:

                g = sieve_triples(g, step_def['column_name'])
        else:
        
            #   Handle non-specific predicates qualified by SPARQL (a rare case for VIVO-ISF)
            
            q = 'select (?' + step_def['object']['name'] + ' as ?o) where { <' + str(uri) + '> <' + \
                str(step_def['predicate']['ref']) + '> ?' + step_def['object']['name'] + ' . \n' + \
                add_qualifiers([step_def]) + ' }\n'
            logger.debug(u"Qualified Step Triples Query {}".format(q))
            result_set = vivo_query(q, self.query_parms)  # SLOW
            g = Graph()
            for binding in result_set['results']['bindings']:
                o = make_rdf_term(binding['o'])
                g.add((uri, step_def['predicate']['ref'], o))
        logger.debug(u"Step Triples {}".format(g.serialize(format='nt')))
        return g
示例#10
0
    def __do_get(self):
        """
        Data is queried from VIVO and returned as a tab delimited text file suitable for
        editing using an editor or spreadsheet, and suitable for use by do_update.

        :return:  Number of rows of data
        """
        from vivopump import vivo_query, make_get_data, unique_path, make_get_query, read_csv, write_csv
        from improve.improve import improve
        import codecs

        #   Generate the get query, execute the query, shape the query results into the return object

        query = make_get_query(self.update_def)
        logger.debug(u"do_get query_parms\n{}".format(self.query_parms))
        logger.debug(u"do_get query\n{}".format(query))
        result_set = vivo_query(query, self.query_parms)
        data = make_get_data(self.update_def, result_set)

        #   Write out the file

        outfile = codecs.open(self.out_filename, mode='w', encoding='ascii', errors='xmlcharrefreplace')

        columns = ['uri'] + self.update_def['entity_def']['order']
        outfile.write(self.inter.join(columns))  # write a header using the inter field separator between column names
        outfile.write('\n')

        for uri in sorted(data.keys()):
            for name in columns:
                if name in data[uri]:

                    #   Translate VIVO values via enumeration if any

                    if name in self.update_def['column_defs']:
                        path = self.update_def['column_defs'][name]

                        #   Warn/correct if path is unique and VIVO is not

                        if unique_path(path) and len(data[uri][name]) > 1:
                            logger.warning(u"VIVO has non-unique values for unique path {} at {} values {}".
                                           format(name, uri, data[uri][name]))
                            data[uri][name] = {next(iter(data[uri][name]))}  # Pick one element from multi-valued set
                            logger.warning(u"Using {}", data[uri][name])

                        #   Handle filters

                        if self.filter and 'filter' in path[len(path) - 1]['object']:
                            a = set()
                            for x in data[uri][name]:
                                was_string = x
                                new_string = improve(path[len(path) - 1]['object']['filter'], x)
                                if was_string != new_string:
                                    logger.debug(u"{} {} {} FILTER IMPROVED {} to {}".
                                                 format(uri, name, path[len(path) - 1]['object']['filter'],
                                                        was_string, new_string))
                                a.add(new_string)
                            data[uri][name] = a

                        #   Handle enumerations

                        if 'enum' in path[len(path) - 1]['object']:
                            enum_name = path[len(path) - 1]['object']['enum']
                            a = set()
                            for x in data[uri][name]:
                                val = self.enum[enum_name]['get'].get(x, '')
                                if val != '':
                                    a.add(val)
                                else:
                                    logger.warning(u"WARNING: Unable to find {} in {}. Blank substituted in {}".
                                                   format(x, enum_name, self.out_filename))
                            data[uri][name] = a

                    #   Gather values into a delimited string

                    val = self.intra.join(data[uri][name])
                    outfile.write(val.replace('\r', ' ').replace('\n', ' ').replace('\t', ' '))
                if name != columns[len(columns) - 1]:
                    outfile.write(self.inter)
            outfile.write('\n')

        outfile.close()

        #   Rewrite the file based on the order_by or uri if none

        sort_column_name = self.update_def['entity_def'].get('order_by', 'uri')
        data = read_csv(self.out_filename, delimiter=self.inter)
        sdata = {}
        try:
            order = sorted(data, key=lambda rown: data[rown][sort_column_name])
        except KeyError:
            logger.error(u"{} in order_by not found.  No such column name. Sorting by uri.".
                         format(sort_column_name))
            order = sorted(data, key=lambda rown: data[rown]['uri'])
        row = 1
        for o in order:
            sdata[row] = data[o]
            row += 1
        write_csv(self.out_filename, sdata, delimiter=self.inter)

        return len(data)
示例#11
0
def do_get(update_def, enum, filename, inter='\t', intra=';', do_filter=True, debug=True):
    """
    Data is queried from VIVO and returned as a tab delimited text file suitable for
    editing using an editor or spreadsheet, and suitable for use by do_update.

    :param filename: Tab delimited file of data from VIVO
    :param: do_filter: boolean if True do the filters, otherwise do not apply filters
    :return:  Number of rows of data
    """
    from vivopump import vivo_query
    import codecs
    from vivopump import improve_title, improve_email, improve_phone_number, improve_date, \
        improve_dollar_amount, improve_sponsor_award_id, improve_deptid, improve_display_name

    query = make_get_query(update_def)
    if debug:
        print query
    result_set = vivo_query(query, debug=debug)
    data = make_get_data(update_def, result_set)

    # Write out the file

    outfile = codecs.open(filename, mode='w', encoding='ascii', errors='xmlcharrefreplace')

    columns = ['uri'] + update_def['entity_def']['order']
    outfile.write(inter.join(columns))
    outfile.write('\n')

    for uri in sorted(data.keys()):
        for name in columns:
            if name in data[uri]:

                # Translate VIVO values via enumeration if any

                if name in update_def['column_defs']:
                    path = update_def['column_defs'][name]

                    # Warn/correct if path is unique and VIVO is not

                    if unique_path(path) and len(data[uri][name]) > 1:
                        print "WARNING. VIVO has non-unique values for unique path:", name, "at", uri, data[uri][name]
                        data[uri][name] = {next(iter(data[uri][name]))}  # Pick one element from the multi-valued set
                        print data[uri][name]

                    # Handle filters

                    if do_filter and 'filter' in path[len(path) - 1]['object']:
                        a = set()
                        for x in data[uri][name]:
                            was_string = x
                            new_string = eval(path[len(path) - 1]['object']['filter'])(x)
                            if debug and was_string != new_string:
                                print uri, name, path[len(path) - 1]['object'][
                                    'filter'], "FILTER IMPROVED", was_string, 'to', \
                                    new_string
                            a.add(new_string)
                        data[uri][name] = a

                    # Handle enumerations

                    if 'enum' in path[len(path) - 1]['object']:
                        enum_name = path[len(path) - 1]['object']['enum']
                        a = set()
                        for x in data[uri][name]:
                            a.add(enum[enum_name]['get'].get(x, x))  # if we can't find the value in the
                            # enumeration, just return the value
                        data[uri][name] = a

                # Gather values into a delimited string

                val = intra.join(data[uri][name])
                outfile.write(val.replace('\r', ' ').replace('\n', ' ').replace('\t', ' '))
            if name != columns[len(columns) - 1]:
                outfile.write(inter)
        outfile.write('\n')

    outfile.close()

    return len(data)
示例#12
0
    def __do_get(self):
        """
        Data is queried from VIVO and returned as a tab delimited text file suitable for
        editing using an editor or spreadsheet, and suitable for use by do_update.

        :return:  Number of rows of data
        """
        from vivopump import vivo_query, make_get_data, unique_path, make_get_query, read_csv, write_csv
        import codecs
        import sys
        from vivopump import improve_title, improve_email, improve_phone_number, improve_date, \
            improve_dollar_amount, improve_sponsor_award_id, improve_deptid, improve_display_name, \
            improve_org_name

        #   Generate the get query, execute the query, shape the query results into the return object

        query = make_get_query(self.update_def)
        if self.verbose:
            print self.query_parms
            print query
        result_set = vivo_query(query, self.query_parms, self.verbose)
        data = make_get_data(self.update_def, result_set)

        #   Write out the file

        outfile = codecs.open(self.out_filename,
                              mode='w',
                              encoding='ascii',
                              errors='xmlcharrefreplace')

        columns = ['uri'] + self.update_def['entity_def']['order']
        outfile.write(
            self.inter.join(columns)
        )  # write a header using the inter field separator between column names
        outfile.write('\n')

        for uri in sorted(data.keys()):
            for name in columns:
                if name in data[uri]:

                    # Translate VIVO values via enumeration if any

                    if name in self.update_def['column_defs']:
                        path = self.update_def['column_defs'][name]

                        # Warn/correct if path is unique and VIVO is not

                        if unique_path(path) and len(data[uri][name]) > 1:
                            print "WARNING. VIVO has non-unique values for unique path:", name, "at", uri, \
                                data[uri][name]
                            data[uri][name] = {
                                next(iter(data[uri][name]))
                            }  # Pick one element from multi-valued set
                            print data[uri][name]

                        # Handle filters

                        if self.filter and 'filter' in path[len(path) -
                                                            1]['object']:
                            a = set()
                            for x in data[uri][name]:
                                was_string = x
                                new_string = eval(
                                    path[len(path) - 1]['object']['filter'])(x)
                                if self.verbose and was_string != new_string:
                                    print uri, name, path[len(path) - 1]['object'][
                                        'filter'], "FILTER IMPROVED", was_string, 'to', \
                                        new_string
                                a.add(new_string)
                            data[uri][name] = a

                        # Handle enumerations

                        if 'enum' in path[len(path) - 1]['object']:
                            enum_name = path[len(path) - 1]['object']['enum']
                            a = set()
                            for x in data[uri][name]:
                                val = self.enum[enum_name]['get'].get(x, '')
                                if val != '':
                                    a.add(val)
                                else:
                                    print "WARNING: Unable to find ", x, "in", enum_name, \
                                        ". Blank substituted in", self.out_filename
                            data[uri][name] = a

                    # Gather values into a delimited string

                    val = self.intra.join(data[uri][name])
                    outfile.write(
                        val.replace('\r', ' ').replace('\n',
                                                       ' ').replace('\t', ' '))
                if name != columns[len(columns) - 1]:
                    outfile.write(self.inter)
            outfile.write('\n')

        outfile.close()

        # Rewrite the file based on the order_by or uri if none

        sort_column_name = self.update_def['entity_def'].get('order_by', 'uri')
        data = read_csv(self.out_filename, delimiter=self.inter)
        sdata = {}
        try:
            order = sorted(data, key=lambda rown: data[rown][sort_column_name])
        except KeyError:
            print >>sys.stderr, "ERROR: ", sort_column_name, \
                "in order_by not found.  No such column name. Sorting by uri."
            order = sorted(data, key=lambda rown: data[rown]['uri'])
        row = 1
        for o in order:
            sdata[row] = data[o]
            row += 1
        write_csv(self.out_filename, sdata, delimiter=self.inter)

        return len(data)
示例#13
0
文件: pump.py 项目: ctsit/vivo-pump
    def __do_get(self):
        """
        Data is queried from VIVO and returned as a tab delimited text file suitable for
        editing using an editor or spreadsheet, and suitable for use by do_update.

        :return:  Number of rows of data
        """
        from vivopump import vivo_query, make_get_data, unique_path, make_get_query, read_csv, write_csv
        from improve.improve import improve
        import codecs

        #   Generate the get query, execute the query, shape the query results into the return object

        query = make_get_query(self.update_def)
        logger.debug(u"do_get query_parms\n{}".format(self.query_parms))
        logger.debug(u"do_get query\n{}".format(query))
        result_set = vivo_query(query, self.query_parms)
        data = make_get_data(self.update_def, result_set)

        #   Write out the file

        outfile = codecs.open(self.out_filename, mode='w', encoding='ascii', errors='xmlcharrefreplace')

        columns = ['uri'] + self.update_def['entity_def']['order']
        outfile.write(self.inter.join(columns))  # write a header using the inter field separator between column names
        outfile.write('\n')

        for uri in sorted(data.keys()):
            for name in columns:
                if name in data[uri]:

                    #   Translate VIVO values via enumeration if any

                    if name in self.update_def['column_defs']:
                        path = self.update_def['column_defs'][name]

                        #   Warn/correct if path is unique and VIVO is not

                        if unique_path(path) and len(data[uri][name]) > 1:
                            logger.warning(u"VIVO has non-unique values for unique path {} at {} values {}".
                                           format(name, uri, data[uri][name]))
                            data[uri][name] = {next(iter(data[uri][name]))}  # Pick one element from multi-valued set
                            logger.warning(u"Using {}", data[uri][name])

                        #   Handle filters

                        if self.filter and 'filter' in path[len(path) - 1]['object']:
                            a = set()
                            for x in data[uri][name]:
                                was_string = x
                                new_string = improve(path[len(path) - 1]['object']['filter'], x)
                                if was_string != new_string:
                                    logger.debug(u"{} {} {} FILTER IMPROVED {} to {}".
                                                 format(uri, name, path[len(path) - 1]['object']['filter'],
                                                        was_string, new_string))
                                a.add(new_string)
                            data[uri][name] = a

                        #   Handle enumerations

                        if 'enum' in path[len(path) - 1]['object']:
                            enum_name = path[len(path) - 1]['object']['enum']
                            a = set()
                            for x in data[uri][name]:
                                val = self.enum[enum_name]['get'].get(x, '')
                                if val != '':
                                    a.add(val)
                                else:
                                    logger.warning(u"WARNING: Unable to find {} in {}. Blank substituted in {}".
                                                   format(x, enum_name, self.out_filename))
                            data[uri][name] = a

                    #   Gather values into a delimited string

                    val = self.intra.join(data[uri][name])
                    outfile.write(val.replace('\r', ' ').replace('\n', ' ').replace('\t', ' '))
                if name != columns[len(columns) - 1]:
                    outfile.write(self.inter)
            outfile.write('\n')

        outfile.close()

        #   Rewrite the file based on the order_by or uri if none

        sort_column_name = self.update_def['entity_def'].get('order_by', 'uri')
        data = read_csv(self.out_filename, delimiter=self.inter)
        sdata = {}
        try:
            order = sorted(data, key=lambda rown: data[rown][sort_column_name])
        except KeyError:
            logger.error(u"{} in order_by not found.  No such column name. Sorting by uri.".
                         format(sort_column_name))
            order = sorted(data, key=lambda rown: data[rown]['uri'])
        row = 1
        for o in order:
            sdata[row] = data[o]
            row += 1
        write_csv(self.out_filename, sdata, delimiter=self.inter)

        return len(data)
示例#14
0
文件: pump.py 项目: senrabc/vivo-pump
def do_get(update_def,
           enum,
           filename,
           inter='\t',
           intra=';',
           do_filter=True,
           debug=True):
    """
    Data is queried from VIVO and returned as a tab delimited text file suitable for
    editing using an editor or spreadsheet, and suitable for use by do_update.

    :param filename: Tab delimited file of data from VIVO
    :param: do_filter: boolean if True do the filters, otherwise do not apply filters
    :return:  Number of rows of data
    """
    from vivopump import vivo_query
    import codecs
    from vivopump import improve_title, improve_email, improve_phone_number, improve_date, \
        improve_dollar_amount, improve_sponsor_award_id, improve_deptid, improve_display_name

    query = make_get_query(update_def)
    if debug:
        print query
    result_set = vivo_query(query, debug=debug)
    data = make_get_data(update_def, result_set)

    # Write out the file

    outfile = codecs.open(filename,
                          mode='w',
                          encoding='ascii',
                          errors='xmlcharrefreplace')

    columns = ['uri'] + update_def['entity_def']['order']
    outfile.write(inter.join(columns))
    outfile.write('\n')

    for uri in sorted(data.keys()):
        for name in columns:
            if name in data[uri]:

                # Translate VIVO values via enumeration if any

                if name in update_def['column_defs']:
                    path = update_def['column_defs'][name]

                    # Warn/correct if path is unique and VIVO is not

                    if unique_path(path) and len(data[uri][name]) > 1:
                        print "WARNING. VIVO has non-unique values for unique path:", name, "at", uri, data[
                            uri][name]
                        data[uri][name] = {
                            next(iter(data[uri][name]))
                        }  # Pick one element from the multi-valued set
                        print data[uri][name]

                    # Handle filters

                    if do_filter and 'filter' in path[len(path) - 1]['object']:
                        a = set()
                        for x in data[uri][name]:
                            was_string = x
                            new_string = eval(path[len(path) -
                                                   1]['object']['filter'])(x)
                            if debug and was_string != new_string:
                                print uri, name, path[len(path) - 1]['object'][
                                    'filter'], "FILTER IMPROVED", was_string, 'to', \
                                    new_string
                            a.add(new_string)
                        data[uri][name] = a

                    # Handle enumerations

                    if 'enum' in path[len(path) - 1]['object']:
                        enum_name = path[len(path) - 1]['object']['enum']
                        a = set()
                        for x in data[uri][name]:
                            a.add(enum[enum_name]['get'].get(
                                x, x))  # if we can't find the value in the
                            # enumeration, just return the value
                        data[uri][name] = a

                # Gather values into a delimited string

                val = intra.join(data[uri][name])
                outfile.write(
                    val.replace('\r', ' ').replace('\n',
                                                   ' ').replace('\t', ' '))
            if name != columns[len(columns) - 1]:
                outfile.write(inter)
        outfile.write('\n')

    outfile.close()

    return len(data)
示例#15
0
    def __do_get(self):
        """
        Data is queried from VIVO and returned as a tab delimited text file suitable for
        editing using an editor or spreadsheet, and suitable for use by do_update.

        :return:  Number of rows of data
        """
        from vivopump import vivo_query, make_get_data, unique_path, make_get_query, read_csv, write_csv
        import codecs
        import sys
        from vivopump import improve_title, improve_email, improve_phone_number, improve_date, \
            improve_dollar_amount, improve_sponsor_award_id, improve_deptid, improve_display_name, \
            improve_org_name

        #   Generate the get query, execute the query, shape the query results into the return object

        query = make_get_query(self.update_def)
        if self.verbose:
            print self.query_parms
            print query
        result_set = vivo_query(query, self.query_parms, self.verbose)
        data = make_get_data(self.update_def, result_set)

        #   Write out the file

        outfile = codecs.open(self.out_filename, mode='w', encoding='ascii', errors='xmlcharrefreplace')

        columns = ['uri'] + self.update_def['entity_def']['order']
        outfile.write(self.inter.join(columns))  # write a header using the inter field separator between column names
        outfile.write('\n')

        for uri in sorted(data.keys()):
            for name in columns:
                if name in data[uri]:

                    # Translate VIVO values via enumeration if any

                    if name in self.update_def['column_defs']:
                        path = self.update_def['column_defs'][name]

                        # Warn/correct if path is unique and VIVO is not

                        if unique_path(path) and len(data[uri][name]) > 1:
                            print "WARNING. VIVO has non-unique values for unique path:", name, "at", uri, \
                                data[uri][name]
                            data[uri][name] = {next(iter(data[uri][name]))}  # Pick one element from multi-valued set
                            print data[uri][name]

                        # Handle filters

                        if self.filter and 'filter' in path[len(path) - 1]['object']:
                            a = set()
                            for x in data[uri][name]:
                                was_string = x
                                new_string = eval(path[len(path) - 1]['object']['filter'])(x)
                                if self.verbose and was_string != new_string:
                                    print uri, name, path[len(path) - 1]['object'][
                                        'filter'], "FILTER IMPROVED", was_string, 'to', \
                                        new_string
                                a.add(new_string)
                            data[uri][name] = a

                        # Handle enumerations

                        if 'enum' in path[len(path) - 1]['object']:
                            enum_name = path[len(path) - 1]['object']['enum']
                            a = set()
                            for x in data[uri][name]:
                                val = self.enum[enum_name]['get'].get(x, '')
                                if val != '':
                                    a.add(val)
                                else:
                                    print "WARNING: Unable to find ", x, "in", enum_name, \
                                        ". Blank substituted in", self.out_filename
                            data[uri][name] = a

                    # Gather values into a delimited string

                    val = self.intra.join(data[uri][name])
                    outfile.write(val.replace('\r', ' ').replace('\n', ' ').replace('\t', ' '))
                if name != columns[len(columns) - 1]:
                    outfile.write(self.inter)
            outfile.write('\n')

        outfile.close()

        # Rewrite the file based on the order_by or uri if none

        sort_column_name = self.update_def['entity_def'].get('order_by', 'uri')
        data = read_csv(self.out_filename, delimiter=self.inter)
        sdata = {}
        try:
            order = sorted(data, key=lambda rown: data[rown][sort_column_name])
        except KeyError:
            print >>sys.stderr, "ERROR: ", sort_column_name, \
                "in order_by not found.  No such column name. Sorting by uri."
            order = sorted(data, key=lambda rown: data[rown]['uri'])
        row = 1
        for o in order:
            sdata[row] = data[o]
            row += 1
        write_csv(self.out_filename, sdata, delimiter=self.inter)

        return len(data)