예제 #1
0
def dump_articles():
    connection = MongoClient('localhost', 27017)

    db = connection.PTEST_BACKUP

    results = db.crawling.find({}, {'_id': False})

    """
    {
            "_id" : ObjectId("54dd29d2b396811764a01330"),
            "url" : "http://www.nasa.gov/pdf/55395main_12%20Earth%20Science.pdf",
            "home" : "NASA",
            "abstract" : "The mission of NASA's Earth Science ... and help answer qu
    estions concerning many related aspects of ... forecasters in assessing particul
    ate pollutio ...",
            "title" : "Earth Science - NASA",
            "keyword" : "aerosols+(pollution+aspects)",
            "stored" : true,
            "complete" : false,
            "key" : "aerosols (pollution aspects)",
            "hashed" : "aHR0cDovL3d3dy5uYXNhLmdvdi9wZGYvNTUzOTVtYWluXzEyJTIwRWFydGglMjBTY2llbmNlLnBkZg=="
    }
    """


    # upload via POST endpoint
    from scripts.remote.remote import post_curling
    import json

    for record in results:
        post_curling(_CRAWLING_POST['local'], {'resource': json.dumps(record), 'pwd': _TEMP_SECRET}, display=True)

    # close the connection to MongoDB
    connection.close()
    def test_post_to_component_endpoint(self):
        """
        WARNING: it stores in the local datastore the json-ld in testdata.component
        :return:
        """
        #from models import Component
        from scripts.testdata.component import component
        from scripts.remote.remote import post_curling

        post_curling(url='http://localhost:8080/database/cots/store',
                     params={'pwd': _TEMP_SECRET, 'data': component},
                     display=True)
    def test_dumping_pipe_full(cls):
        """
        Tests creating python instances > dumping JSON-LD > translate into N-TRIPLES
        :return:
        """
        print " --- full test for generating components and dumping them in JSON-LD"
        jsons = [SubSystem.generate_py_instance(k, v) for k, v in tech_constrains.items()]
        jsonlds = [SubSystem.generate_jsonld(c) for c in jsons]

        print json.dumps(jsonlds, indent=4)

        print " --- Translating the component into ntriples via RDFtranslator"
        from scripts.remote.remote import post_curling
        url = 'http://rdf-translator.appspot.com/convert/json-ld/nt/content'
        post_curling(url, {'content': json.dumps(jsonlds)}, display=True)
    def set_typeof_and_ingraph_properties(self, bookmark='start'):
        print "Fetching page: " + bookmark
        articles = get_curling(_ENV[self.test_env]['_SERVICE'] + '/datastore/index',
                                {'token': _CLIENT_TOKEN,
                                 'bookmark': bookmark if bookmark != 'start' else ''})
        articles = json.loads(articles)

        # fetch single article
        for a in articles['articles']:
            res = get_curling(
                _ENV[self.test_env]['_SERVICE'] + '/datastore/webresource',
                {
                    'retrieve': a,
                    'token': _CLIENT_TOKEN
                }
            )
            res = json.loads(res)
            if not res['type_of'] in res.keys():
                try:
                    int(res['title'])
                    update = {'type_of': 'tweet', 'in_graph': False }
                except Exception:
                    if res['title'] == '':
                        if res['url'].endswith(('jpg', 'jpeg', 'png', 'mp3', 'mp4')):
                            update = {'type_of': 'media', 'in_graph': False }
                        else:
                            update = {'type_of': 'link', 'in_graph': False }
                    else:
                        update = {'type_of': 'feed', 'in_graph': False }

                print update
                post_curling(
                    _ENV[self.test_env]['_SERVICE'] + '/datastore/webresource',
                    { 'token': _CLIENT_TOKEN,
                      'update': a,
                      'properties': json.dumps(update)
                     },
                    display=True
                )


        if not articles['next']:
            return None

        return self.fetch_and_dump_webresources(
            bookmark=articles['next']
        )
    def dump_concepts_to_graph(self, list_of_ids, url):
        """
        Take a list of unique ids, create a graph of concepts, serialize it to triples and dump them to
        right graph in the shard.
        :param list_of_ids: a list() of numerical ids from the datastore
        :param url: the url of the sparql endpoint of the shard
        :return: None
        """
        g = Graph()

        for uuid in list_of_ids:
            [g.add(c) for c in self.create_concepts_triples(uuid)]

        triples = g.serialize(format='nt')
        post_curling(url,
                     {'token': _CLIENT_TOKEN,
                      'triple': triples,
                      'graph_id': _CONCEPTS_GRAPH_ID},
                     display=True
                     )
    def dump_webresources_to_graph(self, list_of_ids, url):
        """
        Take a list of unique ids, create a graph of webresources, serialize it to triples and dump them to the
        right graph in the shard.
        :param list_of_ids: a list() of numerical ids from the datastore
        :param url: the url of the sparql endpoint of the shard
        :return: None
        """
        g = Graph()

        for uuid in list_of_ids:
            print uuid
            g.add(self.create_webresource_triple(uuid))

        triples = g.serialize(format='nt')
        post_curling(url,
                     {'token': _CLIENT_TOKEN,
                      'triple': triples,
                      'graph_id': _WEBRES_GRAPH_ID},
                     display=True
                     )
예제 #7
0
def dump_articles():
    connection = MongoClient('localhost', 27017)

    db = connection.PTEST_BACKUP

    results = db.crawling.find({}, {'_id': False})
    """
    {
            "_id" : ObjectId("54dd29d2b396811764a01330"),
            "url" : "http://www.nasa.gov/pdf/55395main_12%20Earth%20Science.pdf",
            "home" : "NASA",
            "abstract" : "The mission of NASA's Earth Science ... and help answer qu
    estions concerning many related aspects of ... forecasters in assessing particul
    ate pollutio ...",
            "title" : "Earth Science - NASA",
            "keyword" : "aerosols+(pollution+aspects)",
            "stored" : true,
            "complete" : false,
            "key" : "aerosols (pollution aspects)",
            "hashed" : "aHR0cDovL3d3dy5uYXNhLmdvdi9wZGYvNTUzOTVtYWluXzEyJTIwRWFydGglMjBTY2llbmNlLnBkZg=="
    }
    """

    # upload via POST endpoint
    from scripts.remote.remote import post_curling
    import json

    for record in results:
        post_curling(_CRAWLING_POST['local'], {
            'resource': json.dumps(record),
            'pwd': _TEMP_SECRET
        },
                     display=True)

    # close the connection to MongoDB
    connection.close()
    def test_n3_endpoints(self):
        """
    Test the N3
    """
        print "Running test_n3_endpoints"
        env = self.test_env

        base_url_resource = _ENV[env]['_SERVICE'] + "/datastore/webresource"
        base_url_concept = _ENV[env]['_SERVICE'] + "/datastore/concept"

        test_concepts = []


        response = post_curling(base_url_resource, {"token": _CLIENT_TOKEN})

        print response

        pass