示例#1
0
    def test_types(self):
        test_docs = [
            "http://www.khanacademy.org/video/dependent-probability-example-1?playlist=Probability",
            "http://illuminations.nctm.org/LessonDetail.aspx?ID=U159",
            "http://carrefour-numerique.cite-sciences.fr/ressources/flash/anims/url/cyberbase08_home.html",
            "http://www.shodor.org/interactivate/lessons/FindingRemaindersinPascal/",
            "http://www.cmhouston.org/attachments/files/1690/CaterpillarMeasure.pdf",
            "http://www2.edu.fi/materiaalipankki/showfile.php?id=158&file=158_lankarulla4.jpg",
        ]

        for doc_id in test_docs:
            try:
                print ""
                print "============== %s ==============" % doc_id

                url = "https://node01.public.learningregistry.net/obtain?" + urllib.urlencode(
                    {"request_id": doc_id})
                print url
                request = requests.get(url)
                obtainResponse = request.json()

                for data in obtainResponse['documents'][0]['document']:
                    if (data['resource_data_type'] == 'metadata'):
                        doc = getTaskFunction(config, 'validate')(data, config,
                                                                  False)

                        break

            except Exception as ex:
                print "Failed to load: " + url
                traceback.print_exc()
示例#2
0
文件: test.py 项目: LouisVN/LR-Data
    def test_types(self):
        test_docs = [
            "http://www.khanacademy.org/video/dependent-probability-example-1?playlist=Probability",
            "http://illuminations.nctm.org/LessonDetail.aspx?ID=U159",
            "http://carrefour-numerique.cite-sciences.fr/ressources/flash/anims/url/cyberbase08_home.html",
            "http://www.shodor.org/interactivate/lessons/FindingRemaindersinPascal/",
            "http://www.cmhouston.org/attachments/files/1690/CaterpillarMeasure.pdf",
            "http://www2.edu.fi/materiaalipankki/showfile.php?id=158&file=158_lankarulla4.jpg",
        ]

        for doc_id in test_docs:
            try:
                print ""
                print "============== %s ==============" % doc_id


                url = "https://node01.public.learningregistry.net/obtain?"+urllib.urlencode({ "request_id": doc_id})
                print url
                request = requests.get(url)
                obtainResponse = request.json()


                for data in obtainResponse['documents'][0]['document']:
                    if(data['resource_data_type'] == 'metadata'):
                        doc = getTaskFunction(config, 'validate')(data, config, False)

                        break

            except Exception as ex:
                print "Failed to load: "+url
                traceback.print_exc()
示例#3
0
def harvestData(lrUrl, config, enqueueValidate = True):
    try:
        r = redis.StrictRedis(host=config['redis']['host'],
                              port=config['redis']['port'],
                              db=config['redis']['db'])

        print ("Harvesting with url: %s" % lrUrl)

        request = requests.get(lrUrl)

        data = request.json()

        for i in data['listrecords']:
            envelope = i['record']['resource_data']
            r.sadd("harvested_docs", envelope['doc_ID'])

            # only attempt to process metadata resource types (ignore paradata and other non-metadata records)
            if 'resource_data_type' in envelope and envelope['resource_data_type'] == 'metadata':
                validateFunc = getTaskFunction(config, 'validate')

                if enqueueValidate:
                    validateFunc.delay(envelope, config)
                else:
                    validateFunc(envelope, config)


        if "resumption_token" in data and \
            data['resumption_token'] is not None and \
            data['resumption_token'] != "null":
                urlParts = urlparse(lrUrl)
                rawQuery = {"resumption_token": data['resumption_token']}
                newQuery = urllib.urlencode(rawQuery)
                lrUrl = urlunparse((urlParts[0],
                                    urlParts[1],
                                    urlParts[2],
                                    urlParts[3],
                                    newQuery,
                                    urlParts[5]))
                harvestData.delay(lrUrl, config)
    except Exception as exc:
        traceback.print_exc()

        # on request fails, retry the request in 10 seconds.  Most fails are due to a
        # JSON parse error caused by truncated LR requests.

        harvestData.retry(exc=exc, countdown=10, max_retries=5)
示例#4
0
def harvestData(lrUrl, config, enqueueValidate=True):
    try:
        r = redis.StrictRedis(host=config['redis']['host'],
                              port=config['redis']['port'],
                              db=config['redis']['db'])

        print("Harvesting with url: %s" % lrUrl)

        request = requests.get(lrUrl)

        data = request.json()

        for i in data['listrecords']:
            envelope = i['record']['resource_data']
            r.sadd("harvested_docs", envelope['doc_ID'])

            # only attempt to process metadata resource types (ignore paradata and other non-metadata records)
            if 'resource_data_type' in envelope and envelope[
                    'resource_data_type'] == 'metadata':
                validateFunc = getTaskFunction(config, 'validate')

                if enqueueValidate:
                    validateFunc.delay(envelope, config)
                else:
                    validateFunc(envelope, config)


        if "resumption_token" in data and \
            data['resumption_token'] is not None and \
            data['resumption_token'] != "null":
            urlParts = urlparse(lrUrl)
            rawQuery = {"resumption_token": data['resumption_token']}
            newQuery = urllib.urlencode(rawQuery)
            lrUrl = urlunparse((urlParts[0], urlParts[1], urlParts[2],
                                urlParts[3], newQuery, urlParts[5]))
            harvestData.delay(lrUrl, config)
    except Exception as exc:
        traceback.print_exc()

        # on request fails, retry the request in 10 seconds.  Most fails are due to a
        # JSON parse error caused by truncated LR requests.

        harvestData.retry(exc=exc, countdown=10, max_retries=5)