def test_types(self): test_docs = [ "http://www.khanacademy.org/video/dependent-probability-example-1?playlist=Probability", "http://illuminations.nctm.org/LessonDetail.aspx?ID=U159", "http://carrefour-numerique.cite-sciences.fr/ressources/flash/anims/url/cyberbase08_home.html", "http://www.shodor.org/interactivate/lessons/FindingRemaindersinPascal/", "http://www.cmhouston.org/attachments/files/1690/CaterpillarMeasure.pdf", "http://www2.edu.fi/materiaalipankki/showfile.php?id=158&file=158_lankarulla4.jpg", ] for doc_id in test_docs: try: print "" print "============== %s ==============" % doc_id url = "https://node01.public.learningregistry.net/obtain?" + urllib.urlencode( {"request_id": doc_id}) print url request = requests.get(url) obtainResponse = request.json() for data in obtainResponse['documents'][0]['document']: if (data['resource_data_type'] == 'metadata'): doc = getTaskFunction(config, 'validate')(data, config, False) break except Exception as ex: print "Failed to load: " + url traceback.print_exc()
def test_types(self): test_docs = [ "http://www.khanacademy.org/video/dependent-probability-example-1?playlist=Probability", "http://illuminations.nctm.org/LessonDetail.aspx?ID=U159", "http://carrefour-numerique.cite-sciences.fr/ressources/flash/anims/url/cyberbase08_home.html", "http://www.shodor.org/interactivate/lessons/FindingRemaindersinPascal/", "http://www.cmhouston.org/attachments/files/1690/CaterpillarMeasure.pdf", "http://www2.edu.fi/materiaalipankki/showfile.php?id=158&file=158_lankarulla4.jpg", ] for doc_id in test_docs: try: print "" print "============== %s ==============" % doc_id url = "https://node01.public.learningregistry.net/obtain?"+urllib.urlencode({ "request_id": doc_id}) print url request = requests.get(url) obtainResponse = request.json() for data in obtainResponse['documents'][0]['document']: if(data['resource_data_type'] == 'metadata'): doc = getTaskFunction(config, 'validate')(data, config, False) break except Exception as ex: print "Failed to load: "+url traceback.print_exc()
def harvestData(lrUrl, config, enqueueValidate = True): try: r = redis.StrictRedis(host=config['redis']['host'], port=config['redis']['port'], db=config['redis']['db']) print ("Harvesting with url: %s" % lrUrl) request = requests.get(lrUrl) data = request.json() for i in data['listrecords']: envelope = i['record']['resource_data'] r.sadd("harvested_docs", envelope['doc_ID']) # only attempt to process metadata resource types (ignore paradata and other non-metadata records) if 'resource_data_type' in envelope and envelope['resource_data_type'] == 'metadata': validateFunc = getTaskFunction(config, 'validate') if enqueueValidate: validateFunc.delay(envelope, config) else: validateFunc(envelope, config) if "resumption_token" in data and \ data['resumption_token'] is not None and \ data['resumption_token'] != "null": urlParts = urlparse(lrUrl) rawQuery = {"resumption_token": data['resumption_token']} newQuery = urllib.urlencode(rawQuery) lrUrl = urlunparse((urlParts[0], urlParts[1], urlParts[2], urlParts[3], newQuery, urlParts[5])) harvestData.delay(lrUrl, config) except Exception as exc: traceback.print_exc() # on request fails, retry the request in 10 seconds. Most fails are due to a # JSON parse error caused by truncated LR requests. harvestData.retry(exc=exc, countdown=10, max_retries=5)
def harvestData(lrUrl, config, enqueueValidate=True): try: r = redis.StrictRedis(host=config['redis']['host'], port=config['redis']['port'], db=config['redis']['db']) print("Harvesting with url: %s" % lrUrl) request = requests.get(lrUrl) data = request.json() for i in data['listrecords']: envelope = i['record']['resource_data'] r.sadd("harvested_docs", envelope['doc_ID']) # only attempt to process metadata resource types (ignore paradata and other non-metadata records) if 'resource_data_type' in envelope and envelope[ 'resource_data_type'] == 'metadata': validateFunc = getTaskFunction(config, 'validate') if enqueueValidate: validateFunc.delay(envelope, config) else: validateFunc(envelope, config) if "resumption_token" in data and \ data['resumption_token'] is not None and \ data['resumption_token'] != "null": urlParts = urlparse(lrUrl) rawQuery = {"resumption_token": data['resumption_token']} newQuery = urllib.urlencode(rawQuery) lrUrl = urlunparse((urlParts[0], urlParts[1], urlParts[2], urlParts[3], newQuery, urlParts[5])) harvestData.delay(lrUrl, config) except Exception as exc: traceback.print_exc() # on request fails, retry the request in 10 seconds. Most fails are due to a # JSON parse error caused by truncated LR requests. harvestData.retry(exc=exc, countdown=10, max_retries=5)