def load(secure,hostname,url,schema,table,postdata,condition,verbose,rowcount): show("begin "+hostname+" "+url+" "+schema+" "+table+" "+(postdata or "")+" "+(condition or "")) if secure: address = "https://"+hostname+url else: address = "http://"+hostname+url show("load from "+address) reqheaders = {'Content-Type': 'application/json'} reqheaders['Caller-Id'] = '1.2.246.562.10.2013112012294919827487.vipunen' # api credentials from env vars if os.getenv("API_USERNAME"): show("using authentication") apiuser = os.getenv("API_USERNAME") apipass = os.getenv("API_PASSWORD") reqheaders['Authorization'] = 'Basic %s' % base64.b64encode(apiuser+":"+apipass) # automatic POST with (post)data print("value used for , -r, --rowcount=", rowcount) request = urllib2.Request(address, data=postdata, headers=reqheaders) try: response = urllib2.urlopen(request) except httplib.IncompleteRead as e: show('IncompleteRead exception.') show('Received: %d'%(e.partial)) sys.exit(2) except urllib2.HTTPError as e: show('The server couldn\'t fulfill the request.') show('Error code: %d'%(e.code)) sys.exit(2) except urllib2.URLError as e: show('We failed to reach a server.') show('Reason: %s'%(e.reason)) sys.exit(2) else: # everything is fine show("api call OK") # remove data conditionally, otherwise empty # merge operation could be considered here... if condition: show("remove from %s.%s with condition '%s'"%(schema,table,condition)) dboperator.execute("DELETE FROM %s.%s WHERE %s"%(schema,table,condition)) else: show("empty %s.%s"%(schema,table)) dboperator.empty(schema,table) show("insert data") cnt=0 manycount = 0 rows = [] for row in ijson.items(response,'item'): cnt+=1 manycount+=1 # show some sign of being alive if cnt%100 == 0: sys.stdout.write('.') sys.stdout.flush() if cnt%1000 == 0: show("-- %d" % (cnt)) if verbose: show("%d -- %s"%(cnt,row)) # find out which columns to use on insert dboperator.resetcolumns(row) # flatten arrays/lists for col in row: if type(row[col]) is list: row[col] = ''.join(map(str,json.dumps(row[col]))) rows.append(row) if cnt == 1: dboperator.insert(address,schema,table,row) manycount = 0 rows = [] if cnt > 1: if manycount == rowcount: insert(address,schema,table,rows) manycount = 0 rows = [] if len(rows) <= manycount and len(rows) > 0: insert(address,schema,table,rows) rows = [] manycount = 0 show("wrote %d"%(cnt)) show("ready")
dboperator.empty(schema,table) show("insert data") cnt=0 for row in ijson.items(response,'item'): cnt+=1 # show some sign of being alive if cnt%100 == 0: sys.stdout.write('.') sys.stdout.flush() if cnt%1000 == 0: show("-- %d" % (cnt)) if verbose: show("%d -- %s"%(cnt,row)) # find out which columns to use on insert dboperator.resetcolumns(row) # flatten arrays/lists for col in row: if type(row[col]) is list: row[col] = ''.join(map(str,json.dumps(row[col]))) dboperator.insert(address,schema,table,row) show("wrote %d"%(cnt)) show("ready") def usage(): print """ usage: load.py [-s|--secure] -H|--hostname <hostname> -u|--url <url> -e|--schema <schema> -t|--table <table> [-p|--postdata] [-c|--condition <condition>] [-v|--verbose] """
def load(url, schema, table, condition): """ Results from VARDA-API can come in multiple pages. If that's the case, we need to make multiple requests to the VARDA API, using the "next" parameter. """ # First delete all from TABLE -> Start from scratch. TODO: Delete based on condition. dboperator.execute("DELETE FROM %s.%s" % (schema, table)) while True: show("begin " + url + " " + schema + " " + table + " " + (condition or "")) show("load from " + url) reqheaders = {'Content-Type': 'application/json'} reqheaders[ 'Caller-Id'] = '1.2.246.562.10.2013112012294919827487.vipunen' # api credentials from env vars if os.getenv("VARDA_API_KEY"): show("using authentication") api_key = os.getenv("VARDA_API_KEY") reqheaders['Authorization'] = 'Token %s' % api_key try: r = requests.get(url, headers=reqheaders) except requests.exceptions.RequestException as e: print("RequestException: ", e) sys.exit(1) if r.status_code != 200: print("Error! HTTP status code: " + str(r.status_code)) sys.exit(2) try: data = json.loads(r.content) except ValueError as e: print("ValueError: ", e) sys.exit(3) # everything is fine show("api call OK") # Parse URL # From https://varda.oso-pilot.csc.fi/api/v1/toimipisteet/ save in DB https://varda.oso-pilot.csc.fi n = url.find("/", url.find("/") + 2) # find 3rd occurance of "/" address = url[:n] show("insert data") cnt = 0 for row in data["results"]: cnt += 1 # show some sign of being alive if cnt % 100 == 0: sys.stdout.write('.') sys.stdout.flush() if cnt % 1000 == 0: show("-- %d" % (cnt)) # Explicitly remove the keys from dict (row) that are not saved in Antero DB del row['johtaja'] del row['url'] del row['muutos_pvm'] del row['vaka_jarjestaja'] del row['toimipaikat'] # find out which columns to use on insert dboperator.resetcolumns(row) # flatten arrays/lists for col in row: if type(row[col]) is list: row[col] = ''.join(map(str, json.dumps(row[col]))) dboperator.insert(address, schema, table, row) show("wrote %d" % (cnt)) show("ready") if data["next"] is None: break # exit while-loop. We are done. else: url = data["next"]
def load(url, schema, table, condition): """ Results from ARVO-API can come in multiple pages. If that's the case, we need to make multiple requests to the ARVO API, using the "next_url" parameter. """ FIRST_LOOP = True # This is used to make possible DELETE operation (due to condition) only once. while True: show("begin " + url + " " + schema + " " + table + " " + (condition or "")) show("load from " + url) reqheaders = {'Content-Type': 'application/json'} # api credentials from env vars if os.getenv("API_USERNAME"): show("using authentication") apiuser = os.getenv("API_USERNAME") apipass = os.getenv("API_PASSWORD") reqheaders['Authorization'] = 'Basic %s' % base64.b64encode( apiuser + ":" + apipass) try: r = requests.get(url, headers=reqheaders) except requests.exceptions.RequestException as e: print e sys.exit(1) if r.status_code != 200: print "Error! HTTP status code: " + str(r.status_code) sys.exit(2) try: result = json.loads(r.content) except ValueError as e: print e sys.exit(3) if "pagination" not in result or "data" not in result: print "Error! Received JSON-data not valid." sys.exit(4) # everything is fine show("api call OK") """ Received data e.g. { "data": [ { "taustakysymykset": true, "koulutustoimija": "xxxx", "vastausid": 1111, "kyselykertaid": 123, "kysely_alkupvm": "2016-11-29T22:00:00Z", "suorituskieli": "fi", "tutkinto_fi": "xxxx", "opintoala_en": "xxxx", "valmistavan_koulutuksen_jarjestaja_en": "xxxx", "kysymys_en": "xxxx", "koulutustoimija_en": "xxxx", "tutkintotunnus": "xxxx", "numerovalinta": 2, "valmistavan_koulutuksen_oppilaitos_sv": "xxxx", "kysymys_sv": "xxxx", "kysymysjarjestys": 0, "opintoala_sv": "xxxx", "monivalintavaihtoehto": "xxxx", "kysymysid": 1234, "valmistavan_koulutuksen_oppilaitos_fi": "xxxx", "kysely_en": "xxxx", "vastaustyyppi": "xxxx", "kysymysryhma": "xxxx", "tutkinto_en": null, "kunta": null, "kysymysryhmaid": 110, "kysymysryhmajarjestys": 0, "vaihtoehto": null, "opintoala_fi": "xxxx", "kysymys_fi": "Ik", "vastaajaid": 123, "kyselyid": 111, "valmistavan_koulutuksen_jarjestaja_fi": "xxxx", "kysymysryhma_en": "xxxx", "kysely_sv": "xxxx", "kysymysryhma_fi": "xxxx", "opintoalatunnus": "xxxx", "valmistavan_koulutuksen_jarjestaja_sv": "xxxx", "vastausaika": "2017-02-05T22:00:00Z", "tunnus": "xxxx", "valmistavan_koulutuksen_jarjestaja": "xxxx", "koulutustoimija_fi": "xxxx", "kysely_loppupvm": null, "koulutusmuoto": null, "kyselykerta": "xxxx", "valmistavan_koulutuksen_oppilaitos_en": "xxxx", "valtakunnallinen": true, "tutkinto_sv": null, "koulutustoimija_sv": "xxxx", "valmistavan_koulutuksen_oppilaitos": "xxxx", "kysely_fi": "xxxx", "kysymysryhma_sv": "xxxx" } ], "pagination": { "next_url": "null" } } """ address = url.split( "?" )[0] # Save in DB only the part before ?-mark: https://arvo.csc.fi/api/vipunen?alkupvm=2018-01-01&loppupvm=2018-02-01 # remove data conditionally, otherwise empty # merge operation could be considered here... if FIRST_LOOP: # This is done only on the first go (no matter if Arvo returns one or multiple pages) if condition: show("remove from %s.%s with condition '%s'" % (schema, table, condition)) dboperator.execute("DELETE FROM %s.%s WHERE %s" % (schema, table, condition)) else: show("empty %s.%s" % (schema, table)) dboperator.empty(schema, table) show("insert data") cnt = 0 for row in result["data"]: cnt += 1 # show some sign of being alive if cnt % 100 == 0: sys.stdout.write('.') sys.stdout.flush() if cnt % 1000 == 0: show("-- %d" % (cnt)) # find out which columns to use on insert dboperator.resetcolumns(row) # flatten arrays/lists for col in row: if type(row[col]) is list: row[col] = ''.join(map(str, json.dumps(row[col]))) dboperator.insert(address, schema, table, row) show("wrote %d" % (cnt)) show("ready") if result["pagination"]["next_url"] == "null": break # exit while-loop. We are done. else: url = result["pagination"]["next_url"] FIRST_LOOP = False # Do not make the possible DELETE-operation anymore!
def load(url, schema, table, condition): """ Results from ARVO-API can come in multiple pages. If that's the case, we need to make multiple requests to the ARVO API, using the "next_url" parameter. """ FIRST_LOOP = True # This is used to make possible DELETE operation (due to condition) only once. while True: show("begin " + url + " " + schema + " " + table + " " + (condition or "")) show("load from " + url) reqheaders = {'Content-Type': 'application/json'} # api credentials from env vars if os.getenv("API_USERNAME"): show("using authentication") apiuser = os.getenv("API_USERNAME") apipass = os.getenv("API_PASSWORD") reqheaders['Authorization'] = 'Basic %s' % base64.b64encode(apiuser + ":" + apipass) try: r = requests.get(url, headers=reqheaders) except requests.exceptions.RequestException as e: print e sys.exit(1) if r.status_code != 200: print "Error! HTTP status code: " + str(r.status_code) sys.exit(2) try: result = json.loads(r.content) except ValueError as e: print e sys.exit(3) if "pagination" not in result or "data" not in result: print "Error! Received JSON-data not valid." sys.exit(4) # everything is fine show("api call OK") """ Received data e.g. { "data": [ { "taustakysymykset": true, "koulutustoimija": "xxxx", "vastausid": 1111, "kyselykertaid": 123, "kysely_alkupvm": "2016-11-29T22:00:00Z", "suorituskieli": "fi", "tutkinto_fi": "xxxx", "valmistavan_koulutuksen_jarjestaja_en": "xxxx", "kysymys_en": "xxxx", "koulutustoimija_en": "xxxx", "tutkintotunnus": "xxxx", "numerovalinta": 2, "valmistavan_koulutuksen_oppilaitos_sv": "xxxx", "kysymys_sv": "xxxx", "kysymysjarjestys": 0, "monivalintavaihtoehto": "xxxx", "kysymysid": 1234, "valmistavan_koulutuksen_oppilaitos_fi": "xxxx", "kysely_en": "xxxx", "vastaustyyppi": "xxxx", "kysymysryhma": "xxxx", "tutkinto_en": null, "kunta": null, "kysymysryhmaid": 110, "kysymysryhmajarjestys": 0, "vaihtoehto": null, "kysymys_fi": "Ik", "vastaajaid": 123, "kyselyid": 111, "valmistavan_koulutuksen_jarjestaja_fi": "xxxx", "kysymysryhma_en": "xxxx", "kysely_sv": "xxxx", "kysymysryhma_fi": "xxxx", "valmistavan_koulutuksen_jarjestaja_sv": "xxxx", "vastausaika": "2017-02-05T22:00:00Z", "tunnus": "xxxx", "valmistavan_koulutuksen_jarjestaja": "xxxx", "koulutustoimija_fi": "xxxx", "kysely_loppupvm": null, "koulutusmuoto": null, "kyselykerta": "xxxx", "valmistavan_koulutuksen_oppilaitos_en": "xxxx", "valtakunnallinen": true, "tutkinto_sv": null, "koulutustoimija_sv": "xxxx", "valmistavan_koulutuksen_oppilaitos": "xxxx", "kysely_fi": "xxxx", "kysymysryhma_sv": "xxxx" } ], "pagination": { "next_url": "null" } } """ address = url.split("?")[0] # Save in DB only the part before ?-mark: https://arvo.csc.fi/api/vipunen?alkupvm=2018-01-01&loppupvm=2018-02-01 # remove data conditionally, otherwise empty # merge operation could be considered here... if FIRST_LOOP: # This is done only on the first go (no matter if Arvo returns one or multiple pages) if condition: show("remove from %s.%s with condition '%s'" % (schema, table, condition)) dboperator.execute("DELETE FROM %s.%s WHERE %s" % (schema, table, condition)) else: show("empty %s.%s" % (schema, table)) dboperator.empty(schema, table) show("insert data") cnt = 0 for row in result["data"]: cnt += 1 # show some sign of being alive if cnt % 100 == 0: sys.stdout.write('.') sys.stdout.flush() if cnt % 1000 == 0: show("-- %d" % (cnt)) # find out which columns to use on insert dboperator.resetcolumns(row) # flatten arrays/lists for col in row: if type(row[col]) is list: row[col] = ''.join(map(str, json.dumps(row[col]))) dboperator.insert(address, schema, table, row) show("wrote %d" % (cnt)) show("ready") if result["pagination"]["next_url"] == "null" or result["pagination"]["next_url"] == None: break # exit while-loop. We are done. else: url = result["pagination"]["next_url"] FIRST_LOOP = False # Do not make the possible DELETE-operation anymore!
dboperator.empty(schema, table) show("insert data") cnt = 0 for row in ijson.items(response, 'item'): cnt += 1 # show some sign of being alive if cnt % 100 == 0: sys.stdout.write('.') sys.stdout.flush() if cnt % 1000 == 0: show("-- %d" % (cnt)) if verbose: show("%d -- %s" % (cnt, row)) # find out which columns to use on insert dboperator.resetcolumns(row) # flatten arrays/lists for col in row: if type(row[col]) is list: row[col] = ''.join(map(str, json.dumps(row[col]))) dboperator.insert(address, schema, table, row) show("wrote %d" % (cnt)) show("ready") def usage(): print """ usage: load.py [-s|--secure] -H|--hostname <hostname> -u|--url <url> -e|--schema <schema> -t|--table <table> [-p|--postdata] [-c|--condition <condition>] [-v|--verbose]
def load(url, schema, table, condition): """ Results from VARDA-API can come in multiple pages. If that's the case, we need to make multiple requests to the VARDA API, using the "next" parameter. """ # First delete all from TABLE -> Start from scratch. TODO: Delete based on condition. dboperator.execute("DELETE FROM %s.%s" % (schema, table)) while True: show("begin " + url + " " + schema + " " + table + " " + (condition or "")) show("load from " + url) reqheaders = {'Content-Type': 'application/json'} # api credentials from env vars if os.getenv("VARDA_API_KEY"): show("using authentication") api_key = os.getenv("VARDA_API_KEY") reqheaders['Authorization'] = 'Token %s' % api_key try: r = requests.get(url, headers=reqheaders) except requests.exceptions.RequestException as e: print e sys.exit(1) if r.status_code != 200: print "Error! HTTP status code: " + str(r.status_code) sys.exit(2) try: data = json.loads(r.content) except ValueError as e: print e sys.exit(3) # everything is fine show("api call OK") # Parse URL # From https://varda.oso-pilot.csc.fi/api/v1/toimipisteet/ save in DB https://varda.oso-pilot.csc.fi n = url.find("/", url.find("/") + 2) # find 3rd occurance of "/" address = url[:n] show("insert data") cnt = 0 for row in data["results"]: cnt += 1 # show some sign of being alive if cnt % 100 == 0: sys.stdout.write('.') sys.stdout.flush() if cnt % 1000 == 0: show("-- %d" % (cnt)) # Explicitly remove the keys from dict (row) that are not saved in Antero DB del row['johtaja'] del row['url'] del row['muutos_pvm'] del row['vaka_jarjestaja'] del row['toimipaikat'] # find out which columns to use on insert dboperator.resetcolumns(row) # flatten arrays/lists for col in row: if type(row[col]) is list: row[col] = ''.join(map(str, json.dumps(row[col]))) dboperator.insert(address, schema, table, row) show("wrote %d" % (cnt)) show("ready") if data["next"] is None: break # exit while-loop. We are done. else: url = data["next"]