示例#1
0
文件: load_v2.py 项目: CSCfi/antero
def load(secure,hostname,url,schema,table,postdata,condition,verbose,rowcount):
  show("begin "+hostname+" "+url+" "+schema+" "+table+" "+(postdata or "")+" "+(condition or ""))
  if secure:
    address = "https://"+hostname+url
  else:
    address = "http://"+hostname+url
  show("load from "+address)

  reqheaders = {'Content-Type': 'application/json'}
  reqheaders['Caller-Id'] = '1.2.246.562.10.2013112012294919827487.vipunen'

  # api credentials from env vars
  if os.getenv("API_USERNAME"):
    show("using authentication")
    apiuser = os.getenv("API_USERNAME")
    apipass = os.getenv("API_PASSWORD")
    reqheaders['Authorization'] = 'Basic %s' % base64.b64encode(apiuser+":"+apipass)

  # automatic POST with (post)data
  print("value used for , -r, --rowcount=", rowcount)
  request = urllib2.Request(address, data=postdata, headers=reqheaders)
  try:
    response = urllib2.urlopen(request)
  except httplib.IncompleteRead as e:
    show('IncompleteRead exception.')
    show('Received: %d'%(e.partial))
    sys.exit(2)
  except urllib2.HTTPError as e:
    show('The server couldn\'t fulfill the request.')
    show('Error code: %d'%(e.code))
    sys.exit(2)
  except urllib2.URLError as e:
    show('We failed to reach a server.')
    show('Reason: %s'%(e.reason))
    sys.exit(2)
  else:
    # everything is fine
    show("api call OK")

  # remove data conditionally, otherwise empty
  # merge operation could be considered here...
  if condition:
    show("remove from %s.%s with condition '%s'"%(schema,table,condition))
    dboperator.execute("DELETE FROM %s.%s WHERE %s"%(schema,table,condition))
  else:
    show("empty %s.%s"%(schema,table))
    dboperator.empty(schema,table)

  show("insert data")
  cnt=0
  manycount = 0
  rows = []

  for row in ijson.items(response,'item'):
        cnt+=1
        manycount+=1
        # show some sign of being alive
        if cnt%100 == 0:
          sys.stdout.write('.')
          sys.stdout.flush()
        if cnt%1000 == 0:
          show("-- %d" % (cnt))
        if verbose: show("%d -- %s"%(cnt,row))

        # find out which columns to use on insert
        dboperator.resetcolumns(row)

        # flatten arrays/lists
        for col in row:
            if type(row[col]) is list:
                row[col] = ''.join(map(str,json.dumps(row[col])))
        rows.append(row)
        if cnt == 1:
            dboperator.insert(address,schema,table,row)
            manycount = 0
            rows = []
        if cnt > 1:
            if manycount == rowcount:
                insert(address,schema,table,rows)
                manycount = 0
                rows = []
  if len(rows) <= manycount and len(rows) > 0:
      insert(address,schema,table,rows)
      rows = []
      manycount = 0

  show("wrote %d"%(cnt))
  show("ready")
示例#2
0
    dboperator.empty(schema,table)

  show("insert data")
  cnt=0
  for row in ijson.items(response,'item'):
    cnt+=1
    # show some sign of being alive
    if cnt%100 == 0:
      sys.stdout.write('.')
      sys.stdout.flush()
    if cnt%1000 == 0:
      show("-- %d" % (cnt))
    if verbose: show("%d -- %s"%(cnt,row))

    # find out which columns to use on insert
    dboperator.resetcolumns(row)

    # flatten arrays/lists
    for col in row:
      if type(row[col]) is list:
        row[col] = ''.join(map(str,json.dumps(row[col])))

    dboperator.insert(address,schema,table,row)

  show("wrote %d"%(cnt))
  show("ready")

def usage():
  print """
usage: load.py [-s|--secure] -H|--hostname <hostname> -u|--url <url> -e|--schema <schema> -t|--table <table> [-p|--postdata] [-c|--condition <condition>] [-v|--verbose]
"""
示例#3
0
def load(url, schema, table, condition):
    """
    Results from VARDA-API can come in multiple pages. If that's the case,
    we need to make multiple requests to the VARDA API, using the "next" parameter.
    """

    # First delete all from TABLE -> Start from scratch. TODO: Delete based on condition.
    dboperator.execute("DELETE FROM %s.%s" % (schema, table))

    while True:
        show("begin " + url + " " + schema + " " + table + " " +
             (condition or ""))
        show("load from " + url)

        reqheaders = {'Content-Type': 'application/json'}
        reqheaders[
            'Caller-Id'] = '1.2.246.562.10.2013112012294919827487.vipunen'
        # api credentials from env vars
        if os.getenv("VARDA_API_KEY"):
            show("using authentication")
            api_key = os.getenv("VARDA_API_KEY")
            reqheaders['Authorization'] = 'Token %s' % api_key

        try:
            r = requests.get(url, headers=reqheaders)
        except requests.exceptions.RequestException as e:
            print("RequestException: ", e)
            sys.exit(1)

        if r.status_code != 200:
            print("Error! HTTP status code: " + str(r.status_code))
            sys.exit(2)

        try:
            data = json.loads(r.content)
        except ValueError as e:
            print("ValueError: ", e)
            sys.exit(3)

        # everything is fine
        show("api call OK")

        # Parse URL
        # From https://varda.oso-pilot.csc.fi/api/v1/toimipisteet/ save in DB https://varda.oso-pilot.csc.fi
        n = url.find("/", url.find("/") + 2)  # find 3rd occurance of "/"
        address = url[:n]

        show("insert data")
        cnt = 0
        for row in data["results"]:
            cnt += 1
            # show some sign of being alive
            if cnt % 100 == 0:
                sys.stdout.write('.')
                sys.stdout.flush()
            if cnt % 1000 == 0:
                show("-- %d" % (cnt))

            # Explicitly remove the keys from dict (row) that are not saved in Antero DB
            del row['johtaja']
            del row['url']
            del row['muutos_pvm']
            del row['vaka_jarjestaja']
            del row['toimipaikat']

            # find out which columns to use on insert
            dboperator.resetcolumns(row)

            # flatten arrays/lists
            for col in row:
                if type(row[col]) is list:
                    row[col] = ''.join(map(str, json.dumps(row[col])))

            dboperator.insert(address, schema, table, row)

        show("wrote %d" % (cnt))
        show("ready")

        if data["next"] is None:
            break  # exit while-loop. We are done.
        else:
            url = data["next"]
示例#4
0
def load(url, schema, table, condition):
    """
    Results from ARVO-API can come in multiple pages. If that's the case,
    we need to make multiple requests to the ARVO API, using the "next_url" parameter.
    """

    FIRST_LOOP = True  # This is used to make possible DELETE operation (due to condition) only once.

    while True:
        show("begin " + url + " " + schema + " " + table + " " +
             (condition or ""))
        show("load from " + url)

        reqheaders = {'Content-Type': 'application/json'}
        # api credentials from env vars
        if os.getenv("API_USERNAME"):
            show("using authentication")
            apiuser = os.getenv("API_USERNAME")
            apipass = os.getenv("API_PASSWORD")
            reqheaders['Authorization'] = 'Basic %s' % base64.b64encode(
                apiuser + ":" + apipass)

        try:
            r = requests.get(url, headers=reqheaders)
        except requests.exceptions.RequestException as e:
            print e
            sys.exit(1)

        if r.status_code != 200:
            print "Error! HTTP status code: " + str(r.status_code)
            sys.exit(2)

        try:
            result = json.loads(r.content)
        except ValueError as e:
            print e
            sys.exit(3)

        if "pagination" not in result or "data" not in result:
            print "Error! Received JSON-data not valid."
            sys.exit(4)

        # everything is fine
        show("api call OK")
        """
          Received data e.g.

            {
              "data": [
                {
                  "taustakysymykset": true,
                  "koulutustoimija": "xxxx",
                  "vastausid": 1111,
                  "kyselykertaid": 123,
                  "kysely_alkupvm": "2016-11-29T22:00:00Z",
                  "suorituskieli": "fi",
                  "tutkinto_fi": "xxxx",
                  "opintoala_en": "xxxx",
                  "valmistavan_koulutuksen_jarjestaja_en": "xxxx",
                  "kysymys_en": "xxxx",
                  "koulutustoimija_en": "xxxx",
                  "tutkintotunnus": "xxxx",
                  "numerovalinta": 2,
                  "valmistavan_koulutuksen_oppilaitos_sv": "xxxx",
                  "kysymys_sv": "xxxx",
                  "kysymysjarjestys": 0,
                  "opintoala_sv": "xxxx",
                  "monivalintavaihtoehto": "xxxx",
                  "kysymysid": 1234,
                  "valmistavan_koulutuksen_oppilaitos_fi": "xxxx",
                  "kysely_en": "xxxx",
                  "vastaustyyppi": "xxxx",
                  "kysymysryhma": "xxxx",
                  "tutkinto_en": null,
                  "kunta": null,
                  "kysymysryhmaid": 110,
                  "kysymysryhmajarjestys": 0,
                  "vaihtoehto": null,
                  "opintoala_fi": "xxxx",
                  "kysymys_fi": "Ik",
                  "vastaajaid": 123,
                  "kyselyid": 111,
                  "valmistavan_koulutuksen_jarjestaja_fi": "xxxx",
                  "kysymysryhma_en": "xxxx",
                  "kysely_sv": "xxxx",
                  "kysymysryhma_fi": "xxxx",
                  "opintoalatunnus": "xxxx",
                  "valmistavan_koulutuksen_jarjestaja_sv": "xxxx",
                  "vastausaika": "2017-02-05T22:00:00Z",
                  "tunnus": "xxxx",
                  "valmistavan_koulutuksen_jarjestaja": "xxxx",
                  "koulutustoimija_fi": "xxxx",
                  "kysely_loppupvm": null,
                  "koulutusmuoto": null,
                  "kyselykerta": "xxxx",
                  "valmistavan_koulutuksen_oppilaitos_en": "xxxx",
                  "valtakunnallinen": true,
                  "tutkinto_sv": null,
                  "koulutustoimija_sv": "xxxx",
                  "valmistavan_koulutuksen_oppilaitos": "xxxx",
                  "kysely_fi": "xxxx",
                  "kysymysryhma_sv": "xxxx"
                }
              ],
              "pagination": {
                "next_url": "null"
              }
            }

        """

        address = url.split(
            "?"
        )[0]  # Save in DB only the part before ?-mark: https://arvo.csc.fi/api/vipunen?alkupvm=2018-01-01&loppupvm=2018-02-01

        # remove data conditionally, otherwise empty
        # merge operation could be considered here...
        if FIRST_LOOP:  # This is done only on the first go (no matter if Arvo returns one or multiple pages)
            if condition:
                show("remove from %s.%s with condition '%s'" %
                     (schema, table, condition))
                dboperator.execute("DELETE FROM %s.%s WHERE %s" %
                                   (schema, table, condition))
            else:
                show("empty %s.%s" % (schema, table))
                dboperator.empty(schema, table)

        show("insert data")
        cnt = 0
        for row in result["data"]:
            cnt += 1
            # show some sign of being alive
            if cnt % 100 == 0:
                sys.stdout.write('.')
                sys.stdout.flush()
            if cnt % 1000 == 0:
                show("-- %d" % (cnt))

            # find out which columns to use on insert
            dboperator.resetcolumns(row)

            # flatten arrays/lists
            for col in row:
                if type(row[col]) is list:
                    row[col] = ''.join(map(str, json.dumps(row[col])))

            dboperator.insert(address, schema, table, row)

        show("wrote %d" % (cnt))
        show("ready")

        if result["pagination"]["next_url"] == "null":
            break  # exit while-loop. We are done.
        else:
            url = result["pagination"]["next_url"]
            FIRST_LOOP = False  # Do not make the possible DELETE-operation anymore!
def load(url, schema, table, condition):
    """
    Results from ARVO-API can come in multiple pages. If that's the case,
    we need to make multiple requests to the ARVO API, using the "next_url" parameter.
    """

    FIRST_LOOP = True  # This is used to make possible DELETE operation (due to condition) only once.

    while True:
        show("begin " + url + " " + schema + " " + table + " " + (condition or ""))
        show("load from " + url)

        reqheaders = {'Content-Type': 'application/json'}
        # api credentials from env vars
        if os.getenv("API_USERNAME"):
            show("using authentication")
            apiuser = os.getenv("API_USERNAME")
            apipass = os.getenv("API_PASSWORD")
            reqheaders['Authorization'] = 'Basic %s' % base64.b64encode(apiuser + ":" + apipass)

        try:
            r = requests.get(url, headers=reqheaders)
        except requests.exceptions.RequestException as e:
            print e
            sys.exit(1)

        if r.status_code != 200:
            print "Error! HTTP status code: " + str(r.status_code)
            sys.exit(2)

        try:
            result = json.loads(r.content)
        except ValueError as e:
            print e
            sys.exit(3)

        if "pagination" not in result or "data" not in result:
            print "Error! Received JSON-data not valid."
            sys.exit(4)

        # everything is fine
        show("api call OK")

        """
          Received data e.g.

            {
              "data": [
                {
                  "taustakysymykset": true,
                  "koulutustoimija": "xxxx",
                  "vastausid": 1111,
                  "kyselykertaid": 123,
                  "kysely_alkupvm": "2016-11-29T22:00:00Z",
                  "suorituskieli": "fi",
                  "tutkinto_fi": "xxxx",
                  "valmistavan_koulutuksen_jarjestaja_en": "xxxx",
                  "kysymys_en": "xxxx",
                  "koulutustoimija_en": "xxxx",
                  "tutkintotunnus": "xxxx",
                  "numerovalinta": 2,
                  "valmistavan_koulutuksen_oppilaitos_sv": "xxxx",
                  "kysymys_sv": "xxxx",
                  "kysymysjarjestys": 0,
                  "monivalintavaihtoehto": "xxxx",
                  "kysymysid": 1234,
                  "valmistavan_koulutuksen_oppilaitos_fi": "xxxx",
                  "kysely_en": "xxxx",
                  "vastaustyyppi": "xxxx",
                  "kysymysryhma": "xxxx",
                  "tutkinto_en": null,
                  "kunta": null,
                  "kysymysryhmaid": 110,
                  "kysymysryhmajarjestys": 0,
                  "vaihtoehto": null,
                  "kysymys_fi": "Ik",
                  "vastaajaid": 123,
                  "kyselyid": 111,
                  "valmistavan_koulutuksen_jarjestaja_fi": "xxxx",
                  "kysymysryhma_en": "xxxx",
                  "kysely_sv": "xxxx",
                  "kysymysryhma_fi": "xxxx",
                  "valmistavan_koulutuksen_jarjestaja_sv": "xxxx",
                  "vastausaika": "2017-02-05T22:00:00Z",
                  "tunnus": "xxxx",
                  "valmistavan_koulutuksen_jarjestaja": "xxxx",
                  "koulutustoimija_fi": "xxxx",
                  "kysely_loppupvm": null,
                  "koulutusmuoto": null,
                  "kyselykerta": "xxxx",
                  "valmistavan_koulutuksen_oppilaitos_en": "xxxx",
                  "valtakunnallinen": true,
                  "tutkinto_sv": null,
                  "koulutustoimija_sv": "xxxx",
                  "valmistavan_koulutuksen_oppilaitos": "xxxx",
                  "kysely_fi": "xxxx",
                  "kysymysryhma_sv": "xxxx"
                }
              ],
              "pagination": {
                "next_url": "null"
              }
            }

        """

        address = url.split("?")[0]  # Save in DB only the part before ?-mark: https://arvo.csc.fi/api/vipunen?alkupvm=2018-01-01&loppupvm=2018-02-01

        # remove data conditionally, otherwise empty
        # merge operation could be considered here...
        if FIRST_LOOP:  # This is done only on the first go (no matter if Arvo returns one or multiple pages)
            if condition:
                show("remove from %s.%s with condition '%s'" % (schema, table, condition))
                dboperator.execute("DELETE FROM %s.%s WHERE %s" % (schema, table, condition))
            else:
                show("empty %s.%s" % (schema, table))
                dboperator.empty(schema, table)

        show("insert data")
        cnt = 0
        for row in result["data"]:
            cnt += 1
            # show some sign of being alive
            if cnt % 100 == 0:
                sys.stdout.write('.')
                sys.stdout.flush()
            if cnt % 1000 == 0:
                show("-- %d" % (cnt))

            # find out which columns to use on insert
            dboperator.resetcolumns(row)

            # flatten arrays/lists
            for col in row:
                if type(row[col]) is list:
                    row[col] = ''.join(map(str, json.dumps(row[col])))

            dboperator.insert(address, schema, table, row)

        show("wrote %d" % (cnt))
        show("ready")

        if result["pagination"]["next_url"] == "null" or result["pagination"]["next_url"] == None:
            break  # exit while-loop. We are done.
        else:
            url = result["pagination"]["next_url"]
            FIRST_LOOP = False  # Do not make the possible DELETE-operation anymore!
示例#6
0
        dboperator.empty(schema, table)

    show("insert data")
    cnt = 0
    for row in ijson.items(response, 'item'):
        cnt += 1
        # show some sign of being alive
        if cnt % 100 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()
        if cnt % 1000 == 0:
            show("-- %d" % (cnt))
        if verbose: show("%d -- %s" % (cnt, row))

        # find out which columns to use on insert
        dboperator.resetcolumns(row)

        # flatten arrays/lists
        for col in row:
            if type(row[col]) is list:
                row[col] = ''.join(map(str, json.dumps(row[col])))

        dboperator.insert(address, schema, table, row)

    show("wrote %d" % (cnt))
    show("ready")


def usage():
    print """
usage: load.py [-s|--secure] -H|--hostname <hostname> -u|--url <url> -e|--schema <schema> -t|--table <table> [-p|--postdata] [-c|--condition <condition>] [-v|--verbose]
def load(url, schema, table, condition):
    """
    Results from VARDA-API can come in multiple pages. If that's the case,
    we need to make multiple requests to the VARDA API, using the "next" parameter.
    """

    # First delete all from TABLE -> Start from scratch. TODO: Delete based on condition.
    dboperator.execute("DELETE FROM %s.%s" % (schema, table))

    while True:
        show("begin " + url + " " + schema + " " + table + " " + (condition or ""))
        show("load from " + url)

        reqheaders = {'Content-Type': 'application/json'}
        # api credentials from env vars
        if os.getenv("VARDA_API_KEY"):
            show("using authentication")
            api_key = os.getenv("VARDA_API_KEY")
            reqheaders['Authorization'] = 'Token %s' % api_key

        try:
            r = requests.get(url, headers=reqheaders)
        except requests.exceptions.RequestException as e:
            print e
            sys.exit(1)

        if r.status_code != 200:
            print "Error! HTTP status code: " + str(r.status_code)
            sys.exit(2)

        try:
            data = json.loads(r.content)
        except ValueError as e:
            print e
            sys.exit(3)

        # everything is fine
        show("api call OK")

        # Parse URL
        # From https://varda.oso-pilot.csc.fi/api/v1/toimipisteet/ save in DB https://varda.oso-pilot.csc.fi
        n = url.find("/", url.find("/") + 2)  # find 3rd occurance of "/"
        address = url[:n]

        show("insert data")
        cnt = 0
        for row in data["results"]:
            cnt += 1
            # show some sign of being alive
            if cnt % 100 == 0:
                sys.stdout.write('.')
                sys.stdout.flush()
            if cnt % 1000 == 0:
                show("-- %d" % (cnt))

            # Explicitly remove the keys from dict (row) that are not saved in Antero DB
            del row['johtaja']
            del row['url']
            del row['muutos_pvm']
            del row['vaka_jarjestaja']
            del row['toimipaikat']

            # find out which columns to use on insert
            dboperator.resetcolumns(row)

            # flatten arrays/lists
            for col in row:
                if type(row[col]) is list:
                    row[col] = ''.join(map(str, json.dumps(row[col])))

            dboperator.insert(address, schema, table, row)

        show("wrote %d" % (cnt))
        show("ready")

        if data["next"] is None:
            break  # exit while-loop. We are done.
        else:
            url = data["next"]