def convert(compressedJSONFile, destDir=".", force = False, skipchecks=False): path = os.path.normpath(compressedJSONFile) fileName = path.split(os.sep)[-1] date = datetimeFromARDFilename(fileName) day = cd.dayFromDate(date) newFile = destDir+"/NPGEO-RKI-{}.csv".format(cd.dateStrYMDFromDay(day)) redo = False if not skipchecks: # check if previous file exist and make sure the current file is not broken previousFile = destDir+"/NPGEO-RKI-{}.csv".format(cd.dateStrYMDFromDay(day-1)) yesterDayRows = -1 if os.path.isfile(previousFile): yesterdayFrame = dt.fread(previousFile) yesterDayRows = yesterdayFrame.nrows else: print("No file for previous day {}".format(day-1)) allowedShrinkageDays = [33,68] allowedSameDays = [33] allowedJumpDays = [46,66] if not force and os.path.isfile(newFile) and yesterDayRows >= 0: existingFrame = dt.fread(newFile) existingRows = existingFrame.nrows if existingRows < yesterDayRows: if not day in allowedShrinkageDays: print("Existing .csv file for day {} contains less rows ({}) than previous day file ({}), redoing".format(day,existingRows,yesterDayRows)) redo = True else: print("On day {} the number of rows was reduced from {} to compared to yesterday's file ({})".format(day,existingRows,yesterDayRows)) else: if existingRows == yesterDayRows: if not day in allowedSameDays: print("Existing .csv file for day {} contains same number of rows ({}) than previous day file ({}), redoing".format(day,existingRows,yesterDayRows)) redo = True else: print( "Existing .csv file for day {} contains same number of rows ({}) than previous day file ({}) but we can't do anything about it".format( day, existingRows, yesterDayRows)) elif (existingRows > yesterDayRows * 1.1) and (existingRows - yesterDayRows > 5000) and not day in allowedJumpDays: print("Existing .csv file for day {} contains much more rows ({}) than previous day file ({}), redoing".format(day,existingRows,yesterDayRows)) redo = True print("Existing .csv file contains {} rows, {} more than yesterday".format(existingRows,existingRows-yesterDayRows)) if force or redo or not os.path.isfile(newFile): print("Loading " + compressedJSONFile) #with bz2.open(compressedJSONFile, "rb") as f: with lzma.open(compressedJSONFile, "rb") as f: content = ndjson.load(f) frame = dt.Frame(content) if frame.nrows <= yesterDayRows and not day in allowedShrinkageDays: print("Rejecting '{}' because it contains less rows than yesterdays file".format(compressedJSONFile)) return print("Saving " + newFile) frame.to_csv(newFile) else: print("Skipping '{}' because '{}' already exists".format(compressedJSONFile, newFile))
def main(): parser = argparse.ArgumentParser(description='Download RKI/NPGEO data and save as json-dump and .csv') parser.add_argument('-j', '--json-dump-dir', dest='dumpDir', default="dumps") parser.add_argument('-c', '--csv-dir', dest='csvDir', default="archive_csv") parser.add_argument("-f","--force", dest='force', help="download even if data for day already exist", action="store_true") parser.add_argument("-R","--resume", dest='resume', help="download even if data for day already exist (API download only). Barely tested)", action="store_true") parser.add_argument('-r','--retry', dest='maxRetries',type=int, default=10, help='Number of retries before giving up on a single request; each retry waits 3 second longer') parser.add_argument("-F","--fetchcsv", dest='fetchcsv', help="fall back to directly download as .csv file, not using the api", action="store_true") args = parser.parse_args() print(args) if args.fetchcsv: datenStand = retrieveLatestCsvDate(args) datenStandDay = cd.dayFromDate(datenStand) else: datenStand = getRecordVersionOnServer() datenStandDay = cd.dayFromDatenstand(datenStand) afn = archiveFilename(datenStandDay, args.dumpDir) cfn = csvFilename(datenStandDay, "fullDaily", args.csvDir) if not os.path.isfile(afn) and not os.path.isfile(cfn): print("New data available, Stand: {} Tag: {}, downloading...".format(datenStand, datenStandDay)) else: print("Data already locally exists, Stand: {} Tag: {}".format(datenStand, datenStandDay)) if args.force: print("Forcing Download because '--force' is set") if (not os.path.isfile(afn) and not os.path.isfile(cfn)) or args.force: dfn = "dumps/dump-rki-" + time.strftime("%Y%m%d-%H%M%S") + "-Stand-" + cd.dateStrYMDFromDay( datenStandDay) + ".json" if not args.fetchcsv: allRecords = retrieveAllRecords(args) if allRecords is not None: pmu.saveJson(dfn, allRecords) if not os.path.isfile(afn) or args.force: pmu.saveJson(afn, allRecords) if not os.path.isfile(cfn) or args.force: pmu.saveCsv(cfn, allRecords) exit(0) else: print("failed to retrieve data") exit(1) else: # download the .csv if downloadCsv(args, cfn): print("Successfully downloaded .csv") dataDict = pmu.loadCsv(cfn) pmu.saveJson(afn, dataDict) exit(9)
def convert(compressedJSONFile, destDir=".", force=False): path = os.path.normpath(compressedJSONFile) fileName = path.split(os.sep)[-1] date = datetimeFromARDFilename(fileName) day = cd.dayFromDate(date) newFile = destDir + "/NPGEO-RKI-{}.csv".format(cd.dateStrYMDFromDay(day)) if force or not os.path.isfile(newFile): print("Loading " + compressedJSONFile) #with bz2.open(compressedJSONFile, "rb") as f: with lzma.open(compressedJSONFile, "rb") as f: content = ndjson.load(f) frame = dt.Frame(content) print("Saving " + newFile) frame.to_csv(newFile) else: print("Skipping '{}' because '{}' already exists".format( compressedJSONFile, newFile))
def isNewData(dataFilename, daysIncluded): pmu.printMemoryUsage("begin of isNewData") peekTable = dt.fread(dataFilename, max_nrows=1) print("Checking " + dataFilename) ##print(peekTable) ##datenStand = peekTable[0, dt.f.DatenstandISO] dss = peekTable[0, "Datenstand"] print("Datenstand", dss) ds = cd.datetimeFromDatenstandAny(dss) dsdy = cd.dayFromDate(ds) pmu.printMemoryUsage("isNewData") isNew = dsdy not in daysIncluded if isNew: print("contains new day {}".format(dsdy)) else: print("contains day {} already in full table".format(dsdy)) pmu.printMemoryUsage("end of isNewData") return isNew
def unify(table): dss = table[0, "Datenstand"] ds = cd.datetimeFromDatenstandAny(dss) dsdy = cd.dayFromDate(ds) hasRefdatum = "Refdatum" in table.names hasErkrankungsbeginn = "IstErkrankungsbeginn" in table.names #t = table.copy() t = table if "Altersgruppe2" in table.names: t = t[:, dt.f[:].remove(dt.f["Altersgruppe2"])] if not "DatenstandISO" in table.names: isodate = cd.dateStrYMDFromDay(dsdy) t = t[:, dt.f[:].extend({"DatenstandISO": isodate})] if not hasRefdatum: t = t[:, dt.f[:].extend({ "Refdatum": str(cd.day0d), "RefdatumISO": dt.f.MeldedatumISO })] if not hasErkrankungsbeginn: t = t[:, dt.f[:].extend({"IstErkrankungsbeginn": 0})] if "NeuGenesen" not in table.names: t = t[:, dt.f[:].extend({"NeuGenesen": -9, "AnzahlGenesen": 0})] t = t[:, dt.f[:].extend({ "FallGruppe": "", "MeldeTag": nan, "RefTag": nan, "DatenstandTag": dsdy })] #t = t[:, dt.f[:].extend({"Bevoelkerung":0, "FaellePro100k":0.0, "TodesfaellePro100k":0.0, "isStadt":False})] #t = t[:, dt.f[:].extend({"Flaeche":0.0, "FaelleProKm2":0.0, "TodesfaelleProKm2":0.0, "Dichte":0.0})] #print("unified fields", t.names) #Bevoelkerung = loadLandkreisBeveolkerung() #Flaeche = loadLandkreisFlaeche() #Census = loadCensus() #pmu.printMemoryUsage("unify pre realize ") #t.materialize(to_memory=True) pmu.printMemoryUsage("unify pre dict") d = t.to_dict() pmu.printMemoryUsage("unify post dict") print("> iterating through {} rows".format(t.nrows)) start = time.perf_counter() for r in range(t.nrows): mds = d["Meldedatum"][r] if pmu.is_int(mds): md = cd.datetimeFromStampStr(mds) else: md = datetimeFromDateStr3(mds) mdy = cd.dayFromDate(md) d["MeldeTag"][r] = mdy if not hasRefdatum: d["Refdatum"][r] = str(md) d["RefTag"][r] = mdy fg = str(d["IdLandkreis"] [r]) + d["Altersgruppe"][r] + d["Geschlecht"][r] + str( int(d["MeldeTag"][r])) if int(d["IstErkrankungsbeginn"][r]) == 1: rds = d["Refdatum"][r] if pmu.is_int(rds): rd = cd.datetimeFromStampStr(rds) else: rd = datetimeFromDateStr3(rds) rdy = cd.dayFromDate(rd) d["RefTag"][r] = rdy fg = fg + ":" + str(rdy) d["FallGruppe"][r] = fg checkLandkreisData(d, r, Census, Flaeche) finish = time.perf_counter() print("< iterating through {} rows done, {:.1f} rows/sec".format( t.nrows, t.nrows / (finish - start))) pmu.printMemoryUsage("end of unify, pre frame") t = dt.Frame(d) pmu.printMemoryUsage("end of unify, post frame") return t
def unify(table, makeFallGruppe=False): dss = table[0, "Datenstand"] ds = cd.datetimeFromDatenstandAny(dss) if 'FID' in table.names: table.names = {"FID": "ObjectId"} dsdy = cd.dayFromDate(ds) dsisodate = cd.dateStrYMDFromDay(dsdy) hasRefdatum = "Refdatum" in table.names hasErkrankungsbeginn = "IstErkrankungsbeginn" in table.names #t = table.copy() t = table if "Altersgruppe2" in table.names: t = t[:, dt.f[:].remove(dt.f["Altersgruppe2"])] if not "DatenstandISO" in table.names: t = t[:, dt.f[:].extend({"DatenstandISO": dsisodate})] if not hasRefdatum: t = t[:, dt.f[:].extend({"Refdatum": 0})] hasRefdatumISO = "RefdatumISO" in table.names if not hasRefdatumISO: #print("t1",t.names) t = t[:, dt.f[:].extend({"RefdatumISO": ""})] #print("t2",t.names) hasMeldedatumISO = "MeldedatumISO" in table.names if not hasMeldedatumISO: t = t[:, dt.f[:].extend({"MeldedatumISO": ""})] if not hasErkrankungsbeginn: t = t[:, dt.f[:].extend({"IstErkrankungsbeginn": 0})] if "NeuGenesen" not in table.names: t = t[:, dt.f[:].extend({"NeuGenesen": -9, "AnzahlGenesen": 0})] if makeFallGruppe: t = t[:, dt.f[:].extend({ "FallGruppe": "", "MeldeTag": nan, "RefTag": nan, "DatenstandTag": dsdy })] t = t[:, dt.f[:].extend({ "MeldeTag": nan, "RefTag": nan, "DatenstandTag": dsdy })] pmu.printMemoryUsage("unify pre dict") d = t.to_dict() pmu.printMemoryUsage("unify post dict") print("> iterating through {} rows".format(t.nrows)) start = time.perf_counter() for r in range(t.nrows): mds = d["Meldedatum"][r] if pmu.is_int(mds): md = cd.datetimeFromStampStr(mds) else: md = datetimeFromDateStr3(mds) d["Meldedatum"][r] = ticksFromDateTime(md) mdy = cd.dayFromDate(md) d["MeldeTag"][r] = mdy if not hasRefdatum: d["Refdatum"][r] = ticksFromDateTime(md) d["RefTag"][r] = mdy if not hasMeldedatumISO: d["MeldedatumISO"][r] = cd.dateStrYMDFromDay(mdy) if makeFallGruppe: fg = str(d["IdLandkreis"] [r]) + d["Altersgruppe"][r] + d["Geschlecht"][r] + str( int(d["MeldeTag"][r])) #if int(d["IstErkrankungsbeginn"][r]) == 1: rds = d["Refdatum"][r] if pmu.is_int(rds): rd = cd.datetimeFromStampStr(rds) else: rd = datetimeFromDateStr3(rds) d["Refdatum"][r] = ticksFromDateTime(rd) rdy = cd.dayFromDate(rd) d["RefTag"][r] = rdy if not hasRefdatumISO: d["RefdatumISO"][r] = cd.dateStrYMDFromDay(rdy) if makeFallGruppe: fg = fg + ":" + str(rdy) if makeFallGruppe: d["FallGruppe"][r] = fg checkLandkreisData(d, r, Census, Flaeche) finish = time.perf_counter() print("< iterating through {} rows done, {:.1f} rows/sec".format( t.nrows, t.nrows / (finish - start))) pmu.printMemoryUsage("end of unify, pre frame") t = dt.Frame(d) pmu.printMemoryUsage("end of unify, post frame") return t