예제 #1
0
def convert(compressedJSONFile, destDir=".", force = False, skipchecks=False):
    path = os.path.normpath(compressedJSONFile)
    fileName = path.split(os.sep)[-1]
    date = datetimeFromARDFilename(fileName)
    day = cd.dayFromDate(date)
    newFile =  destDir+"/NPGEO-RKI-{}.csv".format(cd.dateStrYMDFromDay(day))

    redo = False
    if not skipchecks:
        # check if previous file exist and make sure the current file is not broken
        previousFile =  destDir+"/NPGEO-RKI-{}.csv".format(cd.dateStrYMDFromDay(day-1))

        yesterDayRows = -1
        if os.path.isfile(previousFile):
            yesterdayFrame = dt.fread(previousFile)
            yesterDayRows = yesterdayFrame.nrows
        else:
            print("No file for previous day {}".format(day-1))

        allowedShrinkageDays = [33,68]
        allowedSameDays = [33]
        allowedJumpDays = [46,66]

        if not force and os.path.isfile(newFile) and yesterDayRows >= 0:
            existingFrame = dt.fread(newFile)
            existingRows = existingFrame.nrows
            if existingRows < yesterDayRows:
                if not day in allowedShrinkageDays:
                    print("Existing .csv file for day {} contains less rows ({}) than previous day file ({}), redoing".format(day,existingRows,yesterDayRows))
                    redo = True
                else:
                    print("On day {} the number of rows was reduced from {} to compared to yesterday's file ({})".format(day,existingRows,yesterDayRows))
            else:
                if existingRows == yesterDayRows:
                    if not day in allowedSameDays:
                        print("Existing .csv file for day {} contains same number of rows ({}) than previous day file ({}), redoing".format(day,existingRows,yesterDayRows))
                        redo = True
                    else:
                        print( "Existing .csv file for day {} contains same number of rows ({}) than previous day file ({}) but we can't do anything about it".format(
                                day, existingRows, yesterDayRows))
                elif (existingRows > yesterDayRows * 1.1) and (existingRows - yesterDayRows > 5000) and not day in allowedJumpDays:
                    print("Existing .csv file for day {} contains much more rows ({}) than previous day file ({}), redoing".format(day,existingRows,yesterDayRows))
                    redo = True

                print("Existing .csv file contains {} rows, {} more than yesterday".format(existingRows,existingRows-yesterDayRows))

    if force or redo or not os.path.isfile(newFile):
        print("Loading " + compressedJSONFile)
        #with bz2.open(compressedJSONFile, "rb") as f:
        with lzma.open(compressedJSONFile, "rb") as f:
            content = ndjson.load(f)
            frame = dt.Frame(content)
            if frame.nrows <= yesterDayRows and not day in allowedShrinkageDays:
                print("Rejecting '{}' because it contains less rows than yesterdays file".format(compressedJSONFile))
                return
            print("Saving " + newFile)
            frame.to_csv(newFile)
    else:
        print("Skipping '{}' because '{}' already exists".format(compressedJSONFile, newFile))
예제 #2
0
def main():
    parser = argparse.ArgumentParser(description='Download RKI/NPGEO data and save as json-dump and .csv')
    parser.add_argument('-j', '--json-dump-dir', dest='dumpDir', default="dumps")
    parser.add_argument('-c', '--csv-dir', dest='csvDir', default="archive_csv")
    parser.add_argument("-f","--force", dest='force', help="download even if data for day already exist", action="store_true")
    parser.add_argument("-R","--resume", dest='resume', help="download even if data for day already exist (API download only). Barely tested)", action="store_true")
    parser.add_argument('-r','--retry', dest='maxRetries',type=int, default=10, help='Number of retries before giving up on a single request; each retry waits 3 second longer')
    parser.add_argument("-F","--fetchcsv", dest='fetchcsv', help="fall back to directly download as .csv file, not using the api", action="store_true")
    args = parser.parse_args()
    print(args)

    if args.fetchcsv:
        datenStand = retrieveLatestCsvDate(args)
        datenStandDay = cd.dayFromDate(datenStand)
    else:
        datenStand = getRecordVersionOnServer()
        datenStandDay = cd.dayFromDatenstand(datenStand)

    afn = archiveFilename(datenStandDay, args.dumpDir)
    cfn = csvFilename(datenStandDay, "fullDaily", args.csvDir)

    if not os.path.isfile(afn) and not os.path.isfile(cfn):
        print("New data available, Stand: {} Tag: {}, downloading...".format(datenStand, datenStandDay))
    else:
        print("Data already locally exists, Stand: {} Tag: {}".format(datenStand, datenStandDay))
        if args.force:
            print("Forcing Download because '--force' is set")

    if (not os.path.isfile(afn) and not os.path.isfile(cfn)) or args.force:
        dfn = "dumps/dump-rki-" + time.strftime("%Y%m%d-%H%M%S") + "-Stand-" + cd.dateStrYMDFromDay(
            datenStandDay) + ".json"

        if not args.fetchcsv:
            allRecords = retrieveAllRecords(args)
            if allRecords is not None:
                pmu.saveJson(dfn, allRecords)
                if not os.path.isfile(afn) or  args.force:
                    pmu.saveJson(afn, allRecords)
                if not os.path.isfile(cfn) or args.force:
                    pmu.saveCsv(cfn, allRecords)
                exit(0)
            else:
                print("failed to retrieve data")
                exit(1)
        else:
            # download the .csv
            if downloadCsv(args, cfn):
                print("Successfully downloaded .csv")
                dataDict = pmu.loadCsv(cfn)
                pmu.saveJson(afn, dataDict)

    exit(9)
예제 #3
0
def convert(compressedJSONFile, destDir=".", force=False):
    path = os.path.normpath(compressedJSONFile)
    fileName = path.split(os.sep)[-1]
    date = datetimeFromARDFilename(fileName)
    day = cd.dayFromDate(date)
    newFile = destDir + "/NPGEO-RKI-{}.csv".format(cd.dateStrYMDFromDay(day))

    if force or not os.path.isfile(newFile):
        print("Loading " + compressedJSONFile)
        #with bz2.open(compressedJSONFile, "rb") as f:
        with lzma.open(compressedJSONFile, "rb") as f:
            content = ndjson.load(f)
            frame = dt.Frame(content)
            print("Saving " + newFile)
            frame.to_csv(newFile)
    else:
        print("Skipping '{}' because '{}' already exists".format(
            compressedJSONFile, newFile))
예제 #4
0
def isNewData(dataFilename, daysIncluded):
    pmu.printMemoryUsage("begin of isNewData")
    peekTable = dt.fread(dataFilename, max_nrows=1)
    print("Checking " + dataFilename)
    ##print(peekTable)
    ##datenStand = peekTable[0, dt.f.DatenstandISO]
    dss = peekTable[0, "Datenstand"]
    print("Datenstand", dss)
    ds = cd.datetimeFromDatenstandAny(dss)
    dsdy = cd.dayFromDate(ds)
    pmu.printMemoryUsage("isNewData")
    isNew = dsdy not in daysIncluded
    if isNew:
        print("contains new day {}".format(dsdy))
    else:
        print("contains day {} already in full table".format(dsdy))
    pmu.printMemoryUsage("end of isNewData")

    return isNew
예제 #5
0
def unify(table):
    dss = table[0, "Datenstand"]
    ds = cd.datetimeFromDatenstandAny(dss)

    dsdy = cd.dayFromDate(ds)
    hasRefdatum = "Refdatum" in table.names
    hasErkrankungsbeginn = "IstErkrankungsbeginn" in table.names
    #t = table.copy()
    t = table
    if "Altersgruppe2" in table.names:
        t = t[:, dt.f[:].remove(dt.f["Altersgruppe2"])]
    if not "DatenstandISO" in table.names:
        isodate = cd.dateStrYMDFromDay(dsdy)
        t = t[:, dt.f[:].extend({"DatenstandISO": isodate})]
    if not hasRefdatum:
        t = t[:, dt.f[:].extend({
            "Refdatum": str(cd.day0d),
            "RefdatumISO": dt.f.MeldedatumISO
        })]
    if not hasErkrankungsbeginn:
        t = t[:, dt.f[:].extend({"IstErkrankungsbeginn": 0})]

    if "NeuGenesen" not in table.names:
        t = t[:, dt.f[:].extend({"NeuGenesen": -9, "AnzahlGenesen": 0})]

    t = t[:, dt.f[:].extend({
        "FallGruppe": "",
        "MeldeTag": nan,
        "RefTag": nan,
        "DatenstandTag": dsdy
    })]

    #t = t[:, dt.f[:].extend({"Bevoelkerung":0, "FaellePro100k":0.0, "TodesfaellePro100k":0.0, "isStadt":False})]
    #t = t[:, dt.f[:].extend({"Flaeche":0.0, "FaelleProKm2":0.0, "TodesfaelleProKm2":0.0, "Dichte":0.0})]

    #print("unified fields", t.names)

    #Bevoelkerung = loadLandkreisBeveolkerung()
    #Flaeche = loadLandkreisFlaeche()
    #Census = loadCensus()

    #pmu.printMemoryUsage("unify pre realize ")
    #t.materialize(to_memory=True)

    pmu.printMemoryUsage("unify pre dict")
    d = t.to_dict()
    pmu.printMemoryUsage("unify post dict")

    print("> iterating through {} rows".format(t.nrows))
    start = time.perf_counter()
    for r in range(t.nrows):
        mds = d["Meldedatum"][r]
        if pmu.is_int(mds):
            md = cd.datetimeFromStampStr(mds)
        else:
            md = datetimeFromDateStr3(mds)
        mdy = cd.dayFromDate(md)
        d["MeldeTag"][r] = mdy
        if not hasRefdatum:
            d["Refdatum"][r] = str(md)
            d["RefTag"][r] = mdy

        fg = str(d["IdLandkreis"]
                 [r]) + d["Altersgruppe"][r] + d["Geschlecht"][r] + str(
                     int(d["MeldeTag"][r]))

        if int(d["IstErkrankungsbeginn"][r]) == 1:
            rds = d["Refdatum"][r]
            if pmu.is_int(rds):
                rd = cd.datetimeFromStampStr(rds)
            else:
                rd = datetimeFromDateStr3(rds)
            rdy = cd.dayFromDate(rd)
            d["RefTag"][r] = rdy
            fg = fg + ":" + str(rdy)
        d["FallGruppe"][r] = fg
        checkLandkreisData(d, r, Census, Flaeche)

    finish = time.perf_counter()

    print("< iterating through {} rows done, {:.1f} rows/sec".format(
        t.nrows, t.nrows / (finish - start)))

    pmu.printMemoryUsage("end of unify, pre frame")
    t = dt.Frame(d)
    pmu.printMemoryUsage("end of unify, post frame")
    return t
예제 #6
0
def unify(table, makeFallGruppe=False):
    dss = table[0, "Datenstand"]
    ds = cd.datetimeFromDatenstandAny(dss)

    if 'FID' in table.names:
        table.names = {"FID": "ObjectId"}

    dsdy = cd.dayFromDate(ds)
    dsisodate = cd.dateStrYMDFromDay(dsdy)
    hasRefdatum = "Refdatum" in table.names
    hasErkrankungsbeginn = "IstErkrankungsbeginn" in table.names
    #t = table.copy()
    t = table
    if "Altersgruppe2" in table.names:
        t = t[:, dt.f[:].remove(dt.f["Altersgruppe2"])]
    if not "DatenstandISO" in table.names:
        t = t[:, dt.f[:].extend({"DatenstandISO": dsisodate})]
    if not hasRefdatum:
        t = t[:, dt.f[:].extend({"Refdatum": 0})]

    hasRefdatumISO = "RefdatumISO" in table.names
    if not hasRefdatumISO:
        #print("t1",t.names)
        t = t[:, dt.f[:].extend({"RefdatumISO": ""})]
        #print("t2",t.names)

    hasMeldedatumISO = "MeldedatumISO" in table.names
    if not hasMeldedatumISO:
        t = t[:, dt.f[:].extend({"MeldedatumISO": ""})]

    if not hasErkrankungsbeginn:
        t = t[:, dt.f[:].extend({"IstErkrankungsbeginn": 0})]

    if "NeuGenesen" not in table.names:
        t = t[:, dt.f[:].extend({"NeuGenesen": -9, "AnzahlGenesen": 0})]

    if makeFallGruppe:
        t = t[:, dt.f[:].extend({
            "FallGruppe": "",
            "MeldeTag": nan,
            "RefTag": nan,
            "DatenstandTag": dsdy
        })]
    t = t[:, dt.f[:].extend({
        "MeldeTag": nan,
        "RefTag": nan,
        "DatenstandTag": dsdy
    })]

    pmu.printMemoryUsage("unify pre dict")
    d = t.to_dict()
    pmu.printMemoryUsage("unify post dict")

    print("> iterating through {} rows".format(t.nrows))
    start = time.perf_counter()
    for r in range(t.nrows):
        mds = d["Meldedatum"][r]
        if pmu.is_int(mds):
            md = cd.datetimeFromStampStr(mds)
        else:
            md = datetimeFromDateStr3(mds)
            d["Meldedatum"][r] = ticksFromDateTime(md)

        mdy = cd.dayFromDate(md)
        d["MeldeTag"][r] = mdy
        if not hasRefdatum:
            d["Refdatum"][r] = ticksFromDateTime(md)
            d["RefTag"][r] = mdy
        if not hasMeldedatumISO:
            d["MeldedatumISO"][r] = cd.dateStrYMDFromDay(mdy)

        if makeFallGruppe:
            fg = str(d["IdLandkreis"]
                     [r]) + d["Altersgruppe"][r] + d["Geschlecht"][r] + str(
                         int(d["MeldeTag"][r]))

        #if int(d["IstErkrankungsbeginn"][r]) == 1:
        rds = d["Refdatum"][r]
        if pmu.is_int(rds):
            rd = cd.datetimeFromStampStr(rds)
        else:
            rd = datetimeFromDateStr3(rds)
            d["Refdatum"][r] = ticksFromDateTime(rd)
        rdy = cd.dayFromDate(rd)
        d["RefTag"][r] = rdy
        if not hasRefdatumISO:
            d["RefdatumISO"][r] = cd.dateStrYMDFromDay(rdy)
        if makeFallGruppe:
            fg = fg + ":" + str(rdy)

        if makeFallGruppe:
            d["FallGruppe"][r] = fg
        checkLandkreisData(d, r, Census, Flaeche)

    finish = time.perf_counter()

    print("< iterating through {} rows done, {:.1f} rows/sec".format(
        t.nrows, t.nrows / (finish - start)))

    pmu.printMemoryUsage("end of unify, pre frame")
    t = dt.Frame(d)
    pmu.printMemoryUsage("end of unify, post frame")
    return t