def test_first_2d_dt(): df_in = dt.Frame([[9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1], [0, 1, 0, 5, 3, 8, 1, 0, 2, 5, 8, None, 1]]) df_reduce = df_in[:, [first(f.C0), first(f.C1)], "C0"] assert_equals(df_reduce, dt.Frame([[None, 0, 1, 2, 3, 5, 8, 9], [None, 0, 1, 2, 3, 5, 8, 9], [3, 0, 1, 0, 5, 2, 1, 0]], names=["C0", "C1", "C2"]))
def test_first(): assert str(dt.first(f.A)) == str(f.A.first()) assert str(dt.first(f[:])) == str(f[:].first()) DT = dt.Frame({'A': ['1', '1', '2', '1', '2'], 'B': [None, '2', '3', '4', '5'], 'C': [1, 2, 1, 1, 2]}) assert_equals(DT[:, f.A.first()], DT[:, dt.first(f.A)]) assert_equals(DT[:, f[:].first()], DT[:, dt.first(f[:])])
def test_first_2d_dt(): df_in = dt.Frame([[9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1], [0, 1, 0, 5, 3, 8, 1, 0, 2, 5, 8, None, 1]]) df_reduce = df_in[:, [first(f.C0), first(f.C1)], "C0"] frame_integrity_check(df_reduce) assert df_reduce.shape == (8, 3) assert df_reduce.ltypes == (ltype.int, ltype.int, ltype.int,) assert df_reduce.to_list() == [[None, 0, 1, 2, 3, 5, 8, 9], [None, 0, 1, 2, 3, 5, 8, 9], [3, 0, 1, 0, 5, 2, 1, 0]]
def test_first_dt_range(): df_in = dt.Frame(A=range(10))[3::3, :] df_reduce = df_in[:, first(f.A)] frame_integrity_check(df_reduce) assert df_reduce.shape == (1, 1) assert df_reduce.ltypes == (ltype.int,) assert df_reduce.to_list() == [[3]]
def test_first_dt(): df_in = dt.Frame([9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1]) df_reduce = df_in[:, first(f.C0)] frame_integrity_check(df_reduce) assert df_reduce.shape == (1, 1) assert df_reduce.ltypes == (ltype.int,) assert df_reduce.to_list() == [[9]]
def test_first_dt_groupby(): df_in = dt.Frame([9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1]) df_reduce = df_in[:, first(f.C0), "C0"] assert_equals( df_reduce, dt.Frame([[None, 0, 1, 2, 3, 5, 8, 9], [None, 0, 1, 2, 3, 5, 8, 9]], names=["C0", "C1"]))
def timeSeries(fullTable, fromDay, toDay, byCriteria, nameColumn, Altersgruppen, Geschlechter): regions = fullTable[:, [dt.first(nameColumn)], dt.by(byCriteria)] #regions = regions[:5,:] print("Creating time series for regions:") print(regions) dailysByCriteria = {} start = time.perf_counter() for i, lk in enumerate(regions[:, byCriteria].to_list()[0]): print("Processing Region '{}'".format(regions[i, nameColumn][0, 0])) start_region = time.perf_counter() pmu.printMemoryUsage("pre analyzeDailyAltersgruppenGeschlechter") dailysByCriteria[lk] = analyzeDailyAltersgruppenGeschlechter( fullTable, filterByDayAndCriteria(fromDay, toDay, (byCriteria == lk)), Altersgruppen, Geschlechter) finish = time.perf_counter() duration = finish - start print( "Region took {:.2f} seconds, elapsed {:.2f} minutes, time to completion: {:.2f} minutes" .format(finish - start_region, duration / 60, duration / (i + 1) * (regions.nrows - i) / 60)) pmu.printMemoryUsage("post analyzeDailyAltersgruppenGeschlechter") print("Done {} of {}, key = {} name = {}".format( i + 1, regions.nrows, lk, regions[i, nameColumn][0, 0])) #if lk >= 0: # break return regions, dailysByCriteria
def test_first_dt(): df_in = dt.Frame([9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1]) df_reduce = df_in[:, first(f.C0)] df_reduce.internal.check() assert df_reduce.shape == (1, 1) assert df_reduce.ltypes == (ltype.int, ) assert df_reduce.topython() == [[9]]
def test_first_dt_range(): df_in = dt.Frame(A=range(10))[3::3, :] df_reduce = df_in[:, first(f.A)] df_reduce.internal.check() assert df_reduce.shape == (1, 1) assert df_reduce.ltypes == (ltype.int, ) assert df_reduce.topython() == [[3]]
def test_first_dt_groupby(): df_in = dt.Frame([9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1]) df_reduce = df_in[:, first(f.C0), "C0"] frame_integrity_check(df_reduce) assert df_reduce.shape == (8, 2) assert df_reduce.ltypes == (ltype.int, ltype.int,) assert df_reduce.to_list() == [[None, 0, 1, 2, 3, 5, 8, 9], [None, 0, 1, 2, 3, 5, 8, 9]]
def test_first_dt_integer_large(numpy): n = 12345678 a_in = numpy.random.randint(2**20, size=n, dtype=numpy.int32) df_in = dt.Frame(a_in) df_reduce = df_in[:, first(f.C0)] assert df_reduce.shape == (1, 1) assert df_reduce.ltypes == (ltype.int, ) assert df_reduce.topython() == [[a_in[0]]]
def test_first_dt_groupby(): df_in = dt.Frame([9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1]) df_reduce = df_in[:, first(f.C0), "C0"] df_reduce.internal.check() assert df_reduce.shape == (8, 2) assert df_reduce.ltypes == ( ltype.int, ltype.int, ) assert df_reduce.topython() == [[None, 0, 1, 2, 3, 5, 8, 9], [None, 0, 1, 2, 3, 5, 8, 9]]
def test_first_array(): assert first([9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1]) == 9 assert first((3.5, 17.9, -4.4)) == 3.5 assert first([]) == None
# collapse release statusMsg('Collapsing emx-release....') subjects['associatedRD3Releases'] = dt.Frame([ flattenValueArray( array=subjects[f.subjectID==d, f.release][f.release != None, :].to_list()[0] ) for d in subjects[:, f.subjectID].to_list()[0] ]) # DISTINCT RECORDS ONLY # since all information has been flattend and repeated by subject, it is # possible to select only the distinct records. statusMsg('Complete! Selecting distinct records only....') subjects = subjects[:, first(f[:]), dt.by(f.subjectID)] #////////////////////////////////////////////////////////////////////////////// # ~ 2 ~ # RESHAPE SAMPLES # Sample metadata will need to be processed a bit differently than subject # metadata. The idea is to have all samples listed horizontally by subject. # This means that for each subject there will be a column for all samples # released in DF1, DF2, DF3, and so on. It was done this way since so that # references to other tables can be made. statusMsg('Summarizing sample metadata....') # recode subjectID --- extract subject ID only (i.e., remove '_original', etc.) samples.names={'subject': 'subjectID'} samples['subjectID']=dt.Frame([
def main(): #testDatePerf() start = time.perf_counter() lastCheckPointTime = start parser = argparse.ArgumentParser( description='Create a unfied data file from daily dumps') parser.add_argument('files', metavar='fileName', type=str, nargs='+', help='.NPGEO COVID19 Germany data as .csv file') parser.add_argument('-d', '--output-dir', dest='outputDir', default=".") #parser.add_argument("--flushmemfull", help="flush full table to disk for lower memory footprint", # action="store_true") parser.add_argument( "--materializeNew", help="materialize new table to disk for lower memory footprint", action="store_true") parser.add_argument( "--noMaterialize", help= "run with higher memory footprint, or much higher memory footprint with --in-memory", action="store_true") parser.add_argument("--inMemory", help="run faster but with higher memory footprint", action="store_true") parser.add_argument( "--checkpoint", help="write checkpoint after amount of minutes elapsed", default=10) parser.add_argument( "--nthreads", help= "number of concurrent threads used by python dataframes, 0 = as many as cores, 1 single-thread, -3 = 3 threads less than cores", default=0) args = parser.parse_args() print(args) print("args.inMemory", args.inMemory) print("args.materializeNew", args.materializeNew) print("args.noMaterialize", args.noMaterialize) if args.nthreads != 0: dt.options.nthreads = args.nthreads print("dt.options.nthreads", dt.options.nthreads) fullTable = None jayPath = args.outputDir + "/all-data.jay" print(jayPath) pmu.printMemoryUsage("after start") daysIncluded = [] if os.path.isfile(jayPath): print("Loading " + jayPath) fullTable = dt.fread(jayPath) pmu.printMemoryUsage("after load") daysIncluded = sorted( fullTable[:, [dt.first(dt.f.DatenstandTag)], dt.by(dt.f.DatenstandTag)].to_list()[0]) print("Days in full table:") print(daysIncluded) pmu.printMemoryUsage("after first query") addedData = False for fa in args.files: files = sorted(glob.glob(fa)) for f in files: if isNewData(f, daysIncluded): addedData = True fstart = time.perf_counter() pmu.printMemoryUsage("after isNewData query") t = tableData(f) pmu.printMemoryUsage("after tabledata query") print("Hashing " + f) newTable = unify(t) pmu.printMemoryUsage("after hashing") save(newTable, f, args.outputDir) pmu.printMemoryUsage("after newTable save") if fullTable is None: fullTable = newTable else: #print("full fields", fullTable.names) checkColumns(fullTable.names, newTable.names) pmu.printMemoryUsage("after checkColumns") if not args.noMaterialize: fullTable.materialize(to_memory=args.inMemory) pmu.printMemoryUsage("after materialize fullTable") if args.materializeNew: newTable.materialize(to_memory=args.inMemory) pmu.printMemoryUsage("after materialize newTable") pmu.printMemoryUsage("before fulltable rbind") fullTable.rbind(newTable) # memory gets used here pmu.printMemoryUsage("after rbind") ffinish = time.perf_counter() secs = ffinish - fstart #print("fullTable", fullTable) print("newTable rows = {}".format(newTable.nrows)) print("fullTable rows = {}".format(fullTable.nrows)) print( "-> File time {:.1f} secs or {:.1f} mins or {:.1f} hours". format(secs, secs / 60, secs / 60 / 60)) if time.perf_counter() - lastCheckPointTime > float( args.checkpoint) * 60: #checkname = args.outputDir+"/"+"all-data.check.jay" #print("Saving checkpoint: " + checkname) #pmu.saveJayTable(fullTable,"all-data.check.jay",args.outputDir) pmu.saveCsvTable(fullTable, "all-data.check.csv", args.outputDir) fullTable = None #fullTable = dt.fread(args.outputDir+"/all-data.check.csv") fullTable = dt.fread(args.outputDir + "/all-data.check.jay") #fullTable.to_jay(checkname) #print("Saving done:" + checkname) lastCheckPointTime = time.perf_counter() if addedData: pmu.printMemoryUsage("before full save") pmu.saveJayTable(fullTable, "all-data.jay", args.outputDir) pmu.printMemoryUsage("after full save") else: print("No new data added, not saving 'all-data.ja'") #pmu.saveCsvTable(fullTable, "all-data.csv", args.outputDir) finish = time.perf_counter() secs = finish - start print("--> Wall time {:.1f} secs or {:.1f} mins or {:.1f} hours".format( secs, secs / 60, secs / 60 / 60))
def loadAndProcessData(dataFilename): print("Loading " + dataFilename) fullTable = dt.fread(dataFilename) print("Loading done loading table from ‘" + dataFilename + "‘, keys:") print(fullTable.keys()) cases = fullTable[:, 'AnzahlFall'].sum()[0, 0] dead = fullTable[:, 'AnzahlTodesfall'].sum()[0, 0] lastDay = fullTable[:, 'MeldeDay'].max()[0, 0] lastnewCaseOnDay = fullTable[:, 'newCaseOnDay'].max()[0, 0] print("File stats: lastDay {} lastnewCaseOnDay {} cases {} dead {}".format( lastDay, lastnewCaseOnDay, cases, dead)) newTable = fullTable[:, dt.f[:]. extend({"erkMeldeDelay": dt.f.MeldeDay - dt.f.RefDay})] #print(newTable.keys()) #dt.by(dt.f.Bundesland)] alldays = fullTable[:, [ dt.sum(dt.f.AnzahlFall), dt.sum(dt.f.FaellePro100k), dt.sum(dt.f.AnzahlTodesfall), dt.sum(dt.f.TodesfaellePro100k), dt.mean(dt.f.Bevoelkerung), dt.max(dt.f.MeldeDay), dt.first(dt.f.LandkreisTyp), dt.first(dt.f.Bundesland) ], dt.by(dt.f.Landkreis)] last7days = fullTable[dt.f.newCaseOnDay > lastDay - 7, :][:, [ dt.sum(dt.f.AnzahlFall), dt.sum(dt.f.FaellePro100k), dt.sum(dt.f.AnzahlTodesfall), dt.sum(dt.f.TodesfaellePro100k) ], dt.by(dt.f.Landkreis)] last7days.names = [ "Landkreis", "AnzahlFallLetzte7Tage", "FaellePro100kLetzte7Tage", "AnzahlTodesfallLetzte7Tage", "TodesfaellePro100kLetzte7Tage" ] last7days[dt.f.AnzahlFallLetzte7Tage < 0, "AnzahlFallLetzte7Tage"] = 0 last7days[dt.f.FaellePro100kLetzte7Tage < 0, "FaellePro100kLetzte7Tage"] = 0 last7days[dt.f.AnzahlTodesfallLetzte7Tage < 0, "AnzahlTodesfallLetzte7Tage"] = 0 last7days[dt.f.TodesfaellePro100kLetzte7Tage < 0, "TodesfaellePro100kLetzte7Tage"] = 0 lastWeek7days = fullTable[(dt.f.newCaseOnDay > lastDay - 14) & ( dt.f.newCaseOnDay <= lastDay - 7), :][:, [ dt.sum(dt.f.AnzahlFall), dt.sum(dt.f.FaellePro100k), dt.sum(dt.f.AnzahlTodesfall), dt.sum(dt.f.TodesfaellePro100k) ], dt.by(dt.f.Landkreis)] #lastWeek7days[dt.f[1:] < 0, dt.f[1:]] = 0 lastWeek7days.names = [ "Landkreis", "AnzahlFallLetzte7TageDavor", "FaellePro100kLetzte7TageDavor", "AnzahlTodesfallLetzte7TageDavor", "TodesfaellePro100kLetzte7TageDavor" ] lastWeek7days[dt.f.AnzahlFallLetzte7TageDavor < 0, "AnzahlFallLetzte7TageDavor"] = 0 lastWeek7days[dt.f.FaellePro100kLetzte7TageDavor < 0, "FaellePro100kLetzte7TageDavor"] = 0 lastWeek7days[dt.f.AnzahlTodesfallLetzte7TageDavor < 0, "AnzahlTodesfallLetzte7TageDavor"] = 0 lastWeek7days[dt.f.TodesfaellePro100kLetzte7TageDavor < 0, "TodesfaellePro100kLetzte7TageDavor"] = 0 allDaysExt0 = merge(alldays, last7days, "Landkreis") allDaysExt1 = merge(allDaysExt0, lastWeek7days, "Landkreis") Rw = dt.f.AnzahlFallLetzte7Tage / dt.f.AnzahlFallLetzte7TageDavor allDaysExt2 = allDaysExt1[:, dt.f[:].extend({"AnzahlFallTrend": Rw})] allDaysExt3 = allDaysExt2[:, dt.f[:].extend({ "FaellePro100kTrend": dt.f.FaellePro100kLetzte7Tage - dt.f.FaellePro100kLetzte7TageDavor })] allDaysExt4 = allDaysExt3[:, dt.f[:].extend({ "TodesfaellePro100kTrend": dt.f.TodesfaellePro100kLetzte7Tage - dt.f.TodesfaellePro100kLetzte7TageDavor })] allDaysExt5 = allDaysExt4[:, dt.f[:].extend({ "Kontaktrisiko": dt.f.Bevoelkerung / 6.25 / ((dt.f.AnzahlFallLetzte7Tage + dt.f.AnzahlFallLetzte7TageDavor) * Rw) })] allDaysExt6 = allDaysExt5[:, dt.f[:].extend( {"LetzteMeldung": lastDay - dt.f.MeldeDay})] allDaysExt6[dt.f.Kontaktrisiko * 2 == dt.f.Kontaktrisiko, "Kontaktrisiko"] = 999999 sortedByRisk = allDaysExt6.sort( ["Kontaktrisiko", "LetzteMeldung", "FaellePro100k"]) #print(sortedByRisk) allDaysExt = sortedByRisk[:, dt.f[:].extend({"Rang": 0})] allDaysExt[:, "Rang"] = np.arange(1, allDaysExt.nrows + 1) #print(allDaysExt) print("Column names frame order:", list(enumerate(allDaysExt.names))) data = allDaysExt.to_pandas() return data
def test_first_dt(): df_in = dt.Frame([9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1]) df_reduce = df_in[:, first(f.C0)] assert_equals(df_reduce, dt.Frame(C0=[9]))
def test_first_dt_range(): df_in = dt.Frame(A=range(10))[3::3, :] df_reduce = df_in[:, first(f.A)] assert_equals(df_reduce, dt.Frame(A=[3]))
def test_first_empty_frame(): DT = dt.Frame(A=[], stype=dt.float32) RZ = DT[:, first(f.A)] assert_equals(RZ, dt.Frame(A=[None], stype=dt.float32))
def main(): #testDatePerf() start = time.perf_counter() lastCheckPointTime = start parser = argparse.ArgumentParser( description='Create a unfied data file from daily dumps') parser.add_argument('files', metavar='fileName', type=str, nargs='+', help='.NPGEO COVID19 Germany data as .csv file') parser.add_argument('-d', '--output-dir', dest='outputDir', default=".") parser.add_argument('-t', '--temp-dir', dest='tempDir', default=".") parser.add_argument( "--flushread", help= "flush full table an re-read after checkpoint lower memory footprint", action="store_true") parser.add_argument("--force", help="build new database anyway", action="store_true") parser.add_argument( "--destructivesave", help="release memory gradually while saving and reload after saving", action="store_true") parser.add_argument("-v", "--verbose", help="make more noise", action="store_true") parser.add_argument("--partitionsize", type=int, help="number of records per partition", default=10000000) parser.add_argument("--memorylimit", type=int, help="maximum memory limit for a database file") parser.add_argument( "--checkpoint", type=int, help="write checkpoint after amount of minutes elapsed", default=10) parser.add_argument( "--nthreads", type=int, help= "number of concurrent threads used by python dataframes, 0 = as many as cores, 1 single-thread, -3 = 3 threads less than cores", default=0) args = parser.parse_args() print(args) # print("args.inMemory",args.inMemory) # print("args.materializeNew",args.materializeNew) # print("args.noMaterialize",args.noMaterialize) if args.nthreads != 0: dt.options.nthreads = args.nthreads print("dt.options.nthreads", dt.options.nthreads) fullTable = None jayFile = "all-data.jay" jayPath = os.path.join(args.outputDir, jayFile) print(jayPath) pmu.printMemoryUsage("after start") partitioned = False daysIncluded = [] if len(pmu.getJayTablePartitions(jayPath)) > 0: fullTable = pmu.loadJayTablePartioned(jayPath, tempDir=args.tempDir, memoryLimit=args.memorylimit, verbose=args.verbose) if fullTable == None: print( "The file {} is not a valid jay file, please remove it and retry" ) exit(1) partitioned = True elif os.path.isfile(jayPath): print("Loading " + jayPath) fullTable = dt.fread(jayPath) if fullTable is not None: pmu.printMemoryUsage("after load") daysIncluded = sorted( fullTable[:, [dt.first(dt.f.DatenstandTag)], dt.by(dt.f.DatenstandTag)].to_list()[0]) print("Days in full table:") print(daysIncluded) pmu.printMemoryUsage("after first query") addedData = False for fa in args.files: files = sorted(glob.glob(fa)) for f in files: if isNewData(f, daysIncluded): addedData = True fstart = time.perf_counter() pmu.printMemoryUsage("after isNewData query") t = tableData(f) pmu.printMemoryUsage("after tabledata query") print("Hashing " + f) newTable = unify(t) pmu.printMemoryUsage("after hashing") save(newTable, f, args.outputDir) pmu.printMemoryUsage("after newTable save") if fullTable is None: fullTable = newTable else: #print("full fields", fullTable.names) checkColumns(fullTable.names, newTable.names) pmu.printMemoryUsage("before fulltable rbind") fullTable.rbind(newTable) # memory gets used here pmu.printMemoryUsage("after rbind") ffinish = time.perf_counter() secs = ffinish - fstart #print("fullTable", fullTable) print("newTable rows = {}".format(newTable.nrows)) print("fullTable rows = {}".format(fullTable.nrows)) print( "-> File time {:.1f} secs or {:.1f} mins or {:.1f} hours". format(secs, secs / 60, secs / 60 / 60)) if time.perf_counter() - lastCheckPointTime > float( args.checkpoint) * 60: print("Saving checkpoint @ {}".format(datetime.now())) pmu.saveJayTablePartioned(fullTable, jayFile, args.outputDir, args.partitionsize, True, args.destructivesave) if args.flushread or args.destructivesave: print("Re-reading checkpoint @ {}".format( datetime.now())) fullTable = None fullTable = pmu.loadJayTablePartioned( jayPath, tempDir=args.tempDir, memoryLimit=args.memorylimit, verbose=args.verbose) lastCheckPointTime = time.perf_counter() print("Checkpoint done @ {}".format(datetime.now())) if addedData or not partitioned: pmu.printMemoryUsage("before full save") #pmu.saveJayTable(fullTable, "all-data.jay", args.outputDir) pmu.saveJayTablePartioned(fullTable, "all-data.jay", args.outputDir, args.partitionsize, True, args.destructivesave) pmu.printMemoryUsage("after full save") else: print("No new data added, not saving 'all-data.jay'") #pmu.saveCsvTable(fullTable, "all-data.csv", args.outputDir) finish = time.perf_counter() secs = finish - start print("--> Wall time {:.1f} secs or {:.1f} mins or {:.1f} hours".format( secs, secs / 60, secs / 60 / 60))
def test_first_dt_integer_large(numpy): n = 12345678 a_in = numpy.random.randint(2**20, size=n, dtype=numpy.int32) df_in = dt.Frame(a_in) df_reduce = df_in[:, first(f.C0)] assert_equals(df_reduce, dt.Frame(C0=[a_in[0]]))
def test_first_array(): a_in = [9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1] a_reduce = first(a_in) assert a_reduce == 9
def test_first_2d_array(): a_in = [[9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1], [0, 1, 0, 5, 3, 8, 1, 0, 2, 5, 8, None, 1]] a_reduce = first(a_in) assert a_reduce == [9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1]
def main(): #testDatePerf() start = time.perf_counter() lastCheckPointTime = start parser = argparse.ArgumentParser( description='Create a unfied data file from daily dumps') parser.add_argument('files', metavar='fileName', type=str, nargs='+', help='.NPGEO COVID19 Germany data as .csv file') parser.add_argument('-d', '--output-dir', dest='outputDir', default=".") parser.add_argument('-t', '--temp-dir', dest='tempDir', default=".") parser.add_argument( "--flushread", help= "flush full table an re-read after checkpoint lower memory footprint", action="store_true") parser.add_argument( "--partition", help= "save data in partionions instead of one file; slower, but you can see progress and maybe need less memory, but ymmv", action="store_true") parser.add_argument("--backup", help="create backup files before overwriting", action="store_true") parser.add_argument("--resume", help="read already unified .csv files first", action="store_true") parser.add_argument( "--unsafe", help= "directly overwrite output files, will corrupt the output file when killed while writing, but uses less disk space (only applies to single .jay file in non-partition mode)", action="store_true") parser.add_argument("--force", help="build new database anyway", action="store_true") parser.add_argument( "--destructivesave", help= "release memory gradually while saving and reload after saving (experimental, untested, only applies to partiioned write)", action="store_true") #parser.add_argument("--incremental", help="only load partial data", action="store_true") parser.add_argument("-v", "--verbose", help="make more noise", action="store_true") parser.add_argument("--partitionsize", type=int, help="number of records per partition", default=10000000) parser.add_argument("--memorylimit", type=int, help="maximum memory limit for a database file") parser.add_argument( "--checkpoint", type=int, help="write checkpoint after amount of minutes elapsed", default=60) parser.add_argument( "--nthreads", type=int, help= "number of concurrent threads used by python dataframes, 0 = as many as cores, 1 single-thread, -3 = 3 threads less than cores", default=0) args = parser.parse_args() print(args) # print("args.inMemory",args.inMemory) # print("args.materializeNew",args.materializeNew) # print("args.noMaterialize",args.noMaterialize) if args.nthreads != 0: dt.options.nthreads = args.nthreads print("dt.options.nthreads", dt.options.nthreads) fullTable = None jayFile = "all-data.jay" jayPath = os.path.join(args.outputDir, jayFile) print(jayPath) pmu.printMemoryUsage("after start") partitioned = False if not args.force: if os.path.isfile(jayPath): print("Loading " + jayPath) fullTable = dt.fread(jayPath, tempdir=args.tempDir, memory_limit=args.memorylimit, verbose=args.verbose) elif len(pmu.getJayTablePartitions(jayPath)) > 0: fullTable = pmu.loadJayTablePartioned( jayPath, tempdir=args.tempDir, memory_limit=args.memorylimit, verbose=args.verbose) if fullTable == None: print( "The file {} is not a valid jay file, please remove it and retry" ) exit(1) partitioned = True daysIncluded = [] addedData = False version = 1 lastversion = 0 for fa in args.files: files = sorted(glob.glob(fa)) for f in files: if fullTable is not None and version != lastversion: pmu.printMemoryUsage("after load") daysIncluded = sorted( fullTable[:, [dt.first(dt.f.DatenstandTag)], dt.by(dt.f.DatenstandTag)].to_list()[0]) print("Days in full table:") print(daysIncluded) pmu.printMemoryUsage("after first query") lastversion = version if isNewData(f, daysIncluded): pmu.printMemoryUsage("after isNewData query") fstart = time.perf_counter() unifiedTable = None if args.resume: unifiedTable = load(f, args.outputDir) addedData = True version = version + 1 if unifiedTable is None: t = tableData(f) pmu.printMemoryUsage("after tabledata query") print("Unifying " + f) unifiedTable = unify(t) pmu.printMemoryUsage("after hashing") save(unifiedTable, f, args.outputDir) pmu.printMemoryUsage("after unifiedTable save") if fullTable is None: fullTable = unifiedTable else: #print("full fields", fullTable.names) checkColumns(fullTable.names, unifiedTable.names) #print("unifiedTable.names",unifiedTable.names) pmu.printMemoryUsage("before fulltable rbind") fullTable.rbind(unifiedTable) # memory gets used here #print("fullTable.names",fullTable.names) pmu.printMemoryUsage("after rbind") ffinish = time.perf_counter() secs = ffinish - fstart #print("fullTable", fullTable) print("unifiedTable rows = {}".format(unifiedTable.nrows)) print("fullTable rows = {}".format(fullTable.nrows)) print( "-> File time {:.1f} secs or {:.1f} mins or {:.1f} hours". format(secs, secs / 60, secs / 60 / 60)) if time.perf_counter() - lastCheckPointTime > float( args.checkpoint) * 60: print("Saving checkpoint @ {}".format(datetime.now())) if args.partition: pmu.saveJayTablePartioned(fullTable, jayFile, args.outputDir, args.partitionsize, True, args.destructivesave) if args.flushread or args.destructivesave: print("Re-reading checkpoint @ {}".format( datetime.now())) fullTable = None fullTable = pmu.loadJayTablePartioned( jayPath, tempdir=args.tempDir, memory_limit=args.memorylimit, verbose=args.verbose) else: pmu.saveJayTable(fullTable, "all-data.jay", args.outputDir, args.backup, args.unsafe) lastCheckPointTime = time.perf_counter() print("Checkpoint done @ {}".format(datetime.now())) if addedData or (args.partition != partitioned): pmu.printMemoryUsage("before full save") if args.partition: pmu.saveJayTablePartioned(fullTable, "all-data.jay", args.outputDir, args.partitionsize, True, args.destructivesave) else: pmu.saveJayTable(fullTable, "all-data.jay", args.outputDir, args.backup, args.unsafe) pmu.printMemoryUsage("after full save") else: print("No new data added, not saving.'") #pmu.saveCsvTable(fullTable, "all-data.csv", args.outputDir) finish = time.perf_counter() secs = finish - start print("Finished in {:.1f} secs or {:.1f} mins or {:.1f} hours".format( secs, secs / 60, secs / 60 / 60))