예제 #1
0
def gen_mapping_BL2LK_json():
    """
    generates a mapping table of BL_Code <-> LK_ID
    dict: key1 = BC_Code -> list of LK_IDs:
    {"SH": {"BL_Name": "Schleswig-Holstein", "LK_IDs": [["01001", "Flensburg"], ["01002", "Kiel"], ..] ..}..}
    """
    global d_ref_landkreise
    d_bundeslaender = {}
    d_landkreis_id_name_mapping = {}  # lk_id -> name
    for lk_id in d_ref_landkreise.keys():
        lk = d_ref_landkreise[lk_id]
        d_landkreis_id_name_mapping[lk_id] = get_lk_name_from_lk_id(lk_id)
        if lk['BL_Code'] not in d_bundeslaender.keys():
            d = {}
            l_lk_ids = []
            l_lk_ids.append((lk_id, lk['LK_Name']))
            d['BL_Name'] = lk['BL_Name']
            d['LK_IDs'] = l_lk_ids

            d_bundeslaender[lk['BL_Code']] = d
        else:
            d_bundeslaender[lk['BL_Code']]['LK_IDs'].append(
                (lk_id, lk['LK_Name']))

    helper.write_json('data/de-districts/mapping_bundesland_landkreis.json',
                      d_bundeslaender)
    helper.write_json('data/de-districts/mapping_landkreis_ID_name.json',
                      d_landkreis_id_name_mapping)
예제 #2
0
def export_latest_data(d_districts_data: dict):
    d_districts_latest = helper.extract_latest_data(d_ref_landkreise,
                                                    d_districts_data)
    d_for_export_V1 = d_districts_latest
    l_for_export_V2 = []
    for lk_id, d in d_districts_latest.items():
        # V1: dict (lk_id) -> dict
        # V2: list of ficts
        # d_for_export_V1[lk_id] = d
        d["Landkreis"] = get_lk_name_from_lk_id(lk_id)
        d["Bundesland"] = d["BL_Name"]
        del d["BL_Name"]
        # divi data is not returned by helper.extract_latest_data and mostly not available at latest day, so using the date of the previous day instead
        if 'DIVI_Intensivstationen_Covid_Prozent' in d_districts_data[lk_id][
                -1]:
            d['DIVI_Intensivstationen_Covid_Prozent'] = d_districts_data[
                lk_id][-1]['DIVI_Intensivstationen_Covid_Prozent']
            d['DIVI_Intensivstationen_Betten_belegt_Prozent'] = d_districts_data[
                lk_id][-1]['DIVI_Intensivstationen_Betten_belegt_Prozent']
        elif 'DIVI_Intensivstationen_Covid_Prozent' in d_districts_data[lk_id][
                -2]:
            d['DIVI_Intensivstationen_Covid_Prozent'] = d_districts_data[
                lk_id][-2]['DIVI_Intensivstationen_Covid_Prozent']
            d['DIVI_Intensivstationen_Betten_belegt_Prozent'] = d_districts_data[
                lk_id][-2]['DIVI_Intensivstationen_Betten_belegt_Prozent']
        d_for_export_V2 = d
        d_for_export_V2['LK_ID'] = lk_id
        l_for_export_V2.append(d_for_export_V2)

    # Export as JSON
    helper.write_json('data/de-districts/de-districts-results.json',
                      d_for_export_V1,
                      sort_keys=True)

    helper.write_json(
        filename='data/de-districts/de-districts-results-V2.json',
        d=l_for_export_V2,
        sort_keys=True)

    # Export as CSV
    with open('data/de-districts/de-districts-results.tsv',
              mode='w',
              encoding='utf-8',
              newline='\n') as fh_csv:
        csvwriter = csv.DictWriter(
            fh_csv,
            delimiter='\t',
            extrasaction='ignore',
            fieldnames=[
                'Landkreis', 'Bundesland', 'Population', 'Cases', 'Deaths',
                'Cases_Per_Million', 'Deaths_Per_Million',
                'DIVI_Intensivstationen_Covid_Prozent',
                'DIVI_Intensivstationen_Betten_belegt_Prozent'
            ])

        csvwriter.writeheader()

        for lk_id, d in d_for_export_V1.items():
            csvwriter.writerow(d)
예제 #3
0
def write_all_ingreds(recipe_file_name, ingred_file_name):
    """Save json of all ingreds in recipe_file_name to ingred_file_name."""
    data = helper.get_json(recipe_file_name)

    ingreds = []
    for recipe in data:
        ingreds.append(recipe['ingreds'])
    ingreds = [ingred for sublist in ingreds for ingred in sublist]
    ingreds = list(set(ingreds))
    ingreds.sort()

    helper.write_json(ingreds, ingred_file_name, 'w')
    return ingreds
def export_latest_data(d_ref_states, d_states_data: dict):
    d_states_latest = helper.extract_latest_data(d_ref_states, d_states_data)

    # # d_states_latest = dict(d_ref_states)
    # for code in d_states_latest.keys():
    #     assert code in d_states_data.keys()
    #     l_state = d_states_data[code]
    #     d_latest = l_state[-1]
    #     d_states_latest[code]['Date_Latest'] = d_latest['Date']
    #     for key in ('Cases', 'Deaths', 'Cases_New', 'Deaths_New', 'Cases_Per_Million', 'Deaths_Per_Million'):
    #         d_states_latest[code][key] = d_latest[key]
    with open('data/de-states/de-states-latest.tsv',
              mode='w',
              encoding='utf-8',
              newline='\n') as fh:
        csvwriter = csv.DictWriter(
            fh,
            delimiter='\t',
            extrasaction='ignore',
            fieldnames=('State', 'Code', 'Population', 'Pop Density',
                        'Date_Latest', 'Cases', 'Deaths', 'Cases_New',
                        'Deaths_New', 'Cases_Per_Million',
                        'Deaths_Per_Million',
                        'DoublingTime_Cases_Last_Week_Per_100000',
                        'Slope_Cases_Last_Week_Percent',
                        'Slope_Deaths_Last_Week_Percent',
                        'Cases_Last_Week_7Day_Percent'))
        csvwriter.writeheader()
        for code in sorted(d_states_latest.keys()):
            d = d_states_latest[code]
            d['Code'] = code
            if code == 'DE-total':  # DE as last row
                d_de = dict(d)
                continue
            csvwriter.writerow(d)
        del d, code
        # add # to uncomment the DE total sum last line
        d_de['State'] = '# Deutschland'
        csvwriter.writerow(d_de)
        del d_de

    helper.write_json(f'data/de-states/de-states-latest.json', d_states_latest)

    l_for_export = []
    for code in sorted(d_states_latest.keys(), key=str.casefold):
        d2 = d_states_latest[code]
        d2['Code'] = code
        l_for_export.append(d2)
    helper.write_json(filename='data/de-states/de-states-latest-list.json',
                      d=l_for_export)
예제 #5
0
def write_recipe_data_filtered(infile, outfile):
    """Filter recipes from infile and save to outfile as json."""
    data = helper.get_json(infile)
    with open('approved_ingreds', 'r', encoding="utf8") as f:
        approved_ingreds = set(f.read().splitlines())
    ingred_filters = generate_ingred_filters(approved_ingreds)

    # Remove duplicate recipes
    df = pd.DataFrame(data)
    df_unique = df[~df['title'].duplicated()]
    data = df_unique.to_dict('records')

    for recipe in data:
        filtered_ingreds = filter_naive(recipe['ingreds'], ingred_filters)
        recipe['ingreds'] = filtered_ingreds

    helper.write_json(data, outfile, 'w')
def export_time_series_all_countries():
    for country in d_countries_timeseries.keys():
        # for country in d_selected_countries.keys():
        country_code = read_country_code(country)
        if not country_code:
            continue
        # country_code = d_selected_countries[country]['Code']
        l_country_data = d_countries_timeseries[country]
        #     pop_in_Mill = d_selected_countries[country]['Population'] / 1000000

        helper.write_json(f'data/int/country-{country_code}.json',
                          l_country_data)

        with open(f'data/int/country-{country_code}.tsv',
                  mode='w',
                  encoding='utf-8',
                  newline='\n') as fh:
            csvwriter = csv.DictWriter(
                fh,
                delimiter='\t',
                extrasaction='ignore',
                fieldnames=[
                    'Days_Past', 'Date', 'Cases', 'Deaths', 'Cases_New',
                    'Deaths_New', 'Cases_Per_Million', 'Deaths_Per_Million',
                    'Cases_New_Per_Million', 'Deaths_New_Per_Million',
                    'Cases_Doubling_Time', 'Deaths_Doubling_Time',
                    'Cases_Change_Factor', 'Deaths_Change_Factor',
                    'Days_Since_2nd_Death', 'Cases_Last_Week_Per_Million',
                    'Deaths_Last_Week_Per_Million'
                ])
            csvwriter.writeheader()

            for d in l_country_data:
                d2 = d
                # d2[]
                # this_Cases_Doubling_Time = None
                # this_Deaths_Doubling_Time = None
                # if 'Cases_Doubling_Time' in d:
                #     this_Cases_Doubling_Time = d['Cases_Doubling_Time']
                # if 'Deaths_Doubling_Time' in d:
                #     this_Deaths_Doubling_Time = d['Deaths_Doubling_Time']
                csvwriter.writerow(d2)
def export_data(d_states_data: dict):
    # export JSON and CSV
    for code in d_states_data.keys():
        outfile = f'data/de-states/de-state-{code}.tsv'
        l_time_series = d_states_data[code]

        helper.write_json(f'data/de-states/de-state-{code}.json',
                          d=l_time_series,
                          sort_keys=True)

        with open(outfile, mode='w', encoding='utf-8', newline='\n') as fh:
            csvwriter = csv.DictWriter(
                fh,
                delimiter='\t',
                extrasaction='ignore',
                fieldnames=[
                    'Days_Past',
                    'Date',
                    'Cases',
                    'Deaths',
                    'Cases_New',
                    'Deaths_New',
                    'Cases_Last_Week',
                    'Deaths_Last_Week',
                    'Cases_Per_Million',
                    'Deaths_Per_Million',
                    'Cases_New_Per_Million',
                    'Deaths_New_Per_Million',
                    'Cases_Last_Week_Per_Million',
                    'Deaths_Last_Week_Per_Million',
                    'Cases_Last_Week_Per_100000',
                    #                'Cases_Doubling_Time', 'Deaths_Doubling_Time',
                    'DIVI_Intensivstationen_Covid_Prozent',
                    'DIVI_Intensivstationen_Betten_belegt_Prozent',
                    'Cases_Last_Week_Doubling_Time',
                    'Cases_Last_Week_7Day_Percent'
                ])
            csvwriter.writeheader()
            for d in l_time_series:
                csvwriter.writerow(d)
예제 #8
0
    def preprocess(self):
        """
        Naive preprocessor that creates the dataset for CLAMS by cloning all repos and filtering locally.
        :return:
        """
        repos_dir = os.path.join(os.getcwd(), 'repos')
        helper.create_dir(repos_dir)

        bitbucket_client = BitBucketServerClient(
            host=self.bitbucket_host,
            is_ssh=False,
            credentials=self.bitbucket_credentials)
        repos = bitbucket_client.get_bitbucket_server_repos(self.client_repos)
        self.clone_repos(bitbucket_client, repos, repos_dir)

        for project in projects_map:
            package_name = projects_map[project]['package']

            print "Removing previous session's results..."
            directory = os.path.join(os.getcwd(), 'files', project)
            helper.delete_dir(directory)
            helper.create_dir(directory)
            print "Ready to run new session!\n"

            # use the following command to filter files
            os.system("find ./repos -iname '*.java' | xargs -n16 -P8 grep -l" +
                      " \"" + package_name + "\" > " + project + ".txt")

            fname = project + ".txt"
            files_urls = {}
            if os.path.exists(fname):
                self.process_filtered_results(directory, files_urls, fname,
                                              project)

                print 'Writing files\' BitBucket Server urls to file...'
                helper.write_json(files_urls, 'files_urls', directory)
                print 'Files\' BitBucket urls are now stored in a json file!\n'
            else:
                print 'No usage examples found for ' + project
예제 #9
0
def write_recipe_matrix(outfile='recipe_matrix.json'):
    '''2D matrix whose rows are ingredients and cols are recipes.
    A 1 denotes the occurence of an ingredient in a given recipe.'''
    ingreds = helper.get_json('all_ingreds_filtered.json')
    recipes = helper.get_json('recipe_data_filtered.json')

    titles = []
    for recipe in recipes:
        titles.append(recipe['title'])

    df = pd.DataFrame(0, ingreds, titles)

    ingreds = set(ingreds)
    for recipe in recipes:
        recipe_ingreds = set(recipe['ingreds'])
        matches = recipe_ingreds & ingreds
        if len(matches) > 0:
            df.loc[list(matches), recipe['title']] = 1

    data = df.to_numpy()
    data = data.tolist()
    helper.write_json(data, outfile, 'w')
예제 #10
0
def save_data(json_data, filename_part):
    filename_with_path = os.path.join(config.data_directory, filename_part + ".json")
    temp_filename_with_path = filename_with_path + ".temp"
    bak_filename_with_path = os.path.join(config.data_backup_directory, filename_part + ".json"
                                          + datetime.now().strftime('.%Y-%m-%d-%H-%M-%S.bak'))
    exists = os.path.isfile(filename_with_path)
    if exists:
        hlp.write_json(json_data, temp_filename_with_path)
        if os.path.getsize(filename_with_path) == os.path.getsize(temp_filename_with_path):
            os.remove(temp_filename_with_path)
            hlp.write_log("File already existed and is the same. Keeping old file: "
                          + filename_with_path, dtm_prefix=False)
        else:  # files are different
            # backup old file
            os.rename(filename_with_path, bak_filename_with_path)
            # rename temporary file
            os.rename(temp_filename_with_path, filename_with_path)
            hlp.write_log("File already exists but it's different. Old file was backed up as: " +
                          bak_filename_with_path, dtm_prefix=False)
    else:
        hlp.write_json(json_data, filename_with_path)
        hlp.write_log("File downloaded and stored as: " + filename_with_path, dtm_prefix=False)
    def preprocess(self):
        """
        This method runs the preprocessor that creates the dataset for CLAMS by using Hound to filter down to specific files.
        :return:
        """
        for project in projects_map:
            package_name = projects_map[project]['package']

            print "Removing previous session's results..."
            directory = os.path.join(os.getcwd(), 'files', project)
            helper.delete_dir(directory)
            helper.create_dir(directory)
            print "Ready to run new session!\n"

            # search on Hound
            print "\nSearching on Hound..."
            hound_client = HoundClient(self.hound_host, self.hound_credentials)
            hound_query = {'q': 'import ' + package_name, 'i': 'nope', 'files': '.java', 'repos': '*'}
            json_response = hound_client.search(hound_query)
            files_urls = self.parse_hound_response(project, json_response)
            print "Search completed!\n"

            # download files from BitBucket Server
            print "Downloading files..."
            bitbucket_client = BitBucketServerClient(host=self.bitbucket_host, is_ssh=self.is_bitbucket_ssh,
                                                     credentials=self.bitbucket_credentials)
            for file_name, info in files_urls.iteritems():
                response = bitbucket_client.download_file(info)
                helper.write_file_content(response, file_name, directory, self.is_bitbucket_ssh)
            print "Files are now stored locally!\n"

            print 'Writing files\' BitBucket Server urls to file...'
            helper.write_json(files_urls, 'files_urls', directory)
            print 'Files\' BitBucket Server urls are now stored in a json file!\n'
            # sleep for 1s to avoid overloading Hound/BitBucket
            # remove in case you don't have any latency issues
            time.sleep(1)
def export_data(d_states_data: dict):
    # export JSON and CSV
    for code in d_states_data.keys():
        outfile = f'data/de-states/de-state-{code}.tsv'
        l_time_series = d_states_data[code]

        helper.write_json(
            f'data/de-states/de-state-{code}.json', l_time_series)

        with open(outfile, mode='w', encoding='utf-8', newline='\n') as fh:
            csvwriter = csv.DictWriter(fh, delimiter='\t', extrasaction='ignore', fieldnames=[
                'Days_Past', 'Date',
                'Cases', 'Deaths',
                'Cases_New', 'Deaths_New',
                'Cases_Last_Week', 'Deaths_Last_Week',
                'Cases_Per_Million', 'Deaths_Per_Million',
                'Cases_New_Per_Million', 'Deaths_New_Per_Million',
                'Cases_Last_Week_Per_Million', 'Deaths_Last_Week_Per_Million',
                'Cases_Doubling_Time', 'Deaths_Doubling_Time'
            ]
            )
            csvwriter.writeheader()
            for d in l_time_series:
                csvwriter.writerow(d)
def export_data(d_districts_data: dict):
    for lk_id, l_time_series in d_districts_data.items():
        file_out = f'data/de-districts/de-district_timeseries-{lk_id}'
        # Export data as JSON
        helper.write_json(
            file_out+'.json', d=l_time_series, sort_keys=True)

        with open(file_out+'.tsv', mode='w', encoding='utf-8', newline='\n') as fh_csv:
            csvwriter = csv.DictWriter(fh_csv, delimiter='\t', extrasaction='ignore', fieldnames=[
                'Days_Past', 'Date',
                'Cases', 'Deaths',
                'Cases_New', 'Deaths_New',
                'Cases_Last_Week', 'Deaths_Last_Week',
                'Cases_Per_Million', 'Deaths_Per_Million',
                'Cases_New_Per_Million', 'Deaths_New_Per_Million',
                'Cases_Last_Week_Per_Million', 'Deaths_Last_Week_Per_Million',
                # 'Cases_Doubling_Time', 'Deaths_Doubling_Time',
                'DIVI_Intensivstationen_Covid_Prozent',
                'DIVI_Intensivstationen_Betten_belegt_Prozent', 'Cases_Last_Week_7Day_Percent'
            ]
            )
            csvwriter.writeheader()
            for d in l_time_series:
                csvwriter.writerow(d)
예제 #14
0
def write_all_ingreds_lemma(infile='all_ingreds_filtered.json',
                            outfile='static/all_ingreds_lemma.json'):
    """Save json of lemmatization of ingreds in infile to outfile."""
    ingreds = helper.get_json(infile)
    ingreds = [lemmatize(ingred) for ingred in ingreds]
    helper.write_json(ingreds, outfile, 'w')
def export_data():
    global d_data_all
    helper.write_json(filename + '.json',
                      d_data_all,
                      sort_keys=False,
                      indent=1)
def generate_database() -> dict:
    d_database = {}
    for csv_file in glob.glob('data/de-divi/downloaded/*.csv'):
        (filepath, fileName) = os.path.split(csv_file)
        (fileBaseName, fileExtension) = os.path.splitext(fileName)
        date = fileBaseName
        del filepath, fileName, fileBaseName, fileExtension

# file 2020-04-24.csv:
# bundesland,kreis,anzahl_standorte,betten_frei,betten_belegt,faelle_covid_aktuell_im_bundesland
# file 2020-04-26.csv:
# gemeindeschluessel,anzahl_meldebereiche,faelle_covid_aktuell,faelle_covid_aktuell_beatmet,anzahl_standorte,betten_frei,betten_belegt,bundesland
# 2020-04-28.csv
# gemeindeschluessel,anzahl_meldebereiche,faelle_covid_aktuell,faelle_covid_aktuell_beatmet,anzahl_standorte,betten_frei,betten_belegt,bundesland,daten_stand
# file 2020-06-28.csv
# bundesland,gemeindeschluessel,anzahl_meldebereiche,faelle_covid_aktuell,faelle_covid_aktuell_beatmet,anzahl_standorte,betten_frei,betten_belegt,daten_stand


# -> skipping file 2020-04-24.csv and 2020-04-25.csv
        if date in ('2020-04-24', '2020-04-25'):
            continue

        with open(csv_file, mode='r', encoding='utf-8') as f:
            csv_reader = csv.DictReader(f, delimiter=",")
            for row in csv_reader:
                assert len(row) >= 8, "Error: too few rows found"
                bl_id = row["bundesland"]
                lk_id = row["gemeindeschluessel"]
                d = {
                    # "bl_id": row["bundesland"],
                    # "lk_id": row["gemeindeschluessel"],
                    "Date": date,
                    "anzahl_meldebereiche": int(row["anzahl_meldebereiche"]),
                    "faelle_covid_aktuell": int(row["faelle_covid_aktuell"]),
                    "faelle_covid_aktuell_beatmet": int(row["faelle_covid_aktuell_beatmet"]),
                    "anzahl_standorte": int(row["anzahl_standorte"]),
                    "betten_frei": int(float(row["betten_frei"])),
                    "betten_belegt": int(float(row["betten_belegt"]))
                }
                d["betten_ges"] = d["betten_frei"] + d["betten_belegt"]
                if d["betten_ges"] > 0:
                    d["betten_belegt_proz"] = round(100 *
                                                    d["betten_belegt"] / d["betten_ges"], 1)
                    d["faelle_covid_aktuell_proz"] = round(100*d["faelle_covid_aktuell"] /
                                                           d["betten_ges"], 1)
                else:
                    d["betten_belegt_proz"] = None
                    d["faelle_covid_aktuell_proz"] = None
                if d["faelle_covid_aktuell"] > 0:
                    d["faelle_covid_aktuell_beatmet_proz"] = round(
                        100*d["faelle_covid_aktuell_beatmet"] / d["faelle_covid_aktuell"], 1)
                else:
                    d["faelle_covid_aktuell_beatmet_proz"] = 0

                # if "daten_stand" in row:
                #     d["daten_stand"] = row["daten_stand"]
                # else:
                #     d["daten_stand"] = date

                if lk_id not in d_database:
                    d_database[lk_id] = []
                d_database[lk_id].append(d)

    helper.write_json('cache/de-divi/de-divi-V3.json',
                      d_database, sort_keys=True, indent=1)
    return d_database
def extract_latest_date_data():
    """
    for all countries in json: extract latest entry
    write to data/int/countries-latest-all.tsv and data/int/countries-latest-all.json
    """

    d_countries_latest = helper.extract_latest_data(d_countries_ref,
                                                    d_countries_timeseries)

    l_for_export = []
    with open('data/int/countries-latest-all.tsv',
              mode='w',
              encoding='utf-8',
              newline='\n') as fh:
        csvwriter = csv.DictWriter(
            fh,
            delimiter='\t',
            extrasaction='ignore',
            fieldnames=[
                'Country', 'Population', 'Date', 'Cases', 'Deaths',
                'Cases_Per_Million', 'Deaths_Per_Million',
                'Cases_Last_Week_Per_Million', 'Deaths_Last_Week_Per_Million',
                'Continent', 'Code', 'DoublingTime_Cases_Last_Week_Per_100000'
            ])
        # 'Cases_Last_Week',
        csvwriter.writeheader()

        for country in sorted(d_countries_latest.keys(), key=str.casefold):
            # l_time_series = d_countries_timeseries[country]
            # d = l_time_series[-1]  # last entry (=>latest date)
            # pop = read_population(country)

            d2 = d_countries_latest[country]
            d2['Country'] = country
            # d2['Code'] = read_country_code(d2['Country'])
            # d2['Continent'] = read_continent(d2['Country'])
            # d2['Population'] = pop
            # if d2['Cases_Per_Million']:
            #     d2['Cases_Per_Million'] = round(
            #         d['Cases_Per_Million'], 0)
            # if d2['Deaths_Per_Million']:
            #     d2['Deaths_Per_Million'] = round(
            #         d['Deaths_Per_Million'], 0)
            # if d2['Cases_Last_Week_Per_Million']:
            #     d2['Cases_Last_Week_Per_Million'] = round(
            #         d['Cases_Last_Week_Per_Million'], 0)
            # if d2['Deaths_Last_Week_Per_Million']:
            #     d2['Deaths_Last_Week_Per_Million'] = round(
            #         d['Deaths_Last_Week_Per_Million'], 0)
            csvwriter.writerow(d2)
            l_for_export.append(d2)

    # JSON export
    helper.write_json(filename='data/int/countries-latest-all.json',
                      d=l_for_export)

    # for selected countries write to separate file, for Gnuplot plotting
    with open('data/int/countries-latest-selected.tsv',
              mode='w',
              encoding='utf-8',
              newline='\n') as fh:
        csvwriter = csv.DictWriter(fh,
                                   delimiter='\t',
                                   extrasaction='ignore',
                                   fieldnames=[
                                       'Country', 'Date', 'Population',
                                       'Cases', 'Deaths', 'Cases_Per_Million',
                                       'Deaths_Per_Million'
                                   ])
        csvwriter.writeheader()
        for country in sorted(d_selected_countries.keys(), key=str.casefold):
            l_time_series = d_countries_timeseries[country]
            d = l_time_series[-1]  # last entry for this country
            d2 = d
            d2["Country"] = country
            d2['Population'] = d_selected_countries[country]['Population']
            csvwriter.writerow(d2)
def generate_database() -> dict:
    d_database = {}
    # d_database_states = {}  # Bundesländer
    d_database_states = {'01': {}, '02': {}, '03': {}, '04': {}, '05': {}, '06': {}, '07': {
    }, '08': {}, '09': {}, '10': {}, '11': {}, '12': {}, '13': {}, '14': {}, '15': {}, '16': {}, 'DE-total': {}}
    for csv_file in glob.glob('data/de-divi/downloaded/*.csv'):
        (filepath, fileName) = os.path.split(csv_file)
        (fileBaseName, fileExtension) = os.path.splitext(fileName)
        date = fileBaseName
        del filepath, fileName, fileBaseName, fileExtension

# file 2020-04-24.csv:
# bundesland,kreis,anzahl_standorte,betten_frei,betten_belegt,faelle_covid_aktuell_im_bundesland
# file 2020-04-26.csv:
# gemeindeschluessel,anzahl_meldebereiche,faelle_covid_aktuell,faelle_covid_aktuell_beatmet,anzahl_standorte,betten_frei,betten_belegt,bundesland
# 2020-04-28.csv
# gemeindeschluessel,anzahl_meldebereiche,faelle_covid_aktuell,faelle_covid_aktuell_beatmet,anzahl_standorte,betten_frei,betten_belegt,bundesland,daten_stand
# file 2020-06-28.csv
# bundesland,gemeindeschluessel,anzahl_meldebereiche,faelle_covid_aktuell,faelle_covid_aktuell_beatmet,anzahl_standorte,betten_frei,betten_belegt,daten_stand


# -> skipping file 2020-04-24.csv and 2020-04-25.csv
        if date in ('2020-04-24', '2020-04-25'):
            continue

        with open(csv_file, mode='r', encoding='utf-8') as f:
            csv_reader = csv.DictReader(f, delimiter=",")
            for row in csv_reader:
                assert len(row) >= 8, "Error: too few rows found"
                bl_id = row["bundesland"]
                lk_id = row["gemeindeschluessel"]
                d = {
                    # "bl_id": row["bundesland"],
                    # "lk_id": row["gemeindeschluessel"],
                    "Date": date,
                    "anzahl_meldebereiche": int(row["anzahl_meldebereiche"]),
                    "faelle_covid_aktuell": int(row["faelle_covid_aktuell"]),
                    "faelle_covid_aktuell_beatmet": int(row["faelle_covid_aktuell_beatmet"]),
                    "anzahl_standorte": int(row["anzahl_standorte"]),
                    "betten_frei": int(float(row["betten_frei"])),
                    "betten_belegt": int(float(row["betten_belegt"]))
                }
                d["betten_ges"] = d["betten_frei"] + d["betten_belegt"]
                if d["betten_ges"] > 0:
                    d["betten_belegt_proz"] = round(100 *
                                                    d["betten_belegt"] / d["betten_ges"], 1)
                    d["faelle_covid_aktuell_proz"] = round(100*d["faelle_covid_aktuell"] /
                                                           d["betten_ges"], 1)
                else:
                    d["betten_belegt_proz"] = None
                    d["faelle_covid_aktuell_proz"] = None
                if d["faelle_covid_aktuell"] > 0:
                    d["faelle_covid_aktuell_beatmet_proz"] = round(
                        100*d["faelle_covid_aktuell_beatmet"] / d["faelle_covid_aktuell"], 1)
                else:
                    d["faelle_covid_aktuell_beatmet_proz"] = 0

                # if "daten_stand" in row:
                #     d["daten_stand"] = row["daten_stand"]
                # else:
                #     d["daten_stand"] = date

                if lk_id not in d_database:
                    d_database[lk_id] = []
                d_database[lk_id].append(d)

                # calc de_states_sum
                d2 = dict(d)
                del d2['Date'], d2['betten_ges'], d2['betten_belegt_proz'], d2['faelle_covid_aktuell_proz'], d2['faelle_covid_aktuell_beatmet_proz']
                if date not in d_database_states[bl_id]:
                    d_database_states[bl_id][date] = d2
                else:
                    for k in d2.keys():
                        d_database_states[bl_id][date][k] += d2[k]
                # 'DE-total'
                if date not in d_database_states['DE-total']:
                    d_database_states['DE-total'][date] = d2
                else:
                    for k in d2.keys():
                        d_database_states['DE-total'][date][k] += d2[k]

                    # print(d_database_states[bl_id][date])

    helper.write_json('cache/de-divi/de-divi-V3.json',
                      d_database, sort_keys=True, indent=1)

    d_database_states2 = {}
    for bl_id in d_database_states.keys():
        bl_code = d_bl_id2code[bl_id]
        d_database_states2[bl_code] = []
        for date, d in d_database_states[bl_id].items():
            d['Date'] = date
            # copy from above:
            d["betten_ges"] = d["betten_frei"] + d["betten_belegt"]
            if d["betten_ges"] > 0:
                d["betten_belegt_proz"] = round(100 *
                                                d["betten_belegt"] / d["betten_ges"], 1)
                d["faelle_covid_aktuell_proz"] = round(100*d["faelle_covid_aktuell"] /
                                                       d["betten_ges"], 1)
            else:
                d["betten_belegt_proz"] = None
                d["faelle_covid_aktuell_proz"] = None
            if d["faelle_covid_aktuell"] > 0:
                d["faelle_covid_aktuell_beatmet_proz"] = round(
                    100*d["faelle_covid_aktuell_beatmet"] / d["faelle_covid_aktuell"], 1)
            else:
                d["faelle_covid_aktuell_beatmet_proz"] = 0

            d_database_states2[bl_code].append(d)
    del d_database_states

    helper.write_json('cache/de-divi/de-divi-V3-states.json',
                      d_database_states2, sort_keys=True, indent=1)

    return d_database
for country_name in d_country_ref_data:
    d_country_ref_data[country_name]['Population'] = int(
        d_country_ref_data[country_name]['Population'])
    d_country_ref_data[country_name]['geonameid'] = int(
        d_country_ref_data[country_name]['geonameid'])
    d_country_ref_data[country_name]['ISO-Numeric'] = int(
        d_country_ref_data[country_name]['ISO-Numeric'])
    d_country_ref_data[country_name]['Area(in sq km)'] = float(
        d_country_ref_data[country_name]['Area(in sq km)'])

if d_country_ref_data['Eritrea']['Population'] == 0:
    d_country_ref_data['Eritrea']['Population'] = 5750433

# export as json
helper.write_json(file_JSON, d_country_ref_data)

# export as csv

l = []
for country_name in sorted(d_country_ref_data.keys()):
    d = d_country_ref_data[country_name]
    d['Country'] = country_name
    l.append(d)
del d

keys = header_row
with open(file_CSV, mode='w', encoding='utf-8', newline='\n') as file:
    dict_writer = csv.DictWriter(file, keys, delimiter="\t")
    dict_writer.writeheader()
    dict_writer.writerows(l)