예제 #1
0
def collect_bbbike_subregion_catalogue(confirmation_required=True,
                                       verbose=False):
    """
    :param confirmation_required: [bool] (default: True)
    :param verbose: [bool] (default: False)

    Testing e.g.
        confirmation_required = True
        verbose               = True
        collect_bbbike_subregion_catalogue(confirmation_required, verbose)
    """
    if confirmed("To collect BBBike subregion catalogue? ",
                 confirmation_required=confirmation_required):
        try:
            home_url = 'http://download.bbbike.org/osm/bbbike/'
            bbbike_subregion_catalogue = pd.read_html(
                home_url, header=0, parse_dates=['Last Modified'])[0].drop(0)
            bbbike_subregion_catalogue.Name = bbbike_subregion_catalogue.Name.map(
                lambda x: x.strip('/'))

            save_pickle(bbbike_subregion_catalogue,
                        cd_dat("BBBike-subregion-catalogue.pickle"),
                        verbose=verbose)

            bbbike_subregion_names = bbbike_subregion_catalogue.Name.tolist()
            save_pickle(bbbike_subregion_names,
                        cd_dat("BBBike-subregion-name-list.pickle"),
                        verbose=verbose)

        except Exception as e:
            print("Failed to get the required information ... {}.".format(e))
    else:
        print(
            "The information collection process was not activated. The existing local copy will be loaded instead."
        )
예제 #2
0
def collect_continents_subregion_tables(confirmation_required=True,
                                        verbose=False):
    """
    :param confirmation_required: [bool] (default: True) whether to confirm before starting to collect the information
    :param verbose: [bool] (default: False)

    Testing e.g.
        confirmation_required = True
        verbose               = True
        collect_continents_subregion_tables(confirmation_required, verbose)
    """
    if confirmed("To collect information about subregions of each continent? ",
                 confirmation_required=confirmation_required):
        try:
            home_link = 'https://download.geofabrik.de/'
            source = requests.get(home_link)
            soup = bs4.BeautifulSoup(source.text,
                                     'lxml').find_all('td',
                                                      {'class': 'subregion'})
            source.close()
            continent_names = [td.a.text for td in soup]
            continent_links = [
                urllib.parse.urljoin(home_link, td.a['href']) for td in soup
            ]
            subregion_tbls = dict(
                zip(continent_names, [
                    get_subregion_table(url, verbose)
                    for url in continent_links
                ]))
            save_pickle(subregion_tbls,
                        cd_dat("GeoFabrik-continents-subregion-tables.pickle"),
                        verbose=verbose)
        except Exception as e:
            print(
                "Failed to collect the required information ... {}.".format(e))
    else:
        print(
            "The information collection process was not activated. The existing local copy will be loaded instead."
        )
예제 #3
0
def collect_bbbike_download_catalogue(confirmation_required=True,
                                      verbose=False):
    """
    :param confirmation_required: [bool] (default: True)
    :param verbose: [bool] (default: False)

    Testing e.g.
        confirmation_required = True
        verbose               = True
        collect_bbbike_download_catalogue(confirmation_required, verbose)
    """
    if confirmed("To collect BBBike download dictionary? ",
                 confirmation_required=confirmation_required):
        try:
            bbbike_subregion_names = fetch_bbbike_subregion_catalogue(
                "BBBike-subregion-name-list", update=True)
            download_catalogue = [
                fetch_bbbike_subregion_download_catalogue(
                    subregion_name, update=True, confirmation_required=False)
                for subregion_name in bbbike_subregion_names
            ]

            subregion_name, subregion_download_catalogue = bbbike_subregion_names[
                0], download_catalogue[0]

            # Available file formats
            file_fmt = [
                re.sub('{}|CHECKSUM'.format(subregion_name), '', f)
                for f in subregion_download_catalogue.Filename
            ]
            save_pickle(file_fmt[:-2],
                        cd_dat("BBBike-osm-file-formats.pickle"),
                        verbose=verbose)

            # Available data types
            data_typ = subregion_download_catalogue.DataType.tolist()
            save_pickle(data_typ[:-2],
                        cd_dat("BBBike-osm-data-types.pickle"),
                        verbose=verbose)

            # available_file_formats = dict(zip(file_fmt, file_ext))

            downloads_dictionary = dict(
                zip(bbbike_subregion_names, download_catalogue))
            save_pickle(downloads_dictionary,
                        cd_dat("BBBike-download-catalogue.pickle"),
                        verbose=verbose)
        except Exception as e:
            print("Failed to collect BBBike download dictionary. {}".format(e))
    else:
        print(
            "The information collection process was not activated. The existing local copy will be loaded instead."
        )
예제 #4
0
def read_osm_pbf(subregion_name,
                 data_dir=None,
                 parsed=True,
                 file_size_limit=50,
                 fmt_other_tags=True,
                 fmt_single_geom=True,
                 fmt_multi_geom=True,
                 update=False,
                 download_confirmation_required=True,
                 pickle_it=False,
                 rm_osm_pbf=False,
                 verbose=False):
    """
    :param subregion_name: [str] e.g. 'rutland'
    :param data_dir: [str; None (default)] customised path of a .osm.pbf file
    :param parsed: [bool] (default: True)
    :param file_size_limit: [numbers.Number] (default: 50) limit of file size (in MB),  e.g. 50, or 100
    :param fmt_other_tags: [bool] (default: True)
    :param fmt_single_geom: [bool] (default: True)
    :param fmt_multi_geom: [bool] (default: True)
    :param update: [bool] (default: False)
    :param download_confirmation_required: [bool] (default: True)
    :param pickle_it: [bool] (default: False)
    :param rm_osm_pbf: [bool] (default: False)
    :param verbose: [bool] (default: False)
    :return: [dict; None]

    If 'subregion' is the name of the subregion, the default file path will be used.

    Example:
        subregion_name                 = 'Rutland'
        data_dir                       = None
        parsed                         = True
        file_size_limit                = 50
        fmt_other_tags                 = True
        fmt_single_geom                = True
        fmt_multi_geom                 = True
        update                         = False
        download_confirmation_required = True
        pickle_it                      = False
        rm_osm_pbf                     = True
        verbose                        = False
        read_osm_pbf(subregion_name, data_dir, parsed, file_size_limit, fmt_other_tags, fmt_single_geom, fmt_multi_geom,
                     update, download_confirmation_required, pickle_it, rm_osm_pbf, verbose)
    """
    assert isinstance(file_size_limit, int) or file_size_limit is None

    osm_pbf_filename, path_to_osm_pbf = get_default_path_to_osm_file(
        subregion_name, ".osm.pbf", mkdir=False)
    if osm_pbf_filename and path_to_osm_pbf:
        if not data_dir:  # Go to default file path
            path_to_osm_pbf = path_to_osm_pbf
        else:
            osm_pbf_dir = regulate_input_data_dir(data_dir)
            path_to_osm_pbf = os.path.join(osm_pbf_dir, osm_pbf_filename)

        subregion_filename = os.path.basename(path_to_osm_pbf)

        path_to_pickle = path_to_osm_pbf.replace(
            ".osm.pbf", ".pickle" if parsed else "-raw.pickle")
        if os.path.isfile(path_to_pickle) and not update:
            osm_pbf_data = load_pickle(path_to_pickle, verbose=verbose)
        else:
            # If the target file is not available, try downloading it first.
            if not os.path.isfile(path_to_osm_pbf) or update:
                try:
                    download_subregion_osm_file(subregion_name,
                                                osm_file_format=".osm.pbf",
                                                download_dir=data_dir,
                                                download_confirmation_required=
                                                download_confirmation_required,
                                                update=update,
                                                verbose=False)
                except Exception as e:
                    print("Cancelled reading data. CAUSE: {}".format(e))
                osm_pbf_data = None

            else:
                file_size_in_mb = round(
                    os.path.getsize(path_to_osm_pbf) / (1024**2), 1)

                if file_size_limit and file_size_in_mb > file_size_limit:
                    # Parsing the '.osm.pbf' file in a chunk-wise way
                    chunks_no = math.ceil(file_size_in_mb / file_size_limit)
                else:
                    chunks_no = None

                print("\nParsing \"{}\" ... ".format(subregion_filename),
                      end="") if verbose else ""
                try:
                    osm_pbf_data = parse_osm_pbf(path_to_osm_pbf, chunks_no,
                                                 parsed, fmt_other_tags,
                                                 fmt_single_geom,
                                                 fmt_multi_geom)
                    print("Successfully.\n") if verbose else ""
                    if pickle_it:
                        save_pickle(osm_pbf_data,
                                    path_to_pickle,
                                    verbose=verbose)
                except Exception as e:
                    print("Failed. CAUSE: \"{}\"\n".format(e))
                    osm_pbf_data = None

                if rm_osm_pbf:
                    remove_subregion_osm_file(path_to_osm_pbf, verbose=verbose)

        return osm_pbf_data

    else:
        print(
            "Errors occur. Maybe check with the input \"subregion_name\" first."
        )
예제 #5
0
def read_shp_zip(subregion_name,
                 layer,
                 feature=None,
                 data_dir=None,
                 update=False,
                 download_confirmation_required=True,
                 pickle_it=False,
                 rm_extracts=False,
                 rm_shp_zip=False,
                 verbose=False):
    """
    :param subregion_name: [str] e.g. 'england', 'oxfordshire', or 'europe'; case-insensitive
    :param layer: [str] e.g. 'railways'
    :param feature: [str; None (default)] e.g. 'rail'; if None, all available features included
    :param data_dir: [str; None (default)]
    :param update: [bool] (default: False) whether to update the relevant file/information
    :param download_confirmation_required: [bool] (default: False)
    :param pickle_it: [bool] (default: False)
    :param rm_extracts: [bool] (default: False) whether to delete extracted files from the .shp.zip file
    :param rm_shp_zip: [bool] (default: False) whether to delete the downloaded .shp.zip file
    :param verbose: [bool] (default: False)
    :return: [gpd.GeoDataFrame]

    Example:
        subregion_name                 = 'Rutland'
        layer                          = 'railways'
        feature                        = None
        data_dir                       = cd("test_read_GeoFabrik")
        update                         = False
        download_confirmation_required = True
        pickle_it                      = False
        rm_extracts                    = True
        rm_shp_zip                     = False
        verbose                        = True
        read_shp_zip(subregion_name, layer, feature, data_dir, update, download_confirmation_required, pickle_it,
                     rm_extracts, rm_shp_zip, verbose)
    """
    shp_zip_filename, path_to_shp_zip = get_default_path_to_osm_file(
        subregion_name, ".shp.zip", mkdir=False)
    if shp_zip_filename and path_to_shp_zip:
        extract_dir = os.path.splitext(path_to_shp_zip)[0]
        if data_dir:
            shp_zip_dir = regulate_input_data_dir(data_dir)
            path_to_shp_zip = os.path.join(shp_zip_dir, shp_zip_filename)
            extract_dir = os.path.join(shp_zip_dir,
                                       os.path.basename(extract_dir))

        # Make a local path for saving a pickle file for .shp data
        sub_name = "-".join(x
                            for x in [
                                shp_zip_filename.replace(
                                    "-latest-free.shp.zip", ""), layer, feature
                            ] if x)
        path_to_shp_pickle = os.path.join(extract_dir,
                                          sub_name + ".shp.pickle")

        if os.path.isfile(path_to_shp_pickle) and not update:
            shp_data = load_pickle(path_to_shp_pickle, verbose=verbose)
        else:
            # Download the requested OSM file urlretrieve(download_url, file_path)
            if not os.path.exists(extract_dir):
                download_subregion_osm_file(shp_zip_filename,
                                            osm_file_format=".shp.zip",
                                            download_dir=data_dir,
                                            update=update,
                                            verbose=verbose,
                                            download_confirmation_required=
                                            download_confirmation_required)

            if os.path.isfile(path_to_shp_zip):
                extract_shp_zip(path_to_shp_zip,
                                extract_dir,
                                layer=layer,
                                verbose=verbose)

            path_to_shp = glob.glob(
                os.path.join(extract_dir, "*{}*.shp".format(layer)))
            if len(path_to_shp) == 0:
                shp_data = None
            elif len(path_to_shp) == 1:
                shp_data = gpd.read_file(
                    path_to_shp[0]
                )  # gpd.GeoDataFrame(read_shp_file(path_to_shp))
                if feature:
                    path_to_shp_feat = path_to_shp[0].replace(
                        layer, layer + "_" + feature)
                    shp_data = gpd.GeoDataFrame(
                        shp_data[shp_data.fclass == feature])
                    shp_data.crs = {
                        'no_defs': True,
                        'ellps': 'WGS84',
                        'datum': 'WGS84',
                        'proj': 'longlat'
                    }
                    shp_data.to_file(path_to_shp_feat, driver='ESRI Shapefile')
            else:  # len(path_to_shp) > 1:
                if not feature:
                    path_to_orig_shp = [
                        p for p in path_to_shp
                        if layer + '_a' in p or layer + '_free' in p
                    ]
                    if len(path_to_orig_shp
                           ) == 1:  # "_a*.shp" is not available
                        shp_data = gpd.read_file(path_to_orig_shp[0])
                    else:
                        shp_data = [gpd.read_file(p) for p in path_to_shp]
                        shp_data = pd.concat(shp_data,
                                             axis=0,
                                             ignore_index=True)
                else:  # feature is None
                    path_to_shp_feat = [
                        p for p in path_to_shp
                        if layer + "_" + feature not in p
                    ]
                    if len(path_to_shp_feat) == 1:  # "_a*.shp" does not exist
                        shp_data = gpd.read_file(path_to_shp_feat[0])
                        shp_data = shp_data[shp_data.fclass == feature]
                    else:  # both "_a*" and "_free*" .shp for feature is available
                        shp_data = [
                            dat[dat.fclass == feature]
                            for dat in (gpd.read_file(p)
                                        for p in path_to_shp_feat)
                        ]
                        shp_data = pd.concat(shp_data,
                                             axis=0,
                                             ignore_index=True)
                    shp_data.crs = {
                        'no_defs': True,
                        'ellps': 'WGS84',
                        'datum': 'WGS84',
                        'proj': 'longlat'
                    }
                    shp_data.to_file(path_to_shp_feat[0].replace(
                        layer, layer + "_" + feature),
                                     driver='ESRI Shapefile')

            if pickle_it:
                save_pickle(shp_data, path_to_shp_pickle, verbose=verbose)

            if os.path.exists(extract_dir) and rm_extracts:
                # import shutil; shutil.rmtree(extract_dir)
                for f in glob.glob(os.path.join(extract_dir, "gis_osm*")):
                    # if layer not in f:
                    os.remove(f)

            if os.path.isfile(path_to_shp_zip) and rm_shp_zip:
                remove_subregion_osm_file(path_to_shp_zip, verbose=verbose)

        return shp_data
예제 #6
0
def collect_bbbike_download_catalogue(confirmation_required=True,
                                      verbose=False):
    """
    :param confirmation_required: [bool] (default: True)
    :param verbose: [bool] (default: False)

    Example:
        confirmation_required = True
        verbose               = True
        collect_bbbike_download_catalogue(confirmation_required, verbose)
    """

    #
    def collect_bbbike_subregion_download_catalogue(subregion_name):
        """
        :param subregion_name: [str]

        Example:
            subregion_name        = 'leeds'
            confirmation_required = True
            verbose               = True
            collect_bbbike_subregion_download_catalogue(subregion_name, confirmation_required, verbose)
        """
        def parse_dlc(dlc):
            dlc_href = dlc.get('href')  # URL
            filename, download_url = dlc_href.strip(
                './'), urllib.parse.urljoin(url, dlc_href)
            if not dlc.has_attr('title'):
                file_format, file_size, last_update = 'Poly', None, None
            else:
                if len(dlc.contents) < 3:
                    file_format, file_size = 'Txt', None
                else:
                    file_format, file_size, _ = dlc.contents  # File type and size
                    file_format, file_size = file_format.strip(
                    ), file_size.text
                last_update = pd.to_datetime(dlc.get('title'))  # Date and time
            parsed_dat = [
                filename, download_url, file_format, file_size, last_update
            ]
            return parsed_dat

        subregion_name_ = regulate_bbbike_input_subregion_name(subregion_name)
        #
        try:
            print("  \"{}\" ... ".format(subregion_name_),
                  end="") if verbose else ""
            url = 'https://download.bbbike.org/osm/bbbike/{}/'.format(
                subregion_name_)

            source = urllib.request.urlopen(url)

            import bs4
            source_soup = bs4.BeautifulSoup(source, 'lxml')
            download_links_class = source_soup.find_all(
                name='a', attrs={'class': ['download_link', 'small']})

            subregion_downloads_catalogue = pd.DataFrame(
                parse_dlc(x) for x in download_links_class)
            subregion_downloads_catalogue.columns = [
                'Filename', 'URL', 'DataType', 'Size', 'LastUpdate'
            ]

            # path_to_file = cd_dat_bbbike(subregion_name_, subregion_name_ + "-download-catalogue.pickle")
            # save_pickle(subregion_downloads_catalogue, path_to_file, verbose=verbose)
            print("Done. ") if verbose else ""

        except Exception as e_:
            subregion_downloads_catalogue = None
            print("Failed. {}".format(subregion_name_, e_)) if verbose else ""

        return subregion_downloads_catalogue

    if confirmed("To collect BBBike download dictionary? ",
                 confirmation_required=confirmation_required):
        try:
            bbbike_subregion_names = fetch_bbbike_subregion_catalogue(
                "BBBike-subregion-name-list", verbose=verbose)
            print("Collecting BBBike download catalogue for: "
                  ) if verbose else ""
            download_catalogue = [
                collect_bbbike_subregion_download_catalogue(subregion_name)
                for subregion_name in bbbike_subregion_names
            ]

            sr_name, sr_download_catalogue = bbbike_subregion_names[
                0], download_catalogue[0]

            # Available file formats
            file_fmt = [
                re.sub('{}|CHECKSUM'.format(sr_name), '', f)
                for f in sr_download_catalogue.Filename
            ]
            save_pickle(file_fmt[:-2],
                        cd_dat("BBBike-osm-file-formats.pickle"),
                        verbose=verbose)

            # Available data types
            data_typ = sr_download_catalogue.DataType.tolist()
            save_pickle(data_typ[:-2],
                        cd_dat("BBBike-osm-data-types.pickle"),
                        verbose=verbose)

            # available_file_formats = dict(zip(file_fmt, file_ext))

            downloads_dictionary = dict(
                zip(bbbike_subregion_names, download_catalogue))
            save_pickle(downloads_dictionary,
                        cd_dat("BBBike-download-catalogue.pickle"),
                        verbose=verbose)

        except Exception as e:
            print("Failed to collect BBBike download dictionary. {}".format(
                e)) if verbose else ""
예제 #7
0
def collect_region_subregion_tier(confirmation_required=True, verbose=False):
    """
    :param confirmation_required: [bool] (default: True) whether to confirm before collecting region-subregion tier
    :param verbose: [bool] (default: False)

    Testing e.g.
        confirmation_required = True
        verbose               = True
        collect_region_subregion_tier(confirmation_required, verbose)
    """

    # Find out the all regions and their subregions
    def compile_region_subregion_tier(sub_reg_tbls):
        """
        :param sub_reg_tbls: [pd.DataFrame] obtained from fetch_continents_subregion_tables()
        :return: ([dict], [list]) a dictionary of region-subregion, and a list of (sub)regions without subregions
        """
        having_subregions = copy.deepcopy(sub_reg_tbls)
        region_subregion_tiers = copy.deepcopy(sub_reg_tbls)

        non_subregions_list = []
        for k, v in sub_reg_tbls.items():
            if v is not None and isinstance(v, pd.DataFrame):
                region_subregion_tiers = update_nested_dict(
                    sub_reg_tbls, {k: set(v.Subregion)})
            else:
                non_subregions_list.append(k)

        for x in non_subregions_list:
            having_subregions.pop(x)

        having_subregions_temp = copy.deepcopy(having_subregions)

        while having_subregions_temp:

            for region_name, subregion_table in having_subregions.items():
                #
                subregion_names, subregion_links = subregion_table.Subregion, subregion_table.SubregionURL
                sub_subregion_tables = dict(
                    zip(subregion_names, [
                        get_subregion_table(link) for link in subregion_links
                    ]))

                subregion_index, without_subregion_ = compile_region_subregion_tier(
                    sub_subregion_tables)
                non_subregions_list += without_subregion_

                region_subregion_tiers.update({region_name: subregion_index})

                having_subregions_temp.pop(region_name)

        # Russian Federation in both pages of Asia and Europe, so that there are duplicates in non_subregions_list
        non_subregions_list = list(
            more_itertools.unique_everseen(non_subregions_list))
        return region_subregion_tiers, non_subregions_list

    if confirmed(
            "To compile a region-subregion tier? (Note that it may take a few minutes.) ",
            confirmation_required=confirmation_required):
        try:
            subregion_tables = fetch_continents_subregion_tables(update=True)
            region_subregion_tier, non_subregions = compile_region_subregion_tier(
                subregion_tables)
            save_pickle(region_subregion_tier,
                        cd_dat("GeoFabrik-region-subregion-tier.pickle"),
                        verbose=verbose)
            save_json(region_subregion_tier,
                      cd_dat("GeoFabrik-region-subregion-tier.json"),
                      verbose=verbose)
            save_pickle(non_subregions,
                        cd_dat("GeoFabrik-non-subregion-list.pickle"),
                        verbose=verbose)
        except Exception as e:
            print("Failed to get the required information ... {}.".format(e))
    else:
        print("The information collection process was not activated.")
예제 #8
0
def collect_subregion_info_catalogue(confirmation_required=True,
                                     verbose=False):
    """
    :param confirmation_required: [bool] (default: False) whether to confirm before starting to collect information
    :param verbose: [bool] (default: False)

    Testing e.g.
        confirmation_required = True
        verbose               = True
        collect_subregion_info_catalogue(confirmation_required, verbose)
    """
    if confirmed(
            "To collect all available subregion links? (Note that it may take a few minutes.) ",
            confirmation_required=confirmation_required):

        home_url = 'http://download.geofabrik.de/'

        try:
            source = requests.get(home_url)
            soup = bs4.BeautifulSoup(source.text, 'lxml')
            source.close()
            avail_subregions = [
                td.a.text for td in soup.find_all('td', {'class': 'subregion'})
            ]
            avail_subregion_urls = [
                urllib.parse.urljoin(home_url, td.a['href'])
                for td in soup.find_all('td', {'class': 'subregion'})
            ]
            avail_subregion_url_tables = [
                get_subregion_table(sub_url, verbose)
                for sub_url in avail_subregion_urls
            ]
            avail_subregion_url_tables = [
                tbl for tbl in avail_subregion_url_tables if tbl is not None
            ]

            subregion_url_tables = list(avail_subregion_url_tables)

            while subregion_url_tables:

                subregion_url_tables_ = []

                for subregion_url_table in subregion_url_tables:
                    subregions = list(subregion_url_table.Subregion)
                    subregion_urls = list(subregion_url_table.SubregionURL)
                    subregion_url_tables_0 = [
                        get_subregion_table(subregion_url, verbose)
                        for subregion_url in subregion_urls
                    ]
                    subregion_url_tables_ += [
                        tbl for tbl in subregion_url_tables_0
                        if tbl is not None
                    ]

                    # (Note that 'Russian Federation' data is available in both 'Asia' and 'Europe')
                    avail_subregions += subregions
                    avail_subregion_urls += subregion_urls
                    avail_subregion_url_tables += subregion_url_tables_

                subregion_url_tables = list(subregion_url_tables_)

            # Save a list of available subregions locally
            save_pickle(avail_subregions,
                        cd_dat("GeoFabrik-subregion-name-list.pickle"),
                        verbose=verbose)

            # Subregion index - {Subregion: URL}
            subregion_url_index = dict(
                zip(avail_subregions, avail_subregion_urls))
            # Save subregion_index to local disk
            save_pickle(
                subregion_url_index,
                cd_dat("GeoFabrik-subregion-name-url-dictionary.pickle"),
                verbose=verbose)
            save_json(subregion_url_index,
                      cd_dat("GeoFabrik-subregion-name-url-dictionary.json"),
                      verbose=verbose)

            # All available URLs for downloading
            home_subregion_url_table = get_subregion_table(home_url)
            avail_subregion_url_tables.append(home_subregion_url_table)
            subregion_downloads_index = pd.DataFrame(
                pd.concat(avail_subregion_url_tables, ignore_index=True))
            subregion_downloads_index.drop_duplicates(inplace=True)
            subregion_downloads_index_json = subregion_downloads_index.set_index(
                'Subregion').to_json()

            # Save subregion_index_downloads to local disk
            save_pickle(
                subregion_downloads_index,
                cd_dat("GeoFabrik-subregion-downloads-catalogue.pickle"),
                verbose=verbose)
            save_json(subregion_downloads_index_json,
                      cd_dat("GeoFabrik-subregion-downloads-catalogue.json"),
                      verbose=verbose)

        except Exception as e:
            print("Failed to get the required information ... {}.".format(e))

    else:
        print("The information collection process was not activated.")
예제 #9
0
def collect_bbbike_subregion_download_catalogue(subregion_name,
                                                confirmation_required=True,
                                                verbose=False):
    """
    :param subregion_name: [str]
    :param confirmation_required: [bool] (default: True)
    :param verbose: [bool] (default: False)

    Testing e.g.
        subregion_name        = 'leeds'
        confirmation_required = True
        verbose               = True
        collect_bbbike_subregion_download_catalogue(subregion_name, confirmation_required, verbose)
    """
    def parse_dlc(dlc):
        dlc_href = dlc.get('href')  # URL
        filename, download_url = dlc_href.strip('./'), urllib.parse.urljoin(
            url, dlc_href)
        if not dlc.has_attr('title'):
            file_format, file_size, last_update = 'Poly', None, None
        else:
            if len(dlc.contents) < 3:
                file_format, file_size = 'Txt', None
            else:
                file_format, file_size, _ = dlc.contents  # File type and size
                file_format, file_size = file_format.strip(), file_size.text
            last_update = pd.to_datetime(dlc.get('title'))  # Date and time
        parsed_dat = [
            filename, download_url, file_format, file_size, last_update
        ]
        return parsed_dat

    subregion_name_ = regulate_bbbike_input_subregion_name(subregion_name)
    #
    if confirmed("To collect BBBike download catalogue for \"{}\"? ".format(
            subregion_name_),
                 confirmation_required=confirmation_required):
        try:
            url = 'https://download.bbbike.org/osm/bbbike/{}/'.format(
                subregion_name_)

            source = urllib.request.urlopen(url)
            source_soup = bs4.BeautifulSoup(source, 'lxml')
            download_links_class = source_soup.find_all(
                name='a', attrs={'class': ['download_link', 'small']})

            subregion_downloads_catalogue = pd.DataFrame(
                parse_dlc(x) for x in download_links_class)
            subregion_downloads_catalogue.columns = [
                'Filename', 'URL', 'DataType', 'Size', 'LastUpdate'
            ]

            path_to_file = cd_dat_bbbike(
                subregion_name_,
                subregion_name_ + "-download-catalogue.pickle")
            save_pickle(subregion_downloads_catalogue,
                        path_to_file,
                        verbose=verbose)

        except Exception as e:
            print("Failed to collect download catalogue for \"{}\". {}".format(
                subregion_name_, e))
    else:
        print(
            "The information collection process was not activated. The existing local copy will be loaded instead."
        )
예제 #10
0
def collect_subregion_info_catalogue(confirmation_required=True,
                                     verbose=False):
    """
    :param confirmation_required: [bool] (default: False) whether to confirm before starting to collect information
    :param verbose: [bool] (default: False)

    Example:
        confirmation_required = True
        verbose               = True
        collect_subregion_info_catalogue(confirmation_required, verbose)
    """
    if confirmed(
            "To collect all available subregion links? (Note that it may take a few minutes.) ",
            confirmation_required=confirmation_required):

        home_url = 'http://download.geofabrik.de/'

        try:
            source = requests.get(home_url)
            soup = bs4.BeautifulSoup(source.text, 'lxml')
            source.close()
            # avail_subregions = [td.a.text for td in soup.find_all('td', {'class': 'subregion'})]
            subregion_href = soup.find_all('td', {'class': 'subregion'})
            avail_subregion_urls = (urllib.parse.urljoin(
                home_url, td.a['href']) for td in subregion_href)
            avail_subregion_url_tables_0 = (get_subregion_table(
                sub_url, verbose) for sub_url in avail_subregion_urls)
            avail_subregion_url_tables = [
                tbl for tbl in avail_subregion_url_tables_0 if tbl is not None
            ]

            subregion_url_tables = list(avail_subregion_url_tables)

            while subregion_url_tables:

                subregion_url_tables_ = []

                for subregion_url_table in subregion_url_tables:
                    # subregions = list(subregion_url_table.Subregion)
                    subregion_urls = list(subregion_url_table.SubregionURL)
                    subregion_url_tables_0 = [
                        get_subregion_table(sr_url, verbose)
                        for sr_url in subregion_urls
                    ]
                    subregion_url_tables_ += [
                        tbl for tbl in subregion_url_tables_0
                        if tbl is not None
                    ]

                    # (Note that 'Russian Federation' data is available in both 'Asia' and 'Europe')
                    # avail_subregions += subregions
                    # avail_subregion_urls += subregion_urls
                    avail_subregion_url_tables += subregion_url_tables_

                subregion_url_tables = list(subregion_url_tables_)

            # All available URLs for downloading
            home_subregion_url_table = get_subregion_table(home_url)
            avail_subregion_url_tables.append(home_subregion_url_table)
            subregion_downloads_index = pd.DataFrame(
                pd.concat(avail_subregion_url_tables, ignore_index=True))
            subregion_downloads_index.drop_duplicates(inplace=True)

            duplicated = subregion_downloads_index[
                subregion_downloads_index.Subregion.duplicated(keep=False)]
            if not duplicated.empty:
                import humanfriendly
                for i in range(0, 2, len(duplicated)):
                    temp = duplicated.iloc[i:i + 2]
                    size = temp['.osm.pbf_Size'].map(
                        lambda x: humanfriendly.parse_size(
                            x.strip('(').strip(')').replace('\xa0', ' ')))
                    idx = size[size == size.min()].index
                    subregion_downloads_index.drop(idx, inplace=True)
                subregion_downloads_index.index = range(
                    len(subregion_downloads_index))

            subregion_downloads_index_json = subregion_downloads_index.set_index(
                'Subregion').to_json()

            # Save subregion_index_downloads to local disk
            save_pickle(
                subregion_downloads_index,
                cd_dat("GeoFabrik-subregion-downloads-catalogue.pickle"),
                verbose=verbose)
            save_json(subregion_downloads_index_json,
                      cd_dat("GeoFabrik-subregion-downloads-catalogue.json"),
                      verbose=verbose)

            avail_subregions = list(subregion_downloads_index.Subregion)
            avail_subregion_urls = list(subregion_downloads_index.SubregionURL)

            # Subregion index - {Subregion: URL}
            subregion_url_index = dict(
                zip(avail_subregions, avail_subregion_urls))

            # Save a list of available subregions locally
            save_pickle(avail_subregions,
                        cd_dat("GeoFabrik-subregion-name-list.pickle"),
                        verbose=verbose)
            # Save subregion_index to local disk
            save_pickle(
                subregion_url_index,
                cd_dat("GeoFabrik-subregion-name-url-dictionary.pickle"),
                verbose=verbose)
            save_json(subregion_url_index,
                      cd_dat("GeoFabrik-subregion-name-url-dictionary.json"),
                      verbose=verbose)

        except Exception as e:
            print("Failed to get the required information ... {}.".format(e))

    else:
        print("The information collection process was not activated.")