示例#1
0
def add_outage_longitudes_latitudes(
        df_outages,
        postcodes_fp: str = '../data/postcodes.csv',
        null_threshold=1):
    pcl = PostCodeLocator(postcodes_fp)

    s_incident_longitudes = pd.Series(index=df_outages.index, dtype='float64')
    s_incident_latitudes = pd.Series(index=df_outages.index, dtype='float64')

    for outage_idx, s_outage in track(df_outages.iterrows(),
                                      total=df_outages.shape[0]):
        incident_longitudes = []
        incident_latitudes = []

        for postcode_impacted in s_outage['postcodes_impacted']:
            longitude, latitude = pcl.get_postcode_location(postcode_impacted)

            incident_longitudes += [longitude]
            incident_latitudes += [latitude]

        s_incident_longitudes.loc[outage_idx] = pd.Series(
            incident_longitudes).mean()
        s_incident_latitudes.loc[outage_idx] = pd.Series(
            incident_latitudes).mean()

    df_outages = (df_outages.assign(longitude=s_incident_longitudes).assign(
        latitude=s_incident_latitudes))

    idxs_to_keep = (df_outages.isnull().sum(axis=1) <= null_threshold).replace(
        False, np.nan).dropna().index
    df_outages = df_outages.loc[idxs_to_keep]

    return df_outages
示例#2
0
文件: sketch.py 项目: xgfs/FREDE
def sketch_with_PPR(sketchers, PPR):
    n = PPR.shape[0]
    for i in track(range(n)):
        for s in sketchers:
            s.append(PPR[i])

    return PPR
示例#3
0
文件: predict.py 项目: tzoiker/psqi
def compute_predictions(*, model, likelihood, test_X):
    t = -time()
    with torch.no_grad():
        batch = test_X[0:1]
        pred = likelihood(model(batch))
        del pred
    t += time()

    print('Predictions will take %.1f minutes' % (t * len(test_X) / 60))

    t = -time()
    u = []
    s = []
    with torch.no_grad():
        n = len(test_X)
        for i in track(range(n), total=n):
            batch = test_X[i:i + 1]
            pred = likelihood(model(batch))
            u.append(pred.mean.detach().cpu().numpy())
            s.append(pred.covariance_matrix.detach().cpu().numpy())
            del pred
    u = np.concatenate(u, axis=0)
    s = np.stack(s, axis=0)
    t += time()

    return u, s
示例#4
0
    def plot_worst(
            self,
            model: tf.keras.Model,
            cadastre_indeces: Collection[int] = range(0, 10),
            metric: Tuple[Mapping, str] = (min, "iou"),
            number: int = 5,
    ):
        """Plot the worst predictions according to a given metric."""
        cadastre_metrics = {}
        optimizer, metric = metric

        for cadastre_index in track(cadastre_indeces):
            try:
                x, y = self[cadastre_index:cadastre_index + 1]
            except ValueError:
                continue

            if x is None or x.shape[0] > 1:
                continue

            evaluation = model.evaluate(x=x, y=y, verbose=0)
            metrics = {
                name: value
                for name, value in zip(model.metrics_names, evaluation)
            }
            cadastre_metrics[cadastre_index] = metrics

        number = min(number, len(cadastre_metrics))
        for _ in range(number):
            worst_cadastre = optimizer(
                cadastre_metrics.keys(),
                key=lambda key: cadastre_metrics[key][metric],
            )
            self.plot_prediction(model=model, cadastre_index=worst_cadastre)
            del cadastre_metrics[worst_cadastre]
示例#5
0
    def first_time_setup(self):
        """Initialize the cadastre cache for the first time."""
        self.directory.mkdir(parents=True, exist_ok=True)

        # Save pertinent cadastre cache metadata
        metadata = {
            "cadastre_path": str(self.cadastre_path.resolve().absolute()),
            "layer_name": self.layer_name,
        }
        self.metadata_path.write_text(json.dumps(metadata))
        metadata = json.loads(self.metadata_path.read_text())

        geometries = []
        indeces = []
        with fiona.open(
            metadata["cadastre_path"],
            layer=metadata["layer_name"],
            mode="r",
        ) as src:
            # srid = int(src.crs["init"].split(":")[0])
            crs = src.crs["init"]
            for index, obj in track(enumerate(src), total=len(src)):
                geometries.append(vector.fiona_polygon(obj))
                indeces.append(index + 1)

        self._dataframe = geopandas.GeoDataFrame(
            crs=crs,
            geometry=geopandas.GeoSeries(geometries),
        )
        self._dataframe.set_index(pd.Index(indeces), inplace=True)
        self._dataframe.to_pickle(
            self.directory / "cadastre.pkl",
            protocol=-1,
        )
示例#6
0
文件: sketch.py 项目: xgfs/FREDE
def sketch(sketchers, G, rw=True, nthreads=1):
    n = G.shape[0]
    x = np.zeros(n, dtype=np.float32)
    PPR = np.zeros((n, n), dtype=np.float32)
    inds = G.indptr
    inds = np.append(inds, n).astype(np.int32)
    degrees = G.sum(axis=0).A1.astype(np.int32)
    for i in track(range(n)):
        if rw:
            libc.ppr_row_rw(x.ctypes.data_as(POINTER(c_float)),
                            inds.ctypes.data_as(POINTER(c_int)),
                            G.indices.ctypes.data_as(POINTER(c_int)),
                            degrees.ctypes.data_as(POINTER(c_int)),
                            n, i, c_float(0.85), nthreads, 100000)
        else:
            libc.ppr_row_matmul(x.ctypes.data_as(POINTER(c_float)),
                                inds.ctypes.data_as(POINTER(c_int)),
                                G.indices.ctypes.data_as(POINTER(c_int)),
                                degrees.ctypes.data_as(POINTER(c_int)),
                                n, i, c_float(0.85), nthreads, c_float(1e-6), 100, 2048)

        row = np.zeros_like(x, dtype=np.float32)
        nnz = x.nonzero()
        row[nnz] = np.log(n * x[nnz])
        PPR[i] = row
        for s in sketchers:
            s.append(row)

    return PPR
示例#7
0
    def cache_mask(
        self,
        ogr_path: Path,
        layer_name: str,
        mask_name: str,
    ) -> None:
        """Cache given mask to disk."""
        mask_directory = self.directory / "mask" / mask_name
        mask_pickle = mask_directory / "mask.pkl"
        if mask_pickle.exists():
            print("Already cached!")
            return
        else:
            mask_directory.mkdir(parents=True, exist_ok=True)

        with fiona.open(ogr_path, "r", layer=layer_name) as src:
            mask = MultiPolygon(
                [vector.fiona_polygon(feature) for feature in track(src)]
            )
            print("Buffering multipolygon pickle... ", end="")
            mask = mask.buffer(0.0)
            print("Done!")

        print("Writing pickle file... ", end="")
        mask_pickle.write_bytes(
            pickle.dumps(mask, protocol=pickle.HIGHEST_PROTOCOL),
        )
        print("Done!")
示例#8
0
    def download_datasets(self,
                          datasets,
                          product_id='EO:EUM:DAT:MSG:MSG15-RSS',
                          download_all=True):
        """
        Downloads a set of dataset from the EUMETSAT API
        in the defined date range and specified product

        Parameters:
            datasets: list of datasets returned by `identify_available_datasets`

        """

        # Identifying dataset ids to download
        dataset_ids = sorted([dataset['id'] for dataset in datasets])

        # Check which datasets to download
        download_ids, local_ids = self.check_if_downloaded(dataset_ids)

        # Downloading specified datasets
        if not dataset_ids:
            self.logger.info(
                'No files will be downloaded. Set DownloadManager bucket_name argument for local download'
            )
            return

        all_metadata = []

        for dataset_id in track(dataset_ids):
            dataset_link = dataset_id_to_link(dataset_id)

            # Download the raw data
            if (dataset_id in download_ids) or (download_all == True):
                try:
                    self.download_single_dataset(dataset_link)
                except:
                    self.logger.info(
                        'The EUMETSAT access token has been refreshed')
                    self.request_access_token()
                    self.download_single_dataset(dataset_link)

            # Extract and save metadata
            dataset_metadata = extract_metadata(self.data_dir,
                                                product_id=product_id)
            dataset_metadata.update({'downloaded': pd.Timestamp.now()})
            all_metadata += [dataset_metadata]
            self.metadata_table.insert(dataset_metadata)

            # Delete old metadata files
            for xml_file in ['EOPMetadata.xml', 'manifest.xml']:
                xml_filepath = f'{self.data_dir}/{xml_file}'

                if os.path.isfile(xml_filepath):
                    os.remove(xml_filepath)

        df_new_metadata = pd.DataFrame(all_metadata)

        return df_new_metadata
示例#9
0
def get_ssen_incidents_info(incidents_url='https://www.ssen.co.uk/Sse_Components/Views/Controls/FormControls/PowerTrackHandler.ashx'):
    raw_incidents_info = requests.get(incidents_url, verify=False).json()
    cleaned_incidents_info = dict()

    for incident in track(raw_incidents_info['Faults']):
        incident_ref = incident['Reference']
        cleaned_incidents_info[incident_ref] = clean_ssen_incident_info(incident)

    return raw_incidents_info, cleaned_incidents_info
示例#10
0
def convert_nbs_to_md(nbs_dir, docs_nb_img_dir, docs_dir):
    nb_files = [f for f in os.listdir(nbs_dir) if f[-6:]=='.ipynb']

    for nb_file in track(nb_files):
        nb_fp = f'{nbs_dir}/{nb_file}'
        junix.export_images(nb_fp, docs_nb_img_dir)
        convert_md(nb_fp, docs_dir, img_path=f'{docs_nb_img_dir}/', jekyll=False)

        md_fp =  docs_dir + '/'+ nb_file.replace('.ipynb', '') + '.md'
        encode_file_as_utf8(md_fp)
示例#11
0
def get_full_search_df(num_pages):
    df_search = pd.DataFrame()

    for page_num in track(range(num_pages), label='Accounts'):
        r = get_accounts_raw_search(page_num=page_num)
        df_search_page = extract_search_df(r)
        df_search = df_search.append(df_search_page)

    df_search = (df_search.reset_index(drop=True).drop_duplicates())

    return df_search
示例#12
0
    def query_API(self,
                  from_date=None,
                  to_date=None,
                  data_stream='emissions',
                  level='national'):
        """ 
        Queries the National Grid ESO Carbon Intensity API, enabling data to be easily
        retrieved for emissions and fuel generation at both a regional and national level
        
        N.b. the regional emissions data only includes forecast levels as the API
        does not have an observed regional emissions data stream at time of writing

        Parameters
        ----------
        from_date : str
            Start date for the queried data
        to_date : str
            End date for the queried data
        data_stream : str
            One of 'emissions' or 'generation'
        level : str
            spatial aggregation level, one of 
            'national' or 'regional' (DNOs)

        Returns
        -------


        """

        # Input checks
        assert data_stream in self.available_data_streams, f"Data stream must be one of: {', '.join(self.available_data_streams)}"
        assert level in self.available_levels, f"Level must be one of: {', '.join(self.available_levels)}"

        query_func = self.query_funcs[level][data_stream]

        if from_date is None or to_date is None:
            df = query_func(from_date, to_date)

        elif (pd.to_datetime(to_date) -
              pd.to_datetime(from_date)) > pd.Timedelta(weeks=2):
            df = pd.DataFrame()
            dt_rng_pairs = construct_date_range_pairs(from_date, to_date)

            for pair_from_date, pair_to_date in track(dt_rng_pairs):
                df_pair = query_func(pair_from_date, pair_to_date)
                df = df.append(df_pair)

        else:
            df = query_func(from_date, to_date)

        return df
示例#13
0
def extract_ukpn_multiple_incident_ids(multiple_incident_urls):
    incident_ids = []

    for multiple_incident_url in track(multiple_incident_urls, label='Multiple Ids'):
        r = requests.get(multiple_incident_url)
        soup = bs(r.text, features='lxml')

        incident_ids += [
            link['data-url'].split('incidentId=')[1]
            for link
            in soup.find('div', {'class': 'multiple-incidents--wrapper mb-4'}).findAll('a')
        ]

    return incident_ids
def show_id_2_ratings_df(show_id):
    df_ratings = pd.DataFrame(index=range(100))
    most_recent_season = show_id_2_most_recent_season(show_id)

    for season in track(range(1, most_recent_season + 1)):
        ratings = season_2_episode_ratings(show_id, season)
        df_ratings[season] = pd.Series(ratings, dtype='float64')

    df_ratings = (df_ratings.dropna(how='all', axis=0).dropna(how='all',
                                                              axis=1))

    df_ratings.index += 1

    return df_ratings
示例#15
0
def collate_cleaned_incidents_info(
        dnos=['ukpn', 'ssen', 'wpd', 'sp', 'np', 'enw']):
    cleaned_incidents_info = dict()

    for dno in track(dnos, label='DNOs'):
        try:
            raw_dno_incidents_info, cleaned_dno_incidents_info = getattr(
                retrieval, f'get_{dno}_incidents_info')()
            cleaned_incidents_info[dno] = cleaned_dno_incidents_info
            retrieval.save_json_data(raw_dno_incidents_info,
                                     f'{dno}_incidents_info')
        except:
            warn(f'Failed to process {dno}')

    return cleaned_incidents_info
示例#16
0
def construct_ets_unit_dfs(account_ids, owners_col_rename_map, units_col_rename_map, label=None):
    df_owners = pd.DataFrame(index=account_ids, columns=owners_col_rename_map.keys())
    df_units = pd.DataFrame(index=account_ids, columns=units_col_rename_map.keys())
    ts_dfs = {}

    for account_id in track(account_ids, label=label):
        page_info, df_ts = extract_page_info(account_id)

        df_owners.loc[account_id] = pd.Series(collate_owner_info(page_info))
        df_units.loc[account_id] = pd.Series(collate_unit_info(page_info))
        ts_dfs[account_id] = df_ts

    df_owners = df_owners.rename(columns=owners_col_rename_map)
    df_units = df_units.rename(columns=units_col_rename_map)

    return df_owners, df_units, ts_dfs
示例#17
0
文件: estimate.py 项目: naplab/DSTRF
def nonlinearities(paths,
                   reduction='none',
                   batch_size=8,
                   device='cpu',
                   verbose=0):
    nonliniearity = {
        'complexity': [],
        'gain change': [],
        'temporal hold': [],
        'shape change': []
    }

    if isinstance(paths, str) and os.path.isdir(paths):
        paths = sorted(glob.glob(os.path.join(paths, 'dSTRF-*.pt')))
        if verbose >= 1:
            print(f'Found {len(paths)} dSTRF files in specified directory.',
                  flush=True)

    if verbose >= 1:
        paths = ipypb.track(paths)

    for path in paths:
        dstrf = torch.load(path).to(device)

        nonliniearity['complexity'].append(
            complexity(dstrf, batch_size=batch_size))
        nonliniearity['gain change'].append(
            gain_change(dstrf, batch_size=batch_size))
        nonliniearity['temporal hold'].append(
            temporal_hold(dstrf, batch_size=batch_size))
        nonliniearity['shape change'].append(
            shape_change(dstrf, batch_size=batch_size, verbose=verbose))

        del dstrf

    for k in nonliniearity:
        nonliniearity[k] = torch.stack(nonliniearity[k], dim=1)
        if reduction == 'mean':
            nonliniearity[k] = nonliniearity[k].mean(dim=1)
        elif reduction == 'median':
            nonliniearity[k] = nonliniearity[k].median(dim=1)
        elif reduction != 'none':
            raise NotImplementedError()

    return nonlinearity
示例#18
0
def calc_psqi(*, u, s, norms, R2, df_Y):
    # Consider only elements with bounded norms
    inds = [i for i, (lower, upper) in enumerate(norms) if np.isinf(upper)]
    cols = [col for i, col in enumerate(df_Y.columns) if i in inds]

    print('PSQI: consider only %s' % cols)

    # Average R^2 across splits
    R2 = R2.mean(axis=0).round(3)

    u = u[:, inds]
    s = s[:, :, inds][:, inds, :]
    R2 = R2[inds]
    df_Y = df_Y.iloc[:, inds]
    norms = norms[inds]

    assert len(u.shape) == 2
    assert len(s.shape) == 3
    ret_ps = []
    ret_cs = []

    train_mean = df_Y.mean(axis=0)
    train_var = df_Y.var(axis=0)

    ws = softmax(np.clip(R2, 0, None))

    t = -time()
    for i, (mean, cov) in track(enumerate(zip(u, s)), total=len(u)):
        if i % 1000 == 0:
            print('PSQI: processed %d / %d' % (i, len(u)))
        ps, cs = _calc_psqi(
            mean,
            cov,
            norms,
            R2=R2,
            train_mean=train_mean,
            train_var=train_var,
        )
        ret_ps.append(ps)
        ret_cs.append(cs)
    t += time()
    print('PSQI: computations took %.1f seconds' % t)
    return np.array(ret_ps), np.array(ret_cs), ws
示例#19
0
def evaluate_models(X, y, models, post_pred_proc_func=None, index=None):
    model_scores = dict()

    for model_name, model in track(models.items()):
        df_pred = generate_kfold_preds(X, y, model, index=index)

        if post_pred_proc_func is not None:
            df_pred['pred'] = post_pred_proc_func(df_pred['pred'])

        model_scores[model_name] = {
            'mae': mean_absolute_error(df_pred['true'], df_pred['pred']),
            'rmse': np.sqrt(mean_squared_error(df_pred['true'], df_pred['pred']))
        }

    df_model_scores = pd.DataFrame(model_scores)

    df_model_scores.index.name = 'metric'
    df_model_scores.columns.name = 'model'

    return df_model_scores
示例#20
0
def feature_selection(x_train, y_train, groups=None, model=RandomForestRegressor(), min_num_features=1, max_num_features=None, **sfs_kwargs):
    if max_num_features is None:
        max_num_features = 1 + x_train.shape[1]

    result_features = dict()
    result_scores = dict()

    for num_features in track(range(min_num_features, max_num_features)):
        sfs = SFS(
            model,
            k_features=num_features,
            **sfs_kwargs
        )

        sfs.fit(x_train, y_train, groups=groups)

        result_features[num_features] = sfs.k_feature_names_
        result_scores[num_features] = sfs.k_score_

    return result_features, result_scores
示例#21
0
def get_ukpn_incidents_info(incidents_url='https://www.ukpowernetworks.co.uk/Incidents/GetIncidents'):
    r_json = requests.get(incidents_url).json()

    incident_ids = (
        extract_ukpn_single_incident_ids(r_json) +
        extract_ukpn_multiple_incident_ids(extract_ukpn_multiple_incident_urls(r_json))
    )

    raw_incidents_info = dict()
    cleaned_incidents_info = dict()

    for incident_id in track(incident_ids, label='Details'):
        try:
            incident_detail_url = get_ukpn_incident_detail_url(incident_id)
            r_json = requests.get(incident_detail_url).json()

            raw_incidents_info[incident_id] = r_json
            cleaned_incidents_info[incident_id] = extract_ukpn_relevant_info(r_json)
        except:
            warn(f'Failed to retrieve incident details for: {incident_id}')

    return raw_incidents_info, cleaned_incidents_info
示例#22
0
def evaluate_discharge_models(df, models, features_kwargs={}):
    df_features = construct_df_discharge_features(df, **features_kwargs)
    s_discharge = construct_discharge_s(df['demand_MW'],
                                        start_time='15:30',
                                        end_time='20:30')

    evening_datetimes = extract_evening_datetimes(df_features)

    X = df_features.loc[evening_datetimes].values
    y = s_discharge.loc[evening_datetimes].values

    model_scores = dict()
    peak_reduction_calc = construct_peak_reduction_calculator(
        s_demand=df['demand_MW'], evening_datetimes=evening_datetimes)

    for model_name, model in track(models.items()):
        df_pred = clean.generate_kfold_preds(X,
                                             y,
                                             model,
                                             index=evening_datetimes)
        df_pred['pred'] = post_pred_discharge_proc_func(df_pred['pred'])

        model_scores[model_name] = {
            'pct_optimal_reduction':
            peak_reduction_calc(df_pred['true'], df_pred['pred']),
            'optimal_discharge_mae':
            mean_absolute_error(df_pred['true'], df_pred['pred']),
            'optimal_discharge_rmse':
            np.sqrt(mean_squared_error(df_pred['true'], df_pred['pred']))
        }

    df_model_scores = pd.DataFrame(model_scores)

    df_model_scores.index.name = 'metric'
    df_model_scores.columns.name = 'model'

    return df_model_scores
示例#23
0
geom_f = "/data/SciBigData/radargrams/geom/"
pattern = geom_f + "/**/*_geom.lbl"
log.info(pattern)

allgeoms = glob.glob(pattern)

log.info(f"found {len(allgeoms)} radargrams geom-files")

# %%

# %%
# this is the actual loop injesting the geom files and simplyfying them
out = []
ids = []
for f in track(allgeoms):
    tab = lbl_to_geom(f)
    df = read_geom_lbl(tab)
    geometry = LineString([Point(xy) for xy in zip(df.longitude, df.latitude)])
    geometry = geometry.simplify(0.001)
    i = ids.append(match_sharad_id(f))
    out.append(geometry)

# %%
m

# %%
df = DataFrame()
df["track_id"] = ids

# %%
示例#24
0
def plot_raster_spread(cache):
    minima, mean, maxima = [], [], []
    for index in track(cache.cadastre_indeces):
        if index > 10_000:
            break
        tiles = cache.lidar_tiles(cadastre_index=index)
        masks = cache.mask_tiles(cadastre_index=index)
        for (tile, mask) in zip(tiles, masks):
            mask = mask.flatten()
            if mask.sum() <= 64:
                continue
            tile = tile.flatten()
            tile = np.ma.masked_where(tile < -3.4e38, tile)
            minima.append(tile.min())
            mean.append(tile.mean())
            maxima.append(tile.max())

    minima = np.array(minima)
    mean = np.array(mean)
    maxima = np.array(maxima)

    sorter = mean.argsort()
    minima = minima[sorter]
    mean = mean[sorter]
    maxima = maxima[sorter]

    configure_latex(width_scaler=2)
    fig, (sorted_ax, dist_ax) = plt.subplots(1, 2)
    x = np.arange(len(minima))
    sorted_ax.fill_between(
        x,
        mean,
        minima,
        label="Minima",
        color="g",
        facecolor="none",
        alpha=0.5,
    )
    sorted_ax.fill_between(
        x,
        mean,
        maxima,
        label="Maxima",
        color="r",
        facecolor="none",
        alpha=0.5,
    )
    sorted_ax.plot(x, mean, label="Mean", color="xkcd:black", linewidth=1.5)

    sorted_ax.legend(loc="upper left")
    sorted_ax.set_xlabel("Tiles sorted by mean elevation")
    sorted_ax.set_ylabel("Tile elevation [m]")

    range = maxima - minima
    dist_ax.hist(range,
                 bins=200,
                 density=True,
                 label=(r"\textbf{Mean:} "
                        f"{range.mean():.2f}m, "
                        r"\textbf{SD:} "
                        f"{range.std():.2f}m"))
    dist_ax.set_xlabel(r"Range ($\max - \min$) [m]")
    dist_ax.legend()

    dist_ax.set_ylabel(r"Frequency")
    dist_ax.yaxis.set_major_formatter(PercentFormatter(xmax=1, decimals=0))

    sorted_ax.get_xaxis().set_ticks([])
    fig.savefig("/code/tex/img/elevation-spread.pdf")
    return fig, sorted_ax
示例#25
0
def get_all_installation_allocations_df(df_search):
    col_renaming_map = {
        'Installation ID': 'installation_id',
        'Installation Name': 'installation_name',
        'Address City': 'installation_city',
        'Account Holder Name': 'account_holder',
        'Account Status': 'account_status',
        'Permit ID': 'permit_id',
        'Status': 'status'
    }

    df_installation_allocations = pd.DataFrame()

    # Retrieving raw data
    for country in track(df_search['country'].unique()):
        df_installation_allocations_country = pd.DataFrame()
        country_installations_links = df_search.loc[
            df_search['country'] == country, 'installations_link']

        for installations_link in track(country_installations_links,
                                        label=country):
            url_root, params = get_url_root_and_params(installations_link)
            df_installation_allocations_country_phase = get_installation_allocations_df(
                root_url, params)

            if df_installation_allocations_country.size > 0:
                df_installation_allocations_country = pd.merge(
                    df_installation_allocations_country,
                    df_installation_allocations_country_phase,
                    how='outer',
                    on=list(col_renaming_map.keys()))
            else:
                df_installation_allocations_country = df_installation_allocations_country_phase

        df_installation_allocations_country['country'] = country
        df_installation_allocations = df_installation_allocations.append(
            df_installation_allocations_country)

    # Collating update datetimes
    update_cols = df_installation_allocations.columns[
        df_installation_allocations.columns.str.contains('Latest Update')]
    df_installation_allocations['latest_update'] = df_installation_allocations[
        update_cols].fillna('').max(axis=1)
    df_installation_allocations = df_installation_allocations.drop(
        columns=update_cols)

    # Renaming columns
    df_installation_allocations = (df_installation_allocations.reset_index(
        drop=True).rename(columns=col_renaming_map))

    # Sorting column order
    non_year_cols = ['country'] + list(
        col_renaming_map.values()) + ['latest_update']
    year_cols = sorted(
        list(set(df_installation_allocations.columns) - set(non_year_cols)))
    df_installation_allocations = df_installation_allocations[non_year_cols +
                                                              year_cols]

    # Dropping header rows
    idxs_to_drop = df_installation_allocations['permit_id'].str.contains(
        '\*').replace(False, np.nan).dropna().index
    df_installation_allocations = df_installation_allocations.drop(
        idxs_to_drop)

    return df_installation_allocations
示例#26
0
    def query_orchestrator(self,
                           stream,
                           query_args,
                           service_type='xml',
                           track_label=None,
                           wait_time=0):
        check_date_rng_args = lambda query_args: all(
            x in query_args.keys() for x in ['start_date', 'end_date'])

        df = pd.DataFrame()

        if stream_info.streams[stream]['request_type'] == 'date_range':
            ## Dealing with date range requests - main concern is whether capping has been applied
            assert check_date_rng_args(
                query_args
            ), 'All date range queries should be provided with a "start_date" and "end_date".'

            self.capping_applied = True
            date_col = stream_info.streams[stream]['date_col']
            start_date, end_date = query_args['start_date'], query_args[
                'end_date']

            absolute_start_date = start_date

            while self.capping_applied == True:
                response = self.make_request(stream, query_args, service_type)
                df_new = self.parse_response(response, stream)

                df = df.append(df_new)

                assert self.capping_applied != None, 'Whether or not capping limits had been breached could not be found in the response metadata'
                if self.capping_applied == True:
                    start_date = pd.to_datetime(
                        df[date_col]).max().tz_localize(None)
                    warnings.warn(
                        f'Response was capped, request is rerunning for missing data from {start_date}'
                    )

                    if pd.to_datetime(start_date) >= pd.to_datetime(end_date):
                        warnings.warn(
                            f'End data ({end_date}) was earlier than start date ({start_date})\nThe start date will be set one day earlier.'
                        )
                        start_date = (
                            pd.to_datetime(end_date) -
                            pd.Timedelta(days=1)).strftime('%Y-%m-%d')

                    query_args['start_date'] = start_date

            response = self.make_request(stream, query_args, service_type)
            df_new = self.parse_response(response, stream)

            df = df.append(df_new)

        elif stream_info.streams[stream]['request_type'] == 'SP_by_SP':
            ## Dealing with SP by SP requests - main concern is handling daylight savings
            assert check_date_rng_args(
                query_args
            ), 'All date range queries should be provided with a "start_date" and "end_date".'

            start_date, end_date = query_args['start_date'], query_args[
                'end_date']

            absolute_start_date = start_date

            for key in ['start_date', 'end_date']:
                query_args.pop(key, None)

            df_dates_SPs = self.dt_rng_2_SPs(start_date, end_date)
            date_SP_tuples = list(df_dates_SPs.reset_index().itertuples(
                index=False, name=None))

            for datetime, query_date, SP in track(date_SP_tuples,
                                                  label=track_label,
                                                  total=len(date_SP_tuples)):
                query_args['query_date'] = query_date
                query_args['SP'] = SP

                # request and parse
                response = self.make_request(stream, query_args, service_type)
                df_SP = self.parse_response(response, stream)

                if df_SP.shape[0] != 0:

                    # processing
                    df_SP = self.expand_cols(df_SP)
                    df_SP['datetime'] = datetime

                    # saving
                    df = df.append(df_SP, sort=False)

                time.sleep(wait_time)

        df = df.reset_index(drop=True)

        df = df[~df.duplicated()]

        ## Assigning local_datetime for relevant dt_rng streams
        if all(col in stream_info.streams[stream].keys()
               for col in ['date_col', 'SP_col']):
            df = self.add_local_datetime(df, absolute_start_date, end_date,
                                         stream)

        return df
示例#27
0
def intersectGDFMP(gdf1,
                   keyfield1,
                   gdf2,
                   keyfield2,
                   areas_in=None,
                   verbosity=1,
                   area_epsg=6931,
                   apply_buffer=False,
                   threshold=None,
                   vertex_limit=10000,
                   check_inputs=True,
                   num_process=None,
                   num_block=None):
    '''Multiprocess overlap between geometries in two geodataframes
    
    Parameters
    ----------
    gdf1: GeoDataFrame (must match crs of gdf2, and have valid geometries if apply_buffer is False)
    keyfield1: column name in gdf1 which uniquely identifies each row and will be used to label the results
    gdf2: GeoDataFrame (must match crs of gdf1, and have valid geometries if apply_buffer is False)
    keyfield2: column name in gdf2 which uniquely identifies each row and will be used to label the results
    areas_in: list of lists of overlap areas between geometries in gdf1 and gdf2, with dimensions [gdf2.shape[0]][gdf1.shape[0]], intersections will be calculated only for geometry pairs with nonzero entries (default None)
    verbosity: int, detail level of reporting during execution: 0=none, 1+=announce computation sections (default 1)
    area_epsg: int, convert to this epsg for area calculation (default 6931)
    apply_buffer: bool, use .buffer(0) to coerce valid geometries in gdf1 and gdf2 (often works, sometimes corrupts geometry) (default False)
    threshold: float or None, a float value discards overlaps that area less than threshold fraction of the largest overlap for each geometry in gdf2 (default None)
    vertex_limit: int>3, limit geometry elements to at most this number of vertices when performing internal calculations (default 10000)
    check_inputs: bool, set to False to circumvent input checks (crs match, geometry validity, areas_in shape) for speedup (default True)
    num_process: int or None, number of processes to use for calculation (coerced up to 1 and down to the number of CPUs in the system, default is number of CPUs in the system minus two) (default None)
    num_block: int or None, number of rows of gdf2 to send to each process (coerced up to 1 and down to gdf2.shape[0], default is approximately gdf2.shape[0]/(3 times the number of CPUs in the system)) (default None)
    
    Returns
    -------
    GeoDataFrame: each row contains an overlap geometry between one row in gdf1 and one row in gdf2, with the following fields:
        keyfield1: entry from gdf1[keyfield1]
        keyfield1+' Index': index of row from gdf1
        keyfield2: entry from gdf2[keyfield2]
        keyfield2_' Index': index of row from gdf2
        'Overlap Geometry': intersection as shapely Polygon or MultiPolygon
        'Overlap Area': area of intersection geometry in units of area_epsg coordinate reference system
        'Overlap Ratio': fraction of area that this geometry represents out of all intersection geometries involving the source row in gdf2
    
    Notes
    -----
    Relies on multiprocess, pandas, geopandas, numpy, and time
    Returns None on input error after printing error description
    '''
    if verbosity >= 1: print('Executing intersectGDFMP')
    begin_time = time.time()

    # Input checks
    NCPU = multiprocess.cpu_count()
    N2 = gdf2.shape[0]

    if check_inputs:
        if num_process is None:
            num_process = max(
                1, NCPU - 2
            )  # empirical factor for circa 2020 multicore computers, prevents lockup by leaving processes available for the system and other applications
        else:
            try:
                num_process = int(num_process)
            except (ValueError, TypeError):
                print(
                    f'Input Error: num_process must be an integer (there are {NCPU} CPUs detected, input was: {num_process})'
                )
                return None
            if num_process < 0:
                tmp = num_process
                num_process = max(1, NCPU - num_process)
            else:
                num_process = min(NCPU, num_process)
        if verbosity >= 1:
            print(f'  num_process set to {num_process} (of {NCPU})')

        if num_block is None:
            num_block = N2
            if num_process > 1:
                num_block = max(
                    1, num_block // (3 * num_process)
                )  # empirical factor based on Canadian census CFSA and DA catographic geometry overlap calculation
        else:
            try:
                num_block = int(num_block)
            except (ValueError, TypeError):
                print(
                    f'Input Error: num_block must be an integer (input was: {num_block})'
                )
                return None
            num_block = max(1, min(num_block, N2))
        if verbosity >= 1: print(f'  num_block set to {num_block} (of {N2})')

        # Ensure coordinate reference systems match
        if gdf1.crs != gdf2.crs:
            print(
                f"Input coordinate reference systems must be the same: gdf1.crs={gdf1.crs}, gdf2.crs={gdf2.crs}"
            )
            return None

        # Ensure geometries are valid
        n_invalid1 = gdf1.shape[0] - sum(gdf1.geometry.is_valid)
        n_invalid2 = gdf2.shape[0] - sum(gdf2.geometry.is_valid)
        if not apply_buffer and (n_invalid1 > 0 or n_invalid2 > 0):
            print(
                f"Input geometries must be valid: {n_invalid1} invalid geometries in gdf1, {n_invalid2} invalid geometries in gdf2"
            )
            return None

        # Ensure areas_in is the proper size
        if not areas_in is None:
            N2 = len(areas_in)
            if N2 != gdf2.shape[0]:
                print(
                    f"Input areas_in must have length {gdf2.shape[0]}, actual length is {N2}"
                )
                return None
            N1 = len(areas_in[0])
            irreg = not all([len(a) == N1 for a in areas_in])
            if len(areas_in) != gdf2.shape[0] or N1 != gdf1.shape[0] or irreg:
                print(
                    f"Input areas_in must have size [{gdf2.shape[0]}][{gdf1.shape[0]}], actual shape is [{N2}][{N1}]{' with irregular second dimension length' if irreg else ''}"
                )
                return None

        # Checks performed here, no need to do so in intersectGDF
        check_inputs = False

    NDIV = N2 // num_block + (0 if N2 % num_block == 0 else 1)
    ITERIND = lambda ind: slice(ind * num_block,
                                ((ind + 1) * num_block
                                 if (ind + 1) * num_block < N2 else N2))
    results = [None] * NDIV

    start_time = time.time()
    completed = 0
    with multiprocess.Pool(num_process) as pool:
        print(
            f'Generating pool, P={num_process}, N={N2}, DIV={num_block}, NDIV={NDIV}'
        )
        ret = [
            pool.apply_async(
                intersectGDF,
                (gdf1, keyfield1, gdf2.iloc[ITERIND(ind), :], keyfield2,
                 areas_in if areas_in is None else areas_in[ITERIND(ind)],
                 verbosity, area_epsg, apply_buffer, threshold, vertex_limit,
                 check_inputs)) for ind in range(NDIV)
        ]
        print('Processing pool')
        for i in ipypb.track(
                range(NDIV)
        ):  # Alternative: initialize pb and call next(pb) in loop, instead of having a while loop to process all updates since last loop
            while True:
                indb_finished = [r.ready() for r in ret]
                indb_empty = [r is None for r in results]
                indb_update = [
                    f and e for f, e in zip(indb_finished, indb_empty)
                ]
                if any(indb_update):
                    ind_update = indb_update.index(True)
                    results[ind_update] = ret[ind_update].get(
                        999)  # Set finite timeout so that it returns properly
                    completed += 1
                    if completed % 10 == 0 or completed == 1 or completed == (
                            NDIV):
                        print(
                            f'Finished {completed}/{NDIV}, wall time {time.strftime("%H:%M:%S", time.gmtime(time.time()-start_time))}'
                        )
                    break
                time.sleep(
                    1
                )  # Make the infinite loop run slower; might be improved with explicit async

    wall_time = time.time() - start_time
    processor_time = sum([r[1] for r in results])
    gdf_joined = pandas.concat([r[0] for r in results
                                ]).sort_values(keyfield2, ignore_index=True)
    print(
        f'Pool processing concluded, process count {sum([r!=None for r in results])}/{NDIV}, wall time {time.strftime("%H:%M:%S", time.gmtime(wall_time))}, processor time {time.strftime("%H:%M:%S", time.gmtime(processor_time))}, processor:wall ratio {processor_time/wall_time:.3}x'
    )
    print(
        f'Total time: {time.strftime("%H:%M:%S", time.gmtime(time.time()-begin_time))}'
    )
    return gdf_joined
示例#28
0
文件: computeppr.py 项目: xgfs/FREDE
def mkl_set_num_threads(cores):
    mkl_rt.mkl_set_num_threads(ctypes.byref(ctypes.c_int(cores)))


mkl_set_num_threads(20)
print('CPUs used now: ', mkl_get_max_threads())

NTHREADS = 20  # !

libppr = cdll.LoadLibrary('cpp/pprlib.so')
libppr.init_rand()

datapath = '/data/frededata/'

for mat in track([
        'academic_confs.mat', 'academic_coa_2014.mat', 'flickr.mat',
        'vk2016.mat'
]):
    tp.send_text(mat[:-4] + ' started ppr evaluation')
    matf = loadmat(datapath + mat)
    G = matf['network']
    n = G.shape[0]
    print(mat, 'n = ', n)
    inds = G.indptr
    inds = np.append(inds, n).astype(np.int32)
    degrees = G.sum(axis=0).A1.astype(np.int32)
    ppr = np.zeros(n * n, dtype=np.float32)
    libppr.ppr_mat_matmul(ppr.ctypes.data_as(POINTER(c_float)),
                          inds.ctypes.data_as(POINTER(c_int)),
                          G.indices.ctypes.data_as(POINTER(c_int)),
                          degrees.ctypes.data_as(POINTER(c_int)), n,
                          c_float(0.85), 1, c_double(1e-6), 100,