Python core.frame.DataFrame示例，pandas.core.frame.DataFrame Python示例

示例#1

0

显示文件

    def relative_change(self, original: pd.core.frame.DataFrame,
                        transformed: pd.core.frame.DataFrame):
        """ Compute the relative change between the two given sets.
        Formula: |original - transformed| / f(original, transformed); with f(o,t) = 1/2 * (|o| + |t|) """

        return (original - transformed).abs() / (
            (original.abs() + transformed.abs()) / 2)

示例#2

0

显示文件

def save_output_table(data: pd.core.frame.DataFrame,
                      file_name: str,
                      index: bool = True,
                      path: str = '') -> None:
    """Save given DataFrame in the Output folder as a csv file."""
    ensure_folder_existence(f'{path}/Output/Tables')
    data.to_csv(f'{path}/Output/Tables/{file_name}.csv', index=index)

示例#3

0

显示文件

def _clean_data(df: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    df = df.replace('N.A.', np.nan)
    df.columns = [inflection.underscore(column) for column in df.columns]
    # TODO: using a random date to make sqlite happy. Once xl data cleaned, remove this.
    df['date_of_birth'] = df['date_of_birth'].fillna('01.01.2017')
    df['date_of_birth'] = df['date_of_birth'].apply(_parse_date)
    df['image_path'] = df['image_path'].apply(_normalize_slashes)
    return df

示例#4

0

显示文件

def export_table_csv(table: pd.core.frame.DataFrame, path: str) -> None:
    """ Export pandas data frame to a csv file.
        Can be useful if you want to save a copy of the data locally.
        :param table: Table which we want to export.
        :param path: Path to which we want to save the file (e.g. r'../local-data/newspapers.csv').
        :return: Nothing
        """
    table.to_csv(path)

示例#5

0

显示文件

文件： rss2mm.py 项目： mumrik58/rss2mattermost

def insert_hash(df:pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    hash_list = list()
    for row in df.iterrows():
        hash_list.append(
            hashlib.sha256(
                str(row).encode()
                ).hexdigest())
    df.insert(0, 'hash', hash_list)
    return df

示例#6

0

显示文件

def draw_graph(data_set: pd.core.frame.DataFrame):
    """
    Методя для отрисовки графика
    :param data_set: данные для отрисовки
    """
    data_set.plot(x=X_NAME, y=Y_NAME, style="o")
    plt.xlabel(X_NAME)
    plt.ylabel(Y_NAME)
    plt.show()

示例#7

0

显示文件

def plot_single(single_run: pd.core.frame.DataFrame, metric: str,
                savepath: str) -> None:
    fig, ax = plt.subplots(1,
                           2,
                           figsize=(8, 6),
                           sharex=True,
                           sharey='row',
                           gridspec_kw={
                               'wspace': 0,
                               'hspace': 0
                           })
    box = dict(facecolor='yellow', pad=6, alpha=0.2)

    ax[0].text(1.0,
               1.0,
               'BEST RUN',
               transform=ax[0].transAxes,
               horizontalalignment='center',
               verticalalignment='bottom',
               fontweight='bold')
    ax[0].text(0.5,
               0.98,
               'TRAINING',
               transform=ax[0].transAxes,
               horizontalalignment='center',
               verticalalignment='top',
               bbox=box)
    ax[1].text(0.5,
               0.98,
               'EVALUATION',
               transform=ax[1].transAxes,
               horizontalalignment='center',
               verticalalignment='top',
               bbox=box)

    train_name = DEFAULT_TRAIN_METRIC
    valid_name = DEFAULT_VALID_METRIC

    single_run.plot(x='epoch', y=train_name, ax=ax[0], legend=False)
    single_run.plot(x='epoch', y=valid_name, ax=ax[1], legend=False)

    ymin = np.min((np.min(
        single_run[train_name]), np.min(single_run[valid_name]))) * 0.95
    ymax = np.max((np.percentile(single_run[train_name],
                                 95), np.percentile(single_run[valid_name],
                                                    95)))
    xmin = np.min(single_run['epoch']) - np.max(single_run['epoch']) * 0.01
    xmax = np.max(single_run['epoch']) * 1.01

    ax[0].set_xlim(xmin, xmax)
    ax[0].set_ylim(ymin, ymax)
    ax[0].yaxis.set_label_coords(-0.15, 0.5, transform=ax[0].transAxes)
    ax[0].set_ylabel('loss', bbox=box)

    fig.savefig(savepath, bbox_inches='tight', dpi=200, transparent=True)

示例#8

0

显示文件

文件： modeling.py 项目： DawidSitnik/Investment-Strategy-With-Self-Organizing-Maps

def train_som(som_width: int,
              som_height: int,
              df: pd.core.frame.DataFrame,
              df_train: pd.core.frame.DataFrame,
              df_test: pd.core.frame.DataFrame,
              df_train_columns: pd.core.frame.DataFrame,
              n_iter: int,
              sigma=0.3,
              learning_rate=0.01):
    """
    Trains self-organizing map and returns train and test datasets with predicted clusters.
    Arguments:
        som_width - width of som map
        som_height - height of som map
        df - initially prepared dataset
        df_train - training dataset
        df_test - testing dataset
        df_train_columns - list of columns of training dataset
        n_iter - number of iteration during training
        sigma - sigma parameter for the model
        learning_rate - learning rate
    Returns:
        final_df_train - training dataset with predicted cluster
        final_df_test - testing dataset with predicted cluster
    """

    som = MiniSom(som_width,
                  som_height,
                  df_train.shape[1],
                  sigma=sigma,
                  learning_rate=learning_rate,
                  random_seed=0)
    som.train(df_train, n_iter)

    # converting numpy arrays to dataframes
    df_train = pd.DataFrame(df_train, columns=df_train_columns)
    df_test = pd.DataFrame(df_test, columns=df_train_columns)

    # creating column with cluster basing on model prediction
    df_train['cluster'] = df_train.apply(lambda x: som_predict(x, som), axis=1)
    df_test['cluster'] = df_test.apply(lambda x: som_predict(x, som), axis=1)

    # joining train and test dataframes with previously dropped columns, which will be useful in the further part of
    # the script
    final_df_train = df_train.join(
        df[['Date', 'Price', 'close_plus_20_days',
            'profit']].iloc[:, :len(df_train)],
        lsuffix='_org')
    final_df_test = df_test.join(
        df[['Date', 'Price', 'close_plus_20_days',
            'profit']].iloc[len(df_train):],
        lsuffix='_org')

    return final_df_train, final_df_test

示例#9

0

显示文件

def reduce_memory_usage(
        data: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    """
    DataFrame reduce memory
    Args:
        data (pd.core.frame.DataFrame): [description]
    Returns:
        pd.core.frame.DataFrame: [description]
    """
    start_memory = data.memory_usage().sum() / 1024**2
    print(
        "Memory usage before optimization is: {:.4f} MB".format(start_memory))

    numerics = [
        "int16",
        "int32",
        "int64",
        "float16",
        "float32",
        "float64",
    ]
    for col in data.columns:
        col_type = data[col].dtypes
        if col_type in numerics:
            col_min = data[col].min()
            col_max = data[col].max()
            if str(col_type)[:3] == "int":
                if col_min > np.iinfo(np.int8).min and col_max < np.iinfo(
                        np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif col_min > np.iinfo(np.int16).min and col_max < np.iinfo(
                        np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif col_min > np.iinfo(np.int32).min and col_max < np.iinfo(
                        np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif col_min > np.iinfo(np.int64).min and col_max < np.iinfo(
                        np.int64).max:
                    data[col] = data[col].astype(np.int64)
            else:
                if col_min > np.finfo(np.float16).min and col_max < np.finfo(
                        np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif col_min > np.finfo(np.float32).min and col_max < np.finfo(
                        np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)
    end_memory = data.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.4f} MB".format(end_memory))
    print("Memory decreased by {:.1f}%".format(
        100 * (start_memory - end_memory) / start_memory))

    return data

示例#10

0

显示文件

def pie_chart_margin(column: str, df: pandas.core.frame.DataFrame, title1: str,
                     title2: str, explode: tuple):
    """
    GroupBy Classification, visualization by margin
    """

    df1 = pd.DataFrame(df.groupby('Classification')[column].sum())
    df3 = pd.DataFrame(
        df.groupby('Classification')['totalMonthlyNetSale'].sum())
    df4 = df1['SellMargin'] / df3['totalMonthlyNetSale']
    print(df3.reset_index())
    print(
        '\nSell Margin % means: TotalSellMargin / totalMonthlyNetSale within classifiction '
    )

    labels = df1.index
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))
    # plot 1
    sizes1 = df1[column]
    axes[0].pie(sizes1,
                explode=explode,
                labels=labels,
                autopct='%1.1f%%',
                shadow=True,
                startangle=90)
    axes[0].axis(
        'equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    axes[0].set_title(title1, fontsize=15)

    # plot 2

    sns.barplot(x="Classification",
                y=df4.reset_index().columns[1],
                data=df4.reset_index(),
                ax=axes[1])
    axes[1].set_title(title2, fontsize=15)
    axes[1].set_ylabel(
        '% SellMargin / totalMonthlyNetSale of this Classification',
        fontsize=12)

    #axes[1].pie(df4, explode=explode, labels=labels, autopct='%1.1f%%',
    #shadow=True, startangle=90)
    #axes[1].axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    #axes[1].set_title(title2, fontsize=15)

    # plot 3
    axes[2] = sns.boxplot(x="Classification", y=column, data=df)
    axes[2] = sns.swarmplot(x="Classification", y=column, data=df, color=".25")
    axes[2].set_title('Sell Margin Details', fontsize=15)

    fig.tight_layout()
    plt.show()

示例#11

0

显示文件

    def formatData(dataFrame: pandas.core.frame.DataFrame,
                   keysToGet: List[str] = ['Open', 'High', 'Low', 'Last']):
        """        
        """

        dataFrame.fillna(method='ffill')

        formattedData = {'Date': dataFrame.index.strftime("%d-%m-%Y").tolist()}

        for key in keysToGet:
            formattedData.update({key: dataFrame[key].tolist()})

        return formattedData

示例#12

0

显示文件

 def _create(self,data : pd.core.frame.DataFrame) -> None:
     
   
   data.drop(['SepsisLabel'],inplace=True, axis=1,errors='ignore')
   self.columns = data.columns
   
   patients = data['Id'].unique()
   
   for patientId in (tqdm(patients,desc="Creating Tensor",ascii = ' |/-\|/-\=') if self.verbose else patients):
     
     patientData = data[data['Id'] == patientId].drop('Id',axis='columns')
       
     self.dataset.append(patientData.values)

示例#13

0

显示文件

文件： utils.py 项目： maxibor/pydamage

def df_to_csv(df: pd.core.frame.DataFrame,
              outdir: str,
              outfile: str = "pydamage_results.csv"):
    """Write Pydamage results to disk

    Args:
        df(pandas DataFrame): Pydamage results DataFrame
        outdir (str): Path to output directory
    """
    df = df.round(3)
    if not outdir:
        outdir = "."
    df.to_csv(f"{outdir}/{outfile}")

示例#14

0

显示文件

 def _output(self, output_file_name: str or bool, encoding: str,
             output_data: pd.core.frame.DataFrame):
     """
     The function groups the data frame by keys and 
     prints the required values to the console or writes to a file
     Parameters
     ----------
     output_file_name : str or bool
         The name of the exit file to open, or bool operator
     encoding : str
         The encoding in which the files will be opened
     group_data : pd.core.frame.DataFrame
         Sorted dataFrame with the required data for output
     Returns
     -------
     None.
     """
     if output_file_name:
         with open(Path(output_file_name), encoding=encoding,
                   mode='w') as file:
             for label, g in output_data.groupby(['mm', 'dd', 'yyyy'],
                                                 sort=False):
                 file.write(str(g['ratio'][0]) + '\t')
                 file.write(str(label[0]) + '\t')
                 file.write(str(label[1]) + '\t')
                 file.write(str(label[2]) + '\t')
                 file.write(str(g['mean_maxt'][0]) + '\n')
                 if g['kind'][0] == 1:
                     for i, j in g.sort_values(['wind',
                                                'meteo']).iterrows():
                         file.write('\t' + str(j['avgt']) + ' ')
                         file.write(str(j['maxt']) + ' ')
                         file.write(str(j['mint']) + ' ')
                         file.write(str(j['wind']) + ' ')
                         file.write(str(j['hum']) + ' ')
                         file.write(str(j['meteo']) + '\n')
         print('OK')
     else:
         for label, g in output_data.groupby(['mm', 'dd', 'yyyy'],
                                             sort=False):
             print(g['ratio'][0], *label[0:3], g['mean_maxt'][0], sep='\t')
             if g['kind'][0] == 1:
                 for i, j in g.sort_values(['wind', 'meteo']).iterrows():
                     print('\t',
                           j['avgt'],
                           j['maxt'],
                           j['mint'],
                           j['wind'],
                           j['hum'],
                           j['meteo'],
                           sep=' ')

示例#15

0

显示文件

文件： utils.py 项目： qize/ARL-UPPER

def clean_target(f: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    """Remove rows with target values that cannot be converted to float or nan."""

    # experimental value (y)
    y = f.iloc[:, -1]

    # indices that cannot be converted to float
    nonfloatable_indices_ = nonfloatable_indices(y.values)

    # drop bad indices
    f = f.drop(f.index[nonfloatable_indices_], axis=0)

    # remove nan
    return f.dropna(axis=0)

示例#16

0

显示文件

def to_csv(frame: pd.core.frame.DataFrame,
            *,
           name: str):

    save_name = '{}.csv'.format(name)
    if not os.path.exists(save_name):
        try:
            frame.to_csv(save_name, encoding='utf-8', index=False)
            print('{}保存成功！'.format(name))
        except Exception as e:
            print('{}保存失败!'.format(name))
            traceback.print_exc()
    else:
        print('{}已存在'.format(name))

示例#17

0

显示文件

文件： preprocess.py 项目： RezaCDoobary/DrugDiscovery-Tox21

def add_hydrogen_to_source_data(data:pd.core.frame.DataFrame)->str:
    """
    Takes the dataframe containing the smiles, adds the hydrogen back in, saves it in the data folder 
    with a new name. The new filename is the output to the function.
    """
    def addHs(smile):
        m=Chem.MolFromSmiles(smile)
        m=Chem.AddHs(m)
        return Chem.MolToSmiles(m)

    data['SMILES'] = data['SMILES'].map(lambda x: addHs(x))
    output_file = 'data/data_dups_removed_with_H.csv'
    data.to_csv(output_file)
    return output_file

示例#18

0

显示文件

def frequency_std_database(data: pd.core.frame.DataFrame):
    """Lors du calcul de le frequence:
        - Difference entre deux lignes successives
        - Moyenne de l'écart de type datetime.timedelta : W jours, X heures, Y minutes et 
            Z secondes
    """

    data = data.diff()
    data = data.iloc[1:]

    mean_data = data.mean()
    std_data = data.std()

    return mean_data, std_data

示例#19

0

显示文件

    def save(df: pd.core.frame.DataFrame, save_fp: str):

        # TODO: DONE
        """Saves specified DataFrame to save_fp.

        Arguments:
            df {pd.core.frame.DataFrame} -- DataFrame to be saved
            save_fp {str} -- Path where the DataFrame is saved

        Returns:
            None -- Prints success message"""

        df.to_csv(path_or_buf=save_fp, index=False, mode='w')

        print("File has been saved to '{}'".format(save_fp))

示例#20

0

显示文件

文件： dfm1RevisionwiseErrorFetcher.py 项目： Akashv1507/wrldc_demForecast_data_dashboard

    def toListOfTuple(self, df:pd.core.frame.DataFrame) -> List[Tuple]:
        """convert demand data to list of Tuple [[timestamp, entitytag, revisionNo, rmse,mae,mape,rmse%],]
        Args:
            df (pd.core.frame.DataFrame): demand data dataframe
        Returns:
            List[tuple]: list of tuple of revisionwise error data
        """ 
        # replacing entity_tag with constituents name
        replace_values = {"WRLDCMP.SCADA1.A0047000":"WR-Total","WRLDCMP.SCADA1.A0046980": "Maharastra", "WRLDCMP.SCADA1.A0046957":"Gujarat", "WRLDCMP.SCADA1.A0046978":"Madhya Pradesh", "WRLDCMP.SCADA1.A0046945":"Chattisgarh", "WRLDCMP.SCADA1.A0046962":"Goa", "WRLDCMP.SCADA1.A0046948":"DD", "WRLDCMP.SCADA1.A0046953":"DNH"}
        df = df.replace({"ENTITY_TAG": replace_values})

        df['DATE_KEY'] = df['DATE_KEY'].astype('str')
        records = df.to_records(index=False)
        listOfTuple = list(records)
        return listOfTuple

示例#21

0

显示文件

def glycaemic_variability(
    df: pd.core.frame.DataFrame,
    colum_name: str = "Sensor Glucose (mg/dL)",
    windows: Dict[str,int] = {
        "weekly": 7,
        "monthly": 30
    },
    kind: str = "NAIVE"
) -> NoReturn:
    """
    
    """
    
    methods = {
        "NAIVE": lambda x: 100 * pd.Series.std(x) / pd.Series.mean(x)
    }
    
    kind = kind.upper()
    
    if kind in methods.keys():
        _statistic = df.groupby(df.index.date)[colum_name].apply(methods[kind])
    
    # Plot the main series :
    _statistic.plot(**{"label":"daily"})
    
    # Plot the moving averages :
    for key, value in windows.items():
        ax = _statistic.rolling(value).mean().plot(**{"label":key})
        
    _global_mean = _statistic.mean()
    plt.axhline(_global_mean, **{"label": f"mean = {round(_global_mean,1)}", "c": "blue"})
    plt.legend()
    plt.title(f"Glycaemic Variability, assesment method : {kind}")

示例#22

0

显示文件

文件： sentiment_analysis.py 项目： fzEro555/Intro-to-Data-Analytics-Projects

def get_sent_score(data_frame: pd.core.frame.DataFrame,
                   col_list: list) -> dict:
    # calculates total sentiment score for every row of dataframe
    sent_analyzer = SentimentIntensityAnalyzer()
    score_list = []
    neg_total = 0
    pos_total = 0
    neutral_total = 0
    total = 0
    for index, row in data_frame.iterrows():
        for col in col_list:
            score = sent_analyzer.polarity_scores(str(row[col]))
            score_list.append((row[col], score))
            neg_total += score['neg']
            pos_total += score['pos']
            neutral_total += score['neu']

            total += 1

    # return average sentiment score of data
    score_dict = {}
    score_dict["neg"] = neg_total / total
    score_dict["pos"] = pos_total / total
    score_dict["neutral"] = neutral_total / total

    print(score_dict)

    return score_dict

示例#23

0

显示文件

def deduplicate_column_values(data: pd.core.frame.DataFrame,
                              reserved_cols: List[str] = [],
                              max_obs: int = 65536) -> pd.core.frame.DataFrame:
    """Delete columns with the same values as a later column.

    Args:
        df: A DataFrame.
        reserved_cols: Names of columns to exclude from deduplication.
        max_obs: The number of observations to sample if df has more than that
            many observations.

    Returns:
        A DataFrame containing only the last instance of each unique column.
    """
    comparison_data = data.drop(reserved_cols,
                                axis=1).sample(n=min(max_obs, data.shape[0]),
                                               replace=False)
    deduplicated_cols = list(
        comparison_data.T.drop_duplicates(keep="last").index)
    deduplicated_data = data[reserved_cols + deduplicated_cols]
    duplicated_cols = [
        col for col in data if col not in deduplicated_data.columns
    ]
    if duplicated_cols:
        print(f'{", ".join(duplicated_cols)} dropped for having identical ' +
              "values as another feature")
    return deduplicated_data

示例#24

0

显示文件

文件： statistics.py 项目： Ruairi-osul/spiketimes

def mean_firing_rate_by(
    df: pd.core.frame.DataFrame,
    spiketimes_col: str = "spiketimes",
    spiketrain_col: str = "spiketrain",
    t_start: float = None,
    t_stop: float = None,
):
    """
    Estimate the mean firing rate of each spiketrain.

    Firing rate caluclated by summing spikes and dividing by total time.

    Args:
        df: A pandas DataFrame containing spiketimes indexed by spiketrain
        spiketimes_col: The label of the column containing spiketimes
        spiketrain_col: The label of the column identifying the spiketrain responsible for the spike
        t_start: Time point at which to start. Defaults to time of first spike in df.
        t_stop: Maximum timepoint. Defaults to last spike in df.
    Returns:
        A DataFrame containing mean firing rate by neuron
    """
    if t_start is None:
        t_start = df[spiketimes_col].min()
    if not t_stop:
        t_stop = df[spiketimes_col].max()
    return (df.groupby(spiketrain_col).apply(
        lambda x: spiketimes.statistics.mean_firing_rate(
            x[spiketimes_col].values,
            t_start=t_start,
            t_stop=t_stop,
        )).reset_index().rename(columns={0: "mean_firing_rate"}))

示例#25

0

显示文件

文件： statistics.py 项目： Ruairi-osul/spiketimes

def diffmeans_test_by(
    df: pd.core.frame.DataFrame,
    n_boot: int = 1000,
    spikecount_col: str = "spike_count",
    spiketrain_col: str = "spiketrain",
    condition_col: str = "cond",
):
    """
    Calculates the difference between means of spike counts for each spike in a data frame and also tests
    significance using a permutation test.

    Args:
        df: A pandas DataFrame containing spiketimes indexed by spiketrain
        n_boot: The number of permutation replicates to draw.
        spikecount_col: The label of the column containing spikecounts
        spiketrain_col: The label of the column identifying the spiketrain responsible for the spike
        condition_col: A categorical column containing 0 for the baseline condition and 1 for the experimental condition
    Returns:
        A pandas DataFrame containing one row per spiketrain with columns {'spiketrain', 'diff_of_means', 'p'}
    """
    return (df.groupby(spiketrain_col).apply(lambda x: pd.Series(
        spiketimes.statistics.diffmeans_test(
            x[spikecount_col].values,
            x[condition_col].values,
            n_boot=n_boot,
        ))).reset_index().rename(columns={
            0: "diff_of_means",
            1: "p"
        }))

示例#26

0

显示文件

文件： statistics.py 项目： Ruairi-osul/spiketimes

def auc_roc_test_by(
    df: pd.core.frame.DataFrame,
    n_boot: int = 1000,
    return_distance_from_chance: bool = False,
    spikecount_col: str = "spike_count",
    spiketrain_col: str = "spiketrain",
    condition_col: str = "cond",
):
    """
    Calculates the Area Under the Receiver Operating Characteristic Curve of spike counts for each spiketrain.

    The AUCROC can be used as a metric of the separability of two distrobutions. Each spiketrain must have been recorded
    in both conditions during multiple trials. Significance tested using a permutation test.

    Args:
        df: A pandas DataFrame containing spiketimes indexed by spiketrain
        n_boot: The number of permutation replicates to draw.
        spikecount_col: The label of the column containing spikecounts
        spiketrain_col: The label of the column identifying the spiketrain responsible for the spike
        condition_col: A categorical column containing 0 for the baseline condition and 1 for the experimental condition
        return_distance_from_chance: If True, returns distance from 0.5
    Returns:
        A pandas DataFrame containing one row per spiketrain with columns {'spiketrain', 'AUCROC', 'p'}
    """
    return (df.groupby(spiketrain_col).apply(lambda x: pd.Series(
        spiketimes.statistics.auc_roc_test(
            x[spikecount_col].values,
            x[condition_col].values,
            n_boot=n_boot,
            return_distance_from_chance=return_distance_from_chance,
        ))).reset_index().rename(columns={
            0: "AUCROC",
            1: "p"
        }))

示例#27

0

显示文件

def cat2onehot_list_loop(data: pd.core.frame.DataFrame, varnames: List[str]) -> pd.core.frame.DataFrame:
    data1 = data.copy()
    for col in varnames:
        df_oh = pd.get_dummies(data1[col])
        data1.drop(col, axis=1, inplace=True)
        data1 = pd.concat((data1, df_oh), axis=1)
    return data1

示例#28

0

显示文件

def matchPathToMetadata(metadata: pd.core.frame.DataFrame,
                        maps_folder_path: str, filesnames_column: str):
    ''' Match the images found in path with the metadata.
    Input(s):
        metadata: dataframe containing metadata
        maps_folder_path: path where the maps images are found
        filesnames_column: column containing the images names, in the metadata dataframe
    Output(s):
        df_metadata: metadata dataframe containing the images paths, when they were found
    '''

    maps_paths = getImagesPaths(maps_folder_path)

    maps_names = []
    for path in maps_paths:
        maps_names.append(getImageName(path))
    maps_names = np.asarray(maps_names)

    _maps_paths = []
    for ind, filename in metadata[filesnames_column].iteritems():
        image_name = getImageName(filename)
        match = (image_name == maps_names).astype('int')

        if np.sum(match) == 1:
            _maps_paths.append(maps_paths[np.argmax(match)])
        else:
            _maps_paths.append(np.nan)

    metadata['path'] = _maps_paths
    df_metadata = metadata.dropna(subset=['path']).reset_index(drop=True)

    return df_metadata

示例#29

0

显示文件

    def create_orientations_widget(self,
                                   orientations: pd.core.frame.DataFrame)\
            -> List[vtk.vtkInteractionWidgetsPython.vtkPlaneWidget]:
        """Create plane widget for each orientation with interactive recompute
        of the model

        Args:
            orientations (pd.core.frame.DataFrame):

        Returns:
            List[vtkInteractionWidgetsPython.vtkPlaneWidget]:
        """
        colors = self._get_color_lot(is_faults=True, is_basement=False)
        widget_list = []
        # for index, pt, nrm in zip(i, pts, nrms):
        self._color_lot = self._get_color_lot(is_faults=True,
                                              is_basement=False,
                                              index='id')
        for index, val in orientations.iterrows():
            widget = self.p.add_plane_widget(self.call_back_plane,
                                             normal=val[['G_x', 'G_y', 'G_z']],
                                             origin=val[['X', 'Y', 'Z']],
                                             bounds=self.extent,
                                             factor=0.15,
                                             implicit=False,
                                             pass_widget=True,
                                             test_callback=False,
                                             color=colors[val['surface']])
            widget.WIDGET_INDEX = index
            widget_list.append(widget)

        return widget_list

示例#30

0

显示文件

文件： shape_data.py 项目： mynkit/Budokan_Rent_Estimate

    def shape_address_col(self, df_: pd.core.frame.DataFrame):
        '''addressカラムを住所全体の文字列になるように変換する
        '''
        assert {'prefecture_id', 'ward_city_id'}.issubset(df_.columns)
        df = df_.copy()
        self.prefecture_city_id_info['ward_city_id'] = [
            int('%d%d' % (prefecture_id, ward_city_id))
            for prefecture_id, ward_city_id in zip(
                self.prefecture_city_id_info.prefecture_id.tolist(),
                self.prefecture_city_id_info.ward_city_id.tolist())
        ]
        self.prefecture_city_id_info.index = self.prefecture_city_id_info.prefecture_id
        prefecture_dic = self.prefecture_city_id_info.to_dict(
        )['prefecture_name']
        self.prefecture_city_id_info.index = self.prefecture_city_id_info.ward_city_id
        ward_dic = self.prefecture_city_id_info.to_dict()['ward_city_name']

        df['ward_city_id'] = [
            int('%d%d' % (prefecture_id, ward_city_id))
            for prefecture_id, ward_city_id in zip(df.prefecture_id.tolist(),
                                                   df.ward_city_id.tolist())
        ]
        df['prefecture_name'] = df['prefecture_id'].map(prefecture_dic)
        df['ward_city_name'] = df['ward_city_id'].map(ward_dic)
        df = df.dropna(subset=['prefecture_name', 'ward_city_name'])
        df['address'] = [
            prefecture_name + ward_city_name +
            address if address == address and address else prefecture_name +
            ward_city_name + registered_address
            for prefecture_name, ward_city_name, address, registered_address in
            zip(df.prefecture_name.tolist(), df.ward_city_name.tolist(),
                df.address.tolist(), df.registered_address.tolist())
        ]
        return df