示例#1
0
def main(df_anon: DataFrame, df_orig: DataFrame, footprint: DataFrame):
    df = df_anon.copy()
    df_original_reduced = df_orig.copy()

    df = df.drop(columns=['longitude', 'latitude'])
    df_original_reduced = df_original_reduced.drop(
        columns=['longitude', 'latitude'])

    # Convert date to year-week
    df['date'] = df['date'].dt.year.astype(
        str) + "-" + df['date'].dt.week.astype(str)
    df_original_reduced['date'] = df_original_reduced['date'].dt.year.astype(
        str) + "-" + df_original_reduced['date'].dt.week.astype(str)

    # Group by user and week number
    df = df.groupby(['id', 'date']).size().reset_index(name='count')
    df_original_reduced = df_original_reduced.groupby(
        ['id', 'date']).size().reset_index(name='count')

    # Join the two Dataframes on the number of times an id is found each week
    df = pd.merge(df_original_reduced, df, on=['date', 'count'], how='left')
    df = df.drop(columns=['count'])
    df = df.groupby(['date', 'id_x'])['id_y'].apply(list)
    weeks = df.reset_index()['date'].unique().tolist()
    df = df.reset_index().set_index(['id_x', 'date']).unstack('date')
    df.columns = weeks

    # Compare the two footprints and create score
    df = (df == footprint)
    df = df.astype(int)  # convert true and falses to 1/0
    score = df.to_numpy().sum()
    values = footprint.fillna(0).astype(
        'bool').to_numpy().sum()  # sum of each non Nan values

    return score / values
示例#2
0
def transform_dataset_offset(
    dataset: npt.NDArray,
    dataframe: DataFrame,
    dataframe_stopcodons: DataFrame,
    sequence_raw: str,
    start_position: int,
    position_offset: int,
    stopcodons: bool,
) -> DataFrame:
    """
    Generate a dataframe with the sequence position_offset.
    """
    # Add position_offset sequence
    offset_sequence = _offset_sequence(dataset, sequence_raw, start_position,
                                       position_offset)
    df_output = dataframe_stopcodons.copy(
    ) if stopcodons is True else dataframe.copy()

    # Copy old sequence
    df_output['Sequence_old'] = df_output['Sequence']
    # Count amino acids
    aa_number = len(set(df_output['Aminoacid']))
    # Generate new position_offset sequence
    df_output['Sequence'] = np.ravel([[aa] * aa_number
                                      for aa in offset_sequence])

    # Drop rows with X
    df_output.drop(df_output.index[df_output['Sequence'] == 'X'], inplace=True)
    return df_output
示例#3
0
    def test_bool_uint(self):
        s0 = Series([0, 1, True], dtype=np.bool)
        s1 = Series([0, 1, 100], dtype=np.uint8)
        s2 = Series([0, 1, 255], dtype=np.uint8)
        s3 = Series([0, 1, 2**15 - 100], dtype=np.uint16)
        s4 = Series([0, 1, 2**16 - 1], dtype=np.uint16)
        s5 = Series([0, 1, 2**31 - 100], dtype=np.uint32)
        s6 = Series([0, 1, 2**32 - 1], dtype=np.uint32)

        original = DataFrame({
            's0': s0,
            's1': s1,
            's2': s2,
            's3': s3,
            's4': s4,
            's5': s5,
            's6': s6
        })
        original.index.name = 'index'
        expected = original.copy()
        expected_types = (np.int8, np.int8, np.int16, np.int16, np.int32,
                          np.int32, np.float64)
        for c, t in zip(expected.columns, expected_types):
            expected[c] = expected[c].astype(t)

        with tm.ensure_clean() as path:
            original.to_stata(path)
            written_and_read_again = self.read_dta(path)
            written_and_read_again = written_and_read_again.set_index('index')
            tm.assert_frame_equal(written_and_read_again, expected)
示例#4
0
def select_snv(df_input: DataFrame) -> DataFrame:
    """
    Select for SNV variants in DSM dataset

    Parameters
    -----------
    df_input : pandas dataframe containing DSM data

    Returns
    --------
    Modified dataframe("Variant","Score") where "SNV?"== True. Returns copy
    """

    # Use _add_SNV_boolean funciton
    df_input = add_snv_boolean(df_input.copy())

    # Select SNV? == True only
    df_input = df_input[df_input["SNV?"] == True].copy()  # pylint: disable=singleton-comparison

    # Select columns of interest
    df_input = df_input[["Position", "Variant", "Score", "Score_NaN"]].copy()

    # Reset index
    df_input.reset_index(drop=True, inplace=True)

    return df_input
示例#5
0
def condense_heatmap(df_input: DataFrame, new_order: List[str]) -> DataFrame:
    """
    Converts the np.array with stored enrichment scores into the condensed heatmap
    """
    df_input = df_input.copy()
    df_input.drop(['Position'], axis=1, inplace=True)

    # Group by sequence and aminoacid, and then pivot table
    df_grouped = df_input.groupby(['Sequence', 'Aminoacid'], sort=False).mean()
    df_pivoted = df_grouped.pivot_table(values='Score',
                                        index='Aminoacid',
                                        columns='Sequence')
    df_pivoted.reset_index(drop=False, inplace=True)

    # Sort in y axis desired order
    df_pivoted['Aminoacid'] = Categorical(df_pivoted['Aminoacid'], new_order)
    df_pivoted = df_pivoted.sort_values(by=['Aminoacid'])

    # Sort in x axis desired order
    x_order = return_common_elements(new_order, list(df_pivoted.columns))

    # Drop amino acid column
    data_dropped = df_pivoted.drop(['Aminoacid'], axis=1)

    return data_dropped[x_order]
示例#6
0
def calculate_correlation_by_residue(df_input: DataFrame) -> DataFrame:
    """
    Calculate correlation by position.
    """
    df_output = df_input.copy()
    df_output = df_output.pivot_table(values='Score',
                                      index='Position',
                                      columns='Aminoacid')
    return df_output.T.corr()
示例#7
0
 def _group_codons_to_aa(self, df_input: DataFrame) -> DataFrame:
     """
     Group different codons that are synonymous. Returns sum of counts.
     """
     df_input = df_input.copy()
     df_input['Aminoacid'] = self.aminoacids
     # Group by mean
     return df_input.groupby(as_index=True, by='Aminoacid',
                             sort=False).sum()
示例#8
0
def _polishdf(df: DataFrame) -> DataFrame:
    df_mean = df.copy()
    df_mean = df.mean().to_frame()
    df_mean.reset_index(drop=False, inplace=True)
    df_mean.rename(columns={0: 'R2'}, inplace=True)
    df_mean['Combinations'] = list(
        df_mean['index'].apply(lambda x: ''.join(x)))  # pylint: disable=unnecessary-lambda
    df_mean.drop(columns=['index'], inplace=True)
    return df_mean
示例#9
0
def _generate_codes(df: DataFrame, cat_cols: List) -> dict:
    tmp = df.copy()
    for col in cat_cols:
        tmp[col] = tmp[col].astype("category").cat.as_ordered()

    # list of categories for each column (always a column for None)
    codes = {col: list(tmp[col].cat.categories) for col in cat_cols}

    return codes
def _calculate_secondary(df_input: DataFrame, secondary: list) -> DataFrame:
    """
    Returns copy.
    """
    df_output: DataFrame = df_input.copy()
    df_output.insert(4, 'Secondary', secondary)
    df_output = df_output.groupby(['Secondary'], as_index=False,
                                  sort=False).mean()
    df_output = df_output[df_output['Secondary'].str.startswith(('β', 'α'))]
    return df_output.drop(['Position'], axis=1)
示例#11
0
def top_recommended_movies_for_user(userId:int, df:DataFrame, svd:SVD, links:DataFrame):
    movies = df.copy()
    for i in range(len(movies['id'])):
        movies['id'].iloc[i] = links[(links['tmdbId'] == movies['id'].iloc[i])]['movieId']
    
    movies['est'] = movies['id'].apply(lambda x: 0)
    for i in range(len(movies['id'])):
        movies['est'].iloc[i] = svd.predict(userId, movies['id'].iloc[i]).est
    movies = movies.sort_values('est', ascending=False)
    return movies
示例#12
0
def _grou_by_secondary(df: DataFrame, secondary: List[str]) -> DataFrame:
    """
    Groups each secondary motif and makes the mean.

    Returns dataframe. Returns copy
    """
    df = df.copy()
    df.insert(4, 'Secondary', secondary)
    df = df.groupby(['Secondary', 'Aminoacid'], as_index=False).mean()
    df = df.loc[df['Secondary'].str.startswith(('β', 'α'))]
    return df
示例#13
0
    def test_dates_invalid_column(self):
        original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)])
        original.index.name = "index"
        with tm.ensure_clean() as path:
            with tm.assert_produces_warning(InvalidColumnName):
                original.to_stata(path, {0: "tc"})

            written_and_read_again = self.read_dta(path)
            modified = original.copy()
            modified.columns = ["_0"]
            tm.assert_frame_equal(written_and_read_again.set_index("index"), modified)
def find_trendline(df_data: DataFrame, y_key: str, high_low: str = "high") -> DataFrame:
    """Attempts to find a trend line based on y_key column from a given stock ticker data frame.

    Parameters
    ----------
    df_data : DataFrame
        The stock ticker data frame with at least date_id, y_key columns.

    y_key : str
        Column name to base the trend line on.

    high_low: str, optional
        Either "high" or "low". High is the default.

    Returns
    -------
    DataFrame
        If a trend is successfully found,
            An updated Panda's data frame with a trend data {y_key}_trend column.
        If no trend was found,
            An original Panda's data frame
    """

    for iteration in [3, 4, 5, 6, 7]:
        df_temp = df_data.copy()
        while len(df_temp) > iteration:
            reg = linregress(
                x=df_temp["date_id"],
                y=df_temp[y_key],
            )

            if high_low == "high":
                df_temp = df_temp.loc[
                    df_temp[y_key] > reg[0] * df_temp["date_id"] + reg[1]
                ]
            else:
                df_temp = df_temp.loc[
                    df_temp[y_key] < reg[0] * df_temp["date_id"] + reg[1]
                ]

        if len(df_temp) > 1:
            break

    if len(df_temp) == 1:
        return df_data

    reg = linregress(
        x=df_temp["date_id"],
        y=df_temp[y_key],
    )

    df_data[f"{y_key}_trend"] = reg[0] * df_data["date_id"] + reg[1]

    return df_data
示例#15
0
def calculate_correlation(df: DataFrame,
                          order_aminoacids: List[str]) -> DataFrame:
    """
    Calculate correlation by variant.
    """
    dataset: DataFrame = df.copy()
    dataset = dataset.pivot_table(values='Score',
                                  index='Position',
                                  columns='Aminoacid')
    dataset = dataset.corr()
    dataset = dataset.reindex(index=order_aminoacids)[order_aminoacids]
    return dataset
示例#16
0
    def test_dates_invalid_column(self):
        original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)])
        original.index.name = 'index'
        with tm.ensure_clean() as path:
            with tm.assert_produces_warning(InvalidColumnName):
                original.to_stata(path, {0: 'tc'})

            written_and_read_again = self.read_dta(path)
            modified = original.copy()
            modified.columns = ['_0']
            tm.assert_frame_equal(written_and_read_again.set_index('index'),
                                  modified)
示例#17
0
    def update_a_df_column(self, df_to_update: DataFrame,
                           df_as_source: DataFrame, unique_col: str,
                           col_to_update: str):
        '''
        Updates a Dataframe column with a source Dataframe based on their common unique columns
        
        parameters:
            df_to_update: dataframe, main df to be updated
            df_as_source: dataframe, source df to update the main df
            unique_col: str,
                common columns (should have same name) to match records, 
                this unique column must have unique values
            col_to_update: str, which column value to be updated
        
        returns:
            a copy of the updated DataFrame     

        warning:
            index is reset during the update
        '''
        # copy df
        df = df_to_update.copy()
        source = df_as_source.copy()

        # reset index: WARNING: drops index if exist
        df.reset_index(inplace=True, drop=True)
        source.reset_index(inplace=True, drop=True)

        # set unique_col as index
        df.set_index(unique_col, inplace=True)
        source.set_index(unique_col, inplace=True)

        # update on series
        df[col_to_update].update(source[col_to_update])

        # reset index: WARNING: puts back the index to first
        df.reset_index(inplace=True, drop=False)
        source.reset_index(inplace=True, drop=False)

        return df
def group_by_aa(df_input: DataFrame, aminoacids: List[str]) -> DataFrame:
    """
    Group different codons that are synonymous.
    """
    # copy df
    df_output = df_input.copy()

    # Set up amino acid column
    df_output['Aminoacid'] = aminoacids

    # Group by mean
    df_output = df_output.groupby(as_index=True, by='Aminoacid',
                                  sort=False).mean()
    return df_output
示例#19
0
    def test_dates_invalid_column(self):
        original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)])
        original.index.name = 'index'
        with tm.ensure_clean() as path:
            with warnings.catch_warnings(record=True) as w:
                tm.assert_produces_warning(original.to_stata(path, {0: 'tc'}),
                                           InvalidColumnName)
            tm.assert_equal(len(w), 1)

            written_and_read_again = self.read_dta(path)
            modified = original.copy()
            modified.columns = ['_0']
            tm.assert_frame_equal(written_and_read_again.set_index('index'),
                                  modified)
示例#20
0
    def add_a_col_from_a_df(self, into_df: DataFrame, from_df: DataFrame,
                            unique_col: str, col_to_add: str):
        """  
        Add a column into a dataframe from another dataframe
        
        parameters:
            into_df: dataframe, main df, which will be updated with a new column
            from_df: dataframe, source df, which has the column to add into main df
            unique_col: str, column name which is common in both dataframes
            col_to_add: str, column to be added from source dataframe
            
        returns:
            * main dataframe filled with the new column and values, where unique column matches
        
        warning:
            this method assumes no index
        """

        main = into_df.copy()
        source = from_df.copy()

        return main.merge(source[[unique_col, col_to_add]],
                          on=unique_col,
                          how="left")
示例#21
0
def feature_values_using_filter_and_indexes(move_data: DataFrame,
                                            id_: Union[int, Text],
                                            feature_name: Text,
                                            filter_: List,
                                            idxs: List,
                                            values: Any,
                                            inplace: Optional[bool] = True):
    """
    Create or update move and stop by radius.
    Parameters
    ----------
    move_data : dataframe
       The input trajectories data.
    id_ : str
        Indicates the index to be changed.
    feature_name : str
        The name of the column that the user wants to change values for.
    filter_ : array
        Indicates the rows with the index "id_" of the "feature_name"
        that must be changed.
    idxs : array like of indexes
        Indexes to atribute value
    values : any
        The new values to be set to the selected feature.
    inplace: bool, optional
        if set to true the original dataframe will be altered,
        otherwise the alteration will be made in a copy, that will be returned,
        by default True

    Returns
    -------
    DataFrame
        A copy of the original dataframe or None
    """

    if not inplace:
        move_data = move_data.copy()

    values_feature = move_data.at[id_, feature_name]
    values_feature_filter = values_feature.iloc[filter_]
    values_feature_filter.iloc[idxs] = values
    values_feature.iloc[filter_] = values_feature_filter
    move_data.at[id_, feature_name] = values_feature

    if not inplace:
        return move_data
    else:
        return None
示例#22
0
def feature_values_using_filter(
        move_data: DataFrame,
        id_: Union[Text, int],
        feature_name: Text,
        filter_: List,
        values: Any,
        inplace: Optional[bool] = True) -> Optional[DataFrame]:
    """
    Changes the values of the feature defined by the user.
    Parameters
    ----------
    move_data : DataFrame
       The input trajectories data.
    id_ : str
        Indicates the index to be changed.
    feature_name : str
        The name of the column that the user wants to change values for.
    filter_ : list or array
        Indicates the rows with the index "id_" of the "feature_name"
        that must be changed.
    values : any
        THe new values to be set to the selected feature.
    inplace: boolean, optional(True by default)
        if set to true the original dataframe will be altered,
        otherwise the alteration will be made in a copy, that will be returned.
    Returns
    -------
    DataFrame
        A copy of the original dataframe or None

    """

    if not inplace:
        move_data = move_data.copy()

    values_feature = move_data.at[id_, feature_name]

    if filter_.shape == ():
        move_data.at[id_, feature_name] = values
    else:
        values_feature.iloc[filter_] = values
        move_data.at[id_, feature_name] = values_feature

    if not inplace:
        return move_data
    else:
        return None
示例#23
0
    def test_large_value_conversion(self):
        s0 = Series([1, 99], dtype=np.int8)
        s1 = Series([1, 127], dtype=np.int8)
        s2 = Series([1, 2 ** 15 - 1], dtype=np.int16)
        s3 = Series([1, 2 ** 63 - 1], dtype=np.int64)
        original = DataFrame({"s0": s0, "s1": s1, "s2": s2, "s3": s3})
        original.index.name = "index"
        with tm.ensure_clean() as path:
            with tm.assert_produces_warning(PossiblePrecisionLoss):
                original.to_stata(path)

            written_and_read_again = self.read_dta(path)
            modified = original.copy()
            modified["s1"] = Series(modified["s1"], dtype=np.int16)
            modified["s2"] = Series(modified["s2"], dtype=np.int32)
            modified["s3"] = Series(modified["s3"], dtype=np.float64)
            tm.assert_frame_equal(written_and_read_again.set_index("index"), modified)
示例#24
0
    def __runPortfolioDesc(self, portfolioDescs:List[APortfolioDescription], portFolioModels:List[DataFrame],
                           evaluatonTools:List[AEvalTool], histories:List[AHistory], trainRatingsDF:DataFrame, testRatingsDF:DataFrame):

        portfolios:List[APortfolio] = []

        portfolioDescI:APortfolioDescription
        historyI:AHistory
        for portfolioDescI, historyI in zip(portfolioDescs, histories):

            print("Training mode: " + str(portfolioDescI.getPortfolioID()))

            # train portfolio model
            portfolioI:APortfolio = portfolioDescI.exportPortfolio(self._batchID, historyI)
            portfolioI.train(historyI, trainRatingsDF.copy(), self._usersDF.copy(), self._itemsDF.copy())
            portfolios.append(portfolioI)

        return self.__iterateOverDataset(portfolios, portfolioDescs, portFolioModels, evaluatonTools, histories, testRatingsDF)
示例#25
0
def color_3d_scatter(df_input: DataFrame, mode: str, lof: float,
                     gof: float) -> DataFrame:
    """
    Color the data points by enrichment scores.

    Parameters
    -----------
    df : pandas dataframe
        The input is a dataframe that has colum with
        ['Position', 'Aminoacid', 'Score'].

    mode : str
        Specify what enrichment scores to use. If mode = 'mean', it will
        use the mean of each position to classify the residues. If
        mode = 'A', it will use the Alanine substitution profile. Can be
        used for each amino acid. Use the one-letter code and upper case.

    gof : int, default is 1
        cutoff for determining gain of function mutations based on
        mutagenesis data.

    lof : int, default is -1
        cutoff for determining loss of function mutations based on
        mutagenesis data.

    Returns
    ---------
    df_grouped: pandas dataframe
        New dataframe with added column of ['Color'] and the ['Score']
        values of the mode you chose.
    """

    # Copy df
    df_grouped: DataFrame = df_input.copy()

    # Select grouping.
    if mode.lower() == 'mean':
        df_grouped = df_grouped.groupby(['Position'], as_index=False).mean()
    else:
        df_grouped = df_grouped.loc[df_grouped['Aminoacid'] == mode]

    # Select colors based on Score values
    df_grouped['Color'] = 'green'
    df_grouped.loc[df_grouped['Score'] < lof, 'Color'] = 'blue'
    df_grouped.loc[df_grouped['Score'] > gof, 'Color'] = 'red'
    return df_grouped
示例#26
0
def cleanData(data: DataFrame) -> DataFrame:
    cleanedData: DataFrame = data.copy().dropna(
    )  # copying and dropping rows with NA

    # Removing whitespace from the column NAMES
    cleanedData: DataFrame = cleanedData.rename(columns=lambda x: x.strip(),
                                                inplace=False)

    # Removing whitespace from the column VALUES
    #cleanedData: DataFrame = cleanedData.applymap(lambda x: x.strip() if type(x) == str else x)
    # NOTE: the above approach ruins the dataframe printing capability (cannot show data frame as nice as it was
    # before, but instead it looks like messy string with \n values)

    for var in cleanedData.columns:
        valuesNoWhitespace: Series = cleanedData[var].str.strip()
        cleanedData[var] = valuesNoWhitespace

    return cleanedData
示例#27
0
    def test_large_value_conversion(self):
        s0 = Series([1, 99], dtype=np.int8)
        s1 = Series([1, 127], dtype=np.int8)
        s2 = Series([1, 2**15 - 1], dtype=np.int16)
        s3 = Series([1, 2**63 - 1], dtype=np.int64)
        original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3})
        original.index.name = 'index'
        with tm.ensure_clean() as path:
            with tm.assert_produces_warning(PossiblePrecisionLoss):
                original.to_stata(path)

            written_and_read_again = self.read_dta(path)
            modified = original.copy()
            modified['s1'] = Series(modified['s1'], dtype=np.int16)
            modified['s2'] = Series(modified['s2'], dtype=np.int32)
            modified['s3'] = Series(modified['s3'], dtype=np.float64)
            tm.assert_frame_equal(written_and_read_again.set_index('index'),
                                  modified)
示例#28
0
    def test_large_value_conversion(self):
        s0 = Series([1, 99], dtype=np.int8)
        s1 = Series([1, 127], dtype=np.int8)
        s2 = Series([1, 2 ** 15 - 1], dtype=np.int16)
        s3 = Series([1, 2 ** 63 - 1], dtype=np.int64)
        original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3})
        original.index.name = 'index'
        with tm.ensure_clean() as path:
            with warnings.catch_warnings(record=True) as w:
                tm.assert_produces_warning(original.to_stata(path),
                                           PossiblePrecisionLoss)
            # should produce a single warning
            tm.assert_equal(len(w), 1)

            written_and_read_again = self.read_dta(path)
            modified = original.copy()
            modified['s1'] = Series(modified['s1'], dtype=np.int16)
            modified['s2'] = Series(modified['s2'], dtype=np.int32)
            modified['s3'] = Series(modified['s3'], dtype=np.float64)
            tm.assert_frame_equal(written_and_read_again.set_index('index'),
                                  modified)
示例#29
0
    def test_bool_uint(self):
        s0 = Series([0, 1, True], dtype=np.bool)
        s1 = Series([0, 1, 100], dtype=np.uint8)
        s2 = Series([0, 1, 255], dtype=np.uint8)
        s3 = Series([0, 1, 2 ** 15 - 100], dtype=np.uint16)
        s4 = Series([0, 1, 2 ** 16 - 1], dtype=np.uint16)
        s5 = Series([0, 1, 2 ** 31 - 100], dtype=np.uint32)
        s6 = Series([0, 1, 2 ** 32 - 1], dtype=np.uint32)

        original = DataFrame({"s0": s0, "s1": s1, "s2": s2, "s3": s3, "s4": s4, "s5": s5, "s6": s6})
        original.index.name = "index"
        expected = original.copy()
        expected_types = (np.int8, np.int8, np.int16, np.int16, np.int32, np.int32, np.float64)
        for c, t in zip(expected.columns, expected_types):
            expected[c] = expected[c].astype(t)

        with tm.ensure_clean() as path:
            original.to_stata(path)
            written_and_read_again = self.read_dta(path)
            written_and_read_again = written_and_read_again.set_index("index")
            tm.assert_frame_equal(written_and_read_again, expected)
示例#30
0
    def __init__(self, ratingsDF:DataFrame):
        if type(ratingsDF) is not DataFrame:
            raise ValueError("Argument ratingsDF isn't type DataFrame.")

        ratingsCopyDF:DataFrame = ratingsDF.copy()
        ratingsCopyDF['index1'] = ratingsCopyDF.index

        #print(ratingsDF)
        #print(ratingsCopyDF)

        userIds:List[int] = list(set([rowI[Ratings.COL_USERID] for indexDFI, rowI in ratingsCopyDF.iterrows()]))

        # dictionary (index = userID, value = list[tuple(int, int)])
        # each list contains pair(int,int) or (itemID, indefOfDataFrame)
        self._dictionaryOfUserIDs:dict[List[tuple(int, int)]] = {}

        userIdI:int
        for userIdI in userIds:
            # select ratings of userIdI
            ratingsUserIDF:DataFrame = ratingsCopyDF.loc[ratingsCopyDF[Ratings.COL_USERID] == userIdI]

            userDictI:dict = {}
            lastItemI:Item = None

            indexDFI:int
            rowI:Series
            for i, rowI in ratingsUserIDF.iterrows():

                indexDFI:int = rowI['index1']
                userIdI:int = rowI[Ratings.COL_USERID]
                itemIdI:int = rowI[Ratings.COL_MOVIEID]

                itemI:Item = Item(userIdI, indexDFI, None)
                if not lastItemI is None:
                    lastItemI.setNext(itemI)

                lastItemI = itemI
                userDictI[itemIdI] = itemI

            self._dictionaryOfUserIDs[userIdI] = userDictI
示例#31
0
def _aa_to_codons_df(df_input: DataFrame, namecolumn: str) -> DataFrame:
    """
    Inputs a dataframe with a column of amino acids, returns all syn for each amino acidcodons.
    Used dict_codon_to_aa() and _aa_to_codons.

    Parameters
    -----------
    df_input : pandas dataframe
    namecolumn : str
        Name of the column containing the amino acids.

    Returns
    --------
    Dataframe with a column containing all the codons that code for that amino acid. Returns copy
    """
    # Copy df_input
    df_input = df_input.copy()

    # Calculate each possible codon for every amino acid
    df_input["Codons_" + namecolumn] = df_input.apply(
        lambda x: _aa_to_codons(x[namecolumn]), axis=1)

    return df_input
示例#32
0
文件: melt.py 项目: ygene2/pandas
def melt(
    frame: DataFrame,
    id_vars=None,
    value_vars=None,
    var_name=None,
    value_name="value",
    col_level=None,
) -> DataFrame:
    # TODO: what about the existing index?
    # If multiindex, gather names of columns on all level for checking presence
    # of `id_vars` and `value_vars`
    if isinstance(frame.columns, ABCMultiIndex):
        cols = [x for c in frame.columns for x in c]
    else:
        cols = list(frame.columns)

    if id_vars is not None:
        if not is_list_like(id_vars):
            id_vars = [id_vars]
        elif isinstance(frame.columns,
                        ABCMultiIndex) and not isinstance(id_vars, list):
            raise ValueError(
                "id_vars must be a list of tuples when columns are a MultiIndex"
            )
        else:
            # Check that `id_vars` are in frame
            id_vars = list(id_vars)
            missing = Index(com.flatten(id_vars)).difference(cols)
            if not missing.empty:
                raise KeyError("The following 'id_vars' are not present"
                               " in the DataFrame: {missing}"
                               "".format(missing=list(missing)))
    else:
        id_vars = []

    if value_vars is not None:
        if not is_list_like(value_vars):
            value_vars = [value_vars]
        elif isinstance(frame.columns,
                        ABCMultiIndex) and not isinstance(value_vars, list):
            raise ValueError(
                "value_vars must be a list of tuples when columns are a MultiIndex"
            )
        else:
            value_vars = list(value_vars)
            # Check that `value_vars` are in frame
            missing = Index(com.flatten(value_vars)).difference(cols)
            if not missing.empty:
                raise KeyError("The following 'value_vars' are not present in"
                               " the DataFrame: {missing}"
                               "".format(missing=list(missing)))
        frame = frame.loc[:, id_vars + value_vars]
    else:
        frame = frame.copy()

    if col_level is not None:  # allow list or other?
        # frame is a copy
        frame.columns = frame.columns.get_level_values(col_level)

    if var_name is None:
        if isinstance(frame.columns, ABCMultiIndex):
            if len(frame.columns.names) == len(set(frame.columns.names)):
                var_name = frame.columns.names
            else:
                var_name = [
                    "variable_{i}".format(i=i)
                    for i in range(len(frame.columns.names))
                ]
        else:
            var_name = [
                frame.columns.name
                if frame.columns.name is not None else "variable"
            ]
    if isinstance(var_name, str):
        var_name = [var_name]

    N, K = frame.shape
    K -= len(id_vars)

    mdata = {}
    for col in id_vars:
        id_data = frame.pop(col)
        if is_extension_array_dtype(id_data):
            id_data = concat([id_data] * K, ignore_index=True)
        else:
            id_data = np.tile(id_data.values, K)
        mdata[col] = id_data

    mcolumns = id_vars + var_name + [value_name]

    mdata[value_name] = frame.values.ravel("F")
    for i, col in enumerate(var_name):
        # asanyarray will keep the columns as an Index
        mdata[col] = np.asanyarray(
            frame.columns._get_level_values(i)).repeat(N)

    return frame._constructor(mdata, columns=mcolumns)
示例#33
0
def instance_crossover_augmentation(
    data: DataFrame,
    restriction: str = 'destination only',
    label_local: Text = LOCAL_LABEL,
    frac: float = 0.5,
) -> DataFrame:
    """
    Generates new data from unobserved trajectories, with a specific restriction.

    By default, the algorithm uses the same destination constraint
    as the route and inserts the points on the
    original dataframe.

    Parameters
    ----------
    data : DataFrame
        The input trajectories data
    restriction : str, optional
        Constraint used to generate new data, by default 'destination only'
    label_local : str, optional
        Label of the points sequences, by default LOCAL_LABEL
    frac : float, optional
        Represents the percentage to be exchanged, by default 0.5

    Example
    -------
    >>> from pymove.utils.data_augmentation import instance_crossover_augmentation
    >>>
    >>> df
                 id 	     local_label
    0     [1, 1, 1]       [85, 673, 394]
    1  [2, 2, 2, 2]  [85, 224, 623, 394]
    2     [3, 3, 3]      [263, 673, 394]
    >>>
    >>> aug_df = instance_crossover_augmentation(df)
    >>> aug_df
                 id 	     local_label
    0     [1, 1, 1]       [85, 673, 394]
    1  [2, 2, 2, 2]  [85, 224, 623, 394]
    2     [3, 3, 3]      [263, 673, 394]
    3     [1, 2, 2]       [85, 623, 394]
    4  [2, 2, 1, 1]  [85, 224, 673, 394]
    5  [2, 2, 3, 3]  [85, 224, 673, 394]
    6     [3, 2, 2]      [263, 623, 394]

    """
    df = data.copy()

    df[DESTINY] = df[label_local].apply(lambda x: x[-1])
    df[START] = df[label_local].apply(lambda x: x[0])

    frames = {}
    destinations = df[DESTINY].unique()
    for idx, dest in progress_bar(enumerate(destinations), total=len(destinations)):
        filter_ = df[df[DESTINY] == dest]

        if restriction == 'departure and destination':
            starts = filter_[START].unique()

            for st in progress_bar(starts, total=len(starts)):
                filter_ = filter_[filter_[START] == st]

                if filter_.shape[0] >= 2:
                    frames[idx] = _augmentation(filter_.iloc[:, :-2], frac=frac)
        else:
            if filter_.shape[0] >= 2:
                frames[idx] = _augmentation(filter_.iloc[:, :-2], frac=frac)

    return pd.concat([frames[i] for i in range(len(frames))], axis=0, ignore_index=True)
示例#34
0
def transition_graph_augmentation_all_vertex(
    traj_df: DataFrame,
    graph: DiGraph | None = None,
    min_path_size: int = 3,
    max_path_size: int = 6,
    max_sampling_source: int = 10,
    max_sampling_target: int = 10,
    source: dict | None = None,
    target: dict | None = None,
    label_local: Text = LOCAL_LABEL,
    simple_paths: bool = False,
    inplace: bool = True
) -> DataFrame:
    """
    Transition Graph Data Augmentation.

    Performs the data increase from the transition graph.

    Parameters
    ----------
    traj_df: DataFrame
        Trajectory data in sequence format
    graph: DiGraph
        Transition graph constructed from trajectory data
    min_path_size: int, optional
        Minimum number of points for the trajectory, by default 3
    max_path_size: int, optional
        Maximum number of points for the trajectory, by default 6
    max_sampling_source: int, optional
        Maximum number of paths to be returned,
        considering the observed origin, by default 10
    max_sampling_target: int, optional
        Maximum number of paths to be returned,
        considering the observed destination, by default 10
    source: dict, optional
        Degree of entry of each node in the graph, by default None
        Example: {node: degree-of-entry}
    target: dict, optional
        Degree of output of each node in the graph, by default None
        Example: {node: degree-of-output}
    label_local: str, optional
        Name of the column referring to the trajectories, by default LOCAL_LABEL
    label_tid: str, optional
        Column name for trajectory IDs, by default TID_STAT
    simple_paths: boolean, optional
        If true, use the paths with the most used sections
        Otherwise, use paths with less used sections, by default False
    inplace : boolean, optional
        if set to true the original dataframe will be altered to contain the result
        of the augmentation, otherwise a copy will be returned, by default True

    Return
    ------
    DataFrame
        Increased data set.

    Example
    -------
    >>> from pymove.utils.data_augmentation import (
            transition_graph_augmentation_all_vertex
        )
    >>>
    >>> traj_df.to_dict()
    {'id': [[1, 1, 1], [2, 2, 2, 2]],
     'datetime': [['2017-09-02 22:00:27', '2017-09-02 22:01:36',
                   '2017-09-02 22:03:08'],
                  ['2017-09-02 23:03:46', '2017-09-02 23:07:19',
                   '2017-09-02 23:07:40', '2017-09-02 23:09:10']],
     'local_label': [[85, 673, 394], [263, 224, 623, 394]],
     'lat': [[-3.8347478, -3.8235834, -3.813889],
             [-3.9067654, -3.8857223, -3.8828723, -3.9939834]],
     'lon': [[-38.592189, -38.590389, -38.5904445],
             [-38.5907723, -38.5928892, -38.5929789, -38.70409]]}
    >>>
    >>> transition_graph_augmentation_all_vertex(traj_df)
    [263.0, 224.0, 623.0]
    [224.0, 623.0, 394.0]

    """
    if inplace:
        traj_df_ = traj_df
    else:
        traj_df_ = traj_df.copy()

    if graph is None:
        graph = build_transition_graph_from_df(traj_df_)

    if source is None:
        source = dict(graph.nodes)
        source = {key: value['freq_source'] for key, value in source.items()}

    if target is None:
        target = dict(graph.nodes)
        target = {key: value['freq_source'] for key, value in target.items()}

    targets = sorted(target.items(), key=lambda x: x[1], reverse=True)
    sources = sorted(source.items(), key=lambda x: x[1], reverse=True)

    [[get_all_paths(
        traj_df_, graph, s, t, min_path_size, max_path_size,
        max_sampling_source, max_sampling_target, label_local, simple_paths
    ) for s, _ in sources] for t, _ in targets]

    if not inplace:
        return traj_df_
示例#35
0
    def severDeal(self):
        datadf = None
        isInit = False
        # 1
        place1 = self.placele1.text()
        if place1 != '':
            place1Arr = [list(p) for p in place1.replace('\n','').replace('\r','').split(' ')]
            place1df = DataFrame(place1Arr, columns=['a','b','c','d'])
            if not isInit:
                datadf = place1df.copy()
                isInit = True
            datadf = pd.merge(datadf, place1df)

        # 2
        place2 = self.placele2.text()
        if place2 != '':
            place2Arr = [list(p) for p in place2.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place2df = DataFrame(place2Arr, columns=['a','b','c','e'])
            if not isInit:
                datadf = place2df
                isInit = True
            datadf = pd.merge(datadf, place2df)

        # 3
        place3 = self.placele3.text()
        if place3 != '':
            place3Arr = [list(p) for p in place3.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place3df = DataFrame(place3Arr, columns=['a','b','c','f'])
            if not isInit:
                datadf = place3df
                isInit = True
            datadf = pd.merge(datadf, place3df)

        # 4
        place4 = self.placele4.text()
        if place4 != '':
            place4Arr = [list(p) for p in place4.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place4df = DataFrame(place4Arr, columns=['a','b','c','g'])
            if not isInit:
                datadf = place4df
                isInit = True
            datadf = pd.merge(datadf, place4df)

        # 5
        place5 = self.placele5.text()
        if place5 != '':
            place5Arr = [list(p) for p in place5.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place5df = DataFrame(place5Arr, columns=['a','b','d','e'])
            if not isInit:
                datadf = place5df
                isInit = True
            datadf = pd.merge(datadf, place5df)

        # 6
        place6 = self.placele6.text()
        if place6 != '':
            place6Arr = [list(p) for p in place6.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place6df = DataFrame(place6Arr, columns=['a','b','d','f'])
            if not isInit:
                datadf = place6df
                isInit = True
            datadf = pd.merge(datadf, place6df)

        # 7
        place7 = self.placele7.text()
        if place7 != '':
            place7Arr = [list(p) for p in place7.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place7df = DataFrame(place7Arr, columns=['a','b','d','g'])
            if not isInit:
                datadf = place7df
                isInit = True
            datadf = pd.merge(datadf, place7df)

        # 8
        place8 = self.placele8.text()
        if place8 != '':
            place8Arr = [list(p) for p in place8.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place8df = DataFrame(place8Arr, columns=['a','b','e','f'])
            if not isInit:
                datadf = place8df
                isInit = True
            datadf = pd.merge(datadf, place8df)

        # 9
        place9 = self.placele9.text()
        if place9 != '':
            place9Arr = [list(p) for p in place9.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place9df = DataFrame(place9Arr, columns=['a','b','e','g'])
            if not isInit:
                datadf = place9df
                isInit = True
            datadf = pd.merge(datadf, place9df)

        # 10
        place10 = self.placele10.text()
        if place10 != '':
            place10Arr = [list(p) for p in place10.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place10df = DataFrame(place10Arr, columns=['a','b','f','g'])
            if not isInit:
                datadf = place10df
                isInit = True
            datadf = pd.merge(datadf, place10df)

        # 11
        place11 = self.placele11.text()
        if place11 != '':
            place11Arr = [list(p) for p in place11.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place11df = DataFrame(place11Arr, columns=['a','c','d','e'])
            if not isInit:
                datadf = place11df
                isInit = True
            datadf = pd.merge(datadf, place11df)

        # 12
        place12 = self.placele12.text()
        if place12 != '':
            place12Arr = [list(p) for p in place12.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place12df = DataFrame(place12Arr, columns=['a','c','d','f'])
            if not isInit:
                datadf = place12df
                isInit = True
            datadf = pd.merge(datadf, place12df)

        # 13
        place13 = self.placele13.text()
        if place13 != '':
            place13Arr = [list(p) for p in place13.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place13df = DataFrame(place13Arr, columns=['a','c','d','g'])
            if not isInit:
                datadf = place13df
                isInit = True
            datadf = pd.merge(datadf, place13df)

        # 14
        place14 = self.placele14.text()
        if place14 != '':
            place14Arr = [list(p) for p in place14.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place14df = DataFrame(place14Arr, columns=['a','c','e','f'])
            if not isInit:
                datadf = place14df
                isInit = True
            datadf = pd.merge(datadf, place14df)

        # 15
        place15 = self.placele15.text()
        if place15 != '':
            place15Arr = [list(p) for p in place15.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place15df = DataFrame(place15Arr, columns=['a','c','e','g'])
            if not isInit:
                datadf = place15df
                isInit = True
            datadf = pd.merge(datadf, place15df)

        # 16
        place16 = self.placele16.text()
        if place16 != '':
            place16Arr = [list(p) for p in place16.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place16df = DataFrame(place16Arr, columns=['a','c','f','g'])
            if not isInit:
                datadf = place16df
                isInit = True
            datadf = pd.merge(datadf, place16df)

        # 17
        place17 = self.placele17.text()
        if place17 != '':
            place17Arr = [list(p) for p in place17.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place17df = DataFrame(place17Arr, columns=['a','d','e','f'])
            if not isInit:
                datadf = place17df
                isInit = True
            datadf = pd.merge(datadf, place17df)

        # 18
        place18 = self.placele18.text()
        if place18 != '':
            place18Arr = [list(p) for p in place18.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place18df = DataFrame(place18Arr, columns=['a','d','e','g'])
            if not isInit:
                datadf = place18df
                isInit = True
            datadf = pd.merge(datadf, place18df)

        # 19
        place19 = self.placele19.text()
        if place19 != '':
            place19Arr = [list(p) for p in place19.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place19df = DataFrame(place19Arr, columns=['a','d','f','g'])
            if not isInit:
                datadf = place19df
                isInit = True
            datadf = pd.merge(datadf, place19df)

        # 20
        place20 = self.placele20.text()
        if place20 != '':
            place20Arr = [list(p) for p in place20.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place20df = DataFrame(place20Arr, columns=['a','e','f','g'])
            if not isInit:
                datadf = place20df
                isInit = True
            datadf = pd.merge(datadf, place20df)

        # 21
        place21 = self.placele21.text()
        if place21 != '':
            place21Arr = [list(p) for p in place21.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place21df = DataFrame(place21Arr, columns=['b','c','d','e'])
            if not isInit:
                datadf = place21df
                isInit = True
            datadf = pd.merge(datadf, place21df)

        # 22
        place22 = self.placele22.text()
        if place22 != '':
            place22Arr = [list(p) for p in place22.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place22df = DataFrame(place22Arr, columns=['b','c','d','f'])
            if not isInit:
                datadf = place22df
                isInit = True
            datadf = pd.merge(datadf, place22df)

        # 23
        place23 = self.placele23.text()
        if place23 != '':
            place23Arr = [list(p) for p in place23.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place23df = DataFrame(place23Arr, columns=['b','c','d','g'])
            if not isInit:
                datadf = place23df
                isInit = True
            datadf = pd.merge(datadf, place23df)

        # 24
        place24 = self.placele24.text()
        if place24 != '':
            place24Arr = [list(p) for p in place24.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place24df = DataFrame(place24Arr, columns=['b','c','e','f'])
            if not isInit:
                datadf = place24df
                isInit = True
            datadf = pd.merge(datadf, place24df)

        # 25
        place25 = self.placele25.text()
        if place25 != '':
            place25Arr = [list(p) for p in place25.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place25df = DataFrame(place25Arr, columns=['b','c','e','g'])
            if not isInit:
                datadf = place25df
                isInit = True
            datadf = pd.merge(datadf, place25df)

        # 26
        place26 = self.placele26.text()
        if place26 != '':
            place26Arr = [list(p) for p in place26.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place26df = DataFrame(place26Arr, columns=['b','c','f','g'])
            if not isInit:
                datadf = place26df
                isInit = True
            datadf = pd.merge(datadf, place26df)

        # 27
        place27 = self.placele27.text()
        if place27 != '':
            place27Arr = [list(p) for p in place27.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place27df = DataFrame(place27Arr, columns=['b','d','e','f'])
            if not isInit:
                datadf = place27df
                isInit = True
            datadf = pd.merge(datadf, place27df)

        # 28
        place28 = self.placele28.text()
        if place28 != '':
            place28Arr = [list(p) for p in place28.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place28df = DataFrame(place28Arr, columns=['b','d','e','g'])
            if not isInit:
                datadf = place28df
                isInit = True
            datadf = pd.merge(datadf, place28df)

        # 29
        place29 = self.placele29.text()
        if place29 != '':
            place29Arr = [list(p) for p in place29.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place29df = DataFrame(place29Arr, columns=['b','d','f','g'])
            if not isInit:
                datadf = place29df
                isInit = True
            datadf = pd.merge(datadf, place29df)

        # 30
        place30 = self.placele30.text()
        if place30 != '':
            place30Arr = [list(p) for p in place30.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place30df = DataFrame(place30Arr, columns=['b','e','f','g'])
            if not isInit:
                datadf = place30df
                isInit = True
            datadf = pd.merge(datadf, place30df)

        # 31
        place31 = self.placele31.text()
        if place31 != '':
            place31Arr = [list(p) for p in place31.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place31df = DataFrame(place31Arr, columns=['c','d','e','f'])
            if not isInit:
                datadf = place31df
                isInit = True
            datadf = pd.merge(datadf, place31df)

        # 32
        place32 = self.placele32.text()
        if place32 != '':
            place32Arr = [list(p) for p in place32.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place32df = DataFrame(place32Arr, columns=['c','d','e','g'])
            if not isInit:
                datadf = place32df
                isInit = True
            datadf = pd.merge(datadf, place32df)

        # 33
        place33 = self.placele33.text()
        if place33 != '':
            place33Arr = [list(p) for p in place33.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place33df = DataFrame(place33Arr, columns=['c','d','f','g'])
            if not isInit:
                datadf = place33df
                isInit = True
            datadf = pd.merge(datadf, place33df)

        # 34
        place34 = self.placele34.text()
        if place34 != '':
            place34Arr = [list(p) for p in place34.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place34df = DataFrame(place34Arr, columns=['c','e','f','g'])
            if not isInit:
                datadf = place34df
                isInit = True
            datadf = pd.merge(datadf, place34df)

        # 35
        place35 = self.placele35.text()
        if place35 != '':
            place35Arr = [list(p) for p in place35.replace('\n','').replace('\r','').replace('  ',' ').split(' ')]
            place35df = DataFrame(place35Arr, columns=['d','e','f','g'])
            if not isInit:
                datadf = place35df
                isInit = True
            datadf = pd.merge(datadf, place35df)

        return datadf
示例#36
0
# 引数でプロジェクト名指定
if len(sys.argv) != 3:
    print("No argument len")
    sys.exit()

project = sys.argv[1]

# 取り出せる最大個数
max_num = int(sys.argv[2])

b = joblib.load(f"scripts/result/{project}_2.pkl")
b = DataFrame(b)

# bのuserから一見ユーザを取り出す
df_first_look = b.copy()
df_first_look.drop_duplicates(subset='requester', inplace=True)
#print(df_first_look.shape[0])
#print(df_first_look)

df_first_look = df_first_look[["predict_proba", "useful"]]

total_eval_u = 0
num_total = 0
loop_count = 1000
for i in range(loop_count):
    use_datetime = df_first_look.sample(frac=0.1)
    use_datetime = use_datetime.sort_values("predict_proba", ascending=False)

    num_data_u = len(use_datetime)