示例#1
0
    def test_replace_mixed(self):
        mf = self.mixed_frame
        mf.iloc[5:20, mf.columns.get_loc('foo')] = nan
        mf.iloc[-10:, mf.columns.get_loc('A')] = nan

        result = self.mixed_frame.replace(np.nan, -18)
        expected = self.mixed_frame.fillna(value=-18)
        assert_frame_equal(result, expected)
        assert_frame_equal(result.replace(-18, nan), self.mixed_frame)

        result = self.mixed_frame.replace(np.nan, -1e8)
        expected = self.mixed_frame.fillna(value=-1e8)
        assert_frame_equal(result, expected)
        assert_frame_equal(result.replace(-1e8, nan), self.mixed_frame)

        # int block upcasting
        df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                        'B': Series([0, 1], dtype='int64')})
        expected = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                              'B': Series([0.5, 1], dtype='float64')})
        result = df.replace(0, 0.5)
        assert_frame_equal(result, expected)

        df.replace(0, 0.5, inplace=True)
        assert_frame_equal(df, expected)

        # int block splitting
        df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                        'B': Series([0, 1], dtype='int64'),
                        'C': Series([1, 2], dtype='int64')})
        expected = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                              'B': Series([0.5, 1], dtype='float64'),
                              'C': Series([1, 2], dtype='int64')})
        result = df.replace(0, 0.5)
        assert_frame_equal(result, expected)

        # to object block upcasting
        df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                        'B': Series([0, 1], dtype='int64')})
        expected = DataFrame({'A': Series([1, 'foo'], dtype='object'),
                              'B': Series([0, 1], dtype='int64')})
        result = df.replace(2, 'foo')
        assert_frame_equal(result, expected)

        expected = DataFrame({'A': Series(['foo', 'bar'], dtype='object'),
                              'B': Series([0, 'foo'], dtype='object')})
        result = df.replace([1, 2], ['foo', 'bar'])
        assert_frame_equal(result, expected)

        # test case from
        df = DataFrame({'A': Series([3, 0], dtype='int64'),
                        'B': Series([0, 3], dtype='int64')})
        result = df.replace(3, df.mean().to_dict())
        expected = df.copy().astype('float64')
        m = df.mean()
        expected.iloc[0, 0] = m[0]
        expected.iloc[1, 1] = m[1]
        assert_frame_equal(result, expected)
示例#2
0
def kurtosis(str,list):

    s=list
    w = pd.read_csv(str, usecols=s)

    frame = DataFrame(w)

    h=len(w)


    print h
    t = frame.mean()

    d = frame.std()

    e = ((w - t) /d) ** 4

    g=e.sum()


    p1=h*(h+1)
    p2=float((h-1)*(h-2)*(h-3))
    p3=float(3*((h-1)**2))
    p4=(h-2)*(h-3)

    i=(((p1/p2)*g)-(p3/p4))

    print 'kurtosis=',i
示例#3
0
文件: he.py 项目: edawine/fatools
def summarize_he( analytical_sets ):

    results = {}
    he = {}

    for analytical_set in analytical_sets:
        he[analytical_set.label] = calculate_he(analytical_set.allele_df)

    he_df = DataFrame( he )
    labels = list(he_df.columns)
    if len(labels) == 2:
        # use Mann-Whitney / Wilcoxon test
        results['test'] = 'Wilcoxon test (paired)'
        results['stats'] = wilcoxon( he_df[labels[0]], he_df[labels[1]])

    elif len(labels) > 2:
        # use Kruskal Wallis
        results['test'] = 'Kruskal-Wallis test'
        results['stats'] = kruskal( * [he_df[x] for x in labels])
        results['warning'] = ''

    results['data'] = he_df
    results['mean'] = he_df.mean()
    results['stddev'] = he_df.std()
    #raise RuntimeError

    return results
def mydeviate(str,list,Deviation=0,MeanAbsDeviation=1,MeanSqDev=0):

    s=list

    w= pd.read_csv(str,usecols=s)

    s=DataFrame(w)
    t= s.mean()

    if Deviation==1:


        b=[w-t]

        print b

    if MeanAbsDeviation==1:

        a=[abs(s)-t]
        print(a)

    if MeanSqDev==1:

        c=[(w-t)**2]
        print c


    return
示例#5
0
class LogAggregate:
    def __init__(self, dataset):
        self.dataset = DataFrame(dataset)

    def get_median(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).median()[kwarg['key']]
        else:
            return self.dataset.median()[kwarg['key']]

    def get_average(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).mean()[kwarg['key']]
        else:
            return self.dataset.mean()[kwarg['key']]

    def get_min(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).min()[kwarg['key']]
        else:
            return self.dataset.min()[kwarg['key']]
    
    def get_max(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).max()[kwarg['key']]
        else:
            return self.dataset.max()[kwarg['key']]

    def get_count(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).count()[kwarg['key']]
        else:
            return self.dataset.count()[kwarg['key']]
示例#6
0
def skewness(str,list):
    s= list



    w = pd.read_csv(str, usecols=s)

    frame = DataFrame(w)

    h=len(w)

    t = frame.mean()



    d = frame.std()

    e = ((w - t) /d) ** 3

    g=e.sum()

    i=(h*g)/((h-1)*(h-2))


    print 'skewness=',i
示例#7
0
 def preprocessing(self, df: pd.DataFrame):
     # Features wrangling
     df_features = df.iloc[:, 5:]
     df_features = df_features.fillna(df.mean())
     df_features = np.array(df_features)
     np.apply_along_axis(self._normalize, 0, df_features)
     # Labels wrangling
     df_labels = np.array(df.loc[:, "Hogwarts House"])
     return df_features, df_labels
示例#8
0
def summary_statistics(data_set: pd.DataFrame) -> pd.DataFrame:
    summary_data = dict()

    summary_data['mean'] = data_set.mean(numeric_only=True)
    summary_data['std'] = data_set.std(ddof=1, numeric_only=True)
    summary_data['min'] = data_set.min(numeric_only=True)
    summary_data['max'] = data_set.max(numeric_only=True)

    return pd.DataFrame(summary_data).T
class CalWeight:
    def __init__(self, step, risk_aversion):
        self.risk_aversion = risk_aversion
        if step == 0:
            self.start = '2012-01-01'
            self.end = '2014-12-31'
        elif step == 1:
            self.start = '2012-01-01'
            self.end = '2015-2-28'
        elif step == 2:
            self.start = '2012-01-01'
            self.end = '2015-4-30'
        secIDs = [
            '000300.ZICN', '000905.ZICN', '399006.ZICN', 'SPX.ZIUS',
            '000012.ZICN', '000013.ZICN'
        ]
        self.rtn_table = DataFrame()
        for secID in secIDs:
            cp = self.get_return(secID)
            cp.name = secID
            self.rtn_table = pd.concat([self.rtn_table, cp], axis=1)
        self.rtn_table.fillna(0, inplace=True)
        self.cov_mat = self.rtn_table.cov() * 250
        self.exp_rtn = self.rtn_table.mean() * 250

    def get_return(self, ticker):
        tmp_lst = []
        fname = PERFIX + 'data_' + ticker + '.csv'
        with open(fname, 'r') as f:
            reader = csv.reader(f)
            for row in reader:
                tmp_lst.append(row)
        df = pd.DataFrame(tmp_lst[1:], columns=tmp_lst[0])
        df['Date'] = pd.to_datetime(df['Date'])
        df = df.set_index("Date")
        df = df[self.start:self.end]
        temp = df['Close'].astype('float64').pct_change().fillna(0.)
        return temp

    def get_weight(self):
        risk_aversion = self.risk_aversion
        P = risk_aversion * matrix(self.cov_mat.values)
        q = -1 * matrix(self.exp_rtn.values)
        G = matrix(
            np.vstack((np.diag(np.ones(len(self.exp_rtn))),
                       np.diag(-np.ones(len(self.exp_rtn))))))
        h = matrix(
            np.array([np.ones(len(self.exp_rtn)),
                      np.zeros(len(self.exp_rtn))
                      ]).reshape(len(self.exp_rtn) * 2, 1))
        A = matrix(np.ones(len(self.exp_rtn)), (1, len(self.exp_rtn)))
        b = matrix([1.0])
        solvers.options['show_progress'] = False
        sol = solvers.qp(P, q, G, h, A, b)
        return DataFrame(index=self.exp_rtn.index,
                         data=np.round(sol['x'], 2),
                         columns=['weight'])  # 权重精确到小数点后两位
示例#10
0
def _plot_stats_attribute(stats_list: Sequence[Stats], attribute: str, label, ax=None):
    """Plot a certain attribute of a collection of histories."""
    data = np.asarray([getattr(h, attribute) for h in stats_list])
    df = DataFrame(data.T)

    df_mean = df.mean(axis=1)
    df_std = df.std(axis=1)
    sns_ax = sns.lineplot(df_mean.index, df_mean, label=label, ax=ax)
    sns_ax.fill_between(df_mean.index, df_mean - df_std, df_mean + df_std, alpha=0.3)
示例#11
0
def plot_mean_std(real: pd.DataFrame, fake: pd.DataFrame, ax=None):
    """
    Plot the means and standard deviations of each dataset.

    :param real: DataFrame containing the real data
    :param fake: DataFrame containing the fake data
    :param ax: Axis to plot on. If none, a new figure is made.
    """
    if ax is None:
        fig, ax = plt.subplots(1, 2, figsize=(10, 5))
        fig.suptitle('Absolute Log Mean and STDs of numeric data\n', fontsize=16)

    ax[0].grid(True)
    ax[1].grid(True)
    real = real._get_numeric_data()
    fake = fake._get_numeric_data()
    real_mean = np.log(np.add(abs(real.mean()).values, 1e-5))
    fake_mean = np.log(np.add(abs(fake.mean()).values, 1e-5))
    min_mean = min(real_mean) - 1
    max_mean = max(real_mean) + 1
    line = np.arange(min_mean, max_mean)
    sns.lineplot(x=line, y=line, ax=ax[0])
    sns.scatterplot(x=real_mean,
                    y=fake_mean,
                    ax=ax[0])
    ax[0].set_title('Means of real and fake data')
    ax[0].set_xlabel('real data mean (log)')
    ax[0].set_ylabel('fake data mean (log)')

    real_std = np.log(np.add(real.std().values, 1e-5))
    fake_std = np.log(np.add(fake.std().values, 1e-5))
    min_std = min(real_std) - 1
    max_std = max(real_std) + 1
    line = np.arange(min_std, max_std)
    sns.lineplot(x=line, y=line, ax=ax[1])
    sns.scatterplot(x=real_std,
                    y=fake_std,
                    ax=ax[1])
    ax[1].set_title('Stds of real and fake data')
    ax[1].set_xlabel('real data std (log)')
    ax[1].set_ylabel('fake data std (log)')

    if ax is None:
        plt.show()
示例#12
0
def variation_statistic(gene_data: pd.DataFrame) -> pd.Series:
    """
    Calculate std/mean for each gene and replace nan with 0
    :gene_data: Expression DF with genes in rows. Calculations are performed for each row across features.
    :return: Series with statistic for each row
    """
    statistic = gene_data.std(axis=1) / gene_data.mean(axis=1)
    # statistic = gene_data.std(axis=1)
    # TODO How to deal with 0 expressed genes? Are they informative?????
    return statistic.replace(np.nan, 0)
示例#13
0
    def fit(self, X: pd.DataFrame, y=None):
        self._reset()

        if self.with_mean:
            self.mean_ = X.mean()

        if self.with_std:
            self.scale_ = X.std(ddof=0)

        return self
示例#14
0
    def test_mean_datetimelike_numeric_only_false(self):
        df = DataFrame(
            {
                "A": np.arange(3),
                "B": pd.date_range("2016-01-01", periods=3),
                "C": pd.timedelta_range("1D", periods=3),
            }
        )

        # datetime(tz) and timedelta work
        result = df.mean(numeric_only=False)
        expected = Series({"A": 1, "B": df.loc[1, "B"], "C": df.loc[1, "C"]})
        tm.assert_series_equal(result, expected)

        # mean of period is not allowed
        df["D"] = pd.period_range("2016", periods=3, freq="A")

        with pytest.raises(TypeError, match="mean is not implemented for Period"):
            df.mean(numeric_only=False)
示例#15
0
    def mean_centered(self, utility_matrix: pd.DataFrame) -> pd.DataFrame:
        """

        :param utility_matrix:
        :return:
        """
        mean_centered_utility_matrix = utility_matrix.sub(
            utility_matrix.mean())

        return self.data.similarity_matrix_cosine(mean_centered_utility_matrix)
示例#16
0
    def table(self, extractedData: DataFrame, name: str):
        filename = self.filename

        self.filename = filename + '_all.txt'
        self._table(extractedData, name)

        mean = extractedData.mean(axis=0)
        std = extractedData.std(axis=0)
        self.filename = filename + '_average.txt'
        self._table(mean, name, std)
    def get_topwords(self, countries, thresh=10, tf_idf=False):
        tw = DataFrame()
        for r in range(len(self.df)):
            if self.df.loc[r, 'country_id'] in countries:
                if tf_idf:
                    tw = tw.append(self.tf_idf.loc[r, :])
                else:
                    tw = tw.append(self.df.loc[r, :])

        return tw.mean().order(ascending=False)[:thresh]
示例#18
0
def scatter_peaks_no_peaks(
    top_eco: pd.DataFrame,
    top_naked: pd.DataFrame,
    non_top_eco: pd.DataFrame,
    non_top_naked: pd.DataFrame,
    ax: plt.Axes = None,
):
    if not ax:
        _, ax = plt.subplots(figsize=(12, 12))
    ax.set_xlabel("Chromatin")
    ax.set_ylabel("Naked")
    ax.scatter(
        non_top_eco,
        non_top_naked,
        alpha=0.2,
        label="All Points",
    )
    ax.scatter(top_eco, top_naked, label="Open ATAC")

    ax.axvline(non_top_eco.mean(), color="C0")
    ax.axvline(top_eco.mean(), color="C1")
    ax.axhline(non_top_naked.mean(), color="C0")
    ax.axhline(top_naked.mean(), color="C1")

    ax.legend(
        loc="upper right",
        frameon=False,
        shadow=False,
    )
    # We concatenate the two DFs to a single one so that the dropna() call will
    # "synced" between the two different rows
    top = pd.DataFrame({"chrom": top_eco, "naked": top_naked}).dropna(axis=0)
    all_ = pd.DataFrame({
        "chrom": non_top_eco,
        "naked": non_top_naked
    }).dropna(axis=0)
    r_top, _ = scipy.stats.pearsonr(top.loc[:, "chrom"], top.loc[:, "naked"])
    r_all, _ = scipy.stats.pearsonr(all_.loc[:, "chrom"], all_.loc[:, "naked"])
    ax.text(0.01,
            0.8,
            f"R (top) = {r_top} \nR (rest) = {r_all}",
            transform=ax.transAxes)
    return ax
示例#19
0
def get_sharpe_ratios(df_returns: DataFrame,
                      risk_free_rate: float = 0.0,
                      periods_per_annum: int = 252) -> Series:
    """
    Helper function to calculate the (annualized) Sharpe Ratios of the financial
    instruments contained in the input dataframe.
    """
    numer = (df_returns.mean(axis=0) - risk_free_rate) * periods_per_annum
    denom = np.sqrt(df_returns.var(axis=0) * periods_per_annum)
    return numer / denom
示例#20
0
def pd_03():
    df=DataFrame(np.random.randn(6,3))
    df.ix[2:,1]=np.nan
    df.ix[4:,2]=np.nan
    print df
    print df.fillna(method='ffill')
    print df.fillna(method='ffill',limit=2)
    data=Series([1.,None,3.5,None,7])
    print data.fillna(data.mean())
    print df.fillna(df.mean())
示例#21
0
def get_mean_by_bin(df: pd.DataFrame) -> pd.Series:
    """
    Takes all the sweep data from the input dataframe as returned by read_hackrf_sweep_file_and_merge
    and gets the average db for each bin.
    Returns as a pandas Series

    :param df: pd.DataFrame from experiment in question
    :return: pd.Series of average
    """
    return df.mean(axis=0)
示例#22
0
    def test_replace_series_dict(self):
        # from GH 3064
        df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}})
        result = df.replace(0, {"zero": 0.5, "one": 1.0})
        expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 2.0, "b": 1.0}})
        tm.assert_frame_equal(result, expected)

        result = df.replace(0, df.mean())
        tm.assert_frame_equal(result, expected)

        # series to series/dict
        df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}})
        s = Series({"zero": 0.0, "one": 2.0})
        result = df.replace(s, {"zero": 0.5, "one": 1.0})
        expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 1.0, "b": 0.0}})
        tm.assert_frame_equal(result, expected)

        result = df.replace(s, df.mean())
        tm.assert_frame_equal(result, expected)
示例#23
0
    def test_mean_datetimelike(self):
        # GH#24757 check that datetimelike are excluded by default, handled
        #  correctly with numeric_only=True

        df = DataFrame({
            "A": np.arange(3),
            "B": pd.date_range("2016-01-01", periods=3),
            "C": pd.timedelta_range("1D", periods=3),
            "D": pd.period_range("2016", periods=3, freq="A"),
        })
        result = df.mean(numeric_only=True)
        expected = Series({"A": 1.0})
        tm.assert_series_equal(result, expected)

        with tm.assert_produces_warning(FutureWarning):
            # in the future datetime columns will be included
            result = df.mean()
        expected = Series({"A": 1.0, "C": df.loc[1, "C"]})
        tm.assert_series_equal(result, expected)
示例#24
0
 def normalize_data(self, data: pd.DataFrame, idx) -> pd.DataFrame:
     """
     All values should be normalized to range(-1,1).
     :param data: The data to normalize.
     :param idx: An id for remembering normalization values in class.
     :return: Normalized DataFrame.
     """
     self.mean[idx] = data.mean()
     self.ptp[idx] = data.max() - data.min()
     return (data - self.mean[idx]) / self.ptp[idx]
示例#25
0
    def test_mean_excludes_datetimes(self, tz):
        # https://github.com/pandas-dev/pandas/issues/24752
        # Our long-term desired behavior is unclear, but the behavior in
        # 0.24.0rc1 was buggy.
        df = DataFrame({"A": [pd.Timestamp("2000", tz=tz)] * 2})
        with tm.assert_produces_warning(FutureWarning):
            result = df.mean()

        expected = Series(dtype=np.float64)
        tm.assert_series_equal(result, expected)
示例#26
0
def remove_outliers(df: pd.DataFrame, zscore: int = 3) -> pd.DataFrame:
    """
    Removes all rows from the given DataFrame containing outliers in any of the columns.

    :param df: Input DataFrame.
    :param zscore: z-score to use when calculating outliers.
    :return: The DataFrame with all outliers removed.
    """
    scores = (df - df.mean()) / df.std(ddof=0).values
    return df[(np.abs(scores) < zscore).all(axis=1)]
示例#27
0
文件: color.py 项目: michael92ht/CBIC
def moments_features(path):
    if not os.path.exists(path):
        logger.error(path + " is not exist!")
        return
    im = cv2.imread(path)
    [b, g, r] = cv2.split(im)
    moments = []
    for n in [b, g, r]:
        df = DataFrame(np.array(n.flatten()))
        moments.extend(float(x) for x in [df.mean()[0], df.std()[0], df.skew()[0]])
    return moments
示例#28
0
 def __init__(self, df):
     scaler = MinMaxScaler(feature_range=(0, 100))
     df_scaled = DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index) #一定要保证colunms特别是index的一致
     df_corr = df_scaled.corr()
     df_stat = DataFrame([df_scaled.apply(lambda x: np.sqrt(np.var(x)))], index=['sd'])  #标准差
     df_stat = df_stat.append(Series(df_scaled.mean(), name='mean'))                     #均值
     df_stat = df_stat.append(Series(df_scaled.apply(self.get_entropy), name='entropy')) #熵值
     self.df_stat = df_stat.append(Series(df_corr.applymap(lambda x: 1-x).sum(), name='critic_part'))  #CRITIC部分
     self.df = df
     self.scaler = scaler        #归一标尺
     self.df_scaled = df_scaled  #归一后数据,格式跟df完全一致
示例#29
0
def normalize(
        data: pd.DataFrame,
        normalization_type: NormalizationType = NormalizationType.STANDARD):
    if normalization_type == NormalizationType.STANDARD:
        data_mean = data.mean(axis=0)
        data_std = data.std(axis=0)
        return (data - data_mean) / data_std, data_mean, data_std
    else:
        data_min = data.min(axis=0)
        data_max = data.max(axis=0)
        return (data - data_min) / (data_max - data_min), data_min, data_max
示例#30
0
    def rescale(data: pd.DataFrame) -> np.ndarray:
        means = data.mean(axis=0)
        variances = data.var(axis=0)

        scaled_data = data.copy(deep=True)
        for row in range(data.shape[0]):
            for col in range(data.shape[1]):
                scaled_data.iloc[row,
                                 col] = (data.iloc[row, col] -
                                         means[col]) / np.sqrt(variances[col])
        return scaled_data, means, np.sqrt(variances)
示例#31
0
def _print_full_performance(performance: pd.DataFrame,
                            verbosity='mean') -> None:
    """ Prints the accuracy, precision, recall and f-1 score
    Args:
        accuracy: performance of the predictor as a DataFrame
        verbosity: if 'mean' will print only the mean value(s), otherwise, will print for each label
    """
    if verbosity == 'mean':
        print(performance.mean().astype(float).round(3))
    else:
        print(performance.astype(float).round(3))
示例#32
0
def combine_spread(file_set, shift, drop_return_data=False):
    """
    Combine the spread of input files, return with mean and standard
    deviation calculated.

    """

    data = []
    values = {}
    for val in ('left', 'right', 'com', 'dist', 'radius', 'diameter'):
        values[val] = {}

    # Collect data from all files into dictionaries
    for i, _file in enumerate(file_set):
        data.append(Spread().read(_file))
        for val in values.keys():
            values[val][i] = Series(
                    data=data[i].spread[val]['val'],
                    index=data[i].times
                    )
        data[i].times = (np.array(data[i].times) - shift[i])

    spread = Spread()
    spread.spread['num'] = len(file_set)

    for val in values.keys():

        # Shift time as per synchronisation
        for i in values[val]:
            values[val][i].index = np.array(values[val][i].index) - shift[i]

        # Convert to DataFrame
        df = DataFrame(data=values[val])

        # If not a single file, keep only indices with at least two non-NaN
        if len(file_set) > 1:
            df = df.dropna()

        # If return data dropped, fill data here
        if drop_return_data:
            for i in df.columns:
                data[i].spread[val]['val'] = df[i].tolist()

        # Get times, mean and standard error as lists
        mean = list(df.mean(axis=1))
        std_error = list(df.std(axis=1))
        times = list(df.index)

        # Add to Spread object
        spread.spread[val]['val'] = mean
        spread.spread[val]['std'] = std_error
        spread.spread['times'] = times

    return spread, data
示例#33
0
def cv(df: pd.DataFrame, fill_value: Optional[float] = None) -> pd.Series:
    """
    Computes the Coefficient of variation for each column.

    Used by DataContainer objects to compute metrics.

    """
    res = df.std() / df.mean()
    if fill_value is not None:
        res = res.fillna(fill_value)
    return res
示例#34
0
文件: ex01_df.py 项目: venuur/pycli
def avg_columns(df:pd.DataFrame) -> pd.Series:
    '''Calculates mean of all columns in DataFrame.

    Args:
        df: Data frame of values to average.

    Returns:
        Series of means indexed by columns.
    '''

    return df.mean()
示例#35
0
def get_average_curve(input_csv: pd.DataFrame) -> pd.DataFrame:
    r"""
    Find the generalized curve to represent the class

    :param input_csv: raw class data
    :return: data points for generalized curve

    """
    average_series = input_csv.mean(axis=0)
    generalised = pd.DataFrame(average_series).transpose()
    return generalised
 def normalize(self, signal: pd.DataFrame) -> pd.DataFrame:
     """Apply normalization
     Args:
         signal (pd.DataFrame): Raw signal
     Returns:
         signal (pd.DataFrame): Normalized signal
     """
     df_mean = signal.mean()
     df_std = signal.std()
     signal = (signal - df_mean) / df_std
     return signal
示例#37
0
def select_centroids_smart(df: pd.DataFrame, k: int, get_dist=get_euclidean_distances) -> np.ndarray:
    points = pd.DataFrame(df.mean(axis=0)).T
    i = 1
    while i < k:
        dists = get_dist(df, points).sum(axis=1)
        furthest = np.argmax(dists)
        next_point = pd.DataFrame(df.iloc[furthest]).T
        points = points.append(next_point)
        df = drop_df(df, df.iloc[furthest])
        i += 1
    return points.values
示例#38
0
def fillNan(matrix: pd.DataFrame, type: str = 'value', value: float = 0):
    """
    :param matrix:
    :param type: lựa chọn ['value', 'col_avg', 'row_avg']
    :param value: float
    :return:
    """
    filled_matrix = matrix.isna()
    result_matrix = matrix.copy()

    if type == 'value':
        result_matrix = matrix.fillna(value)
    elif type == 'col_avg':
        col_avg = matrix.mean(axis=0)
        result_matrix = matrix.fillna(col_avg)
    elif type == 'row_avg':
        row_avg = matrix.mean(axis=1)
        result_matrix = matrix.T.fillna(row_avg).T

    return result_matrix, filled_matrix
示例#39
0
def _maximum_likelihood_pairs(pairings: DataFrame, ret_largest: bool = True):
    """
    Given a pairings, choose the maximum likely pairing.
    """
    pm = pairings.mean(axis=1)
    if pm.gt(0).sum() == 0:
        raise ValueError("There is no crossover between these datasets")
    elif pm.gt(0).sum() == 1 or ret_largest:
        return pm.idxmax(), pm.max()
    else:
        return pm[pm.gt(0)]
示例#40
0
def resumo_disciplinas(dados: pd.DataFrame) -> pd.DataFrame:
    media_disciplina = dados.mean(axis=0).round(2)

    numero_acessos = dados[dados != 0].count(axis=0)

    tabela = pd.DataFrame(
        columns=media_disciplina.index,
        data=[media_disciplina.values, numero_acessos],
        index=['Média de Acesso por Caderno', 'Número de Acesso por Caderno '])

    return tabela
示例#41
0
    def test_replace_series_dict(self):
        # from GH 3064
        df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}})
        result = df.replace(0, {'zero': 0.5, 'one': 1.0})
        expected = DataFrame(
            {'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 2.0, 'b': 1.0}})
        assert_frame_equal(result, expected)

        result = df.replace(0, df.mean())
        assert_frame_equal(result, expected)

        # series to series/dict
        df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}})
        s = Series({'zero': 0.0, 'one': 2.0})
        result = df.replace(s, {'zero': 0.5, 'one': 1.0})
        expected = DataFrame(
            {'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 1.0, 'b': 0.0}})
        assert_frame_equal(result, expected)

        result = df.replace(s, df.mean())
        assert_frame_equal(result, expected)
示例#42
0
def BackTestSignal(dfXAlpha, dfXReturn, XPrice, strategy, riskmgr=None, freq = 252):
    dfAlphaWeight  = strategy.GenSingleAlphaWeight(dfXAlpha)
    if riskmgr is not None:
        dfAlphaWeight = riskmgr.AdjustAlphaWeight(dfAlphaWeight)
    dfSignalReturn = GenSingleFactorReturn(dfAlphaWeight, dfXReturn)
    
    # (simple_sharpe, geo_sharpe, sim_mean * N, geo_mean * N, vol)
    sharpe = CalcSharpeRatio(dfSignalReturn['Return'], freq)
    
    # Detailed Data
    dfLongCount = DataFrame(columns=['LongCount'], data=dfAlphaWeight.apply(lambda s: s[s>0].count(), axis=1))
    dfShortCount = DataFrame(columns=['ShortCount'], data=dfAlphaWeight.apply(lambda s: s[s<0].count(), axis=1))
    dfLongExposure = DataFrame(columns=['LongExposure'], data=dfAlphaWeight.apply(lambda s: s[s>0].sum(), axis=1))
    dfShortExposure = DataFrame(columns=['ShortExposure'], data=dfAlphaWeight.apply(lambda s: s[s<0].sum(), axis=1))
    dfNetExposure = DataFrame(columns=['NetExposure'], data=dfAlphaWeight.apply(sum, axis=1))
    dfTotalDollarInvest = DataFrame(columns=['I'], data=dfAlphaWeight.apply(lambda s: abs(s).sum(), axis=1))
    dfTotalDollarTraded = DataFrame(columns=['D'], data=(dfAlphaWeight - dfAlphaWeight.shift(1)).apply(lambda s: abs(s).sum(), axis=1))
    dfSharesTraded = dfAlphaWeight / XPrice
    dfTotalSharesTraded = DataFrame(columns=['Q'], data=(dfSharesTraded - dfSharesTraded.shift(1)).apply(lambda s: abs(s).sum(), axis=1))
    
    TurnOver = dfTotalDollarTraded.mean()[0]/dfTotalDollarInvest.mean()[0]
    CentsPerShare = 100 * dfSignalReturn['Return'].iloc[1:].mean() / dfTotalSharesTraded.mean()[0]
    
    dfMetrics = DataFrame(list(sharpe)).T
    dfMetrics.columns = ['Simple Sharpe', 'Geo. Sharpe', 'Simple Mean', 'Geo. Mean', 'Anual Vol']
    dfMetrics['Turnover'] = TurnOver
    dfMetrics['CentsPerShare'] = CentsPerShare
    dfMetrics['AvgHolding'] = 1.0/TurnOver
    dfMetrics.index = [dfXAlpha.index.name]
    
    dfSignalReturn = pd.merge(dfSignalReturn, dfLongCount,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfShortCount,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfLongExposure,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfShortExposure,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfNetExposure,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfTotalDollarInvest,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfTotalDollarTraded,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfTotalSharesTraded,left_index=True,right_index=True,how='outer')    
    
    return dfMetrics, dfSignalReturn, dfAlphaWeight
    def test_align_int_fill_bug(self):
        # GH #910
        X = np.arange(10 * 10, dtype='float64').reshape(10, 10)
        Y = np.ones((10, 1), dtype=int)

        df1 = DataFrame(X)
        df1['0.X'] = Y.squeeze()

        df2 = df1.astype(float)

        result = df1 - df1.mean()
        expected = df2 - df2.mean()
        assert_frame_equal(result, expected)
def stndize(str,list):

    s=list
    w= pd.read_csv(str,usecols=s)
    frame = DataFrame(w)

    t=frame.mean()
    print t
    z=frame.std()
    print z
    print (w-t)/z

    return;
示例#45
0
    def test_common_start_returns(self, before, after, mean_by_date, demeaned,
                                  expected_vals):
        dr = date_range(start='2015-1-17', end='2015-2-2')
        dr.name = 'date'
        tickers = ['A', 'B', 'C', 'D']
        r1, r2, r3, r4 = (1.20, 1.40, 0.90, 0.80)
        prices = DataFrame(index=dr, columns=tickers,
                           data=[[r1**1, r2**1, r3**1, r4**1],
                                 [r1**2, r2**2, r3**2, r4**2],
                                 [r1**3, r2**3, r3**3, r4**3],
                                 [r1**4, r2**4, r3**4, r4**4],
                                 [r1**5, r2**5, r3**5, r4**5],
                                 [r1**6, r2**6, r3**6, r4**6],
                                 [r1**7, r2**7, r3**7, r4**7],
                                 [r1**8, r2**8, r3**8, r4**8],
                                 [r1**9, r2**9, r3**9, r4**9],
                                 [r1**10, r2**10, r3**10, r4**10],
                                 [r1**11, r2**11, r3**11, r4**11],
                                 [r1**12, r2**12, r3**12, r4**12],
                                 [r1**13, r2**13, r3**13, r4**13],
                                 [r1**14, r2**14, r3**14, r4**14],
                                 [r1**15, r2**15, r3**15, r4**15],
                                 [r1**16, r2**16, r3**16, r4**16],
                                 [r1**17, r2**17, r3**17, r4**17]])
        dr2 = date_range(start='2015-1-21', end='2015-1-29')
        factor = DataFrame(index=dr2, columns=tickers,
                           data=[[3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1]]).stack()
        factor.index = factor.index.set_names(['date', 'asset'])
        factor.name = 'factor'

        cmrt = common_start_returns(
            factor,
            prices,
            before,
            after,
            False,
            mean_by_date,
            factor if demeaned else None)
        cmrt = DataFrame({'mean': cmrt.mean(axis=1), 'std': cmrt.std(axis=1)})
        expected = DataFrame(index=range(-before, after + 1),
                             columns=['mean', 'std'], data=expected_vals)
        assert_frame_equal(cmrt, expected)
示例#46
0
def calc_kelly_leverages(securities: Set[str],
                         start_date: date,
                         end_date: date,
                         risk_free_rate: float = 0.04) -> Dict[str, float]:
    """Calculates the optimal leverages for the given securities and
    time frame. Returns a list of (security, leverage) tuple with the
    calculate optimal leverages.

    Note: risk_free_rate is annualized
    """
    f = {}
    ret = {}
    excess_return = {}

    # Download the historical prices from Yahoo Finance and calculate the
    # excess return (return of security - risk free rate) for each security.
    for symbol in securities:
        try:
            hist_prices = get_historical_data(
                symbol, start=start_date, end=end_date,
                output_format='pandas')
        except IOError as e:
            raise ValueError(f'Unable to download data for {symbol}. '
                             f'Reason: {str(e)}')

        f[symbol] = hist_prices

        ret[symbol] = hist_prices['close'].pct_change()
        # risk_free_rate is annualized
        excess_return[symbol] = (ret[symbol] - (risk_free_rate / 252))

    # Create a new DataFrame based on the Excess Returns.
    df = DataFrame(excess_return).dropna()

    # Calculate the CoVariance and Mean of the DataFrame
    C = 252 * df.cov()
    M = 252 * df.mean()

    # Calculate the Kelly-Optimal Leverages using Matrix Multiplication
    F = inv(C).dot(M)

    # Return a list of (security, leverage) tuple
    return {security: leverage
            for security, leverage in zip(df.columns.values.tolist(), F)}
示例#47
0
    def test_ops(self):

        # tst ops and reversed ops in evaluation
        # GH7198

        # smaller hits python, larger hits numexpr
        for n in [4, 4000]:

            df = DataFrame(1, index=range(n), columns=list('abcd'))
            df.iloc[0] = 2
            m = df.mean()

            for op_str, op, rop in [('+', '__add__', '__radd__'),
                                    ('-', '__sub__', '__rsub__'),
                                    ('*', '__mul__', '__rmul__'),
                                    ('/', '__truediv__', '__rtruediv__')]:

                base = (DataFrame(np.tile(m.values, n)  # noqa
                                  .reshape(n, -1),
                                  columns=list('abcd')))

                expected = eval("base{op}df".format(op=op_str))

                # ops as strings
                result = eval("m{op}df".format(op=op_str))
                assert_frame_equal(result, expected)

                # these are commutative
                if op in ['+', '*']:
                    result = getattr(df, op)(m)
                    assert_frame_equal(result, expected)

                # these are not
                elif op in ['-', '/']:
                    result = getattr(df, rop)(m)
                    assert_frame_equal(result, expected)

        # GH7192
        df = DataFrame(dict(A=np.random.randn(25000)))
        df.iloc[0:5] = np.nan
        expected = (1 - np.isnan(df.iloc[0:25]))
        result = (1 - np.isnan(df)).iloc[0:25]
        assert_frame_equal(result, expected)
def cross_validate_trades(trades, N = 20, subset_fraction = 0.7):
    
    tickers = trades.tickers
    sample_size = round(len(tickers) * subset_fraction)
    summary = DataFrame(dtype = float)

    for n in range(N):
        sample_tickers = list(random.choice(tickers, sample_size, replace = False))
        trade_subset = trades.find(lambda T: T.ticker in sample_tickers)
        summary[n] = summary_report(trade_subset)

    result = DataFrame(dtype = float)
    result['Base'] = summary_report(trades)
    result['Mean'] = summary.mean(axis = 1)
    result['Std'] = summary.std(axis = 1)
    result['Median'] = summary.median(axis = 1)
    result['Max'] = summary.max(axis = 1)
    result['Min'] = summary.min(axis = 1)

    return (result, summary)
示例#49
0
def avg_medal_count():
    '''
    Using the dataframe's apply method, create a new Series called 
    avg_medal_count that indicates the average number of gold, silver,
    and bronze medals earned amongst countries who earned at 
    least one medal of any kind at the 2014 Sochi olympics.  Note that
    the countries list already only includes countries that have earned
    at least one medal. No additional filtering is necessary.
    
    You do not need to call the function in your code when running it in the
    browser - the grader will do that automatically when you submit or test it.
    '''

    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
    
    olympic_medal_counts = {'country_name':countries,
                            'gold': Series(gold),
                            'silver': Series(silver),
                            'bronze': Series(bronze)}
    df = DataFrame(olympic_medal_counts)
    
    # YOUR CODE HERE
    #df['average_medal_count'] = df.mean(axis=1)
    #avg_medal_count_by_country = df[['country_name','average_medal_count']]
    avg_medal_count = df.mean()
    # Or, we could do it this way
    avg_medal_count = df[['gold', 'silver', 'bronze']].apply(numpy.mean)
    print(avg_medal_count)
    
    return avg_medal_count
示例#50
0
class GetGenes(object):

	def __init__(self, data):
		self.dataframe = DataFrame(data)

	# read a text file and return a data frame. Records should be separated by TAB
	# There should not be duplicate column names
	def import_file(self, filename):
		# this function use to convert string to float
		def convert(x):
			try:
				x = float(x)
			except ValueError:
				pass
			return(x)

		table = []
		for line in open(filename):
			if(line.strip()):	# If not empty line
				line = line.rstrip('\n').split('\t')
				line = list(map(convert, line))
				table.append(line)
		self.dataframe = DataFrame(table[1:],columns=table[0])
		return

	def houseKeepingGenes(self, geneNum):
		# compute the CV of data
		std = array(self.dataframe.std(axis = 1))
		mean = array(self.dataframe.mean(axis = 1))
		CV = std/mean
		CV = list(map(abs, CV))		# convert to positive number

		# get the fist N minimum value
		mins = nsmallest(geneNum, CV)
		print("The GOOD genes are:\n")
		for item in mins:
			print(self.dataframe.ix[CV.index(item)][0])
		return
示例#51
0
## 带有重复值的轴索引
obj = Series(range(5), index=['a','a','b','b','c'])
# 检验是否唯一
obj.index.is_unique
# 一个索引有多个值,那么该索引就会返回多个值。
obj['a']

## 汇总和计算描述统计
df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],
	index=['a','b','c','d'], columns=['one','two'])
# 对列
df.sum()
# 对行
df.sum(axis=1)
# 默认会排除NA,但是可以通过skipna禁用该功能
df.mean(axis=1,skipna=False)
# 返回最大值的索引
df.idxmax()
# 累加
df.cumsum()
df.describe()
# 相关系数
returns.MSFT.corr(returns.IBM)
returns.corr()
returns.cov()
returns.corrwith(returns.IBM)

## 唯一值,值计数以及成员资格
obj = Series(['c','a','d','a','a','b','b','c','c'])
uniques = obj.unique()
# 统计个数
示例#52
0
    def run(self,Model='svc',kernel='linear',is_cross_validation=True, cross_validationMethod='LOO', DecompositionMethod='PCA',decompositionLevel='FeatureType',n_components=30, FeatureSelection='TopExplainedVarianceComponents', n_features=10, isPerm=0,isBetweenSubjects=True,isConcatTwoLabels=False,isSaveCsv=None, isSavePickle=None, isSaveFig=None,isSelectSubFeatures=False,SubFeatures='ExpressionLevel'):       
        # -- TODO :
        # --  # Greedy selection on features + Other feature selection types...
        # --  # Make sure featuers are Best only based on train data!!!
        # --  # Keep a list of n_train, n_test from each Label and scoring (accuracy, f1..) in each cross validation iteration
        # --  # Plot results summary (see CARS paper for desired results for Ein Gedi Poster 22-1-2015)
        # --  # remove irelevant data using 'Tracking Success' and consider 'TimeStamps' for feature calculation
        # --  # add f feature analysis by facial part (see excel) 
        # --  # select best model (svm, otherwise ridge regression) 
        # --  # compare svc results with regerssion results (using LOO and different Params for regression  - params for unbalanced data, different kernels, etc.), model evaluation - http://scikit-learn.org/stable/modules/model_evaluation.html) 
        # --  # check how the model weights behave - feature selection analysis
        # --  # calc model error
        # --  # divide data to subparts for training and testing - try within/ between subject, and analyze distribution of features when data is divided
        # --  # LOO - also on bool labels (patients vs controls and mental status bool)
        # --  # add mental status rank scores (0-4)
        # --  # make sure p-val returns the right value in 'scores'
        # --  # run it over random data (permutation test) 
        # --  # continoue here - check regression results-Make sure regression works (not so good).. check what happens in svc for G7 (high train R, negative test R)

        ## init 
        if isSelectSubFeatures:
            print('Features : ' + SubFeatures)
            f=self.FeaturesDF.copy()
            featureNames=self.FeaturesDF.index.names
            try:
               f=f.loc[SubFeatures]
               f.index=MultiIndex.from_product([[SubFeatures],f.index], names=featureNames)
            except KeyError:
               f.index=f.index.swaplevel(0,1)
               f=f.loc[SubFeatures]
               f.index=MultiIndex.from_product([f.index,[SubFeatures]], names=featureNames)
            self.FeaturesDF=f.copy()
        else:
            SubFeatures='allFeatureTypes'

        FeatureTypeList=[j for j in tuple(self.FeaturesDF.index)]
        self.FullResults=DF()
           
        # set learning params (cross validation method, and model for learning)
        isBoolLabel=self.LabelsObject.isBoolLabel
        isBoolScores=isBoolLabel
        if DecompositionMethod==None and (FeatureSelection == 'TopExplainedVarianceComponents' or FeatureSelection == 'TopNComponents'):
            print("ERROR- feature selection method cannot be '"+ FeatureSelection +"' when X is not decomposed")
            FeatureSelection=raw_input("Choose a different feature selection method ('RFE','f_regression','dPrime','AllFeatures'): ")

        model, isBoolModel= learningUtils.setModel(Model)
        selectFeatures =learningUtils.setFeatureSelection(FeatureSelection,n_features)
        n_components=min(n_features,n_features) #cannot have more components than features. 
        decompositionTitle, decomposeFunction= learningUtils.setDecomposition(DecompositionMethod,n_components,decompositionLevel)
        isDecompose=  decompositionTitle!='noDecomposition'


        # save learning params
        self.Learningdetails={'Model':Model,'Kernel':kernel,'CrossVal':cross_validationMethod,'FeatureSelection':FeatureSelection,'Decomposition':decompositionTitle,'LabelBy':self.Details['LabelDetails'].keys()[0],'FeatureMethod':self.Details['FeatureMethod'],'PieceLength':self.Details['PieceLength']}
        print('\n------------Learning Details------------')
        print(DF.from_dict(self.Learningdetails,orient='index'))
        print('\n----' + cross_validationMethod + ' Cross validation Results:----')
        
        #define global variables over modules (to be used in myUtils)

        globalVars.transformMargins=0#lambda x:x         
        globalVars.isBoolLabel=isBoolLabel
        globalVars.isBoolModel=isBoolModel
        global trainLabels_all, testLabels_all, TrueLabels,isAddDroppedSubjects 
        trainLabels_all, testLabels_all, TrueLabels,isAddDroppedSubjects=labelUtils.initTrainTestLabels_all(self.LabelsObject)
        trainLabels_all2, testLabels_all2, TrueLabels2,isAddDroppedSubjects2=labelUtils.initTrainTestLabels_all(self.LabelsObject2)

        
        LabelingList=trainLabels_all.columns #['N1']
        self.ResultsDF=DF()
        self.BestFeatures=DF(columns=LabelingList) #dict of BestFeaturesDF according to Labeling methods
        YpredictedOverAllLabels=pandas.Panel(items=range(len(trainLabels_all)),major_axis=LabelingList,minor_axis=TrueLabels.index) #panel: items=cv_ind, major=labels, minor=#TODO 
       
                                              
        ## Create train and test sets according to LabelBy, repeat learning each time on different Labels from LabelingList
        
        isMultivarLabels=False      
        LabelingIndex=enumerate(LabelingList)
        if isMultivarLabels:
            LabelingIndex=enumerate([LabelingList])

        for label_ind, Labeling in LabelingIndex:
            """if isPerm: #TODO - fix this to work with continous / bool data
                try:
                    trainLabels=self.LabelsObject.permedLabelsDF[Labeling]
                except AttributeError:
                    self.LabelsObject.permLabels()
                    trainLabels=self.LabelsObject.permedLabelsDF[Labeling]"""
            #set subjects list according to labels and features
            X,SubjectsList,droppedSubjects,Xdropped=featuresUtils.initX(self.FeaturesDF,trainLabels_all,Labeling)
            X2,SubjectsList2,droppedSubjects2,Xdropped2=featuresUtils.initX(self.FeaturesDF,trainLabels_all2,Labeling,is2=1)
            
            #init train and test labels
            trainLabels, testLabels, LabelRange = labelUtils.initTrainTestLabels(Labeling,SubjectsList,trainLabels_all, testLabels_all)
            trainLabels2, testLabels2, LabelRange2 = labelUtils.initTrainTestLabels(Labeling,SubjectsList2,trainLabels_all2, testLabels_all2)
            
            #make sure only labeled subjects are used for classification
            X=X.query('subject == '+ str(list(trainLabels.index)) ) 
            X.index.get_level_values(X.index.names[0]) 
            SubjectIndex=list(set(X.index.get_level_values('subject')))

            X2=X2.query('subject == '+ str(list(trainLabels2.index)) )  
            X2.index.get_level_values(X2.index.names[0]) 
            SubjectIndex2=list(set(X2.index.get_level_values('subject')))                       
            #init vars
            if isBetweenSubjects:
                cv_param=len(SubjectIndex)
                self.Learningdetails['CrossValSubjects']='between'
                isWithinSubjects=False
            else:
                isWithinSubjects=True
                X=X.swaplevel(0,1)
                PieceIndex=list(set(X.index.get_level_values('Piece_ind')))
                cv_param=len(PieceIndex)
                self.Learningdetails['CrossValSubjects']='within'
            
            self.Learningdetails['NumOfFeatures']=n_features
            
            
            try:
                print('\n**' + Labeling + '**')
            except TypeError:
                print('\n*******')
                print(Labeling)
            
            cv, crossValScores= learningUtils.setCrossValidation(cross_validationMethod,cv_param,trainLabels,isWithinSubjects) 
            
            ## Learning - feature selection for different scoring types, with cross validation - 

            BestFeaturesForLabel=self.BestFeaturesForLabel(FeatureTypeList,LabelingList,n_features) #saves dataframe with best features for each label, for later analysis
            cv_ind=0
            #used for transforming from margins returned from svm to continouse labels (e.g . PANSS)
            trainScores=DF()
            test_index=X.index
            testScores=concat([DF(index=test_index),DF(index=['std_train_err'])])
            testScores2=concat([DF(index=testLabels.index),DF(index=['std_train_err'])]) 
            testProbas=DF(index=X.index)
            testProbas2=DF(index=SubjectIndex)

            #impt=Imputer(missing_values='NaN', strategy='median', axis=0)

            globalVars.LabelRange=LabelRange

            ModelWeights1=DF(columns=range(len(cv)),index=X.columns)
            Components=pandas.Panel(items=range(len(cv)),major_axis=X.columns,minor_axis=range(n_features)) #todo fix this for 1st and second learning
            ExplainedVar=DF(columns=range(len(cv)))
            ModelWeights2=DF(columns=range(len(cv)))
            bestNfeaturesPanel=Panel(items=LabelingList,minor_axis=range(len(cv)),major_axis=range(n_features))
            
            #bestNfeaturesPanel=Panel(items=LabelingList,major_axis=range(len(cv)),minor_axis=MultiIndex.from_tuples(('a','b')))
            

            for train, test in cv:

                if not is_cross_validation:
                   train=np.append(train,test)
                   #test=np.append(train,test)
                   self.Learningdetails['CrossVal']='NONE'
                   #if cv_ind>0:
                    #    break

                if isBetweenSubjects:
                    #set X and Y
                    train_subjects=trainLabels.iloc[train].index
                    test_subjects=testLabels.iloc[test].index 
                    Xtrain,Xtest, Ytrain, YtrainTrue, Ytest=learningUtils.setXYTrainXYTest(X,Labeling,trainLabels,testLabels,TrueLabels,train_subjects,test_subjects)
                    Xtrain2,Xtest2, Ytrain2, YtrainTrue2, Ytest2=learningUtils.setXYTrainXYTest(X2,Labeling,trainLabels2,testLabels2,TrueLabels2,train_subjects,test_subjects)

                    
                    if isConcatTwoLabels: #used when there is more than one doctor
                        Xtrain=concat([Xtrain,Xtrain2])
                        Xtest=concat([Xtest,Xtest2])
                        Ytrain=concat([Ytrain,Ytrain2])
                        YtrainTrue=concat([YtrainTrue,YtrainTrue2])
                        Ytest=concat([Ytest,Ytest2])
                        Xdropped=concat([Xdropped,Xdropped2])
                        SubjectsList=list(set(SubjectsList).intersection(set(SubjectsList2)))
                        droppedSubjects=list(set(droppedSubjects).union(set(droppedSubjects2)).difference(set(SubjectsList)))#diff from SubjectsList to make sure no subjects are both in train and test.
                 

                    #select N best features:
                    Xtrain, Xtest, bestNfeatures, components, explainedVar = learningUtils.decomposeAndSelectBestNfeatures(Xtrain,Xtest,Ytrain,n_features,selectFeatures,decomposeFunction)
                    BestFeaturesForLabel.add(bestNfeatures) #todo - delete this??  
                    bestNfeaturesPanel[Labeling][cv_ind]=bestNfeatures   
                    """for  feature_ind,feature_name in enumerate(bestNfeatures):
                         
                         try:
                            bestNfeaturesPanel[Labeling][feature_name].loc[cv_ind]=feature_ind
                        except KeyError:
                            bestNfeaturesPanel[Labeling].columns=bestNfeaturesPanel[Labeling].columns.append(feature_name)#continue here!! use 
                            bestNfeaturesPanel[Labeling][feature_name].loc[cv_ind]=feature_ind



                    [bestNfeatures].iloc[cv_ind]=range(len(bestNfeatures))"""
                    #train 1 
                    TrainModel=model
                    TrainModel.fit(Xtrain.sort_index(),Ytrain.T.sort_index())
                    """try:
                        #Components[cv_ind]=components.T
                        #ExplainedVar[cv_ind]=explainedVar
                        isDecompose=True"""
                    if cv_ind==0:
                        ModelWeights1=DF(columns=range(len(cv)),index=range(len(bestNfeatures)))    
                    ModelWeights1[cv_ind]=TrainModel.coef_.flatten()
                  
                    #get ROC scores without cross validation:
                                           
                    #train 2
                    if isBoolLabel:
                       PiecePrediction_train=DF(TrainModel.predict_proba(Xtrain).T[1],index=Xtrain.index,columns=['prediction'])
                       TrainModel2=svm.SVC(kernel='linear', probability=True,class_weight={0:1,1:1})
                    else:
                       PiecePrediction_train=DF(TrainModel.decision_function(Xtrain),index=Xtrain.index,columns=['prediction'])
                       TrainModel2=linear_model.LinearRegression()

                    Xtrain2, Ytrain2, YtrainTrue2=learningUtils.getX2Y2(Xtrain,Ytrain,YtrainTrue,PiecePrediction_train, isBoolLabel)                 
                    TrainModel2.fit(Xtrain2, Ytrain2)
                    if cv_ind==0:
                        ModelWeights2=DF(columns=range(len(cv)),index= Xtrain2.columns)
                    ModelWeights2[cv_ind]=TrainModel2.coef_.flatten()         

                              
                    #test 1
                    if isAddDroppedSubjects: #take test subjects from cv + subjects that were dropped for labeling used for test
                        if isDecompose:
                            dXdropped=DF(decomposeFunc(Xdropped).values,index=Xdropped.index)
                        XtestDropped=dXdropped[bestNfeatures]
                        YtestDropped=Series(XtestDropped.copy().icol(0))
                        #YTrueDropped=Series(Xdropped.copy().icol(0))
                        for subject in droppedSubjects:
                            YtestDropped[subject]=testLabels_all[Labeling].loc[subject]
                            #YTrueAll.loc[subject]=TrueLabels[Labeling].loc[subject]
                        Ytest=concat([Ytest,YtestDropped]).sort_index()
                        Xtest=concat([Xtest,XtestDropped]).sort_index()


                    if isPerm: #TODO- Check this!!
                        Ytest=y_perms.loc[Ytest.index]
                    Xtest=Xtest.fillna(0.)
                    
                    
                elif isWithinSubjects:
                    #train 1
                    train_pieces=PieceIndex[train]
                    test_pieces=PieceIndex[test] #TODO - make sure that if test/train> piece index, it ignores it and repeate the process
                    
                    XtrainAllFeatures=X.query('Piece_ind == '+ str(list(train_pieces)))
                    Ytrain=Series(index=X.index)
                    Ytest=Series(index=X.index)
                    YtrainTrue=Series(index=X.index)
                    
                    for subject in PieceIndex: 
                        for piece in train_pieces:
                            Ytrain.loc[piece].loc[subject]=trainLabels[subject]
                            YtrainTrue.loc[piece].loc[subject]=TrueLabels[Labeling].loc[subject] 
                            Ytest.loc[piece].loc[subject]=testLabels[subject]   
                    Ytrain=Ytrain.dropna()
                    YtrainTrue=YtrainTrue.dropna() 
                    for subject in test_subjects:
                        Ytest.loc[piece].loc[subject]=testLabels[subject]
                #train scores 1       
                if cv_ind==0:
                    trainScores,YtrainPredicted=learningUtils.getTrainScores(Ytrain,Xtrain,YtrainTrue,TrainModel)
                    plt.figure(1)
                    if len(LabelingList)>1:
                        plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
                    if isBoolLabel:
                        testScores,testProbas=learningUtils.getTestScores(Ytest,Xtest,TrainModel)
                    else:
                        testScores[cv_ind],testProbas=learningUtils.getTestScores(Ytest,Xtest,TrainModel)
                        plt.title(Labeling,fontsize=10)
                else:
                    plt.figure(3)
                    new_trainScores,YtrainPredicted=learningUtils.getTrainScores(Ytrain,Xtrain,YtrainTrue,TrainModel)
                    trainScores=concat([trainScores,new_trainScores],axis=1)
                #test 1   
                    testScores[cv_ind],testProbas_new=learningUtils.getTestScores(Ytest,Xtest,TrainModel)
                    testProbas=concat([testProbas,testProbas_new])
                
                #train2

                if isBoolLabel:
                    PiecePrediction_test=DF(TrainModel.predict_proba(Xtest).T[1],index=Xtest.index,columns=['prediction'])
                else:
                    PiecePrediction_test=DF(TrainModel.decision_function(Xtest),index=Xtest.index,columns=['prediction'])
                Xtest2, Ytest2 , YtestTrue2 =learningUtils.getX2Y2(Xtest,Ytest,Ytest,PiecePrediction_test,isBoolLabel)
                
                if cv_ind==0:
                    trainScores2,YtrainPredicted2=learningUtils.getTrainScores(Ytrain2,Xtrain2,YtrainTrue2,TrainModel2)
                    YpredictedOverAllLabels[cv_ind].loc[Labeling]=YtrainPredicted2
                    #plt.figure(1)
                    #if len(LabelingList)>1:
                        #plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
                #test2
                    if isBoolLabel:
                        testScores2,testProbas2=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2)
                    else:
                        testScores2[cv_ind],testProbas2=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2)
                    #plt.title(Labeling,fontsize=10)
                else:
                    new_trainScores2,YtrainPredicted2=learningUtils.getTrainScores(Ytrain2,Xtrain2,YtrainTrue2,TrainModel2)
                    YpredictedOverAllLabels[cv_ind].loc[Labeling]=YtrainPredicted2
                    trainScores2=concat([trainScores2,new_trainScores2],axis=1)
                    if len(Xtest2)>0: # if there is more than one segment for subject
                        testScores2[cv_ind],testProbas2_new=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2)     
                        testProbas2=concat([testProbas2,testProbas2_new])
                cv_ind+=1

                #crossValScores=crossValScores.append(CVscoresDF,ignore_index=True) #information about entire train test data. 
            fig2=plt.figure(2)
            if len(LabelingList)>1:
                plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
            #if isAddDroppedSubjects:
               # testLabelsSummary=testLabels_all[Labeling].loc[AllSubjects]
           # else:
               # testLabelsSummary=testLabels
            scoresSummary,rocDF = learningUtils.getScoresSummary(trainScores2,testScores2,testProbas2,TrueLabels[Labeling])

            # reset global vars
            globalVars.fitYscale='notDefined'
            globalVars.beta=DF()

            plt.title(Labeling,fontsize=10)
            plt.xlabel('Ytrue',fontsize=8)
            plt.ylabel('Ypredicted',fontsize=8)
            plt.tick_params(labelsize=6)
            #print(crossValScores.T)    
            scores=scoresSummary.fillna(0.)
            
            #analyze feature weights             
            ModelWeights1=ModelWeights1.dropna(how='all')
            WeightedFeatures1_index0=analysisUtils.getFeaturesWeights(0,bestNfeaturesPanel[Labeling],ModelWeights1) #FeatureAnalysisIndex=0 for featureType, 1= au's (if not decomposed) or component rank (if decomposed)
            WeightedFeatures1_index1=analysisUtils.getFeaturesWeights(1,bestNfeaturesPanel[Labeling],ModelWeights1)
            WeightedFeatures1=concat([DF(index=['-------(A) Index0-------']),WeightedFeatures1_index0,DF(index=['-------(B) Index1 -------']),WeightedFeatures1_index1])
            
            WeightedFeatures2=DF(ModelWeights2.mean(axis=1)).fillna(0)
            #WeightedFeatures2=DF([ModelWeights2.mean(axis=1),ModelWeights2.std(axis=1)],index=['mean','std']).T.fillna(0)
            BestFeatures=concat([DF(index=['------------- Learning 1 -------------']),WeightedFeatures1,DF(index=['------------- Learning 2 -------------']),WeightedFeatures2])
            self.BestFeatures[Labeling]=Series(BestFeatures.values.flatten(),index=BestFeatures.index)

            #analyze decomposition
            if isDecompose:
                Components_mean = Components.mean(axis=0)
                Components_std = Components.std(axis=0)
                normalize=lambda df:DF(StandardScaler().fit_transform(df.T).T,index=df.index,columns=df.columns) 

                """#componentsMeanFeatureType=normalize(Components.mean(axis=1,level='FeatureType'))
                #componentsMeanFeatureTypeABS=normalize(componentsDF.abs().mean(axis=1,level='FeatureType'))
                #componentsMeanFSsignal=normalize(componentsDF.mean(axis=1,level='fs-signal'))
                #componentsMeanFSsignalABS=normalize(componentsDF.abs().mean(axis=1,level='fs-signal'))
                #ExplainedVar_mean = DF(ExplainedVar.mean(axis=1)).T#todo- check!
                #ExplainedVar_mean.index=['ExplainedVar_mean']
                #ExplainedVar_std = DF(ExplainedVar.std(axis=1)).T#todo- check!
                #ExplainedVar_std.index=['ExplainedVar_std']
                #componentsToCSV=concat([DF(index='---meanFeatureType----'),componentsMeanFeatureType,DF(index='---meanFeatureType - abs ----'),componentsMeanFeatureTypeABS,DF(index='---mean fs-signal ----'),componentsMeanFSsignal,DF(index='---mean fs-signal - abs ----'),componentsMeanFSsignalABS])
                try:
                    self.LabelComponents[Labeling]=concat([DF(index=['---components mean---']),Components_mean,ExplainedVar_mean,DF(index=['---components std over cross validation---']),Components_std,ExplainedVar_std])
                except AttributeError:
                    self.LabelComponents=dict.fromkeys(LabelingList)
                    self.LabelComponents[Labeling]=concat([DF(index=['---components mean---']),Components_mean,ExplainedVar_mean,DF(index=['---components std over cross validation---']),Components_std,ExplainedVar_std])"""

                """print(Components_mean)
                print(ExplainedVar_mean)
                print(WeightedFeatures1)"""

                        
            #BestFeaturesForLabel.analyze(ByLevel=0) #TODO change to regression coeff
            LabelFullResults=concat([DF(index=[Labeling]),scores]) 
  
            self.FullResults=concat([self.FullResults,LabelFullResults])            
            self.ResultsDF=concat([self.ResultsDF,DF(scores[0],columns=[Labeling])],axis=1)

            #self.BestFeatures[Labeling]=BestFeaturesForLabel.WeightedMean

            #plt.savefig('C:\\Users\\taliat01\\Desktop\\TALIA\\Code-Python\\Results\\'+Labeling+'png')
        testScores3=pandas.Panel(items=range(len(X2.index))) #for each cv score...
        FullSubjectsList=YpredictedOverAllLabels[0].columns
        YdroppNans=YpredictedOverAllLabels.dropna(axis=0,how='all')
        YdroppNans=YdroppNans.dropna(axis=1,how='all')
        YpredictedOverAllLabels=YdroppNans.dropna(axis=2,how='all')
        notNans_cv_ind=YpredictedOverAllLabels.items
        notNans_trainSubjects=YpredictedOverAllLabels.minor_axis
        notNans_LabelsList=YpredictedOverAllLabels.major_axis
        notNans_TrueLabels=TrueLabels.T[notNans_trainSubjects].loc[notNans_LabelsList]
        cv_ind=0
        for train, test in cv:
            if cv_ind in notNans_cv_ind:
                print(test)
                train=list(set(FullSubjectsList[train]).intersection(set(notNans_trainSubjects)))
                test=list(set(FullSubjectsList[test]).intersection(set(notNans_trainSubjects)))
                if len(train)>0 and len(test)>0: 
                    AllLabelsYTrainPredicted=YpredictedOverAllLabels[cv_ind][train]
                    AllLabelsYTrainPredicted=AllLabelsYTrainPredicted.fillna(0)
                    AllLabelsYTrainTrue=notNans_TrueLabels[train]
                    AllLabelsYTestPredicted=YpredictedOverAllLabels[cv_ind][test]
                    AllLabelsYTestTrue=notNans_TrueLabels[test]

                    pseudoInverse_AllLabelsYTrainTrue=DF(np.linalg.pinv(AllLabelsYTrainTrue),columns=AllLabelsYTrainTrue.index,index=AllLabelsYTrainTrue.columns)
                    global AllLabelsTransformationMatrix
                    AllLabelsTransformationMatrix=DF(AllLabelsYTrainPredicted.dot(pseudoInverse_AllLabelsYTrainTrue),columns=pseudoInverse_AllLabelsYTrainTrue.columns)#change to real code!!
                TrainModel3=lambda y: y.T.dot(AllLabelsTransformationMatrix)
                #testscores3[cv_ind]=learningUtils.getTestScores(AllLabelsYTrainTrue,AllLabelsYTrainPredicted,TrainModel3)
            cv_ind+=1

        self.BestNFeaturesAll=bestNfeaturesPanel 
        self.ResultsDF=self.ResultsDF.fillna(0.)  
        
        ## Print and save results  
        print('\n')
        print(self.ResultsDF)
        print('\n')
        D=self.Learningdetails 
        savePath=resultsPath+'\\'+D['Model']+'_'+D['CrossVal']+'_LabelBy'+D['LabelBy']+ '_FSelection'+FeatureSelection+'_Decompostion'+D['Decomposition']+'PieceSize'+D['PieceLength']+'_'+SubFeatures
        if isPerm:
            savePath=savePath+'_PERMStest'
        saveName=savePath+'\\'+str(n_features)+'_features'        
        self.Learningdetails['saveDir']=savePath
        dir=os.path.dirname(saveName)
        if not os.path.exists(dir):
            os.makedirs(dir)
        if isSavePickle is None:
            isSavePickle=int(raw_input('Save Results to pickle? '))
        if isSaveCsv is None:
            isSaveCsv= int(raw_input('save Results to csv? '))
        if isSaveFig is None:
            isSaveFig=int(raw_input('save Results to figure? '))

       
        if isSavePickle:        
            self.ResultsDF.to_pickle(saveName+'.pickle')
            self.BestFeatures.to_pickle(saveName+'_bestFeatures.pickle')
                
        if isSaveCsv:
            DetailsDF=DF.from_dict(self.Learningdetails,orient='index')
            ResultsCSV=concat([self.ResultsDF,DF(index=['-------Label Details-------']),self.N,DF(index=['-------Learning Details-------']),DetailsDF,DF(index=['-------Selected Features Analysis------']),self.BestFeatures])
            ResultsCSV.to_csv(saveName+'.csv')
            if isBoolLabel:
                ROCfig=learningUtils.save_plotROC(rocDF,isSave=True,saveName=saveName,title=SubFeatures)

        if isSaveCsv or isSavePickle:
            print('successfully saved as:\n' + saveName)
        
        if isSaveFig:
            plt.figure(1)
            plt.savefig(saveName + 'Train.png')
            plt.figure(2)
            plt.savefig(saveName + 'Test.png')
        plt.close()
        plt.close()
    sys.stdout = Logger(title)

if t == 2:
    authors = y_options.get(0)[1]
    recipients = y_options.get(1)[1]
    accuracies = []
    results = {}
    clf, author_score = load_classifier(clfs.get(7), X, y_options.get(0))

    for a in np.unique(authors):
        s_targets = list(compress(recipients, authors == a))
        s_data = list(compress(X, authors == a))
        _, score = classify(clfs.get(c), ("Recipients of {}".format(class_labels[a]), s_targets), s_data)
        accuracies.append(np.mean(score))
        results[a] = dict(zip(np.unique(s_targets), np.atleast_1d(score)))

    df = DataFrame(results).T
    plot_accuracy_matrix(df, class_labels[np.unique(recipients)], class_labels[np.unique(authors)], title)

    df = df.T.fillna(df.mean(axis=1)).T
    plot_accuracy_matrix(df, class_labels[np.unique(recipients)], class_labels[np.unique(authors)], title+" (filled)")
    print("##################################")
    print("Mean Recipient Score: {}".format(np.mean(accuracies)))
    print("##################################")
    print("FINAL SCORE: {}".format(np.mean(author_score * np.mean(accuracies))))
    print("##################################")

else:
    classify(clfs.get(c), y, X, save=save_results if t == 0 else False)

示例#54
0
# reductions or summary statistics
f = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
df.sum() # columns sum
df.sum(axis=1) # sum row by row
df
(7.10 - 4.5)/2
df.mean(axis=1, skipna=False)
df
df.idxmax()
df
df.cumsum() # accumultation
df.describe() # multiple summary statistics in one shot.
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj
obj.describe()
## Correlation and Covariance
import pandas.io.data as web
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
    
price = DataFrame({tic: data['Adj Close'],
示例#55
0
BPbin= []
for y in frame2.sbp:
    BPbin.append(int((y - min(frame2.sbp))/3))
frame2['bin'] = BPbin
frame2 = frame2[:(len(frame2))-2] #removes trailing incomplete cardiac cycle
print frame2

groupedRR = frame2['RR'].groupby(frame2['bin'])
RRarray = groupedRR.mean() 

groupedSBP = frame2['sbp'].groupby(frame2['bin'])
SBParray = np.asarray(groupedSBP.mean())
print SBParray

bin_weight = groupedSBP.size()/frame2['hb'].max()
frame3 = frame2.mean()


#linear regression
#RR vs SBP
slope, intercept, r_value, p_value, std_err = linregress(SBParray, RRarray)
frame3['BRS slope'] = slope
frame3['R^2'] = r_value**2
print frame3
bestfit = [(i*0.012020)+intercept for i in SBParray]

#plots plots plots plots plots plots plots plots plots plots plots
fig = plt.figure()

#ECG plot
ax1 = fig.add_subplot(2, 1, 1)
# 'Panel' objects are 3D.

wp = Panel( {
    'Item1' : DataFrame(randn(4, 3)),
    'Item2' : DataFrame(randn(4, 2))
} )
pprint( wp )

# There are also 'TimeSeries', 'SparseSeries', and 'SparsePanel' objects.
# In newer versions, there is experiemental support for higher-dimensional
# panels.

# Stats can also be performed on Pandas objects.
df = DataFrame( randn( 6, 4 ), columns=[ 'A', 'B', 'C', 'D' ] )
pprint( df )

# You can choose which axis number to perform the operation along.
pprint( df.mean( 0 ) )
pprint( df.mean( 1 ) )

# Much more to Pandas, but that's the basic idea.

# For more information, see:
#   http://pandas.pydata.org/pandas-docs/stable/index.html
# Also, definitely have a look at StatsModels:
#   http://statsmodels.sourceforge.net/
#   http://statsmodels.sourceforge.net/stable/

# <demo> --- stop ---
示例#57
0
# Returns all repos data from a given user
def getRepos(user):
    myrepos=requests.get("https://api.github.com/users/"+ user +"/repos", \
    	headers={'Authorization': 'token 5218551eb082bffa572318de0c2de10d255170b1'}).json()
    return myrepos

# Getting number of stars
data = DataFrame()
i = 0
for user in topGitUsers:
    userRepos = getRepos(user)
    i += 1
    print i #check progress
    if len(userRepos) > 0:
	    stars = []
	    listUserStars = [('',0)]
	    for repo in userRepos:
	        #print repo['stargazers_count']
	        stars.append(repo['stargazers_count'])
	    userStars = DataFrame(stars)
	    userMeanSt = userStars.mean(axis=0)
	    listUserStars.append((user,userMeanSt))
	    #print user + str(userMeanSt[0])
	    result = DataFrame({'userId': user,'Mean of stars': userMeanSt})
	    data = data.append(result)
    else:
		print user + ': No repos found for this user'

data.to_csv('gitTopUsersMean.csv')

def main():
    """
    Calculation and aggregation of summary statistics
    """

    # Summary of statistics
    # return is not ndarray
    df = DataFrame([[1.4, np.nan],
                    [7.1, -4.5],
                    [np.nan, np.nan],
                    [0.75, -1.3]],
                   index=list('abcd'),
                   columns=['one', 'two'])
    print df
    print df.sum()
    print df.sum(axis=1)
    print df.mean(axis=1) # exclude nan
    print df.mean(axis=1, skipna=False)
    print df.idxmin()
    print df.idxmax()
    print df.cumsum()
    print df.describe()
    # values are not number
    obj = Series(list('aabc') * 4)
    print obj.describe()


    methods = ['count', 'min', 'max', # 'argmin', 'argmax',
               'quantile', 'median', 'mad', 'var', 'std',
               'skew', 'kurt', 'cummin', 'cummax', 'cumprod',
               'diff', 'pct_change']

    for method in methods:
        print u'「{0}」'.format(method)
        print getattr(df, method)()
        print ''

    # Correspond and Covariance
    all_data = {}
    lst = [] # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']:
    for ticket in lst: #, 'GOOG']:
        # IOError: after 3 tries, Yahoo! did not return a 200
        # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv'
        all_data[ticket] = pd.io.data.get_data_yahoo(ticket, '1/1/2000', '1/1/2010')
    price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()})
    volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()})
    if all_data:
        returns = price.pct_change()
        print returns.tail()
        print ''
        print returns.MSFT.corr(returns.IBM)
        print returns.MSFT.cov(returns.IBM)
        print ''
        print returns.corr()
        print returns.cov()
        print ''
        print returns.corrwith(returns.IBM)
        print returns.corrwith(volume)

    # unique, frequency, belong
    print '',''
    obj = Series(list('cadaabbcc'))
    uniques = obj.unique()
    print uniques
    print obj.value_counts()
    print pd.value_counts(obj.values, sort=False)
    mask = obj.isin(['b', 'c'])
    print mask
    print obj[mask]

    data = DataFrame({
        'Qu1' : [1,3,4,3,4],
        'Qu2' : [2,3,1,2,3],
        'Qu3' : [1,5,2,4,4],
    })
    print data
    print data.apply(pd.value_counts).fillna(0)
        if False:
            kw = dict(method='time')
            df = df.reindex(index).interpolate(**kw).ix[index]
        dfs.update({model: df})

dfs = Panel.fromDict(dfs).swapaxes(0, 2)


# In[ ]:

from pandas import DataFrame

means = dict()
for station, df in dfs.iteritems():
    df.dropna(axis=1, how='all', inplace=True)
    mean = df.mean()
    df = df - mean + mean['OBS_DATA']
    means.update({station: mean['OBS_DATA'] - mean.drop('OBS_DATA')})

bias = DataFrame.from_dict(means).dropna(axis=1, how='all')
bias = bias.applymap('{:.2f}'.format).replace('nan', '--')

columns = dict()
[columns.update({station: get_coops_longname(station)}) for
 station in bias.columns.values]

bias.rename(columns=columns, inplace=True)

to_html(bias.T, 'style.css')

示例#60
0
import numpy as np
from pandas import DataFrame, Series
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats

df=DataFrame(abs(np.random.randn(30).reshape(6,5))*100)

plt.bar(np.arange(len(df.mean())), df.mean(),
        align='center',
        color='white',
        linewidth=1.5)
plt.hold(True)
plt.errorbar(np.arange(len(df.mean())),df.mean(),df.std(),
             elinewidth=1.2,
             capsize=7.5,
             fmt=None)


plt.show()