Пример #1
0
    def test_include_na(self, sparse, dtype):
        s = ['a', 'b', np.nan]
        res = get_dummies(s, sparse=sparse, dtype=dtype)
        exp = DataFrame({'a': [1, 0, 0],
                         'b': [0, 1, 0]},
                        dtype=self.effective_dtype(dtype))
        if sparse:
            exp = exp.apply(pd.SparseArray, fill_value=0.0)
        assert_frame_equal(res, exp)

        # Sparse dataframes do not allow nan labelled columns, see #GH8822
        res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
        exp_na = DataFrame({nan: [0, 0, 1],
                            'a': [1, 0, 0],
                            'b': [0, 1, 0]},
                           dtype=self.effective_dtype(dtype))
        exp_na = exp_na.reindex(['a', 'b', nan], axis=1)
        # hack (NaN handling in assert_index_equal)
        exp_na.columns = res_na.columns
        if sparse:
            exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0)
        assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies([nan], dummy_na=True,
                                  sparse=sparse, dtype=dtype)
        exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan],
                                dtype=self.effective_dtype(dtype))
        tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
Пример #2
0
 def test_frequency_is_original(self, num_cols):
     # GH 22150
     index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"])
     original = index.copy()
     df = DataFrame(1, index=index, columns=range(num_cols))
     df.apply(lambda x: x)
     assert index.freq == original.freq
Пример #3
0
    def test_apply(self, float_frame):
        with np.errstate(all='ignore'):
            # ufunc
            applied = float_frame.apply(np.sqrt)
            tm.assert_series_equal(np.sqrt(float_frame['A']), applied['A'])

            # aggregator
            applied = float_frame.apply(np.mean)
            assert applied['A'] == np.mean(float_frame['A'])

            d = float_frame.index[0]
            applied = float_frame.apply(np.mean, axis=1)
            assert applied[d] == np.mean(float_frame.xs(d))
            assert applied.index is float_frame.index  # want this

        # invalid axis
        df = DataFrame(
            [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c'])
        with pytest.raises(ValueError):
            df.apply(lambda x: x, 2)

        # GH 9573
        df = DataFrame({'c0': ['A', 'A', 'B', 'B'],
                        'c1': ['C', 'C', 'D', 'D']})
        df = df.apply(lambda ts: ts.astype('category'))

        assert df.shape == (4, 2)
        assert isinstance(df['c0'].dtype, CategoricalDtype)
        assert isinstance(df['c1'].dtype, CategoricalDtype)
Пример #4
0
    def test_apply_modify_traceback(self):
        data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo',
                                'bar', 'bar', 'bar', 'bar',
                                'foo', 'foo', 'foo'],
                          'B': ['one', 'one', 'one', 'two',
                                'one', 'one', 'one', 'two',
                                'two', 'two', 'one'],
                          'C': ['dull', 'dull', 'shiny', 'dull',
                                'dull', 'shiny', 'shiny', 'dull',
                                'shiny', 'shiny', 'shiny'],
                          'D': np.random.randn(11),
                          'E': np.random.randn(11),
                          'F': np.random.randn(11)})

        data.loc[4, 'C'] = np.nan

        def transform(row):
            if row['C'].startswith('shin') and row['A'] == 'foo':
                row['D'] = 7
            return row

        def transform2(row):
            if (notna(row['C']) and row['C'].startswith('shin') and
                    row['A'] == 'foo'):
                row['D'] = 7
            return row

        try:
            data.apply(transform, axis=1)
        except AttributeError as e:
            assert len(e.args) == 2
            assert e.args[1] == 'occurred at index 4'
            assert e.args[0] == "'float' object has no attribute 'startswith'"
Пример #5
0
    def test_with_dictlike_columns(self):
        # GH 17602
        df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
        result = df.apply(lambda x: {'s': x['a'] + x['b']},
                          axis=1)
        expected = Series([{'s': 3} for t in df.itertuples()])
        assert_series_equal(result, expected)

        df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'),
                    pd.Timestamp('2017-05-02 00:00:00')]
        result = df.apply(lambda x: {'s': x['a'] + x['b']},
                          axis=1)
        assert_series_equal(result, expected)

        # compose a series
        result = (df['a'] + df['b']).apply(lambda x: {'s': x})
        expected = Series([{'s': 3}, {'s': 3}])
        assert_series_equal(result, expected)

        # GH 18775
        df = DataFrame()
        df["author"] = ["X", "Y", "Z"]
        df["publisher"] = ["BBC", "NBC", "N24"]
        df["date"] = pd.to_datetime(['17-10-2010 07:15:30',
                                     '13-05-2011 08:20:35',
                                     '15-01-2013 09:09:09'])
        result = df.apply(lambda x: {}, axis=1)
        expected = Series([{}, {}, {}])
        assert_series_equal(result, expected)
Пример #6
0
    def test_apply_differently_indexed(self):
        df = DataFrame(np.random.randn(20, 10))

        result0 = df.apply(Series.describe, axis=0)
        expected0 = DataFrame(dict((i, v.describe()) for i, v in compat.iteritems(df)), columns=df.columns)
        assert_frame_equal(result0, expected0)

        result1 = df.apply(Series.describe, axis=1)
        expected1 = DataFrame(dict((i, v.describe()) for i, v in compat.iteritems(df.T)), columns=df.index).T
        assert_frame_equal(result1, expected1)
Пример #7
0
    def test_result_type_error(self, result_type):
        # allowed result_type
        df = DataFrame(
            np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1,
            columns=['A', 'B', 'C'])

        with pytest.raises(ValueError):
            df.apply(lambda x: [1, 2, 3],
                     axis=1,
                     result_type=result_type)
Пример #8
0
    def test_apply_non_numpy_dtype(self):
        df = DataFrame({"dt": pd.date_range("2015-01-01", periods=3, tz="Europe/Brussels")})
        result = df.apply(lambda x: x)
        assert_frame_equal(result, df)

        result = df.apply(lambda x: x + pd.Timedelta("1day"))
        expected = DataFrame({"dt": pd.date_range("2015-01-02", periods=3, tz="Europe/Brussels")})
        assert_frame_equal(result, expected)

        df = DataFrame({"dt": ["a", "b", "c", "a"]}, dtype="category")
        result = df.apply(lambda x: x)
        assert_frame_equal(result, df)
Пример #9
0
def applyDataFrame():
    df = DataFrame(np.arange(12).reshape(4,3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
    print (df)
    f = lambda x: x.max() - x.min()
    func1 =  df.apply(f, axis = 0)
    func2 = df.apply(f, axis = 1)
    print (func1)
    print (func2)

    f2 = lambda x: '%.2f' % x
    df.applymap(f2)
    print (df.applymap(f2))
Пример #10
0
def data_prep(input_file, bad_samples_file, freq_dict=None):
    '''prepare the ibdhmm file byremoving sites that are too close from eachother, calculating major and minor allele
    if specified, the freq_dict should be a json file that contains the frequencies. This is created from freq_parse.py'''
    min_snpD = 10
    tri_allele= 0
    
    output_file = ('.').join(input_file.split('.')[0:-2]) + '_cleaned.txt'
    
    
    #relaxing conditions because we only have 3000 SNPs to begin with
    bad_samples = [sample.strip() for sample in open(bad_samples_file)]                                              
    df = DataFrame(read_csv(input_file, sep = '\t'))
    #remove bad samples
    df.drop(bad_samples, inplace = True, axis =1)
    #remove non-biallelic alleles
    #df.drop(df[df.apply(allele_count, axis = 1) != 2].index, inplace = True)
    
    
    #relaxing conditions because we only have 3000 SNPs to begin with
    '''#remove SNPs that are too close to one another
    df['diff'] = df.groupby('chrom')['pos'].diff()
    df.fillna('first', inplace = True)
    #df.to_csv('test_df.txt', sep = '\t')
    # BUG NOTE MUST FIX THE DAISY CHAIN PROBLEM
    df = df.query('diff > 10 or diff == "first"')
    df.drop('diff', axis = 1, inplace = True)'''
    
    if not freq_dict:
        #calculate the major and minor allele
        major = df.apply(major_find, axis =1 )
        minor = df.apply(minor_find, axis =1 )
        major_prop = df.apply(major_prop_find, axis =1 )
        minor_prop = df.apply(minor_prop_find, axis = 1)
    else:
        snp_dict = json.load(open(freq_dict))
        df['keys'] = df['chrom'].map(str) +':'+ df['pos'].map(str)        
        major = df['keys'].apply(lambda x : snp_dict[x]['major'])
        major_prop = df['keys'].apply(lambda x : snp_dict[x]['major_freq'])
        minor = df['keys'].apply(lambda x : snp_dict[x]['minor'])
        minor_prop = df['keys'].apply(lambda x : snp_dict[x]['minor_freq'])
        
        df.drop('keys', inplace= True, axis = 1)
               
        
        
    #inserting this stuff into dataframe for future use
    df.insert(3, 'minor_prop', minor_prop)
    df.insert(3, 'minor', minor)
    df.insert(3, 'major_prop', major_prop)
    df.insert(3, 'major', major)
    
    df.to_csv(output_file, sep = '\t', index= False)
    return df
Пример #11
0
    def test_consistent_coerce_for_shapes(self):
        # we want column names to NOT be propagated
        # just because the shape matches the input shape
        df = DataFrame(np.random.randn(4, 3), columns=['A', 'B', 'C'])

        result = df.apply(lambda x: [1, 2, 3], axis=1)
        expected = Series([[1, 2, 3] for t in df.itertuples()])
        assert_series_equal(result, expected)

        result = df.apply(lambda x: [1, 2], axis=1)
        expected = Series([[1, 2] for t in df.itertuples()])
        assert_series_equal(result, expected)
Пример #12
0
def peakToTroughs(dailyret,dates):
    '''
    Example:
        sr = s['retdat']
        stkd = s['stockData']
        dt = stkd['Date']
        ptk = peakToTroughs(sr,dt)
    '''
    ''' get cummulative percent changes'''
    drs = Series(dailyret)
    soc1dr = drs+1
    soc1cumdr = soc1dr.cumprod()
    localPeaksPairs = peakdetect(y_axis=soc1cumdr,lookahead=1)[0]
    indexOfLocalPeaks  = np.empty(len(localPeaksPairs));
    for i in range(len(indexOfLocalPeaks)):
        indexOfLocalPeaks[i] = localPeaksPairs[i][0]
    # data frame with 2 columns, where column 1 is a peak, and column 2 is the next peak that follows it
    dd = DataFrame({'a':indexOfLocalPeaks[0:(len(indexOfLocalPeaks)-1)],'b':indexOfLocalPeaks[1:len(indexOfLocalPeaks)]})
    # add one more row to dd to represent the last peak and last row of soc1cumdr, so
    #   that you calculate the last possible trough, if it there was one between the last peak and the last day
    #   of data
    lastDdValue = dd.iloc[len(dd)-1,1]
    lastValueInData = len(soc1cumdr)-1
    dd = rbind(dd,[lastDdValue,lastValueInData])
    def minBetween2Peaks(x):
        lowindex = int(x[0])
        highindex = int(x[1])
        minval = min(soc1cumdr[lowindex:(highindex+1)])
        return minval
    localMins = dd.apply(minBetween2Peaks,1)
    localMins.index = range(len(localMins))
    localPeaks = soc1cumdr[indexOfLocalPeaks.astype(int)]
    localPeaks.index = range(len(localPeaks))
    diffs = (localMins - localPeaks)/localPeaks
    
    # get indices of localMins in soc1cumdr so that you can get their dates
    def ff(x):
        ''' this function gets the index of soc1cumdr whose value = x'''
        r = soc1cumdr[soc1cumdr==x].index[0]
        return r
    indexOfLocalMins = map(ff,localMins)
    datesOfLocalMins = Series(dates)[indexOfLocalMins]
    datesOfLocalMins.index = range(len(datesOfLocalMins))
    # calculate peak to end of data
    def minBetweenPeakAndEnd(x):
        arr = soc1cumdr.iloc[x[0]:len(soc1cumdr)]
        return min(arr)
    absMinsToEnd = dd.apply(minBetweenPeakAndEnd,1)
    absMinsToEnd.index = range(len(absMinsToEnd))
    diffsToEnd = (absMinsToEnd - localPeaks)/localPeaks
    ret =  DataFrame({'Date':datesOfLocalMins,'Peak':localPeaks,'Valley':localMins,'Diff':diffs,'DiffToEnd':diffsToEnd})

    return ret
Пример #13
0
 def __init__(self, background: pd.DataFrame, permutations: int=100):
     """
     :param background: A data frame containing all the observations as binary data 1 and 0 or True and False where
             rows represent observations and columns represent samples.
     :param permutations: how many permutations by default
     :return:
     """
     self.permutations = permutations
     self.background = background
     self.sample_weights = background.apply(sum) / background.apply(sum).pipe(sum)
     self.cummulative_sum = np.cumsum(self.sample_weights)
     self.sample_indices = [x for x in range(0, background.shape[1])]
Пример #14
0
def test():
    frame = DataFrame(numpy.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
    format = lambda x: '%.2f' % x
    range = lambda x: x.max() - x.min()
    
    # http://stackoverflow.com/questions/19798153/difference-between-map-applymap-and-apply-methods-in-pandas/19798528#19798528
    print(frame.apply(range))
    print("")
    print(frame.applymap(format))
    print("")
    print(frame.apply(range).map(format))
    
    return frame
Пример #15
0
    def test_with_dictlike_columns_with_infer(self):
        # GH 17602
        df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
        result = df.apply(lambda x: {'s': x['a'] + x['b']},
                          axis=1, result_type='expand')
        expected = DataFrame({'s': [3, 3]})
        assert_frame_equal(result, expected)

        df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'),
                    pd.Timestamp('2017-05-02 00:00:00')]
        result = df.apply(lambda x: {'s': x['a'] + x['b']},
                          axis=1, result_type='expand')
        assert_frame_equal(result, expected)
Пример #16
0
    def test_consistency_for_boxed(self, box):
        # passing an array or list should not affect the output shape
        df = DataFrame(
            np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1,
            columns=['A', 'B', 'C'])

        result = df.apply(lambda x: box([1, 2]), axis=1)
        expected = Series([box([1, 2]) for t in df.itertuples()])
        assert_series_equal(result, expected)

        result = df.apply(lambda x: box([1, 2]), axis=1, result_type='expand')
        expected = DataFrame(
            np.tile(np.arange(2, dtype='int64'), 6).reshape(6, -1) + 1)
        assert_frame_equal(result, expected)
Пример #17
0
    def test_apply_non_numpy_dtype(self):
        df = DataFrame({'dt': pd.date_range(
            "2015-01-01", periods=3, tz='Europe/Brussels')})
        result = df.apply(lambda x: x)
        assert_frame_equal(result, df)

        result = df.apply(lambda x: x + pd.Timedelta('1day'))
        expected = DataFrame({'dt': pd.date_range(
            "2015-01-02", periods=3, tz='Europe/Brussels')})
        assert_frame_equal(result, expected)

        df = DataFrame({'dt': ['a', 'b', 'c', 'a']}, dtype='category')
        result = df.apply(lambda x: x)
        assert_frame_equal(result, df)
Пример #18
0
def compute_confusion_matrix(target, predicted, normalize=True, sort = True):
    """ returns a confusion matrix as a data frame with labels
    Parameters:
        target (array): The values that are predicted.
        predicted (array): predicted values.
        normalize (bool): If True, Normalize
        normalize (bool): If true sort by value.
    Returns (DataFrame): df with the confusion matrix.
    """

    # Determine the uniqu values in the target list, sort them and assign as labels.
    labels = np.unique(list(target))
    labels.sort()

    # Compute the confusion matrix, place into data frame and normailize if desired.
    confusion = metrics.confusion_matrix(target, predicted, labels)
    confusion = DataFrame(confusion, index=labels, columns=labels)
    if normalize:
        confusion = confusion.apply(lambda x: x / np.sum(x), axis=1)

    # if sort is true: find the max value for each and then sort, the confusion matrix
    if sort:
        #get the max values, order and then use to order the confusion matrix on both axes
        max_values =confusion.max(axis = 1)
        max_values.sort(inplace = True, ascending=False)
        order = max_values.index
        confusion = confusion.loc[order,order]
    return confusion
Пример #19
0
    def test_apply(self):
        with np.errstate(all="ignore"):
            # ufunc
            applied = self.frame.apply(np.sqrt)
            assert_series_equal(np.sqrt(self.frame["A"]), applied["A"])

            # aggregator
            applied = self.frame.apply(np.mean)
            self.assertEqual(applied["A"], np.mean(self.frame["A"]))

            d = self.frame.index[0]
            applied = self.frame.apply(np.mean, axis=1)
            self.assertEqual(applied[d], np.mean(self.frame.xs(d)))
            self.assertIs(applied.index, self.frame.index)  # want this

        # invalid axis
        df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"])
        self.assertRaises(ValueError, df.apply, lambda x: x, 2)

        # GH9573
        df = DataFrame({"c0": ["A", "A", "B", "B"], "c1": ["C", "C", "D", "D"]})
        df = df.apply(lambda ts: ts.astype("category"))
        self.assertEqual(df.shape, (4, 2))
        self.assertTrue(isinstance(df["c0"].dtype, CategoricalDtype))
        self.assertTrue(isinstance(df["c1"].dtype, CategoricalDtype))
Пример #20
0
    def test_apply_empty_infer_type(self):
        no_cols = DataFrame(index=['a', 'b', 'c'])
        no_index = DataFrame(columns=['a', 'b', 'c'])

        def _check(df, f):
            with warnings.catch_warnings(record=True):
                test_res = f(np.array([], dtype='f8'))
            is_reduction = not isinstance(test_res, np.ndarray)

            def _checkit(axis=0, raw=False):
                res = df.apply(f, axis=axis, raw=raw)
                if is_reduction:
                    agg_axis = df._get_agg_axis(axis)
                    tm.assertIsInstance(res, Series)
                    self.assertIs(res.index, agg_axis)
                else:
                    tm.assertIsInstance(res, DataFrame)

            _checkit()
            _checkit(axis=1)
            _checkit(raw=True)
            _checkit(axis=0, raw=True)

        with np.errstate(all='ignore'):
            _check(no_cols, lambda x: x)
            _check(no_cols, lambda x: x.mean())
            _check(no_index, lambda x: x)
            _check(no_index, lambda x: x.mean())

        result = no_cols.apply(lambda x: x.mean(), broadcast=True)
        tm.assertIsInstance(result, DataFrame)
Пример #21
0
    def test_apply_mixed_dtype_corner(self):
        df = DataFrame({"A": ["foo"], "B": [1.0]})
        result = df[:0].apply(np.mean, axis=1)
        # the result here is actually kind of ambiguous, should it be a Series
        # or a DataFrame?
        expected = Series(np.nan, index=pd.Index([], dtype="int64"))
        assert_series_equal(result, expected)

        df = DataFrame({"A": ["foo"], "B": [1.0]})
        result = df.apply(lambda x: x["A"], axis=1)
        expected = Series(["foo"], index=[0])
        assert_series_equal(result, expected)

        result = df.apply(lambda x: x["B"], axis=1)
        expected = Series([1.0], index=[0])
        assert_series_equal(result, expected)
Пример #22
0
    def test_apply(self):
        with np.errstate(all='ignore'):
            # ufunc
            applied = self.frame.apply(np.sqrt)
            assert_series_equal(np.sqrt(self.frame['A']), applied['A'])

            # aggregator
            applied = self.frame.apply(np.mean)
            self.assertEqual(applied['A'], np.mean(self.frame['A']))

            d = self.frame.index[0]
            applied = self.frame.apply(np.mean, axis=1)
            self.assertEqual(applied[d], np.mean(self.frame.xs(d)))
            self.assertIs(applied.index, self.frame.index)  # want this

        # invalid axis
        df = DataFrame(
            [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c'])
        self.assertRaises(ValueError, df.apply, lambda x: x, 2)

        # GH9573
        df = DataFrame({'c0': ['A', 'A', 'B', 'B'],
                        'c1': ['C', 'C', 'D', 'D']})
        df = df.apply(lambda ts: ts.astype('category'))
        self.assertEqual(df.shape, (4, 2))
        self.assertTrue(isinstance(df['c0'].dtype, CategoricalDtype))
        self.assertTrue(isinstance(df['c1'].dtype, CategoricalDtype))
Пример #23
0
def get_flights_from_route(cur, origin, destination):
    """
    Returns a dataframe for all flights matching origin, destination.
    """

    import time
    
    ### MySQL query
    time0 = time.time()
    cur.execute("SELECT Year, Month, DayofMonth, DayOfWeek, CRSDepTime, UniqueCarrier, ArrDelay FROM flights_100000 WHERE Origin = %s and Dest = %s;", (origin, destination))
    rows = cur.fetchall()
    td = time.time() - time0
    print 'Database query took %.2f seconds.' % td
    
    ### Convert to dataframe
    df = DataFrame(list(rows), columns=['Year', 'Month', 'DayOfMonth', 'DayOfWeek', 'CRSDepTime', 'Carrier', 'ArrDelay'])

    ### Drop columns without delays (cancellations)
    df = df.dropna()
    
    ### Create some auxiliary columns
    df['DayOfYear'] = df.apply( lambda x: datetime.datetime(x['Year'],x['Month'],x['DayOfMonth']).timetuple().tm_yday, axis=1)
    df['Week'] = df['DayOfYear'] / 7 + 1
    df['DepHour'] = df['CRSDepTime']/100

    ### Drop unused columns
    df = df.drop(['DayOfMonth','CRSDepTime'],axis=1).sort_index(axis=1)

    ## df.head()
    
    return df
def avg_medal_count():
    '''
    Using the dataframe's apply method, create a new Series called 
    avg_medal_count that indicates the average number of gold, silver,
    and bronze medals earned amongst countries who earned at 
    least one medal of any kind at the 2014 Sochi olympics.  Note that
    the countries list already only includes countries that have earned
    at least one medal. No additional filtering is necessary.
    
    You do not need to call the function in your code when running it in the
    browser - the grader will do that automatically when you submit or test it.
    '''

    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
    
    olympic_medal_counts = {'gold': Series(gold),
                            'silver': Series(silver),
                            'bronze': Series(bronze)}    
    df = DataFrame(olympic_medal_counts)
    
    # YOUR CODE HERE
    avg_medal_count = df.apply(numpy.mean, axis=0)
    print avg_medal_count
Пример #25
0
    def customer_lifetime_value(self, transaction_prediction_model, frequency, recency, T, monetary_value, time=12, discount_rate=1):
        """
        This method computes the average lifetime value for a group of one or more customers.
            transaction_prediction_model: the model to predict future transactions, literature uses
                pareto/ndb but we can also use a different model like bg
            frequency: the frequency vector of customers' purchases (denoted x in literature).
            recency: the recency vector of customers' purchases (denoted t_x in literature).
            T: the vector of customers' age (time since first purchase)
            monetary_value: the monetary value vector of customer's purchases (denoted m in literature).
            time: the lifetime expected for the user in months. Default: 12
            discount_rate: the monthly adjusted discount rate. Default: 1
        Returns:
            the conditional expectation of the average profit per transaction.
            Also creates a discounted_monthly_cash_flows attribute
        """
        df = DataFrame()
        df['frequency'] = frequency
        df['recency'] = recency
        df['T'] = T

        d = discount_rate
        m = self.conditional_expected_average_profit()
        discounted_monthly_cash_flows = []

        for i in range(30, (time*30)+1, 30):
            df['expected_revenues_period_'+str(i)] = df.apply(
                lambda r: (m*transaction_prediction_model.predict(i, r['frequency'], r['recency'], r['T'])/(1+d)**(i/30)),
                axis=1
            )
            discounted_monthly_cash_flows.append(df['expected_revenues_period_'+str(i)].sum())

        return sum(discounted_monthly_cash_flows)
Пример #26
0
def pastas_hook(obj):
    for key, value in obj.items():
        if key in ["tmin", "tmax", "date_modified", "date_created"]:
            val = Timestamp(value)
            if val is NaT:
                val = None
            obj[key] = val
        elif key == "series":
            try:
                obj[key] = read_json(value, typ='series', orient="split")
            except:
                try:
                    obj[key] = TimeSeries(**value)
                except:
                    obj[key] = value
        elif key == "time_offset":
            obj[key] = Timedelta(value)
        elif key == "parameters":
            # Necessary to maintain order when using the JSON format!
            value = json.loads(value, object_pairs_hook=OrderedDict)
            param = DataFrame(data=value, columns=value.keys()).T
            obj[key] = param.apply(to_numeric, errors="ignore")
        else:
            try:
                obj[key] = json.loads(value, object_hook=pastas_hook)
            except:
                obj[key] = value
    return obj
Пример #27
0
    def test_apply_modify_traceback(self):
        data = DataFrame(
            {
                "A": ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
                "B": ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"],
                "C": ["dull", "dull", "shiny", "dull", "dull", "shiny", "shiny", "dull", "shiny", "shiny", "shiny"],
                "D": np.random.randn(11),
                "E": np.random.randn(11),
                "F": np.random.randn(11),
            }
        )

        data.loc[4, "C"] = np.nan

        def transform(row):
            if row["C"].startswith("shin") and row["A"] == "foo":
                row["D"] = 7
            return row

        def transform2(row):
            if notnull(row["C"]) and row["C"].startswith("shin") and row["A"] == "foo":
                row["D"] = 7
            return row

        try:
            transformed = data.apply(transform, axis=1)  # noqa
        except AttributeError as e:
            self.assertEqual(len(e.args), 2)
            self.assertEqual(e.args[1], "occurred at index 4")
            self.assertEqual(e.args[0], "'float' object has no attribute 'startswith'")
Пример #28
0
    def test_apply_bug(self):

        # GH 6125
        positions = pd.DataFrame(
            [[1, "ABC0", 50], [1, "YUM0", 20], [1, "DEF0", 20], [2, "ABC1", 50], [2, "YUM1", 20], [2, "DEF1", 20]],
            columns=["a", "market", "position"],
        )

        def f(r):
            return r["market"]

        expected = positions.apply(f, axis=1)

        positions = DataFrame(
            [
                [datetime(2013, 1, 1), "ABC0", 50],
                [datetime(2013, 1, 2), "YUM0", 20],
                [datetime(2013, 1, 3), "DEF0", 20],
                [datetime(2013, 1, 4), "ABC1", 50],
                [datetime(2013, 1, 5), "YUM1", 20],
                [datetime(2013, 1, 6), "DEF1", 20],
            ],
            columns=["a", "market", "position"],
        )
        result = positions.apply(f, axis=1)
        assert_series_equal(result, expected)
Пример #29
0
 def test_apply_mixed_datetimelike(self):
     # mixed datetimelike
     # GH 7778
     df = DataFrame({'A': date_range('20130101', periods=3),
                     'B': pd.to_timedelta(np.arange(3), unit='s')})
     result = df.apply(lambda x: x, axis=1)
     assert_frame_equal(result, df)
Пример #30
0
    def _grading_policy(self):
        '''
        Gets the grading policy from the course policy.

        Returns
        -------
        grading_policy : DataFrame
            Information about how grades are determined in a course.
        '''
        course_policy = self._xd.get('grading_policy')
        grading_policy = DataFrame(course_policy.iloc[0,:]['GRADER'])

        # type == the gformat of sequences
        grading_policy = grading_policy.set_index('type')

        def max_seqs(seq_type):
            '''
            Determines the max number of sequenences that should contribute to 
            a users final grade for each gformat
            '''
            return seq_type.get('min_count', 1) - seq_type.get('drop_count', 0)

        grading_policy['max_seqs'] = grading_policy.apply(max_seqs, axis = 1)

        return grading_policy
Пример #31
0
    def test_apply_modify_traceback(self):
        data = DataFrame({
            "A": [
                "foo",
                "foo",
                "foo",
                "foo",
                "bar",
                "bar",
                "bar",
                "bar",
                "foo",
                "foo",
                "foo",
            ],
            "B": [
                "one",
                "one",
                "one",
                "two",
                "one",
                "one",
                "one",
                "two",
                "two",
                "two",
                "one",
            ],
            "C": [
                "dull",
                "dull",
                "shiny",
                "dull",
                "dull",
                "shiny",
                "shiny",
                "dull",
                "shiny",
                "shiny",
                "shiny",
            ],
            "D":
            np.random.randn(11),
            "E":
            np.random.randn(11),
            "F":
            np.random.randn(11),
        })

        data.loc[4, "C"] = np.nan

        def transform(row):
            if row["C"].startswith("shin") and row["A"] == "foo":
                row["D"] = 7
            return row

        def transform2(row):
            if notna(row["C"]) and row["C"].startswith(
                    "shin") and row["A"] == "foo":
                row["D"] = 7
            return row

        try:
            data.apply(transform, axis=1)
        except AttributeError as e:
            assert len(e.args) == 2
            assert e.args[1] == "occurred at index 4"
            assert e.args[0] == "'float' object has no attribute 'startswith'"
Пример #32
0
def test_str_accessor_in_apply_func():
    # https://github.com/pandas-dev/pandas/issues/38979
    df = DataFrame(zip("abc", "def"))
    expected = Series(["A/D", "B/E", "C/F"])
    result = df.apply(lambda f: "/".join(f.str.upper()), axis=1)
    tm.assert_series_equal(result, expected)
def prepare_training_data(training_data: pd.DataFrame,
                          logger: logging.getLogger(),
                          filter_data_top_n=0) -> pd.DataFrame:
    """Adds columns: [bm25_scores_encoded, use_scores encoded, scores_concatenated]

    Parameters
    ----------
    training_data: pd.DataFrame
        Needs to have columns
        ['bm25_class_labels', 'bm25_scores', 'use_class_labels', 'use_scores']
        Columns are lists encoded as string at this moment -> will be converted to lists

    filter_data_top_n: integer
        if n >= 0, will filter all datapoints where correct result was under first n responses
        in USE and BM25

    Returns
    -------
    training_data: pd.DataFrame
        The df extended by [bm25_scores_encoded, use_scores encoded, scores_concatenated]
    """

    # evaluate the list values in the dataframe
    logger.info("# evaluate the list values in the dataframe")

    for column in [
            'bm25_class_labels', 'bm25_scores', 'use_class_labels',
            'use_scores'
    ]:
        try:
            training_data[column] = training_data[column].apply(
                ast.literal_eval)
        except ValueError:
            print(f"Column already in right format")

    # infer the number of classes
    logger.info("# infer the number of classes")
    num_of_classes = training_data.class_label.nunique()

    # encode scores and write new columns
    logger.info("# encode scores and write new columns")
    training_data["bm25_scores_encoded"] = training_data[[
        "bm25_class_labels", "bm25_scores"
    ]].apply(lambda x: encode_ids_and_scores(x, num_of_classes), axis=1)
    training_data["use_scores_encoded"] = training_data[[
        "use_class_labels", "use_scores"
    ]].apply(lambda x: encode_ids_and_scores(x, num_of_classes), axis=1)
    training_data["scores_concatenated"] = training_data[[
        "bm25_scores_encoded", "use_scores_encoded"
    ]].apply(lambda row: np.concatenate((row[0], row[1])), axis=1)

    # this function filters out the training data by only taking those datapoints of USE and BM25
    # where the right result has been under top_n responses
    # don't train on something where there is absolute missing information
    def filter_result_not_found_under_n_responses(row, n):
        class_ = row["class_label"]
        if class_ not in row["bm25_class_labels"][:n] or class_ not in row[
                "use_class_labels"][:n]:
            return False
        else:
            return True

    if filter_data_top_n:
        logger.info("# Filter datapoints")
        training_data = training_data[training_data.apply(
            lambda row: filter_result_not_found_under_n_responses(
                row, filter_data_top_n),
            axis=1)]

    return training_data
Пример #34
0
def recreate_sampling_times(
    data: DataFrame,
    step_length: float,
    start_time: float,
    end_time: float,
    plot_col=None,
) -> DataFrame:
    """
    Functions that transforms measurement data with samples taken it any (possibly irregular)
    sample rate and outputs the same measurements evenly spanced according to a given step length.

    data:           dataframe with numeric values that includes a 'Time' column
    step length:    desired time between each sample timestep
    duration:       amount of time covered by measurements in data
    plot_col:       name of column that should be plotted before and after (for vertification purposes)
    """

    first_time_in_df = data[DFKeys.TIME.value].iloc[0]
    if start_time < first_time_in_df:
        raise ValueError("start time cannot precede first time in df")

    get_shifted_time = lambda row: row[DFKeys.TIME.value] - start_time
    shifted_timestamps = data.apply(get_shifted_time,
                                    axis=1).rename(DFKeys.TIME.value, axis=1)

    duration = end_time - start_time
    timesteps = np.arange(0, duration, step_length)
    new_columns = [pd.Series(timesteps, name=DFKeys.TIME.value)]
    columns_except_time = data.columns.difference([
        DFKeys.TIME.value,
        "child_frame_id",
        "header.frame_id",
        "header.seq",
        "header.stamp.nsecs",
        "header.stamp.secs",
        "pose.covariance",
        "twist.covariance",
        "pins_0",
        "pins_1",
        "pins_2",
        "pins_3",
        "pins_4",
        "pins_5",
        "pins_6",
        "pins_7",
    ])

    for col_name in columns_except_time:
        f = interp1d(shifted_timestamps.values, data[col_name].values)
        new_columns.append(pd.Series(f(timesteps), name=col_name))

    data_new = pd.concat(new_columns, axis=1)

    if plot_col in data.columns:
        SAVEDIR = Path("results/interpolation")
        sea.set_style("white")
        # plt.figure(figsize=(5, 2.5))
        sea.lineplot(x=shifted_timestamps.values,
                     y=data[plot_col],
                     label="original")
        sea.lineplot(x=DFKeys.TIME.value,
                     y=plot_col,
                     data=data_new,
                     label="interpolated")
        # plt.ylabel("Velocity")
        # plt.savefig(SAVEDIR.joinpath("%s.pdf" % plot_col))
        plt.show()

    return data_new
Пример #35
0
 def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
     return df[df.apply(self.condition, axis=1)]
Пример #36
0
def prepare_data(test, traces, options):

    std_out('Preparing data for plot')

    # Dataframe to return
    df = DataFrame()

    # Check if there are different subplots
    n_subplots = 1

    for trace in traces:
        if 'subplot' in traces[trace].keys():
            n_subplots = max(n_subplots, traces[trace]['subplot'])
        else:
            std_out(f'Trace {trace} not assigned to subplot. Skipping',
                    'WARNING')

    std_out(f'Making {n_subplots} subplots')

    # Generate list of subplots
    subplots = [[] for x in range(n_subplots)]

    # Put data in the df
    for trace in traces.keys():

        if 'subplot' not in traces[trace].keys():
            std_out(
                f'The trace {traces[trace]} was not placed in any subplot. Assuming subplot #1',
                'WARNING')
            traces[trace]['subplot'] = 1

        ndevs = traces[trace]['devices']
        nchans = traces[trace]['channel']

        # Make them lists always
        if ndevs == 'all': devices = list(test.devices.keys())
        elif type(ndevs) == str or type(ndevs) == int: devices = [ndevs]
        else: devices = ndevs

        for device in devices:

            ndev = str(device)

            # Make them lists always
            if nchans == 'all':
                channels = list(test.devices[ndev].readings.columns)
            elif type(nchans) == str:
                channels = [nchans]
            else:
                channels = nchans

            for channel in channels:
                # Check if device is in columns
                if channel not in test.devices[ndev].readings.columns:
                    std_out(
                        f'The device {ndev} does not contain {channel}. Ignoring',
                        'WARNING')
                    continue

                # Put channel in subplots
                subplots[traces[trace]['subplot'] - 1].append(channel + '_' +
                                                              ndev)

                column_orig = [channel]
                columns_add = [channel + '_' + ndev]

                # Add filtering name to dfdev
                if 'filter' in traces[trace]:
                    col_name = traces[trace]['filter']['col']

                    if col_name not in test.devices[ndev].readings.columns:
                        std_out(
                            f'Column {col_name} not in dataframe. Ignoring filtering',
                            'WARNING')
                    else:
                        column_orig.append(col_name)
                        columns_add.append(col_name)

                # Device dataframe
                dfdev = DataFrame(
                    test.devices[ndev].readings[column_orig].values,
                    columns=columns_add,
                    index=test.devices[ndev].readings.index)

                # Add filtering function
                if 'filter' in traces[trace]:
                    value = traces[trace]['filter']['value']
                    relationship = traces[trace]['filter']['relationship']

                    if col_name in dfdev.columns:
                        if relationship == '==':
                            dfdev.loc[dfdev[col_name] == value]
                        elif relationship == '<=':
                            dfdev.loc[dfdev[col_name] <= value]
                        elif relationship == '>=':
                            dfdev.loc[dfdev[col_name] >= value]
                        elif relationship == '<':
                            dfdev.loc[dfdev[col_name] < value]
                        elif relationship == '>':
                            dfdev.loc[dfdev[col_name] > value]
                        else:
                            std_out(
                                f"Not valid relationship. Valid options: '==', '<=', '>=', '<', '>'",
                                'ERROR')
                            continue
                        # Remove column for filtering from dfdev
                        dfdev.drop(columns=[col_name], inplace=True)

                # Combine it in the df
                df = df.combine_first(dfdev)

        # Add average or other extras
        # TODO Check this to simplify
        # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.core.resample.Resampler.aggregate.html
        if 'extras' in traces[trace]:
            for extra in traces[trace]['extras']:

                extra_name = channel + f'-{extra.upper()}'
                sbl = subplots[traces[trace]['subplot'] - 1]

                if extra == 'max':
                    df[extra_name] = df.loc[:, sbl].max(axis=1)

                if extra == 'mean':
                    df[extra_name] = df.loc[:, sbl].mean(axis=1)

                if extra == 'min':
                    df[extra_name] = df.loc[:, sbl].min(axis=1)

                subplots[traces[trace]['subplot'] - 1].append(extra_name)

    # Trim data
    if options['min_date'] is not None: df = df[df.index > options['min_date']]
    if options['max_date'] is not None: df = df[df.index < options['max_date']]

    # Make sure everything is numeric before resampling
    # https://stackoverflow.com/questions/34257069/resampling-pandas-dataframe-is-deleting-column#34270422
    df = df.apply(to_numeric, errors='coerce')

    # Resample it
    if options['frequency'] is not None:
        std_out(f"Resampling at {options['frequency']}", "INFO")

        if 'resample' in options:

            if options['resample'] == 'max':
                df = df.resample(options['frequency']).max()
            if options['resample'] == 'min':
                df = df.resample(options['frequency']).min()
            if options['resample'] == 'mean':
                df = df.resample(options['frequency']).mean()

        else:
            df = df.resample(options['frequency']).mean()

    # Clean na
    if options['clean_na'] is not None:
        if options['clean_na'] == 'fill':
            df = df.fillna(method='ffill')
        if options['clean_na'] == 'drop':
            df.dropna(axis=0, how='any', inplace=True)

    if df.empty: std_out('Dataframe for selected options is empty', 'WARNING')

    return df, subplots
Пример #37
0
print(df.cumsum()) # 积累型方法
print(df.describe())

obj = Series(['a', 'a', 'b', 'c']*4)
print(obj)
print(obj.sort_index())
print(obj.describe())
"""

# 5.3.1相关性和协方差

# 5.3.2唯一值、计数和成员属性
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()
print(uniques)
print(obj.value_counts())
print(pd.value_counts(obj.values, sort=False))
mask = obj.isin(['b', 'c'])
print(mask)
print(obj[mask])

to_mach = Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = Series(['c', 'b', 'a'])
print(pd.Index(unique_vals).get_indexer(to_mach))

data = DataFrame({'Qu1' : [1, 3, 4, 3, 4],
                  'Qu2' : [2, 3, 1, 2, 3],
                  'Qu3' : [1, 5, 2, 4, 4]})
print(data)
print(data.apply(pd.value_counts).fillna(0))
Пример #38
0
votes_rcvd = {
    'Candidate': ['Khan', 'Correy', 'Li', 'OTooley'],
    'Votes': [2218231, 704200, 492940, 105630]
}
votes_df = DataFrame(votes_rcvd, columns=['Candidate', 'Votes'])
votes_df.set_index('Candidate')

# In[124]:

#Add column showing percentage of total vote
votes_df["Vote_Percentage"] = (votes_df['Votes'] / total_votes) * 100
votes_df.set_index('Candidate')

# In[125]:

total = votes_df.apply(np.sum)
total['Candidate'] = 'total'
votes_df.append(pd.DataFrame(total.values, index=total.keys()).T,
                ignore_index=True)

# In[126]:

totaled_votes_df = votes_df.append(
    {
        'Candidate': 'Total',
        'Votes': total_votes,
        'Vote_Percentage': '100'
    },
    ignore_index=True)
totaled_votes_df.set_index('Candidate')
Пример #39
0
#导入模块:

from pandas import DataFrame
import pandas as pd
import numpy as np
#生成DataFrame数据

df = DataFrame(np.random.randn(4, 5), columns=['A', 'B', 'C', 'D', 'E'])
# DataFrame数据预览:

#           A         B         C         D         E
# 0  0.673092  0.230338 -0.171681  0.312303 -0.184813
# 1 -0.504482 -0.344286 -0.050845 -0.811277 -0.298181
# 2  0.542788  0.207708  0.651379 -0.656214  0.507595
# 3 -0.249410  0.131549 -2.198480 -0.437407  1.628228
#计算各列数据总和并作为新列添加到末尾
df['Col_sum'] = df.apply(lambda x: x.sum(), axis=1)
#df['Col_sum'] = df.apply(lambda x: x[0], axis=1)
print(df)
#计算各行数据总和并作为新行添加到末尾
df.loc['Row_sum'] = df.apply(lambda x: x.sum())
#df.loc['Row_sum'] = df.apply(lambda x: x[0])
print(df)
# 最终数据结果:

#                 A         B         C         D         E   Col_sum
# 0        0.673092  0.230338 -0.171681  0.312303 -0.184813  0.859238
# 1       -0.504482 -0.344286 -0.050845 -0.811277 -0.298181 -2.009071
# 2        0.542788  0.207708  0.651379 -0.656214  0.507595  1.253256
# 3       -0.249410  0.131549 -2.198480 -0.437407  1.628228 -1.125520
# Row_sum  0.461987  0.225310 -1.769627 -1.592595  1.652828 -1.022097
class TestMoments(unittest.TestCase):

    _multiprocess_can_split_ = True

    _nan_locs = np.arange(20, 40)
    _inf_locs = np.array([])

    def setUp(self):
        arr = randn(N)
        arr[self._nan_locs] = np.NaN

        self.arr = arr
        self.rng = bdate_range(datetime(2009, 1, 1), periods=N)

        self.series = Series(arr.copy(), index=self.rng)

        self.frame = DataFrame(randn(N, K), index=self.rng,
                               columns=np.arange(K))

    def test_centered_axis_validation(self):
        # ok
        mom.rolling_mean(Series(np.ones(10)),3,center=True ,axis=0)
        # bad axis
        self.assertRaises(ValueError, mom.rolling_mean,Series(np.ones(10)),3,center=True ,axis=1)

        # ok ok
        mom.rolling_mean(DataFrame(np.ones((10,10))),3,center=True ,axis=0)
        mom.rolling_mean(DataFrame(np.ones((10,10))),3,center=True ,axis=1)
        # bad axis
        self.assertRaises(ValueError, mom.rolling_mean,DataFrame(np.ones((10,10))),3,center=True ,axis=2)

    def test_rolling_sum(self):
        self._check_moment_func(mom.rolling_sum, np.sum)

    def test_rolling_count(self):
        counter = lambda x: np.isfinite(x).astype(float).sum()
        self._check_moment_func(mom.rolling_count, counter,
                                has_min_periods=False,
                                preserve_nan=False,
                                fill_value=0)

    def test_rolling_mean(self):
        self._check_moment_func(mom.rolling_mean, np.mean)

    def test_cmov_mean(self):
        try:
            from scikits.timeseries.lib import cmov_mean
        except ImportError:
            raise nose.SkipTest

        vals = np.random.randn(10)
        xp = cmov_mean(vals, 5)

        rs = mom.rolling_mean(vals, 5, center=True)
        assert_almost_equal(xp.compressed(), rs[2:-2])
        assert_almost_equal(xp.mask, np.isnan(rs))

        xp = Series(rs)
        rs = mom.rolling_mean(Series(vals), 5, center=True)
        assert_series_equal(xp, rs)

    def test_cmov_window(self):
        try:
            from scikits.timeseries.lib import cmov_window
        except ImportError:
            raise nose.SkipTest

        vals = np.random.randn(10)
        xp = cmov_window(vals, 5, 'boxcar')

        rs = mom.rolling_window(vals, 5, 'boxcar', center=True)
        assert_almost_equal(xp.compressed(), rs[2:-2])
        assert_almost_equal(xp.mask, np.isnan(rs))

        xp = Series(rs)
        rs = mom.rolling_window(Series(vals), 5, 'boxcar', center=True)
        assert_series_equal(xp, rs)

    def test_cmov_window_corner(self):
        try:
            from scikits.timeseries.lib import cmov_window
        except ImportError:
            raise nose.SkipTest

        # all nan
        vals = np.empty(10, dtype=float)
        vals.fill(np.nan)
        rs = mom.rolling_window(vals, 5, 'boxcar', center=True)
        self.assert_(np.isnan(rs).all())

        # empty
        vals = np.array([])
        rs = mom.rolling_window(vals, 5, 'boxcar', center=True)
        self.assert_(len(rs) == 0)

        # shorter than window
        vals = np.random.randn(5)
        rs = mom.rolling_window(vals, 10, 'boxcar')
        self.assert_(np.isnan(rs).all())
        self.assert_(len(rs) == 5)

    def test_cmov_window_frame(self):
        try:
            from scikits.timeseries.lib import cmov_window
        except ImportError:
            raise nose.SkipTest

        # DataFrame
        vals = np.random.randn(10, 2)
        xp = cmov_window(vals, 5, 'boxcar')
        rs = mom.rolling_window(DataFrame(vals), 5, 'boxcar', center=True)
        assert_frame_equal(DataFrame(xp), rs)

    def test_cmov_window_na_min_periods(self):
        try:
            from scikits.timeseries.lib import cmov_window
        except ImportError:
            raise nose.SkipTest

        # min_periods
        vals = Series(np.random.randn(10))
        vals[4] = np.nan
        vals[8] = np.nan

        xp = mom.rolling_mean(vals, 5, min_periods=4, center=True)
        rs = mom.rolling_window(vals, 5, 'boxcar', min_periods=4, center=True)

        assert_series_equal(xp, rs)

    def test_cmov_window_regular(self):
        try:
            from scikits.timeseries.lib import cmov_window
        except ImportError:
            raise nose.SkipTest

        win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman',
                     'blackmanharris', 'nuttall', 'barthann']
        for wt in win_types:
            vals = np.random.randn(10)
            xp = cmov_window(vals, 5, wt)

            rs = mom.rolling_window(Series(vals), 5, wt, center=True)
            assert_series_equal(Series(xp), rs)

    def test_cmov_window_special(self):
        try:
            from scikits.timeseries.lib import cmov_window
        except ImportError:
            raise nose.SkipTest

        win_types = ['kaiser', 'gaussian', 'general_gaussian', 'slepian']
        kwds = [{'beta': 1.}, {'std': 1.}, {'power': 2., 'width': 2.},
                {'width': 0.5}]

        for wt, k in zip(win_types, kwds):
            vals = np.random.randn(10)
            xp = cmov_window(vals, 5, (wt,) + tuple(k.values()))

            rs = mom.rolling_window(Series(vals), 5, wt, center=True,
                                    **k)
            assert_series_equal(Series(xp), rs)

    def test_rolling_median(self):
        self._check_moment_func(mom.rolling_median, np.median)

    def test_rolling_min(self):
        self._check_moment_func(mom.rolling_min, np.min)

        a = np.array([1, 2, 3, 4, 5])
        b = mom.rolling_min(a, window=100, min_periods=1)
        assert_almost_equal(b, np.ones(len(a)))

        self.assertRaises(ValueError, mom.rolling_min, np.array([1,
                          2, 3]), window=3, min_periods=5)

    def test_rolling_max(self):
        self._check_moment_func(mom.rolling_max, np.max)

        a = np.array([1, 2, 3, 4, 5])
        b = mom.rolling_max(a, window=100, min_periods=1)
        assert_almost_equal(a, b)

        self.assertRaises(ValueError, mom.rolling_max, np.array([1,
                          2, 3]), window=3, min_periods=5)

    def test_rolling_quantile(self):
        qs = [.1, .5, .9]

        def scoreatpercentile(a, per):
            values = np.sort(a, axis=0)

            idx = per / 1. * (values.shape[0] - 1)
            return values[int(idx)]

        for q in qs:
            def f(x, window, min_periods=None, freq=None, center=False):
                return mom.rolling_quantile(x, window, q,
                                            min_periods=min_periods,
                                            freq=freq,
                                            center=center)

            def alt(x):
                return scoreatpercentile(x, q)

            self._check_moment_func(f, alt)

    def test_rolling_apply(self):
        ser = Series([])
        assert_series_equal(
            ser, mom.rolling_apply(ser, 10, lambda x: x.mean()))

        def roll_mean(x, window, min_periods=None, freq=None, center=False):
            return mom.rolling_apply(x, window,
                                     lambda x: x[np.isfinite(x)].mean(),
                                     min_periods=min_periods,
                                     freq=freq,
                                     center=center)
        self._check_moment_func(roll_mean, np.mean)

    def test_rolling_apply_out_of_bounds(self):
        # #1850
        arr = np.arange(4)

        # it works!
        result = mom.rolling_apply(arr, 10, np.sum)
        self.assert_(isnull(result).all())

        result = mom.rolling_apply(arr, 10, np.sum, min_periods=1)
        assert_almost_equal(result, result)

    def test_rolling_std(self):
        self._check_moment_func(mom.rolling_std,
                                lambda x: np.std(x, ddof=1))
        self._check_moment_func(functools.partial(mom.rolling_std, ddof=0),
                                lambda x: np.std(x, ddof=0))

    def test_rolling_std_1obs(self):
        result = mom.rolling_std(np.array([1., 2., 3., 4., 5.]),
                                 1, min_periods=1)
        expected = np.zeros(5)

        assert_almost_equal(result, expected)

        result = mom.rolling_std(np.array([np.nan, np.nan, 3., 4., 5.]),
                                 3, min_periods=2)
        self.assert_(np.isnan(result[2]))

    def test_rolling_std_neg_sqrt(self):
        # unit test from Bottleneck

        # Test move_nanstd for neg sqrt.

        a = np.array([0.0011448196318903589,
                      0.00028718669878572767,
                      0.00028718669878572767,
                      0.00028718669878572767,
                      0.00028718669878572767])
        b = mom.rolling_std(a, window=3)
        self.assert_(np.isfinite(b[2:]).all())

        b = mom.ewmstd(a, span=3)
        self.assert_(np.isfinite(b[2:]).all())

    def test_rolling_var(self):
        self._check_moment_func(mom.rolling_var,
                                lambda x: np.var(x, ddof=1))
        self._check_moment_func(functools.partial(mom.rolling_var, ddof=0),
                                lambda x: np.var(x, ddof=0))

    def test_rolling_skew(self):
        try:
            from scipy.stats import skew
        except ImportError:
            raise nose.SkipTest('no scipy')
        self._check_moment_func(mom.rolling_skew,
                                lambda x: skew(x, bias=False))

    def test_rolling_kurt(self):
        try:
            from scipy.stats import kurtosis
        except ImportError:
            raise nose.SkipTest('no scipy')
        self._check_moment_func(mom.rolling_kurt,
                                lambda x: kurtosis(x, bias=False))

    def test_fperr_robustness(self):
        # TODO: remove this once python 2.5 out of picture
        if PY3:
            raise nose.SkipTest

        # #2114
        data = '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1a@\xaa\xaa\xaa\xaa\xaa\xaa\x02@8\x8e\xe38\x8e\xe3\xe8?z\t\xed%\xb4\x97\xd0?\xa2\x0c<\xdd\x9a\x1f\xb6?\x82\xbb\xfa&y\x7f\x9d?\xac\'\xa7\xc4P\xaa\x83?\x90\xdf\xde\xb0k8j?`\xea\xe9u\xf2zQ?*\xe37\x9d\x98N7?\xe2.\xf5&v\x13\x1f?\xec\xc9\xf8\x19\xa4\xb7\x04?\x90b\xf6w\x85\x9f\xeb>\xb5A\xa4\xfaXj\xd2>F\x02\xdb\xf8\xcb\x8d\xb8>.\xac<\xfb\x87^\xa0>\xe8:\xa6\xf9_\xd3\x85>\xfb?\xe2cUU\xfd?\xfc\x7fA\xed8\x8e\xe3?\xa5\xaa\xac\x91\xf6\x12\xca?n\x1cs\xb6\xf9a\xb1?\xe8%D\xf3L-\x97?5\xddZD\x11\xe7~?#>\xe7\x82\x0b\x9ad?\xd9R4Y\x0fxK?;7x;\nP2?N\xf4JO\xb8j\x18?4\xf81\x8a%G\x00?\x9a\xf5\x97\r2\xb4\xe5>\xcd\x9c\xca\xbcB\xf0\xcc>3\x13\x87(\xd7J\xb3>\x99\x19\xb4\xe0\x1e\xb9\x99>ff\xcd\x95\x14&\x81>\x88\x88\xbc\xc7p\xddf>`\x0b\xa6_\x96|N>@\xb2n\xea\x0eS4>U\x98\x938i\x19\x1b>\x8eeb\xd0\xf0\x10\x02>\xbd\xdc-k\x96\x16\xe8=(\x93\x1e\xf2\x0e\x0f\xd0=\xe0n\xd3Bii\xb5=*\xe9\x19Y\x8c\x8c\x9c=\xc6\xf0\xbb\x90]\x08\x83=]\x96\xfa\xc0|`i=>d\xfc\xd5\xfd\xeaP=R0\xfb\xc7\xa7\x8e6=\xc2\x95\xf9_\x8a\x13\x1e=\xd6c\xa6\xea\x06\r\x04=r\xda\xdd8\t\xbc\xea<\xf6\xe6\x93\xd0\xb0\xd2\xd1<\x9d\xdeok\x96\xc3\xb7<&~\xea9s\xaf\x9f<UUUUUU\x13@q\x1c\xc7q\x1c\xc7\xf9?\xf6\x12\xdaKh/\xe1?\xf2\xc3"e\xe0\xe9\xc6?\xed\xaf\x831+\x8d\xae?\xf3\x1f\xad\xcb\x1c^\x94?\x15\x1e\xdd\xbd>\xb8\x02@\xc6\xd2&\xfd\xa8\xf5\xe8?\xd9\xe1\x19\xfe\xc5\xa3\xd0?v\x82"\xa8\xb2/\xb6?\x9dX\x835\xee\x94\x9d?h\x90W\xce\x9e\xb8\x83?\x8a\xc0th~Kj?\\\x80\xf8\x9a\xa9\x87Q?%\xab\xa0\xce\x8c_7?1\xe4\x80\x13\x11*\x1f? \x98\x00\r\xb6\xc6\x04?\x80u\xabf\x9d\xb3\xeb>UNrD\xbew\xd2>\x1c\x13C[\xa8\x9f\xb8>\x12b\xd7<pj\xa0>m-\x1fQ@\xe3\x85>\xe6\x91)l\x00/m>Da\xc6\xf2\xaatS>\x05\xd7]\xee\xe3\xf09>'

        arr = np.frombuffer(data, dtype='<f8')
        if sys.byteorder != "little":
            arr = arr.byteswap().newbyteorder()

        result = mom.rolling_sum(arr, 2)
        self.assertTrue((result[1:] >= 0).all())

        result = mom.rolling_mean(arr, 2)
        self.assertTrue((result[1:] >= 0).all())

        result = mom.rolling_var(arr, 2)
        self.assertTrue((result[1:] >= 0).all())

        # #2527, ugh
        arr = np.array([0.00012456, 0.0003, 0])
        result = mom.rolling_mean(arr, 1)
        self.assertTrue(result[-1] >= 0)

        result = mom.rolling_mean(-arr, 1)
        self.assertTrue(result[-1] <= 0)

    def _check_moment_func(self, func, static_comp, window=50,
                           has_min_periods=True,
                           has_center=True,
                           has_time_rule=True,
                           preserve_nan=True,
                           fill_value=None):

        self._check_ndarray(func, static_comp, window=window,
                            has_min_periods=has_min_periods,
                            preserve_nan=preserve_nan,
                            has_center=has_center,
                            fill_value=fill_value)

        self._check_structures(func, static_comp,
                               has_min_periods=has_min_periods,
                               has_time_rule=has_time_rule,
                               fill_value=fill_value,
                               has_center=has_center)

    def _check_ndarray(self, func, static_comp, window=50,
                       has_min_periods=True,
                       preserve_nan=True,
                       has_center=True,
                       fill_value=None):

        result = func(self.arr, window)
        assert_almost_equal(result[-1],
                            static_comp(self.arr[-50:]))

        if preserve_nan:
            assert(np.isnan(result[self._nan_locs]).all())

        # excluding NaNs correctly
        arr = randn(50)
        arr[:10] = np.NaN
        arr[-10:] = np.NaN

        if has_min_periods:
            result = func(arr, 50, min_periods=30)
            assert_almost_equal(result[-1], static_comp(arr[10:-10]))

            # min_periods is working correctly
            result = func(arr, 20, min_periods=15)
            self.assert_(np.isnan(result[23]))
            self.assert_(not np.isnan(result[24]))

            self.assert_(not np.isnan(result[-6]))
            self.assert_(np.isnan(result[-5]))

            arr2 = randn(20)
            result = func(arr2, 10, min_periods=5)
            self.assert_(isnull(result[3]))
            self.assert_(notnull(result[4]))

            # min_periods=0
            result0 = func(arr, 20, min_periods=0)
            result1 = func(arr, 20, min_periods=1)
            assert_almost_equal(result0, result1)
        else:
            result = func(arr, 50)
            assert_almost_equal(result[-1], static_comp(arr[10:-10]))

        if has_center:
            if has_min_periods:
                result = func(arr, 20, min_periods=15, center=True)
                expected = func(arr, 20, min_periods=15)
            else:
                result = func(arr, 20, center=True)
                expected = func(arr, 20)

            assert_almost_equal(result[1], expected[10])
            if fill_value is None:
                self.assert_(np.isnan(result[-9:]).all())
            else:
                self.assert_((result[-9:] == 0).all())
            if has_min_periods:
                self.assert_(np.isnan(expected[23]))
                self.assert_(np.isnan(result[14]))
                self.assert_(np.isnan(expected[-5]))
                self.assert_(np.isnan(result[-14]))

    def _check_structures(self, func, static_comp,
                          has_min_periods=True, has_time_rule=True,
                          has_center=True,
                          fill_value=None):

        series_result = func(self.series, 50)
        self.assert_(isinstance(series_result, Series))

        frame_result = func(self.frame, 50)
        self.assertEquals(type(frame_result), DataFrame)

        # check time_rule works
        if has_time_rule:
            win = 25
            minp = 10

            if has_min_periods:
                series_result = func(self.series[::2], win, min_periods=minp,
                                     freq='B')
                frame_result = func(self.frame[::2], win, min_periods=minp,
                                    freq='B')
            else:
                series_result = func(self.series[::2], win, freq='B')
                frame_result = func(self.frame[::2], win, freq='B')

            last_date = series_result.index[-1]
            prev_date = last_date - 24 * datetools.bday

            trunc_series = self.series[::2].truncate(prev_date, last_date)
            trunc_frame = self.frame[::2].truncate(prev_date, last_date)

            assert_almost_equal(series_result[-1], static_comp(trunc_series))

            assert_almost_equal(frame_result.xs(last_date),
                                trunc_frame.apply(static_comp))

        if has_center:
            if has_min_periods:
                minp = 10
                series_xp = func(self.series, 25, min_periods=minp).shift(-12)
                frame_xp = func(self.frame, 25, min_periods=minp).shift(-12)

                series_rs = func(self.series, 25, min_periods=minp,
                                 center=True)
                frame_rs = func(self.frame, 25, min_periods=minp,
                                center=True)

            else:
                series_xp = func(self.series, 25).shift(-12)
                frame_xp = func(self.frame, 25).shift(-12)

                series_rs = func(self.series, 25, center=True)
                frame_rs = func(self.frame, 25, center=True)

            if fill_value is not None:
                series_xp = series_xp.fillna(fill_value)
                frame_xp = frame_xp.fillna(fill_value)
            assert_series_equal(series_xp, series_rs)
            assert_frame_equal(frame_xp, frame_rs)

    def test_legacy_time_rule_arg(self):
        from io import StringIO
        # suppress deprecation warnings
        sys.stderr = StringIO()

        rng = bdate_range('1/1/2000', periods=20)
        ts = Series(np.random.randn(20), index=rng)
        ts = ts.take(np.random.permutation(len(ts))[:12]).sort_index()

        try:
            result = mom.rolling_mean(ts, 1, min_periods=1, freq='B')
            expected = mom.rolling_mean(ts, 1, min_periods=1,
                                        time_rule='WEEKDAY')
            tm.assert_series_equal(result, expected)

            result = mom.ewma(ts, span=5, freq='B')
            expected = mom.ewma(ts, span=5, time_rule='WEEKDAY')
            tm.assert_series_equal(result, expected)

        finally:
            sys.stderr = sys.__stderr__

    def test_ewma(self):
        self._check_ew(mom.ewma)

        arr = np.zeros(1000)
        arr[5] = 1
        result = mom.ewma(arr, span=100, adjust=False).sum()
        self.assert_(np.abs(result - 1) < 1e-2)

    def test_ewma_nan_handling(self):
        s = Series([1.] + [np.nan] * 5 + [1.])

        result = mom.ewma(s, com=5)
        assert_almost_equal(result, [1] * len(s))

    def test_ewmvar(self):
        self._check_ew(mom.ewmvar)

    def test_ewmvol(self):
        self._check_ew(mom.ewmvol)

    def test_ewma_span_com_args(self):
        A = mom.ewma(self.arr, com=9.5)
        B = mom.ewma(self.arr, span=20)
        assert_almost_equal(A, B)

        self.assertRaises(Exception, mom.ewma, self.arr, com=9.5, span=20)
        self.assertRaises(Exception, mom.ewma, self.arr)

    def test_ew_empty_arrays(self):
        arr = np.array([], dtype=np.float64)

        funcs = [mom.ewma, mom.ewmvol, mom.ewmvar]
        for f in funcs:
            result = f(arr, 3)
            assert_almost_equal(result, arr)

    def _check_ew(self, func):
        self._check_ew_ndarray(func)
        self._check_ew_structures(func)

    def _check_ew_ndarray(self, func, preserve_nan=False):
        result = func(self.arr, com=10)
        if preserve_nan:
            assert(np.isnan(result[self._nan_locs]).all())

        # excluding NaNs correctly
        arr = randn(50)
        arr[:10] = np.NaN
        arr[-10:] = np.NaN

        # ??? check something

        # pass in ints
        result2 = func(np.arange(50), span=10)
        self.assert_(result2.dtype == np.float_)

    def _check_ew_structures(self, func):
        series_result = func(self.series, com=10)
        self.assert_(isinstance(series_result, Series))
        frame_result = func(self.frame, com=10)
        self.assertEquals(type(frame_result), DataFrame)

    # binary moments
    def test_rolling_cov(self):
        A = self.series
        B = A + randn(len(A))

        result = mom.rolling_cov(A, B, 50, min_periods=25)
        assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1])

    def test_rolling_corr(self):
        A = self.series
        B = A + randn(len(A))

        result = mom.rolling_corr(A, B, 50, min_periods=25)
        assert_almost_equal(result[-1], np.corrcoef(A[-50:], B[-50:])[0, 1])

        # test for correct bias correction
        a = tm.makeTimeSeries()
        b = tm.makeTimeSeries()
        a[:5] = np.nan
        b[:10] = np.nan

        result = mom.rolling_corr(a, b, len(a), min_periods=1)
        assert_almost_equal(result[-1], a.corr(b))

    def test_rolling_corr_pairwise(self):
        panel = mom.rolling_corr_pairwise(self.frame, 10, min_periods=5)

        correl = panel.ix[:, 1, 5]
        exp = mom.rolling_corr(self.frame[1], self.frame[5],
                               10, min_periods=5)
        tm.assert_series_equal(correl, exp)

    def test_flex_binary_moment(self):
        # GH3155
        # don't blow the stack
        self.assertRaises(ValueError, mom._flex_binary_moment,5,6,None)

    def test_corr_sanity(self):
        #GH 3155
        df = DataFrame(
            np.array(
                    [[ 0.87024726,  0.18505595],
                      [ 0.64355431,  0.3091617 ],
                      [ 0.92372966,  0.50552513],
                      [ 0.00203756,  0.04520709],
                      [ 0.84780328,  0.33394331],
                      [ 0.78369152,  0.63919667]])
            )

        res = mom.rolling_corr(df[0],df[1],5,center=True)
        self.assertTrue(all([np.abs(np.nan_to_num(x)) <=1 for x in res]))

        # and some fuzzing
        for i in range(10):
            df = DataFrame(np.random.rand(30,2))
            res = mom.rolling_corr(df[0],df[1],5,center=True)
            print( res)
            self.assertTrue(all([np.abs(np.nan_to_num(x)) <=1 for x in res]))

    def test_flex_binary_frame(self):
        def _check(method):
            series = self.frame[1]

            res = method(series, self.frame, 10)
            res2 = method(self.frame, series, 10)
            exp = self.frame.apply(lambda x: method(series, x, 10))

            tm.assert_frame_equal(res, exp)
            tm.assert_frame_equal(res2, exp)

            frame2 = self.frame.copy()
            frame2.values[:] = np.random.randn(*frame2.shape)

            res3 = method(self.frame, frame2, 10)
            exp = DataFrame(dict((k, method(self.frame[k], frame2[k], 10))
                                 for k in self.frame))
            tm.assert_frame_equal(res3, exp)

        methods = [mom.rolling_corr, mom.rolling_cov]
        for meth in methods:
            _check(meth)

    def test_ewmcov(self):
        self._check_binary_ew(mom.ewmcov)

    def test_ewmcorr(self):
        self._check_binary_ew(mom.ewmcorr)

    def _check_binary_ew(self, func):
        A = Series(randn(50), index=np.arange(50))
        B = A[2:] + randn(48)

        A[:10] = np.NaN
        B[-10:] = np.NaN

        result = func(A, B, 20, min_periods=5)

        self.assert_(np.isnan(result.values[:15]).all())
        self.assert_(not np.isnan(result.values[15:]).any())

        self.assertRaises(Exception, func, A, randn(50), 20, min_periods=5)

    def test_expanding_apply(self):
        ser = Series([])
        assert_series_equal(ser, mom.expanding_apply(ser, lambda x: x.mean()))

        def expanding_mean(x, min_periods=1, freq=None):
            return mom.expanding_apply(x,
                                       lambda x: x.mean(),
                                       min_periods=min_periods,
                                       freq=freq)
        self._check_expanding(expanding_mean, np.mean)

    def test_expanding_corr(self):
        A = self.series.dropna()
        B = (A + randn(len(A)))[:-5]

        result = mom.expanding_corr(A, B)

        rolling_result = mom.rolling_corr(A, B, len(A), min_periods=1)

        assert_almost_equal(rolling_result, result)

    def test_expanding_count(self):
        result = mom.expanding_count(self.series)
        assert_almost_equal(result, mom.rolling_count(self.series,
                                                      len(self.series)))

    def test_expanding_quantile(self):
        result = mom.expanding_quantile(self.series, 0.5)

        rolling_result = mom.rolling_quantile(self.series,
                                              len(self.series),
                                              0.5, min_periods=1)

        assert_almost_equal(result, rolling_result)

    def test_expanding_cov(self):
        A = self.series
        B = (A + randn(len(A)))[:-5]

        result = mom.expanding_cov(A, B)

        rolling_result = mom.rolling_cov(A, B, len(A), min_periods=1)

        assert_almost_equal(rolling_result, result)

    def test_expanding_max(self):
        self._check_expanding(mom.expanding_max, np.max, preserve_nan=False)

    def test_expanding_corr_pairwise(self):
        result = mom.expanding_corr_pairwise(self.frame)

        rolling_result = mom.rolling_corr_pairwise(self.frame,
                                                   len(self.frame),
                                                   min_periods=1)

        for i in result.items:
            assert_almost_equal(result[i], rolling_result[i])

    def _check_expanding_ndarray(self, func, static_comp, has_min_periods=True,
                                 has_time_rule=True, preserve_nan=True):
        result = func(self.arr)

        assert_almost_equal(result[10],
                            static_comp(self.arr[:11]))

        if preserve_nan:
            assert(np.isnan(result[self._nan_locs]).all())

        arr = randn(50)

        if has_min_periods:
            result = func(arr, min_periods=30)
            assert(np.isnan(result[:29]).all())
            assert_almost_equal(result[-1], static_comp(arr[:50]))

            # min_periods is working correctly
            result = func(arr, min_periods=15)
            self.assert_(np.isnan(result[13]))
            self.assert_(not np.isnan(result[14]))

            arr2 = randn(20)
            result = func(arr2, min_periods=5)
            self.assert_(isnull(result[3]))
            self.assert_(notnull(result[4]))

            # min_periods=0
            result0 = func(arr, min_periods=0)
            result1 = func(arr, min_periods=1)
            assert_almost_equal(result0, result1)
        else:
            result = func(arr)
            assert_almost_equal(result[-1], static_comp(arr[:50]))

    def _check_expanding_structures(self, func):
        series_result = func(self.series)
        self.assert_(isinstance(series_result, Series))
        frame_result = func(self.frame)
        self.assertEquals(type(frame_result), DataFrame)

    def _check_expanding(self, func, static_comp, has_min_periods=True,
                         has_time_rule=True,
                         preserve_nan=True):
        self._check_expanding_ndarray(func, static_comp,
                                      has_min_periods=has_min_periods,
                                      has_time_rule=has_time_rule,
                                      preserve_nan=preserve_nan)
        self._check_expanding_structures(func)
Пример #41
0
    def test_apply_deprecate_reduce(self):
        empty_frame = DataFrame()

        x = []
        with tm.assert_produces_warning(FutureWarning):
            empty_frame.apply(x.append, axis=1, reduce=True)
Пример #42
0
def evaluate_prediction(
    prediction: Iterable[str],
    reference: Iterable[str],
) -> DataFrame:
    """Calculates F1 Score, Recall and Precision of a :func:`~pypairs.cyclone` prediction.

    Parameters
    ----------

    prediction
        List of predicted classes.
    reference
        List of actual classes

    Returns
    -------

        A :class:`~pandas.DataFrame` with columns "f1", "recall", "precision" and "average"
        for all categories and a overall average containing the respective score.

    Example
    -------

        To get the prediction quality for the example usecase of :func:`~pypairs.cyclone` run::

            from pypairs import pairs, datasets, utils, plotting
            import numpy as np

            adata = datasets.leng15('sorted')
            marker_pairs = datasets.default_cc_marker()
            scores = pairs.cyclone(adata, marker_pairs)

            ref_labels = list(np.repeat("G2M", 76)) + list(np.repeat("S", 80)) + list(np.repeat("G1", 91))

            prediction_quality = utils.evaluate_prediction(scores['max_class'], ref_labels)

            print(prediction_quality)

    """
    ref = np.array(reference)
    pred = np.array(prediction)

    labels_cats = np.unique(list(ref) + list(pred))

    f1 = np.append(f1_score(ref, pred, average=None, labels=labels_cats),
                   f1_score(ref, pred, average='macro', labels=labels_cats))
    recall = np.append(
        recall_score(ref, pred, average=None, labels=labels_cats),
        recall_score(ref, pred, average='macro', labels=labels_cats))
    precision = np.append(
        precision_score(ref, pred, average=None, labels=labels_cats),
        precision_score(ref, pred, average='macro', labels=labels_cats))

    labels = np.append(labels_cats, "average")

    df = DataFrame(columns=labels, index=["f1", "recall", "precision"])

    df.loc["f1"] = f1
    df.loc["recall"] = recall
    df.loc["precision"] = precision

    average = np.average(df.values, axis=0)

    df.loc["average"] = average

    df = df.apply(pd.to_numeric, errors='coerce')

    return df.T
Пример #43
0
    def test_int64_overflow_issues(self):

        # #2690, combinatorial explosion
        df1 = DataFrame(np.random.randn(1000, 7),
                        columns=list('ABCDEF') + ['G1'])
        df2 = DataFrame(np.random.randn(1000, 7),
                        columns=list('ABCDEF') + ['G2'])

        # it works!
        result = merge(df1, df2, how='outer')
        assert len(result) == 2000

        low, high, n = -1 << 10, 1 << 10, 1 << 20
        left = DataFrame(np.random.randint(low, high, (n, 7)),
                         columns=list('ABCDEFG'))
        left['left'] = left.sum(axis=1)

        # one-2-one match
        i = np.random.permutation(len(left))
        right = left.iloc[i].copy()
        right.columns = right.columns[:-1].tolist() + ['right']
        right.index = np.arange(len(right))
        right['right'] *= -1

        out = merge(left, right, how='outer')
        assert len(out) == len(left)
        assert_series_equal(out['left'], -out['right'], check_names=False)
        result = out.iloc[:, :-2].sum(axis=1)
        assert_series_equal(out['left'], result, check_names=False)
        assert result.name is None

        out.sort_values(out.columns.tolist(), inplace=True)
        out.index = np.arange(len(out))
        for how in ['left', 'right', 'outer', 'inner']:
            assert_frame_equal(out, merge(left, right, how=how, sort=True))

        # check that left merge w/ sort=False maintains left frame order
        out = merge(left, right, how='left', sort=False)
        assert_frame_equal(left, out[left.columns.tolist()])

        out = merge(right, left, how='left', sort=False)
        assert_frame_equal(right, out[right.columns.tolist()])

        # one-2-many/none match
        n = 1 << 11
        left = DataFrame(np.random.randint(low, high, (n, 7)).astype('int64'),
                         columns=list('ABCDEFG'))

        # confirm that this is checking what it is supposed to check
        shape = left.apply(Series.nunique).values
        assert is_int64_overflow_possible(shape)

        # add duplicates to left frame
        left = concat([left, left], ignore_index=True)

        right = DataFrame(np.random.randint(low, high,
                                            (n // 2, 7)).astype('int64'),
                          columns=list('ABCDEFG'))

        # add duplicates & overlap with left to the right frame
        i = np.random.choice(len(left), n)
        right = concat([right, right, left.iloc[i]], ignore_index=True)

        left['left'] = np.random.randn(len(left))
        right['right'] = np.random.randn(len(right))

        # shuffle left & right frames
        i = np.random.permutation(len(left))
        left = left.iloc[i].copy()
        left.index = np.arange(len(left))

        i = np.random.permutation(len(right))
        right = right.iloc[i].copy()
        right.index = np.arange(len(right))

        # manually compute outer merge
        ldict, rdict = defaultdict(list), defaultdict(list)

        for idx, row in left.set_index(list('ABCDEFG')).iterrows():
            ldict[idx].append(row['left'])

        for idx, row in right.set_index(list('ABCDEFG')).iterrows():
            rdict[idx].append(row['right'])

        vals = []
        for k, lval in ldict.items():
            rval = rdict.get(k, [np.nan])
            for lv, rv in product(lval, rval):
                vals.append(k + tuple([lv, rv]))

        for k, rval in rdict.items():
            if k not in ldict:
                for rv in rval:
                    vals.append(k + tuple([np.nan, rv]))

        def align(df):
            df = df.sort_values(df.columns.tolist())
            df.index = np.arange(len(df))
            return df

        def verify_order(df):
            kcols = list('ABCDEFG')
            assert_frame_equal(df[kcols].copy(),
                               df[kcols].sort_values(kcols, kind='mergesort'))

        out = DataFrame(vals, columns=list('ABCDEFG') + ['left', 'right'])
        out = align(out)

        jmask = {
            'left': out['left'].notnull(),
            'right': out['right'].notnull(),
            'inner': out['left'].notnull() & out['right'].notnull(),
            'outer': np.ones(len(out), dtype='bool')
        }

        for how in 'left', 'right', 'outer', 'inner':
            mask = jmask[how]
            frame = align(out[mask].copy())
            assert mask.all() ^ mask.any() or how == 'outer'

            for sort in [False, True]:
                res = merge(left, right, how=how, sort=sort)
                if sort:
                    verify_order(res)

                # as in GH9092 dtypes break with outer/right join
                assert_frame_equal(frame,
                                   align(res),
                                   check_dtype=how not in ('right', 'outer'))
Пример #44
0
 def from_df(cls, df: pd.DataFrame, **kwargs) -> 'ItemList':
     df['img_data'] = df.apply(
         lambda row: Image(npimg2tensor(generate_image(row))), axis=1)
     return cls(items=range(len(df)), inner_df=df.copy(), **kwargs)
Пример #45
0
def match_evaluations_to_courses(
    evaluation_narratives: pd.DataFrame,
    evaluation_ratings: pd.DataFrame,
    evaluation_statistics: pd.DataFrame,
    listings: pd.DataFrame,
) -> Tuple[pd.DataFrame, ...]:
    """
    Match evaluations to course IDs.

    Parameters
    ----------
    evaluation_narratives:
        DataFrame of narratives.
    evaluation_ratings:
        DataFrame of ratings.
    evaluation_statistics:
        DataFrame of statistics.
    listings:
        Listings DataFrame from import_courses.

    Returns
    -------
    evaluation_narratives,
    evaluation_ratings,
    evaluation_statistics,
    evaluation_questions
    """
    print("Matching evaluations to courses")

    # construct outer season grouping
    season_crn_to_course_id = listings[["season_code", "course_id",
                                        "crn"]].groupby("season_code")
    # construct inner course_code to course_id mapping
    season_crn_to_course_id = season_crn_to_course_id.apply(  # type: ignore
        lambda x: x[["crn", "course_id"]].set_index("crn")["course_id"].
        to_dict())
    # cast outer season mapping to dictionary
    season_crn_to_course_id = season_crn_to_course_id.to_dict()  # type: ignore

    def get_course_id(row):
        course_id = season_crn_to_course_id.get(row["season"],
                                                {}).get(row["crn"], None)
        return course_id

    # get course IDs
    evaluation_narratives["course_id"] = evaluation_narratives.apply(
        get_course_id, axis=1)
    evaluation_ratings["course_id"] = evaluation_ratings.apply(get_course_id,
                                                               axis=1)
    evaluation_statistics["course_id"] = evaluation_statistics.apply(
        get_course_id, axis=1)

    # each course must have exactly one statistic, so use this for reporting
    nan_total = evaluation_statistics["course_id"].isna().sum()
    print(
        f"Removing {nan_total}/{len(evaluation_statistics)} evaluated courses without matches"
    )

    # remove unmatched courses
    evaluation_narratives.dropna(subset=["course_id"], axis=0, inplace=True)
    evaluation_ratings.dropna(subset=["course_id"], axis=0, inplace=True)
    evaluation_statistics.dropna(subset=["course_id"], axis=0, inplace=True)

    # change from float to integer type for import
    evaluation_narratives["course_id"] = evaluation_narratives[
        "course_id"].astype(int)
    evaluation_ratings["course_id"] = evaluation_ratings["course_id"].astype(
        int)
    evaluation_statistics["course_id"] = evaluation_statistics[
        "course_id"].astype(int)

    # drop cross-listing duplicates
    evaluation_statistics.drop_duplicates(  # type: ignore
        subset=["course_id"], inplace=True, keep="first")
    evaluation_ratings.drop_duplicates(  # type: ignore
        subset=["course_id", "question_code"],
        inplace=True,
        keep="first")
    evaluation_narratives.drop_duplicates(  # type: ignore
        subset=["course_id", "question_code", "comment"],
        inplace=True,
        keep="first")

    return evaluation_statistics, evaluation_narratives, evaluation_ratings
Пример #46
0
class Apply:
    def setup(self):
        self.df = DataFrame(np.random.randn(1000, 100))

        self.s = Series(np.arange(1028.0))
        self.df2 = DataFrame({i: self.s for i in range(1028)})
        self.df3 = DataFrame(np.random.randn(1000, 3), columns=list("ABC"))

    def time_apply_user_func(self):
        self.df2.apply(lambda x: np.corrcoef(x, self.s)[(0, 1)])

    def time_apply_axis_1(self):
        self.df.apply(lambda x: x + 1, axis=1)

    def time_apply_lambda_mean(self):
        self.df.apply(lambda x: x.mean())

    def time_apply_np_mean(self):
        self.df.apply(np.mean)

    def time_apply_pass_thru(self):
        self.df.apply(lambda x: x)

    def time_apply_ref_by_name(self):
        self.df3.apply(lambda x: x["A"] + x["B"], axis=1)
Пример #47
0
def resolve_cross_listings(merged_course_info: pd.DataFrame) -> pd.DataFrame:
    """
    Resolve course cross-listings by computing unique course_ids.

    Parameters
    ----------
    merged_course_info:
        Raw course information from JSON files.

    Returns
    -------
    merged_course_info with 'temp_course_id' field added.
    """

    # seasons must be sorted in ascending order
    # prioritize Yale College courses when deduplicating listings
    print("Sorting by season and if-undergrad")

    def classify_yc(row):
        if row["school"] == "YC":
            return True

        if row["school"] != row["school"]:
            # check number of numbers in course number
            # (some courses have letters in them)
            num_nums = len([x for x in row["number"] if x.isnumeric()])
            # if the course number is in the 000s to 400s range it's undergrad
            if row["number"][0] in ["0", "1", "2", "3", "4"] and num_nums < 4:
                return True
        return False

    merged_course_info["is_yc"] = merged_course_info.apply(classify_yc, axis=1)
    merged_course_info = merged_course_info.sort_values(
        by=["season_code", "is_yc"], ascending=[True, False])

    print("Aggregating cross-listings")
    merged_course_info["season_code"] = merged_course_info[
        "season_code"].astype(int)
    merged_course_info["crn"] = merged_course_info["crn"].astype(int)
    merged_course_info["crns"] = merged_course_info["crns"].apply(
        lambda crns: [int(crn) for crn in crns])

    # group CRNs by season for cross-listing deduplication

    crns_by_season = merged_course_info.groupby("season_code")[  # type: ignore
        "crns"].apply(list)
    # convert CRN groups to sets for resolution
    crns_by_season = crns_by_season.apply(lambda x: [frozenset(y) for y in x])
    # resolve overlapping CRN sets
    crns_by_season = crns_by_season.apply(merge_overlapping)

    print("Mapping out cross-listings")
    # map CRN groups to temporary IDs within each season
    temp_course_ids_by_season = crns_by_season.apply(
        lambda x: invert_dict_of_lists(dict(enumerate(x))))
    temp_course_ids_by_season = temp_course_ids_by_season.to_dict()

    # assign season-specific ID based on CRN group IDs
    merged_course_info["season_course_id"] = merged_course_info.apply(
        lambda row: temp_course_ids_by_season[row["season_code"]][row["crn"]],
        axis=1)
    # temporary string-based unique course identifier
    merged_course_info["temp_course_id"] = merged_course_info.apply(
        lambda x: f"{x['season_code']}_{x['season_course_id']}", axis=1)

    return merged_course_info
Пример #48
0
       Chinese  English  Math       name
guan     132.0       65    30   XIAOGUAN
zhang    190.0       85    98  XIAOZHANG
zhao     186.0       92    96   XIAOZHAO
ma       180.0       88    77     XIAOMA
huang      NaN       90    90  XIAOHUANG
'''

## 使用更加复杂的函数


def plus(df, n, m):
    df['new1'] = (df['Chinese'] + df['English']) * m
    df['new2'] = (df['Chinese'] + df['English']) * n
    return df


print '\n'
df1 = data_frame2.apply(plus, axis=1, args=(
    2,
    3,
))
print df1
'''output
       Chinese  English  Math       name   new1   new2
guan     132.0       65    30   XIAOGUAN  591.0  394.0
zhang    190.0       85    98  XIAOZHANG  825.0  550.0
zhao     186.0       92    96   XIAOZHAO  834.0  556.0
ma       180.0       88    77     XIAOMA  804.0  536.0
huang      NaN       90    90  XIAOHUANG    NaN    NaN
'''
Пример #49
0
class Scores(object):
    """

    Parameters
    ----------
    uri : str, optional

    modality : str, optional

    Returns
    -------
    scores : `Scores`

    Examples
    --------

        >>> s = Scores(uri='video', modality='speaker')
        >>> s[Segment(0,1), 's1', 'A'] = 0.1
        >>> s[Segment(0,1), 's1', 'B'] = 0.2
        >>> s[Segment(0,1), 's1', 'C'] = 0.3
        >>> s[Segment(0,1), 's2', 'A'] = 0.4
        >>> s[Segment(0,1), 's2', 'B'] = 0.3
        >>> s[Segment(0,1), 's2', 'C'] = 0.2
        >>> s[Segment(2,3), 's1', 'A'] = 0.2
        >>> s[Segment(2,3), 's1', 'B'] = 0.1
        >>> s[Segment(2,3), 's1', 'C'] = 0.3

    """
    @classmethod
    def from_df(cls, df, uri=None, modality=None, aggfunc=np.mean):
        """

        Parameters
        ----------
        df : DataFrame
            Must contain the following columns:
            'segment', 'track', 'label' and 'value'
        uri : str, optional
            Resource identifier
        modality : str, optional
            Modality
        aggfunc : func
            Value aggregation function in case of duplicate (segment, track,
            label) tuples

        Returns
        -------

        """
        dataframe = pivot_table(df,
                                values=PYANNOTE_SCORE,
                                index=[PYANNOTE_SEGMENT, PYANNOTE_TRACK],
                                columns=PYANNOTE_LABEL,
                                aggfunc=aggfunc)

        annotation = Annotation(uri=uri, modality=modality)
        for index, _ in dataframe.iterrows():
            segment = Segment(*index[0])
            track = index[1]
            annotation[segment, track] = ''

        labels = dataframe.columns

        return cls(uri=uri,
                   modality=modality,
                   annotation=annotation,
                   labels=labels,
                   values=dataframe.values)

    def __init__(self,
                 uri=None,
                 modality=None,
                 annotation=None,
                 labels=None,
                 values=None,
                 dtype=None):

        super(Scores, self).__init__()

        names = [PYANNOTE_SEGMENT + '_' + field
                 for field in Segment._fields] + [PYANNOTE_TRACK]

        if annotation:
            annotation = annotation.copy()
            index = Index([s + (t, ) for s, t in annotation.itertracks()],
                          name=names)

        else:
            annotation = Annotation(uri=uri, modality=modality)
            index = MultiIndex(levels=[list() for name in names],
                               labels=[list() for name in names],
                               names=names)

        self.annotation_ = annotation
        columns = None if labels is None else list(labels)
        data = None if values is None else np.array(values)
        dtype = np.float if values is None else values.dtype

        self.dataframe_ = DataFrame(data=data,
                                    dtype=dtype,
                                    index=index,
                                    columns=columns)

        self.hasChanged_ = True

        self.modality = modality
        self.uri = uri

    def copy(self):
        self._reindexIfNeeded()
        copied = self.__class__(uri=self.uri, modality=self.modality)
        copied.dataframe_ = self.dataframe_.copy()
        copied.annotation_ = self.annotation_.copy()
        copied.hasChanged_ = self.hasChanged_
        return copied

    # del scores[segment]
    # del scores[segment, :]
    # del scores[segment, track]
    def __delitem__(self, key):

        if isinstance(key, Segment):
            segment = key
            self.dataframe_.drop(tuple(segment), axis=0, inplace=True)
            del self.annotation_[segment]
            self.hasChanged_ = True

        elif isinstance(key, tuple) and len(key) == 2:
            segment, track = key
            self.dataframe_.drop(tuple(segment) + (track, ),
                                 axis=0,
                                 inplace=True)
            del self.annotation_[segment, track]
            self.hasChanged_ = True

        else:
            raise KeyError('')

    # value = scores[segment, track, label]
    def __getitem__(self, key):

        if len(key) == 2:
            key = (key[0], '_', key[1])

        segment, track, label = key
        return self.dataframe_.at[tuple(segment) + (track, ), label]

    # scores[segment, track, label] = value
    # scores[segment, label] ==== scores[segment, '_', label]
    def __setitem__(self, key, value):

        if len(key) == 2:
            key = (key[0], '_', key[1])

        segment, track, label = key

        # do not add empty track
        if not segment:
            return

        self.dataframe_.at[tuple(segment) + (track, ), label] = value
        self.annotation_[segment, track] = label
        self.hasChanged_ = True

    def __len__(self):
        """Number of annotated segments"""
        return len(self.annotation_)

    def __nonzero__(self):
        return self.__bool__()

    def __bool__(self):
        """False if annotation is empty"""
        return True if self.annotation_ else False

    def __contains__(self, included):
        """Check if segments are annotated

        Parameters
        ----------
        included : `Segment` or `Timeline`

        Returns
        -------
        contains : bool
            True if every segment in `included` is annotated, False otherwise.
        """
        return included in self.annotation_

    def __iter__(self):
        """Iterate over sorted segments"""
        return iter(self.annotation_.get_timeline(copy=False))

    def __reversed__(self):
        """Reverse iterate over sorted segments"""
        return reversed(self.annotation_.get_timeline(copy=False))

    def itersegments(self):
        return iter(self)

    def tracks(self, segment):
        """Set of tracks for query segment

        Parameters
        ----------
        segment : `Segment`
            Query segment

        Returns
        -------
        tracks : set
            Set of tracks for query segment
        """
        return self.annotation_.get_tracks(segment)

    def has_track(self, segment, track):
        """Check whether a given track exists

        Parameters
        ----------
        segment : `Segment`
            Query segment
        track :
            Query track

        Returns
        -------
        exists : bool
            True if track exists for segment
        """
        return self.annotation_.has_track(segment, track)

    def get_track_by_name(self, track):
        """Get all tracks with given name

        Parameters
        ----------
        track : any valid track name
            Requested name track

        Returns
        -------
        tracks : list
            List of (segment, track) tuples
        """
        return self.annotation_.get_track_by_name(track)

    def new_track(self, segment, candidate=None, prefix=None):
        """Track name generator

        Parameters
        ----------
        segment : Segment
        prefix : str, optional
        candidate : any valid track name


        Returns
        -------
        track : str
            New track name
        """

        return self.annotation_.new_track(segment, candidate=None, prefix=None)

    def itertracks(self):
        """Iterate over annotation as (segment, track) tuple"""
        return self.annotation_.itertracks()

    def itervalues(self):
        """Iterate over scores as (segment, track, label, value) tuple"""

        # make sure segment/track pairs are sorted
        self._reindexIfNeeded()

        labels = self.labels()

        # yield one (segment, track, label) tuple per loop
        for index, columns in self.dataframe_.iterrows():
            segment = Segment(*index[:-1])
            track = index[-1]
            for label in labels:
                value = columns[label]
                if not np.isnan(value):
                    yield segment, track, label, value

    def get_track_scores(self, segment, track):
        """Get all scores for a given track.

        Parameters
        ----------
        segment : Segment
        track : hashable
            segment, track must be a valid track

        Returns
        -------
        scores : dict
            {label: score} dictionary
        """
        return dict(self.dataframe_.xs(tuple(segment) + (track, )))

    def labels(self):
        """List of labels

        Returns
        -------
        labels : list
            Sorted list of existing labels

        Remarks
        -------
            Labels are sorted based on their string representation.
        """
        return sorted(self.dataframe_.columns, key=str)

    def _reindexIfNeeded(self):

        if not self.hasChanged_:
            return

        names = [PYANNOTE_SEGMENT + '_' + field
                 for field in Segment._fields] + [PYANNOTE_TRACK]

        new_index = Index(
            [s + (t, ) for s, t in self.annotation_.itertracks()], name=names)

        self.dataframe_ = self.dataframe_.reindex(new_index)

        self.hasChanged_ = False

        return

    def retrack(self):
        """
        """

        self._reindexIfNeeded()
        retracked = self.copy()

        annotation = self.annotation_.retrack()
        retracked.annotation_ = annotation

        names = [PYANNOTE_SEGMENT + '_' + field
                 for field in Segment._fields] + [PYANNOTE_TRACK]
        new_index = Index([s + (t, ) for s, t in annotation.itertracks()],
                          name=names)
        retracked.dataframe_.index = new_index

        return retracked

    def apply(self, func, axis=0):

        applied = self.copy()
        applied.dataframe_ = self.dataframe_.apply(func, axis=axis)
        applied.hasChanged_ = True

        return applied

    def rank(self, ascending=False):
        """

        Parameters
        ----------
        ascending : boolean, default False
            False for ranks by high (0) to low (N-1)

        Returns
        -------
        rank : `Scores`

        """

        ranked = self.copy()
        ranked.dataframe_ = -1 + self.dataframe_.rank(axis=1,
                                                      ascending=ascending)
        ranked.hasChanged_ = True
        return ranked

    def nbest(self, n, ascending=False):
        """

        Parameters
        ----------
        n : int
            Size of n-best list
        ascending : boolean, default False
            False for ranks by high (0) to low (N-1)

        Returns
        -------
        nbest : `Scores`
            New scores where only n-best are kept.

        """

        filtered = self.copy()
        ranked_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending)
        filtered.dataframe_ = filtered.dataframe_.where(ranked_ < n,
                                                        other=np.NaN)
        filtered.hasChanged_ = True
        return filtered

    def subset(self, labels, invert=False):
        """Scores subset

        Extract scores subset based on labels

        Parameters
        ----------
        labels : set
            Set of labels
        invert : bool, optional
            If invert is True, extract all but requested `labels`

        Returns
        -------
        subset : `Scores`
            Scores subset.
        """

        self._reindexIfNeeded()

        if not isinstance(labels, set):
            raise TypeError('labels must be provided as a set of labels.')

        if invert:
            labels = set(self.labels()) - labels
        else:
            labels = labels & set(self.labels())

        subset = Scores(uri=self.uri, modality=self.modality)
        subset.annotation_ = self.annotation_
        subset.dataframe_ = self.dataframe_[list(labels)]

        return subset

    def to_annotation(self, threshold=-np.inf, posterior=False):
        """

        Parameters
        ----------
        threshold : float, optional
            Each track is annotated with the label with the highest score.
            Yet, if the latter is smaller than `threshold`, label is replaced
            with an `Unknown` instance.
        posterior : bool, optional
            If True, scores are posterior probabilities in open-set
            identification. If top model posterior is higher than unknown
            posterior, it is selected. Otherwise, label is replaced with an
            `Unknown` instance.
        """

        if not self:
            return Annotation(uri=self.uri, modality=self.modality)

        best = self.nbest(1, ascending=False)
        large_enough = best.copy()

        if posterior:
            unknown_posterior = 1. - self.dataframe_.sum(axis=1)

            large_enough.dataframe_ = (((best.dataframe_.T > unknown_posterior)
                                        & (best.dataframe_.T > threshold)).T)

        else:

            large_enough.dataframe_ = ((best.dataframe_.T > threshold).T)

        large_enough.dataframe_.where(best.dataframe_.notnull(),
                                      inplace=True,
                                      other=np.NaN)

        annotation = Annotation(uri=self.uri, modality=self.modality)
        for segment, track, label, value in large_enough.itervalues():
            label = label if value else Unknown()
            annotation[segment, track] = label

        return annotation

    def map(self, func):
        """Apply function to all values"""

        mapped = self.copy()
        mapped.dataframe_ = self.dataframe_.applymap(func)
        mapped.hasChanged_ = True
        return mapped

    def crop(self, focus, mode='strict'):
        """Crop on focus

        Parameters
        ----------
        focus : `Segment` or `Timeline`

        mode : {'strict', 'loose', 'intersection'}
            In 'strict' mode, only segments fully included in focus coverage
            are kept. In 'loose' mode, any intersecting segment is kept
            unchanged. In 'intersection' mode, only intersecting segments are
            kept and replaced by their actual intersection with the focus.

        Returns
        -------
        cropped : same type as caller
            Cropped version of the caller containing only tracks matching
            the provided focus and mode.

        Remarks
        -------
        In 'intersection' mode, the best is done to keep the track names
        unchanged. However, in some cases where two original segments are
        cropped into the same resulting segments, conflicting track names are
        modified to make sure no track is lost.

        """

        if isinstance(focus, Segment):
            return self.crop(Timeline([focus], uri=self.uri), mode=mode)

        self._reindexIfNeeded()
        cropped = self.copy()

        if mode in ['strict', 'loose']:

            new_annotation = self.annotation_.crop(focus, mode=mode)
            keep = [
                new_annotation.has_track(segment, track)
                for segment, track in self.itertracks()
            ]
            cropped.dataframe_ = self.dataframe_[keep]
            cropped.annotation_ = new_annotation
            cropped.hasChanged_ = True

            return cropped

        elif mode in ['intersection']:

            raise NotImplementedError('')

            # # two original segments might be cropped into the same resulting
            # # segment -- therefore, we keep track of the mapping
            # intersection, mapping = timeline.crop(coverage,
            #                                       mode=mode, mapping=True)
            #
            # # create new empty annotation
            # A = self.__class__(uri=self.uri, modality=self.modality)
            #
            # for cropped in intersection:
            #     for original in mapping[cropped]:
            #         for track in self.tracks(original):
            #             # try to use original track name (candidate)
            #             # if it already exists, create a brand new one
            #             new_track = A.new_track(cropped, candidate=track)
            #             # copy each value, column by column
            #             for label in self.dataframe_.columns:
            #                 value = self.dataframe_.get_value((original, track),
            #                                            label)
            #                 A.dataframe_ = A.dataframe_.set_value((cropped, new_track),
            #                                         label, value)
            #
            # return A

    def __str__(self):
        """Human-friendly representation"""
        if self:
            self._reindexIfNeeded()
            return str(self.dataframe_)
        else:
            return ""

    def _repr_png_(self):
        from .notebook import repr_scores
        return repr_scores(self)
Пример #50
0
def get_slope(X: pd.DataFrame) -> float:
    lm = LinearRegression()
    lm.fit(np.arange(X.shape[0]).reshape(-1,1), X.apply(lambda x: math.log(x) if x != 0 else x))
    return lm.coef_[0]
Пример #51
0
    def test_unstack_nan_index(self):  # GH7466
        cast = lambda val: '{0:1}'.format('' if val != val else val)
        nan = np.nan

        def verify(df):
            mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
            rows, cols = df.notna().values.nonzero()
            for i, j in zip(rows, cols):
                left = sorted(df.iloc[i, j].split('.'))
                right = mk_list(df.index[i]) + mk_list(df.columns[j])
                right = sorted(list(map(cast, right)))
                assert left == right

        df = DataFrame({
            'jim': ['a', 'b', nan, 'd'],
            'joe': ['w', 'x', 'y', 'z'],
            'jolie': ['a.w', 'b.x', ' .y', 'd.z']
        })

        left = df.set_index(['jim', 'joe']).unstack()['jolie']
        right = df.set_index(['joe', 'jim']).unstack()['jolie'].T
        assert_frame_equal(left, right)

        for idx in itertools.permutations(df.columns[:2]):
            mi = df.set_index(list(idx))
            for lev in range(2):
                udf = mi.unstack(level=lev)
                assert udf.notna().values.sum() == len(df)
                verify(udf['jolie'])

        df = DataFrame({
            '1st': ['d'] * 3 + [nan] * 5 + ['a'] * 2 + ['c'] * 3 + ['e'] * 2 +
            ['b'] * 5,
            '2nd': ['y'] * 2 + ['w'] * 3 + [nan] * 3 + ['z'] * 4 + [nan] * 3 +
            ['x'] * 3 + [nan] * 2,
            '3rd': [
                67, 39, 53, 72, 57, 80, 31, 18, 11, 30, 59, 50, 62, 59, 76, 52,
                14, 53, 60, 51
            ]
        })

        df['4th'], df['5th'] = \
            df.apply(lambda r: '.'.join(map(cast, r)), axis=1), \
            df.apply(lambda r: '.'.join(map(cast, r.iloc[::-1])), axis=1)

        for idx in itertools.permutations(['1st', '2nd', '3rd']):
            mi = df.set_index(list(idx))
            for lev in range(3):
                udf = mi.unstack(level=lev)
                assert udf.notna().values.sum() == 2 * len(df)
                for col in ['4th', '5th']:
                    verify(udf[col])

        # GH7403
        df = pd.DataFrame({
            'A': list('aaaabbbb'),
            'B': range(8),
            'C': range(8)
        })
        df.iloc[3, 1] = np.NaN
        left = df.set_index(['A', 'B']).unstack(0)

        vals = [[3, 0, 1, 2, nan, nan, nan, nan],
                [nan, nan, nan, nan, 4, 5, 6, 7]]
        vals = list(map(list, zip(*vals)))
        idx = Index([nan, 0, 1, 2, 4, 5, 6, 7], name='B')
        cols = MultiIndex(levels=[['C'], ['a', 'b']],
                          labels=[[0, 0], [0, 1]],
                          names=[None, 'A'])

        right = DataFrame(vals, columns=cols, index=idx)
        assert_frame_equal(left, right)

        df = DataFrame({
            'A': list('aaaabbbb'),
            'B': list(range(4)) * 2,
            'C': range(8)
        })
        df.iloc[2, 1] = np.NaN
        left = df.set_index(['A', 'B']).unstack(0)

        vals = [[2, nan], [0, 4], [1, 5], [nan, 6], [3, 7]]
        cols = MultiIndex(levels=[['C'], ['a', 'b']],
                          labels=[[0, 0], [0, 1]],
                          names=[None, 'A'])
        idx = Index([nan, 0, 1, 2, 3], name='B')
        right = DataFrame(vals, columns=cols, index=idx)
        assert_frame_equal(left, right)

        df = pd.DataFrame({
            'A': list('aaaabbbb'),
            'B': list(range(4)) * 2,
            'C': range(8)
        })
        df.iloc[3, 1] = np.NaN
        left = df.set_index(['A', 'B']).unstack(0)

        vals = [[3, nan], [0, 4], [1, 5], [2, 6], [nan, 7]]
        cols = MultiIndex(levels=[['C'], ['a', 'b']],
                          labels=[[0, 0], [0, 1]],
                          names=[None, 'A'])
        idx = Index([nan, 0, 1, 2, 3], name='B')
        right = DataFrame(vals, columns=cols, index=idx)
        assert_frame_equal(left, right)

        # GH7401
        df = pd.DataFrame({
            'A':
            list('aaaaabbbbb'),
            'B': (date_range('2012-01-01', periods=5).tolist() * 2),
            'C':
            np.arange(10)
        })

        df.iloc[3, 1] = np.NaN
        left = df.set_index(['A', 'B']).unstack()

        vals = np.array([[3, 0, 1, 2, nan, 4], [nan, 5, 6, 7, 8, 9]])
        idx = Index(['a', 'b'], name='A')
        cols = MultiIndex(levels=[['C'],
                                  date_range('2012-01-01', periods=5)],
                          labels=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
                          names=[None, 'B'])

        right = DataFrame(vals, columns=cols, index=idx)
        assert_frame_equal(left, right)

        # GH4862
        vals = [['Hg', nan, nan, 680585148], ['U', 0.0, nan, 680585148],
                ['Pb', 7.07e-06, nan, 680585148],
                ['Sn', 2.3614e-05, 0.0133, 680607017],
                ['Ag', 0.0, 0.0133, 680607017],
                ['Hg', -0.00015, 0.0133, 680607017]]
        df = DataFrame(vals,
                       columns=['agent', 'change', 'dosage', 's_id'],
                       index=[17263, 17264, 17265, 17266, 17267, 17268])

        left = df.copy().set_index(['s_id', 'dosage', 'agent']).unstack()

        vals = [[nan, nan, 7.07e-06, nan, 0.0],
                [0.0, -0.00015, nan, 2.3614e-05, nan]]

        idx = MultiIndex(levels=[[680585148, 680607017], [0.0133]],
                         labels=[[0, 1], [-1, 0]],
                         names=['s_id', 'dosage'])

        cols = MultiIndex(levels=[['change'], ['Ag', 'Hg', 'Pb', 'Sn', 'U']],
                          labels=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]],
                          names=[None, 'agent'])

        right = DataFrame(vals, columns=cols, index=idx)
        assert_frame_equal(left, right)

        left = df.loc[17264:].copy().set_index(['s_id', 'dosage', 'agent'])
        assert_frame_equal(left.unstack(), right)

        # GH9497 - multiple unstack with nulls
        df = DataFrame({
            '1st': [1, 2, 1, 2, 1, 2],
            '2nd': pd.date_range('2014-02-01', periods=6, freq='D'),
            'jim': 100 + np.arange(6),
            'joe': (np.random.randn(6) * 10).round(2)
        })

        df['3rd'] = df['2nd'] - pd.Timestamp('2014-02-02')
        df.loc[1, '2nd'] = df.loc[3, '2nd'] = nan
        df.loc[1, '3rd'] = df.loc[4, '3rd'] = nan

        left = df.set_index(['1st', '2nd', '3rd']).unstack(['2nd', '3rd'])
        assert left.notna().values.sum() == 2 * len(df)

        for col in ['jim', 'joe']:
            for _, r in df.iterrows():
                key = r['1st'], (col, r['2nd'], r['3rd'])
                assert r[col] == left.loc[key]
Пример #52
0
class TestMoments(unittest.TestCase):

    _nan_locs = np.arange(20, 40)
    _inf_locs = np.array([])

    def setUp(self):
        arr = randn(N)
        arr[self._nan_locs] = np.NaN

        self.arr = arr
        self.rng = bdate_range(datetime(2009, 1, 1), periods=N)

        self.series = Series(arr.copy(), index=self.rng)

        self.frame = DataFrame(randn(N, K),
                               index=self.rng,
                               columns=np.arange(K))

    def test_rolling_sum(self):
        self._check_moment_func(mom.rolling_sum, np.sum)

    def test_rolling_count(self):
        counter = lambda x: np.isfinite(x).astype(float).sum()
        self._check_moment_func(mom.rolling_count,
                                counter,
                                has_min_periods=False,
                                preserve_nan=False)

    def test_rolling_mean(self):
        self._check_moment_func(mom.rolling_mean, np.mean)

    def test_rolling_median(self):
        self._check_moment_func(mom.rolling_median, np.median)

    def test_rolling_min(self):
        self._check_moment_func(mom.rolling_min, np.min)

    def test_rolling_max(self):
        self._check_moment_func(mom.rolling_max, np.max)

    def test_rolling_quantile(self):
        qs = [.1, .5, .9]

        def scoreatpercentile(a, per):
            values = np.sort(a, axis=0)

            idx = per / 1. * (values.shape[0] - 1)
            return values[int(idx)]

        for q in qs:

            def f(x, window, min_periods=None, freq=None):
                return mom.rolling_quantile(x,
                                            window,
                                            q,
                                            min_periods=min_periods,
                                            freq=freq)

            def alt(x):
                return scoreatpercentile(x, q)

            self._check_moment_func(f, alt)

    def test_rolling_apply(self):
        ser = Series([])
        assert_series_equal(ser, mom.rolling_apply(ser, 10,
                                                   lambda x: x.mean()))

        def roll_mean(x, window, min_periods=None, freq=None):
            return mom.rolling_apply(x,
                                     window,
                                     lambda x: x[np.isfinite(x)].mean(),
                                     min_periods=min_periods,
                                     freq=freq)

        self._check_moment_func(roll_mean, np.mean)

    def test_rolling_std(self):
        self._check_moment_func(mom.rolling_std, lambda x: np.std(x, ddof=1))
        self._check_moment_func(functools.partial(mom.rolling_std, ddof=0),
                                lambda x: np.std(x, ddof=0))

    def test_rolling_var(self):
        self._check_moment_func(mom.rolling_var, lambda x: np.var(x, ddof=1))
        self._check_moment_func(functools.partial(mom.rolling_var, ddof=0),
                                lambda x: np.var(x, ddof=0))

    def test_rolling_skew(self):
        try:
            from scipy.stats import skew
        except ImportError:
            raise nose.SkipTest('no scipy')
        self._check_moment_func(mom.rolling_skew,
                                lambda x: skew(x, bias=False))

    def test_rolling_kurt(self):
        try:
            from scipy.stats import kurtosis
        except ImportError:
            raise nose.SkipTest('no scipy')
        self._check_moment_func(mom.rolling_kurt,
                                lambda x: kurtosis(x, bias=False))

    def _check_moment_func(self,
                           func,
                           static_comp,
                           window=50,
                           has_min_periods=True,
                           has_time_rule=True,
                           preserve_nan=True):

        self._check_ndarray(func,
                            static_comp,
                            window=window,
                            has_min_periods=has_min_periods,
                            preserve_nan=preserve_nan)

        self._check_structures(func,
                               static_comp,
                               has_min_periods=has_min_periods,
                               has_time_rule=has_time_rule)

    def _check_ndarray(self,
                       func,
                       static_comp,
                       window=50,
                       has_min_periods=True,
                       preserve_nan=True):

        result = func(self.arr, window)
        assert_almost_equal(result[-1], static_comp(self.arr[-50:]))

        if preserve_nan:
            assert (np.isnan(result[self._nan_locs]).all())

        # excluding NaNs correctly
        arr = randn(50)
        arr[:10] = np.NaN
        arr[-10:] = np.NaN

        if has_min_periods:
            result = func(arr, 50, min_periods=30)
            assert_almost_equal(result[-1], static_comp(arr[10:-10]))

            # min_periods is working correctly
            result = func(arr, 20, min_periods=15)
            self.assert_(np.isnan(result[23]))
            self.assert_(not np.isnan(result[24]))

            self.assert_(not np.isnan(result[-6]))
            self.assert_(np.isnan(result[-5]))

            # min_periods=0
            result0 = func(arr, 20, min_periods=0)
            result1 = func(arr, 20, min_periods=1)
            assert_almost_equal(result0, result1)
        else:
            result = func(arr, 50)
            assert_almost_equal(result[-1], static_comp(arr[10:-10]))

    def _check_structures(self,
                          func,
                          static_comp,
                          has_min_periods=True,
                          has_time_rule=True):

        series_result = func(self.series, 50)
        self.assert_(isinstance(series_result, Series))

        frame_result = func(self.frame, 50)
        self.assertEquals(type(frame_result), DataFrame)

        # check time_rule works
        if has_time_rule:
            win = 25
            minp = 10

            if has_min_periods:
                series_result = func(self.series[::2],
                                     win,
                                     min_periods=minp,
                                     freq='B')
                frame_result = func(self.frame[::2],
                                    win,
                                    min_periods=minp,
                                    freq='B')
            else:
                series_result = func(self.series[::2], win, freq='B')
                frame_result = func(self.frame[::2], win, freq='B')

            last_date = series_result.index[-1]
            prev_date = last_date - 24 * datetools.bday

            trunc_series = self.series[::2].truncate(prev_date, last_date)
            trunc_frame = self.frame[::2].truncate(prev_date, last_date)

            assert_almost_equal(series_result[-1], static_comp(trunc_series))

            assert_almost_equal(frame_result.xs(last_date),
                                trunc_frame.apply(static_comp))

    def test_legacy_time_rule_arg(self):
        from StringIO import StringIO
        # suppress deprecation warnings
        sys.stderr = StringIO()

        rng = bdate_range('1/1/2000', periods=20)
        ts = Series(np.random.randn(20), index=rng)
        ts = ts.take(np.random.permutation(len(ts))[:12]).sort_index()

        try:
            result = mom.rolling_mean(ts, 1, min_periods=1, freq='B')
            expected = mom.rolling_mean(ts,
                                        1,
                                        min_periods=1,
                                        time_rule='WEEKDAY')
            tm.assert_series_equal(result, expected)

            result = mom.ewma(ts, span=5, freq='B')
            expected = mom.ewma(ts, span=5, time_rule='WEEKDAY')
            tm.assert_series_equal(result, expected)

        finally:
            sys.stderr = sys.__stderr__

    def test_ewma(self):
        self._check_ew(mom.ewma)

    def test_ewmvar(self):
        self._check_ew(mom.ewmvar)

    def test_ewmvol(self):
        self._check_ew(mom.ewmvol)

    def test_ewma_span_com_args(self):
        A = mom.ewma(self.arr, com=9.5)
        B = mom.ewma(self.arr, span=20)
        assert_almost_equal(A, B)

        self.assertRaises(Exception, mom.ewma, self.arr, com=9.5, span=20)
        self.assertRaises(Exception, mom.ewma, self.arr)

    def _check_ew(self, func):
        self._check_ew_ndarray(func)
        self._check_ew_structures(func)

    def _check_ew_ndarray(self, func, preserve_nan=False):
        result = func(self.arr, com=10)
        if preserve_nan:
            assert (np.isnan(result[self._nan_locs]).all())

        # excluding NaNs correctly
        arr = randn(50)
        arr[:10] = np.NaN
        arr[-10:] = np.NaN

        # ??? check something

        # pass in ints
        result2 = func(np.arange(50), span=10)
        self.assert_(result2.dtype == np.float_)

    def _check_ew_structures(self, func):
        series_result = func(self.series, com=10)
        self.assert_(isinstance(series_result, Series))
        frame_result = func(self.frame, com=10)
        self.assertEquals(type(frame_result), DataFrame)

    # binary moments
    def test_rolling_cov(self):
        A = self.series
        B = A + randn(len(A))

        result = mom.rolling_cov(A, B, 50, min_periods=25)
        assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1])

    def test_rolling_corr(self):
        A = self.series
        B = A + randn(len(A))

        result = mom.rolling_corr(A, B, 50, min_periods=25)
        assert_almost_equal(result[-1], np.corrcoef(A[-50:], B[-50:])[0, 1])

        # test for correct bias correction
        a = tm.makeTimeSeries()
        b = tm.makeTimeSeries()
        a[:5] = np.nan
        b[:10] = np.nan

        result = mom.rolling_corr(a, b, len(a), min_periods=1)
        assert_almost_equal(result[-1], a.corr(b))

    def test_rolling_corr_pairwise(self):
        panel = mom.rolling_corr_pairwise(self.frame, 10, min_periods=5)

        correl = panel.ix[:, 1, 5]
        exp = mom.rolling_corr(self.frame[1], self.frame[5], 10, min_periods=5)
        tm.assert_series_equal(correl, exp)

    def test_flex_binary_frame(self):
        def _check(method):
            series = self.frame[1]

            res = method(series, self.frame, 10)
            res2 = method(self.frame, series, 10)
            exp = self.frame.apply(lambda x: method(series, x, 10))

            tm.assert_frame_equal(res, exp)
            tm.assert_frame_equal(res2, exp)

            frame2 = self.frame.copy()
            frame2.values[:] = np.random.randn(*frame2.shape)

            res3 = method(self.frame, frame2, 10)
            exp = DataFrame(
                dict((k, method(self.frame[k], frame2[k], 10))
                     for k in self.frame))
            tm.assert_frame_equal(res3, exp)

        methods = [mom.rolling_corr, mom.rolling_cov]
        for meth in methods:
            _check(meth)

    def test_ewmcov(self):
        self._check_binary_ew(mom.ewmcov)

    def test_ewmcorr(self):
        self._check_binary_ew(mom.ewmcorr)

    def _check_binary_ew(self, func):
        A = Series(randn(50), index=np.arange(50))
        B = A[2:] + randn(48)

        A[:10] = np.NaN
        B[-10:] = np.NaN

        result = func(A, B, 20, min_periods=5)

        self.assert_(np.isnan(result.values[:15]).all())
        self.assert_(not np.isnan(result.values[15:]).any())

        self.assertRaises(Exception, func, A, randn(50), 20, min_periods=5)
Пример #53
0
series3 = frame['d']
series3
frame
frame.sub(series3, axis=0)  #沿着列一直向右广播,即每列减去相应值

#numpy的ufunc(元素级数组方法)也可用于操作pandas对象
frame = DataFrame(np.random.randn(4, 3), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

np.abs(frame)

#通过dataframe的apply方法,将函数应用到各列或各行所形成的一维数组上(序列级)
f = lambda x: x.max() - x.min()  #定义匿名函数f,返回极差

frame.apply(f)  #默认axis=0,应用到列上求每列的极差
frame.apply(f, axis=1)   #应用到行上,求每行的极差
#许多常见的数组统计功能都被实现成了dataframe的方法(如sum,mean),则无需使用apply方法

#除标量外,apply还可根据传递的函数性质返回多个值组成的series
def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)    #返回每列中最小值和最大值组成的series

#python的元素级函数也可应用于dataframe,此时需使用applymap函数
format = lambda x: '%.2f' % x   #定义元素的格式匿名函数f
frame.applymap(format)

#series也有同样的应用于元素级的函数map
frame['e'].map(format)
Пример #54
0
def brand_fill(df: pd.DataFrame):
    # fill brand
    global regex
    regex = re.compile("(%s)" % "|".join(map(re.escape, name_dict.keys())))
    # _save('Data/brand_dict_name_dict_regex', [brand_dict, name_dict, regex])
    return df.apply(lambda x: brand_check(x.values[0], x.values[1]), axis=1)
Пример #55
0
 def test_apply_multi_index(self):
     s = DataFrame([[1, 2], [3, 4], [5, 6]])
     s.index = MultiIndex.from_arrays([['a', 'a', 'b'], ['c', 'd', 'd']])
     s.columns = ['col1', 'col2']
     res = s.apply(lambda x: Series({'min': min(x), 'max': max(x)}), 1)
     assert isinstance(res.index, MultiIndex)
Пример #56
0
def flows(futures, start=None, end=None, var=None, roll=None):
    position = futures.position
    market = futures.market
    # market1 = futures.p
    market = DataFrame(
        list(market.find({
            'date': {
                '$gte': start
            },
            'variety': var
        })))
    position = DataFrame(
        list(position.find({
            'date': {
                '$gte': start
            },
            'variety': var
        }))).drop_duplicates(['date', 'variety', 'symbol', 'long_party_name'],
                             'last')
    # position = position[['date','varie']]
    # position = position[position['long_party_name'].notna()]

    # 持仓
    # 所有会员
    party_name = position[position['date'] == end]
    long_party_name = party_name['long_party_name']
    short_party_name = party_name['short_party_name']
    party_name = long_party_name.append(
        short_party_name).dropna().drop_duplicates()
    # 多空变化量求和
    long = position.groupby(['date', 'variety', 'long_party_name'
                             ])[['long_openIntr', 'long_openIntr_chg']].sum()
    # print(long)
    short = position.groupby(['date', 'variety', 'short_party_name'
                              ])[['short_openIntr',
                                  'short_openIntr_chg']].sum()
    # # 合并
    frames = [long, short]
    position = pd.concat(frames, axis=1, sort=True).fillna(0).reset_index()
    # 字段更名
    position = position.rename(columns={
        'level_0': 'date',
        'level_1': 'variety',
        'level_2': 'BrokerID'
    })
    #
    ##行情
    market = market.copy()
    # 指数收盘
    market['cv'] = market.apply(lambda x: x['close'] * x['open_interest'],
                                axis=1)
    closes = market.groupby(['date', 'variety'])[['cv', 'open_interest']].sum()
    closes['close_index'] = closes['cv'] / closes['open_interest']
    # #指数开盘
    market['ov'] = market.apply(lambda x: x['open'] * x['open_interest'],
                                axis=1)
    opens = market.groupby(['date', 'variety'])[['ov', 'open_interest']].sum()
    closes['open_index'] = opens['ov'] / opens['open_interest']
    # 价格变化量
    closes['change_index'] = closes.apply(
        lambda x: x['close_index'] - x['open_index'], axis=1)
    closes = closes.reset_index()

    chg = closes[['date', 'variety', 'close_index', 'change_index']]

    # print(chg['change_index'])

    # print(merge)
    df = pd.DataFrame()

    for i in party_name:
        try:
            chg = chg.copy()
            # print(chg)
            chg['BrokerID'] = i
            position1 = position[position['BrokerID'] == i]
            # 两表合并
            mem = pd.merge(chg,
                           position1,
                           on=['date', 'variety', 'BrokerID'],
                           how='left').fillna(0)
            # mem = merge[merge['BrokerID'] == i]
            # print(mem)

            mem = mem.copy()
            mem['today_net'] = mem.apply(
                lambda x: x['long_openIntr'] - x['short_openIntr'], axis=1)
            mem['yesterday_net'] = mem.groupby(['variety', 'BrokerID'
                                                ])['today_net'].shift(1)
            mem['tomorrow_chg'] = mem.groupby(['variety', 'BrokerID'
                                               ])['change_index'].shift(-1)
            mem['net_chg'] = mem.apply(
                lambda x: x['today_net'] - x['yesterday_net'], axis=1)
            #
            mem['count'] = mem['net_chg'].count()
            # mem = mem.rename(columns={'long_open_interest': 'long_openIntr', 'long_open_interest_chg': 'long_openIntr_chg', 'short_open_interest': 'short_openIntr','short_open_interest_chg': 'short_openIntr_chg'})
            # mem['change'] = mem.groupby(['variety', 'BrokerID'])['close_index'].shif(1)
            mem['change'] = mem['close_index'] - mem['close_index'].shift(1)

            # 时间窗口相关系数
            # mem['corr'] = mem['net_chg'].rolling(window=240).corr(mem['change_index'])
            # mem['corr2'] = mem['net_chg'].rolling(window=240).corr(mem['tomorrow_chg']).shift(1)
            # mem['corr3'] = mem['today_net'].rolling(window=240).corr(mem['change'])
            #
            mem['lot'] = 0
            # mem = mem.copy()
            mem['lot'] = mem.apply(lambda x: 0 if x['today_net'] == 0 else 1
                                   if x['today_net'] > 0 else -1,
                                   axis=1)
            mem['lot'] = mem['lot'].shift(1).fillna(0)
            mem['pnl'] = mem['change'] * mem['lot']
            # mem['fee']=0
            # mem['fee'][mem['lot'] != mem['lot'].shif(1)] = mem['close_index'] * 2*1
            mem['netpnl'] = mem['pnl']
            mem['cumpnl'] = mem['netpnl'].rolling(roll).sum()

            # mem['date'] = pd.to_datetime(mem['date'])

            # #画图
            # mem = mem.set_index('date')
            # with pd.plotting.plot_params.use('x_compat', True):  # 方法一
            #     mem[['cumpnl']].plot(color='r',title=mem[u'BrokerID'][0]+" "+var+' '+end)
            #     mem['today_net'].plot(secondary_y=['today_net'])
            #     plt.ylabel('净持仓')
            # plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
            # plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
            # plt.show()

            # plt.plot(mem['cumpnl'])
            # print(mem)

            # flows = mem[mem['cumpnl'] > 0]
            # flows.sort_values('cumpnl', inplace=False)
            # print(flows)
            # # flows = flows[['date', 'variety', 'BrokerID', 'corr', 'corr2', 'today_net', 'net_chg', 'corr3',
            #                'cumpnl']].sort_values('cumpnl',
            #                                       inplace=False)  # [['date','variety','BrokerID','corr','corr2','cumpnl']]
            # flows = flows.rename(columns={'today_net': '净持仓', 'cumpnl': '累计盈亏点数', 'net_chg': '净持仓变化量', 'corr3': '相关系数'})
            # print(flows[['variety','BrokerID','净持仓','净持仓变化量','累计盈亏点数']])
            # print(flows)
            # print(flows.sort_values('累计盈亏点数'))
            # mem=mem.groupby()
            # print(mem)
            # print(flows['净持仓'].sum())

            # mem = mem[-1:]
            print(mem)

            df1 = pd.DataFrame(mem)
            df = df.append(df1)

            # print(df.tail(20))
        except:
            continue
    return df
Пример #57
0
frame


# In[93]:

np.abs(frame)


# In[94]:

f = lambda x: x.max() - x.min()


# In[95]:

frame.apply(f)


# In[96]:

frame.apply(f, axis = 1)


# In[97]:

def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])


# In[98]:
Пример #58
0
def alpha_diversity(X: pd.DataFrame,
                    metric="richness",
                    taxonomy: pd.DataFrame = None,
                    component_type="OTU",
                    mode="infer",
                    idx_taxonomy=[
                        'Phylum', 'Class', 'Order', 'Family', 'Genus',
                        'Species'
                    ],
                    name=None,
                    base=2,
                    obsv_type=None,
                    **alpha_kws):
    """
    X => pd.DataFrame of Otu counts with columns as Otus and rows as samples
    metric => a callable or a string {entropy, richness, gini, singletons}
    taxonomy => pd.DataFrame of taxonomy assignments for Otus
    """

    # Alpha measures
    def _entropy(x, **alpha_kws):
        return stats.entropy(x, base=base, **alpha_kws)

    def _richness(x, **alpha_kws):
        return (x > 0).sum().astype(int)

    def _gini(x, **alpha_kws):
        return skbio.diversity.alpha.gini_index(x, **alpha_kws)

    def _singletons(x, **alpha_kws):
        return (x == 1).sum().astype(int)

    d_metric_fn = {
        "entropy": _entropy,
        "richness": _richness,
        "gini": _gini,
        "singletons": _singletons
    }

    # Supported diversity measures
    if hasattr(metric, "__call__"):
        func = metric
    else:
        supported_metrics = list(d_metric_fn.keys())
        assert metric in supported_metrics, f"`{metric}` is not compatible.  Only available alpha diversity measures are {supported_metrics}"
        func = d_metric_fn[metric]
        name = metric

    # Compute diversity
    if mode == "infer":
        mode = "batch" if taxonomy is not None else "singular"

    assert mode in {
        "singular", "batch"
    }, "Please specify either 'singular', 'batch', or 'infer' for the mode"
    if mode == "singular":
        Se_alpha = X.apply(lambda x: func(x, **alpha_kws), axis=1)
        Se_alpha.index.name = name
        return Se_alpha

    if mode == "batch":
        assert taxonomy is not None, "`taxonomy` cannot be `None` when `mode='batch'`"
        d_level_metric = OrderedDict()
        for level in idx_taxonomy:
            if level in taxonomy.columns:
                df_level = otu_to_level(X, taxonomy, level=level)
                d_level_metric[level] = alpha_diversity(df_level,
                                                        metric=metric,
                                                        taxonomy=None,
                                                        mode="singular",
                                                        base=base,
                                                        **alpha_kws)
            else:
                print(
                    f"Skipping taxonomy level `{level}` because it is not in the taxonomy dataframe",
                    file=sys.stderr)

        d_level_metric[component_type] = alpha_diversity(X,
                                                         metric=metric,
                                                         taxonomy=None,
                                                         mode="singular",
                                                         base=base,
                                                         **alpha_kws)
        df_level_metric = pd.DataFrame(d_level_metric)
        df_level_metric.index.name = f"id_{obsv_type}"
        return df_level_metric
Пример #59
0
def DistanceCalculation(df: DataFrame):
    df['Miles'] = 0.0
    df['Miles'] = df.apply(lambda row: EstimatedDistance(row), axis=1)
    return df
Пример #60
0
import numpy as np
from pandas import Series
from pandas import DataFrame

#   - ufunc -> element wise array method work fine with pandas objects
frame = DataFrame(np.random.randn(4, 3),
                  columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])

frame
np.abs(frame)

# Applying a function on 1D arrays to each column or row
f = lambda x: x.max() - x.min()

frame.apply(f)
frame.apply(f, axis=1)


# sum and mean can be applied directly (no need for apply)
def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])


frame.apply(f)

# Element-wise python functions can be used too
format = lambda x: '%.2f' % x

# applymap, not map, because Series.map -> apply function element-wise
frame.applymap(format)