Пример #1
0
    def test_dataframe(self, orient, numpy):
        if orient == "records" and numpy:
            pytest.skip("Not idiomatic pandas")

        df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[
            "a", "b"], columns=["x", "y", "z"])
        encode_kwargs = {} if orient is None else dict(orient=orient)
        decode_kwargs = {} if numpy is None else dict(numpy=numpy)

        output = ujson.decode(ujson.encode(df, **encode_kwargs),
                              **decode_kwargs)

        # Ensure proper DataFrame initialization.
        if orient == "split":
            dec = _clean_dict(output)
            output = DataFrame(**dec)
        else:
            output = DataFrame(output)

        # Corrections to enable DataFrame comparison.
        if orient == "values":
            df.columns = [0, 1, 2]
            df.index = [0, 1]
        elif orient == "records":
            df.index = [0, 1]
        elif orient == "index":
            df = df.transpose()

        tm.assert_frame_equal(output, df, check_dtype=False)
Пример #2
0
    def testDataFrame(self):
        df = DataFrame([[1, 2, 3], [4, 5, 6]], index=["a", "b"], columns=["x", "y", "z"])

        # column indexed
        outp = DataFrame(ujson.decode(ujson.encode(df)))
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)
        assert_array_equal(df.index, outp.index)

        outp = DataFrame(**ujson.decode(ujson.encode(df, orient="split")))
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)
        assert_array_equal(df.index, outp.index)

        outp = DataFrame(ujson.decode(ujson.encode(df, orient="records")))
        outp.index = df.index
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)

        outp = DataFrame(ujson.decode(ujson.encode(df, orient="values")))
        outp.index = df.index
        self.assertTrue((df.values == outp.values).all())

        outp = DataFrame(ujson.decode(ujson.encode(df, orient="index")))
        self.assertTrue((df.transpose() == outp).values.all())
        assert_array_equal(df.transpose().columns, outp.columns)
        assert_array_equal(df.transpose().index, outp.index)
Пример #3
0
def svd_agg(m_rna, mi_rna, targets_matrix, c=1):
    if settings.CELERY_DEBUG:
        import sys
        sys.path.append('/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg')
        import pydevd
        pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True)

    #
    mRNA_data = m_rna.apply(lambda x: 1.0*x/max(x), axis=0)
    miRNA_data = mi_rna.apply(lambda x: 1-1.0*x/max(x), axis=0)
    #
    aggregate_data = mRNA_data
    #
    common_mRNAs =  Index(set(mRNA_data.columns) & set(targets_matrix.columns))
    common_miRNAs = Index(set(miRNA_data.columns) & set(targets_matrix.index))
    #
    for mRNA in common_mRNAs:
        #
        mRNA = Index([mRNA])
        #
        targetting_miRNAs = targets_matrix.ix[targets_matrix[mRNA[0]]==1, mRNA].index
        #
        selected_miRNA = miRNA_data.ix[:, targetting_miRNAs].T
        #
        if len(selected_miRNA.index) > 1:
            first_comp = DataFrame(np.linalg.svd(selected_miRNA)[2]).ix[0, :]
            first_comp.index = selected_miRNA.columns
        else:
            continue
        new_rep = DataFrame(np.linalg.svd(DataFrame([aggregate_data.ix[:, mRNA[0]], first_comp ]))[2]).ix[0, :]
        new_rep.index = aggregate_data.index
        aggregate_data.ix[:, mRNA[0]] = new_rep
    return aggregate_data
Пример #4
0
def svd_agg_train(m_rna, mi_rna, targets_matrix, hide_columns=Index([])):
    #
    sample_indexes = m_rna.index - hide_columns
    mRNA_data = m_rna.apply(lambda x: 1.0*x/max(x), axis=0).ix[sample_indexes, :]
    miRNA_data = mi_rna.apply(lambda x: 1-1.0*x/max(x), axis=0).ix[sample_indexes, :]
    #
    aggregate_data = mRNA_data
    #
    common_mRNAs =  Index(set(mRNA_data.columns) & set(targets_matrix.columns))
    common_miRNAs = Index(set(miRNA_data.columns) & set(targets_matrix.index))
    #
    for mRNA in common_mRNAs:
        #
        mRNA = Index([mRNA])
        #
        targetting_miRNAs = targets_matrix.ix[targets_matrix[mRNA[0]]==1, mRNA].index
        #
        selected_miRNA = miRNA_data.ix[:, targetting_miRNAs]
        #
        if len(selected_miRNA.columns)>1:
            first_comp = DataFrame(np.linalg.svd(selected_miRNA)[2]).ix[0, :]
            first_comp.index = selected_miRNA.index
        new_rep = DataFrame(np.linalg.svd(DataFrame([aggregate_data.ix[:,mRNA[0]], first_comp ]).transpose())[2]).ix[0, :]
        new_rep.index = aggregate_data.index
        aggregate_data.ix[:, mRNA[0]] = new_rep
    return aggregate_data
Пример #5
0
    def testDataFrame(self):
        df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z'])

        # column indexed
        outp = DataFrame(ujson.decode(ujson.encode(df)))
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)
        assert_array_equal(df.index, outp.index)

        dec = _clean_dict(ujson.decode(ujson.encode(df, orient="split")))
        outp = DataFrame(**dec)
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)
        assert_array_equal(df.index, outp.index)

        outp = DataFrame(ujson.decode(ujson.encode(df, orient="records")))
        outp.index = df.index
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)

        outp = DataFrame(ujson.decode(ujson.encode(df, orient="values")))
        outp.index = df.index
        self.assertTrue((df.values == outp.values).all())

        outp = DataFrame(ujson.decode(ujson.encode(df, orient="index")))
        self.assertTrue((df.transpose() == outp).values.all())
        assert_array_equal(df.transpose().columns, outp.columns)
        assert_array_equal(df.transpose().index, outp.index)
Пример #6
0
def set2df(sets, column_names, index=None, sort=True):
    df = DataFrame(list(sets), columns=column_names, index=index)
    if sort:
        df = df.sort(column_names)
        if index:
            df.index = index
        else:
            df.index = range(len(df))
    return df
Пример #7
0
def test_sort_datetimelike():
    # GH10505

    # use same data as test_groupby_sort_categorical, which category is
    # corresponding to datetime.month
    df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1),
                           datetime(2011, 2, 1), datetime(2011, 5, 1),
                           datetime(2011, 2, 1), datetime(2011, 1, 1),
                           datetime(2011, 5, 1)],
                    'foo': [10, 8, 5, 6, 4, 1, 7],
                    'bar': [10, 20, 30, 40, 50, 60, 70]},
                   columns=['dt', 'foo', 'bar'])

    # ordered=True
    df['dt'] = Categorical(df['dt'], ordered=True)
    index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
             datetime(2011, 5, 1), datetime(2011, 7, 1)]
    result_sort = DataFrame(
        [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
    result_sort.index = CategoricalIndex(index, name='dt', ordered=True)

    index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
             datetime(2011, 5, 1), datetime(2011, 1, 1)]
    result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
                              columns=['foo', 'bar'])
    result_nosort.index = CategoricalIndex(index, categories=index,
                                           name='dt', ordered=True)

    col = 'dt'
    assert_frame_equal(
        result_sort, df.groupby(col, sort=True, observed=False).first())

    # when categories is ordered, group is ordered by category's order
    assert_frame_equal(
        result_sort, df.groupby(col, sort=False, observed=False).first())

    # ordered = False
    df['dt'] = Categorical(df['dt'], ordered=False)
    index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
             datetime(2011, 5, 1), datetime(2011, 7, 1)]
    result_sort = DataFrame(
        [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
    result_sort.index = CategoricalIndex(index, name='dt')

    index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
             datetime(2011, 5, 1), datetime(2011, 1, 1)]
    result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
                              columns=['foo', 'bar'])
    result_nosort.index = CategoricalIndex(index, categories=index,
                                           name='dt')

    col = 'dt'
    assert_frame_equal(
        result_sort, df.groupby(col, sort=True, observed=False).first())
    assert_frame_equal(
        result_nosort, df.groupby(col, sort=False, observed=False).first())
Пример #8
0
    def test_grouper_index_types(self):
        # related GH5375
        # groupby misbehaving when using a Floatlike index
        df = DataFrame(np.arange(10).reshape(5, 2), columns=list('AB'))
        for index in [tm.makeFloatIndex, tm.makeStringIndex,
                      tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex,
                      tm.makePeriodIndex]:

            df.index = index(len(df))
            df.groupby(list('abcde')).apply(lambda x: x)

            df.index = list(reversed(df.index.tolist()))
            df.groupby(list('abcde')).apply(lambda x: x)
Пример #9
0
    def deserialize(self, item, force_bytes_to_unicode=False):
        index = self._index_from_records(item)
        column_fields = [x for x in item.dtype.names if x not in item.dtype.metadata['index']]
        multi_column = item.dtype.metadata.get('multi_column')
        if len(item) == 0:
            rdata = item[column_fields] if len(column_fields) > 0 else None
            if multi_column is not None:
                columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"])
                return DataFrame(rdata, index=index, columns=columns)
            else:
                return DataFrame(rdata, index=index)

        columns = item.dtype.metadata['columns']
        df = DataFrame(data=item[column_fields], index=index, columns=columns)

        if multi_column is not None:
            df.columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"])

        if force_bytes_to_unicode:
            # This is needed due to 'str' type in py2 when read back in py3 is 'bytes' which breaks the workflow
            # of people migrating to py3. # https://github.com/manahl/arctic/issues/598
            # This should not be used for a normal flow, and you should instead of writing unicode strings
            # if you want to work with str in py3.,

            for c in df.select_dtypes(object):
                # The conversion is not using astype similar to the index as pandas has a bug where it tries to convert
                # the data columns to a unicode string, and the object in this case would be bytes, eg. b'abc'
                # which is converted to u"b'abc'" i.e it includes the b character as well! This generally happens
                # when there is a str conversion without specifying the encoding. eg. str(b'abc') -> "b'abc'" and the
                # fix for this is to tell it the encoding to use: i.e str(b'abc', 'utf-8') -> "abc"
                if type(df[c].iloc[0]) == bytes:
                    df[c] = df[c].str.decode('utf-8')

            if isinstance(df.index, MultiIndex):
                unicode_indexes = []
                # MultiIndex requires a conversion at each level.
                for level in range(len(df.index.levels)):
                    _index = df.index.get_level_values(level)
                    if isinstance(_index[0], bytes):
                        _index = _index.astype('unicode')
                    unicode_indexes.append(_index)
                df.index = unicode_indexes
            else:
                if type(df.index[0]) == bytes:
                    df.index = df.index.astype('unicode')

            if type(df.columns[0]) == bytes:
                df.columns = df.index.astype('unicode')

        return df
Пример #10
0
    def test_unstack_fill_frame(self):

        # From a dataframe
        rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
        df = DataFrame(rows, columns=list('AB'), dtype=np.int32)
        df.index = MultiIndex.from_tuples(
            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

        result = df.unstack(fill_value=-1)

        rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
        expected = DataFrame(rows, index=list('xyz'), dtype=np.int32)
        expected.columns = MultiIndex.from_tuples(
            [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
        assert_frame_equal(result, expected)

        # From a mixed type dataframe
        df['A'] = df['A'].astype(np.int16)
        df['B'] = df['B'].astype(np.float64)

        result = df.unstack(fill_value=-1)
        expected['A'] = expected['A'].astype(np.int16)
        expected['B'] = expected['B'].astype(np.float64)
        assert_frame_equal(result, expected)

        # From a dataframe with incorrect data type for fill_value
        result = df.unstack(fill_value=0.5)

        rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
        expected = DataFrame(rows, index=list('xyz'), dtype=np.float)
        expected.columns = MultiIndex.from_tuples(
            [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
        assert_frame_equal(result, expected)
Пример #11
0
    def test_nunique(self):
        df = DataFrame({
            'A': list('abbacc'),
            'B': list('abxacc'),
            'C': list('abbacx'),
        })

        expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]})
        result = df.groupby('A', as_index=False).nunique()
        tm.assert_frame_equal(result, expected)

        # as_index
        expected.index = list('abc')
        expected.index.name = 'A'
        result = df.groupby('A').nunique()
        tm.assert_frame_equal(result, expected)

        # with na
        result = df.replace({'x': None}).groupby('A').nunique(dropna=False)
        tm.assert_frame_equal(result, expected)

        # dropna
        expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3},
                             index=list('abc'))
        expected.index.name = 'A'
        result = df.replace({'x': None}).groupby('A').nunique()
        tm.assert_frame_equal(result, expected)
def twitter_count(keys,d,strdb):

	#Mongo
	connection = pymongo.MongoClient(keys['db']['host'])
	dbtm = connection[keys['db']['name']]
	db = dbtm[strdb]
	#MongoDB Query - Mentions
	#The Day Of
	upper_bound_start_ts = float(calendar.timegm(d[-1].utctimetuple())*1000); 
	upper_bound_end = d[-1] + timedelta(days=1); 
	upper_bound_end_ts = float(calendar.timegm(upper_bound_end.utctimetuple())*1000)
	#upper_bound_end_ts = float(calendar.timegm(d[-1].utctimetuple())*1000); upper_bound_start = d[-1] - timedelta(days=1); upper_bound_start_ts = float(calendar.timegm(upper_bound_start.utctimetuple())*1000)
	# #Retrieve Tweeets that are not authored by the user itself. 
	if strdb in 'mentions':
		tr = 	db.aggregate([
								{'$match': {'timestamp':{'$gt': upper_bound_start_ts, '$lt': upper_bound_end_ts}}},
								{'$unwind':'$cdpid'},
								{'$group':{'_id':'$cdpid',strdb:{'$sum':1}}}])
	#Tweets collection does not need unwind unlike mentions collection. 
	else:
		tr = 	db.aggregate([
							{'$match': {'timestamp':{'$gt': upper_bound_start_ts, '$lt': upper_bound_end_ts}}},
							{'$group':{'_id':'$cdpid',strdb:{'$sum':1}}}])
	tr = DataFrame(tr['result']); 
	tr.index = tr._id;  tr=tr.drop('_id',axis=1); tr = tr.sort_index();
	#mts['Date'] = Period(d[-2],'D')
	print '%s for ' %(strdb), d[-1], ' processed'
	return(tr)
Пример #13
0
    def test_merge_datetime_index(self, box):
        # see gh-19038
        df = DataFrame([1, 2, 3],
                       ["2016-01-01", "2017-01-01", "2018-01-01"],
                       columns=["a"])
        df.index = pd.to_datetime(df.index)
        on_vector = df.index.year

        if box is not None:
            on_vector = box(on_vector)

        expected = DataFrame(
            OrderedDict([
                ("a", [1, 2, 3]),
                ("key_1", [2016, 2017, 2018]),
            ])
        )

        result = df.merge(df, on=["a", on_vector], how="inner")
        tm.assert_frame_equal(result, expected)

        expected = DataFrame(
            OrderedDict([
                ("key_0", [2016, 2017, 2018]),
                ("a_x", [1, 2, 3]),
                ("a_y", [1, 2, 3]),
            ])
        )

        result = df.merge(df, on=[df.index.year], how="inner")
        tm.assert_frame_equal(result, expected)
Пример #14
0
    def test_wls_panel(self):
        y = tm.makeTimeDataFrame()
        x = Panel({"x1": tm.makeTimeDataFrame(), "x2": tm.makeTimeDataFrame()})

        y.ix[[1, 7], "A"] = np.nan
        y.ix[[6, 15], "B"] = np.nan
        y.ix[[3, 20], "C"] = np.nan
        y.ix[[5, 11], "D"] = np.nan

        stack_y = y.stack()
        stack_x = DataFrame(dict((k, v.stack()) for k, v in compat.iteritems(x)))

        weights = x.std("items")
        stack_weights = weights.stack()

        stack_y.index = stack_y.index._tuple_index
        stack_x.index = stack_x.index._tuple_index
        stack_weights.index = stack_weights.index._tuple_index

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = ols(y=y, x=x, weights=1 / weights)
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            expected = ols(y=stack_y, x=stack_x, weights=1 / stack_weights)

        assert_almost_equal(result.beta, expected.beta)

        for attr in ["resid", "y_fitted"]:
            rvals = getattr(result, attr).stack().values
            evals = getattr(expected, attr).values
            assert_almost_equal(rvals, evals)
Пример #15
0
    def predict(self, tree):
        """
        TODO Should take an array and predict every item. A score can be stored.
        It would follow the guidelines set by scikit-learn.
        """
        tree_rules = self.extract_rules(tree)
        df = DataFrame(columns=['label', 'prob'])
        gb = self.posteriori.groupby('label')


        for key, indexes in gb.groups.items():
            apriori_prob = self.apriori[self.apriori.label == key]['freq'].values[0]
            prob = apriori_prob

            group_df, missing_prob = self.apply_smoothing(self.posteriori.ix[indexes], tree_rules)

            for rule in tree_rules:
                prob_evidence = group_df[group_df.rule == rule]['freq']
                if len(prob_evidence) == 0:
                    prob_evidence = missing_prob
                else:
                    prob_evidence = prob_evidence.values[0]
                prob *= prob_evidence
            
            post = DataFrame({'label':[key], 'prob':[prob]})
            df = df.append(post)

        df.index = np.arange(df.index.size)
        df = df.sort(columns='prob', ascending=False)
        return df.ix[df['prob'].idxmax()]
Пример #16
0
    def test_basic(self, sparse, dtype):
        s_list = list('abc')
        s_series = Series(s_list)
        s_series_index = Series(s_list, list('ABC'))

        expected = DataFrame({'a': [1, 0, 0],
                              'b': [0, 1, 0],
                              'c': [0, 0, 1]},
                             dtype=self.effective_dtype(dtype))
        result = get_dummies(s_list, sparse=sparse, dtype=dtype)
        if sparse:
            tm.assert_sp_frame_equal(result,
                                     expected.to_sparse(kind='integer',
                                                        fill_value=0))
        else:
            assert_frame_equal(result, expected)

        result = get_dummies(s_series, sparse=sparse, dtype=dtype)
        if sparse:
            expected = expected.to_sparse(kind='integer', fill_value=0)
        assert_frame_equal(result, expected)

        expected.index = list('ABC')
        result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
        if sparse:
            expected.to_sparse(kind='integer', fill_value=0)
        assert_frame_equal(result, expected)
Пример #17
0
    def test_wls_panel(self):
        y = tm.makeTimeDataFrame()
        x = Panel({'x1': tm.makeTimeDataFrame(),
                   'x2': tm.makeTimeDataFrame()})

        y.iloc[[1, 7], y.columns.get_loc('A')] = np.nan
        y.iloc[[6, 15], y.columns.get_loc('B')] = np.nan
        y.iloc[[3, 20], y.columns.get_loc('C')] = np.nan
        y.iloc[[5, 11], y.columns.get_loc('D')] = np.nan

        stack_y = y.stack()
        stack_x = DataFrame(dict((k, v.stack())
                                 for k, v in x.iteritems()))

        weights = x.std('items')
        stack_weights = weights.stack()

        stack_y.index = stack_y.index._tuple_index
        stack_x.index = stack_x.index._tuple_index
        stack_weights.index = stack_weights.index._tuple_index

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = ols(y=y, x=x, weights=1 / weights)
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            expected = ols(y=stack_y, x=stack_x, weights=1 / stack_weights)

        assert_almost_equal(result.beta, expected.beta)

        for attr in ['resid', 'y_fitted']:
            rvals = getattr(result, attr).stack().values
            evals = getattr(expected, attr).values
            assert_almost_equal(rvals, evals)
Пример #18
0
    def test_wls_panel(self):
        y = tm.makeTimeDataFrame()
        x = Panel({"x1": tm.makeTimeDataFrame(), "x2": tm.makeTimeDataFrame()})

        y.ix[[1, 7], "A"] = np.nan
        y.ix[[6, 15], "B"] = np.nan
        y.ix[[3, 20], "C"] = np.nan
        y.ix[[5, 11], "D"] = np.nan

        stack_y = y.stack()
        stack_x = DataFrame(dict((k, v.stack()) for k, v in x.iteritems()))

        weights = x.std("items")
        stack_weights = weights.stack()

        stack_y.index = stack_y.index.get_tuple_index()
        stack_x.index = stack_x.index.get_tuple_index()
        stack_weights.index = stack_weights.index.get_tuple_index()

        result = ols(y=y, x=x, weights=1 / weights)
        expected = ols(y=stack_y, x=stack_x, weights=1 / stack_weights)

        assert_almost_equal(result.beta, expected.beta)

        for attr in ["resid", "y_fitted"]:
            rvals = getattr(result, attr).stack().values
            evals = getattr(expected, attr).values
            assert_almost_equal(rvals, evals)
Пример #19
0
 def bdib(self, ticker, fld_list, startDateTime, endDateTime, eventType='TRADE', interval = 1):
     """
     Get one ticker (Only one ticker available per call); eventType (TRADE, BID, ASK,..etc); interval (in minutes)
             ; fld_list (Only [open, high, low, close, volumne, numEvents] availalbe)
     return pandas dataframe with return Data
     """
     # Create and fill the request for the historical data
     request = self.refDataService.createRequest("IntradayBarRequest")
     request.set("security", ticker)
     request.set("eventType", eventType)
     request.set("interval", interval)  # bar interval in minutes        
     request.set("startDateTime", startDateTime)
     request.set("endDateTime", endDateTime)
     
     print "Sending Request:", request
     # Send the request
     self.session.sendRequest(request)
     # defaultdict - later convert to pandas
     data = defaultdict(dict)
     # Process received events
     while(True):
         # We provide timeout to give the chance for Ctrl+C handling:
         ev = self.session.nextEvent(500)
         for msg in ev:
             barTickData = msg.getElement('barData').getElement('barTickData')
             for i in range(barTickData.numValues()) :
                 for j in range(len(fld_list)) :
                     data[(fld_list[j])][barTickData.getValue(i).getElement(0).getValue()] = barTickData.getValue(i).getElement(fld_list[j]).getValue()
     
         if ev.eventType() == blpapi.Event.RESPONSE:
             # Response completly received, so we could exit
             break
     data = DataFrame(data)
     data.index = pd.to_datetime(data.index)
     return data
Пример #20
0
def test_fenci():

    dfs = []

    for i in range(0, 9):
        f = file('Data/ftags_{}.pkl'.format(i), 'rb')
        fdist = pickle.load(f)
        #fdist.plot(50)
        df = DataFrame(fdist.items(), columns=['关键词', '计数'])
        df = df.sort_index(by='计数', ascending=False)
        df.index = range(len(df))

        df_plt = df[:30]
        df_plt = df_plt[::-1]
        #df_plt['关键词'].apply(lambda x : x.encode('utf8'))
        print df_plt.head()
        df_plt.plot(kind='barh', x=df_plt['关键词'], title=classifies[i])

        #plt.show()

        filePath = 'Data/{}.png'.format(classifies[i])
        str_name_f = filePath.decode("utf8")
        plt.savefig(str_name_f, dpi=100)

        dfs.append((classifies[i],df))

        #print df[df[1] > 1]
        f.close()
    print 'end'

    with pd.ExcelWriter('Data/keys.xlsx') as writer:
        for key, df in dfs:
            print key
            df.to_excel(writer, sheet_name=key, index=False)
Пример #21
0
    def test_wls_panel(self):
        y = tm.makeTimeDataFrame()
        x = Panel({'x1' : tm.makeTimeDataFrame(),
                   'x2' : tm.makeTimeDataFrame()})

        y.ix[[1, 7], 'A'] = np.nan
        y.ix[[6, 15], 'B'] = np.nan
        y.ix[[3, 20], 'C'] = np.nan
        y.ix[[5, 11], 'D'] = np.nan

        stack_y = y.stack()
        stack_x = DataFrame(dict((k, v.stack())
                                  for k, v in x.iteritems()))

        weights = x.std('items')
        stack_weights = weights.stack()

        stack_y.index = stack_y.index.get_tuple_index()
        stack_x.index = stack_x.index.get_tuple_index()
        stack_weights.index = stack_weights.index.get_tuple_index()

        result = ols(y=y, x=x, weights=1/weights)
        expected = ols(y=stack_y, x=stack_x, weights=1/stack_weights)

        assert_almost_equal(result.beta, expected.beta)

        for attr in ['resid', 'y_fitted']:
            rvals = getattr(result, attr).stack().values
            evals = getattr(expected, attr).values
            assert_almost_equal(rvals, evals)
Пример #22
0
 def data_frame(self):
     if self._processed_knockouts is None:
         self._process_knockouts()
     data_frame = DataFrame(self._processed_knockouts)
     data_frame.sort_values("size", inplace=True)
     data_frame.index = [i for i in range(len(data_frame))]
     return data_frame
Пример #23
0
def make_plot():
	# get list of the checked features
	features = request.form.getlist('feature')
	
	# capture the ticker input from the user
	ticker = request.form['ticker']

	# calculate one month time period from now
	now = datetime.now()
	#end_date = now.strftime('%Y-%m-%d') 
	start_date = (now - timedelta(days=30)).strftime('%Y-%m-%d')

	# fetch the appropriate dataset via API
	URL = 'https://www.quandl.com/api/v3/datasets/WIKI/'+ticker+'.json?start_date='+start_date+'&end_date='+end_date+'&order=asc&api_key=eFoXAcyvLhyuB3Rsvg6o'
	# URL = 'https://www.quandl.com/api/v3/datasets/WIKI/'+ticker+'.json?start_date=2015-08-01&end_date=2015-09-01&order=asc&api_key=eFoXAcyvLhyuB3Rsvg6o'
	r = requests.get(URL)

	# convert into a pandas dataframe
	request_df = DataFrame(r.json()) 
	df = DataFrame(request_df.ix['data','dataset'], columns = request_df.ix['column_names','dataset'])
	df.columns = [x.lower() for x in df.columns]
	df = df.set_index(['date'])
	df.index = to_datetime(df.index)

	# create a Bokeh plot from the dataframe
	# output_file("stock.html", title="Stock prices changes for last month")
	p = figure(x_axis_type = "datetime")
	if 'open' in features:
	    p.line(df.index, df['open'], color='blue', legend='opening price')
	if 'high' in features:
	    p.line(df.index, df['high'], color='red', legend='highest price')
	if 'close' in features:
	    p.line(df.index, df['close'], color='green', legend='closing price')
	return p
Пример #24
0
def plotting():
    
    # get list of the checked features
    features = request.form.getlist('feature')
    #user's input
    ticker = request.form['ticker']
    #calculate the time one month before
    now = datetime.now()
    #calculate the time difference
    start_date = (now - timedelta(days=30)).strftime('%Y-%m-%d')
    #fetch the dataset
    URL = 'https://www.quandl.com/api/v3/datasets/WIKI/'+ticker+'.json?start_date='+start_date+'&end_date='+end_date+'&order=asc&api_key=WVEFZw8uyJzuvHE3VsQW'
    r = requests.get(URL)
    
    
    #pass to pandas dataframe
    raw_data = DataFrame(r.json())
    #clean up the data
    df = DataFrame(raw_data.ix['data','dataset'] , columns = raw_data.ix['column_names','dataset'])
    #set the column names with lower case
    df.columns = [x.lower() for x in df.columns]
    #set the index to the date column
    df = df.set_index(['date'])
    #convert the index to datetime 
    df.index = to_datetime(df.index)
    
    #create the plot
    p = figure(x_axis_type = "datetime")
    if 'open' in features:
        p.line(df.index, df['open'], color='blue', legend='opening price')
    if 'high' in features:
        p.line(df.index, df['high'], color='red', legend='highest price')
    if 'close' in features:
        p.line(df.index, df['close'], color='green', legend='closing price')
    return p
Пример #25
0
def output():
	# getting user set options from the index2.html page
        options = request.form.getlist('feature')
	stock = request.form['stock']
        stock = stock.upper()
        
        # requesting data from Quandl
        nw = datetime.now()
	start_date = (nw - timedelta(days=30)).strftime('%Y-%m-%d')
	end_date = nw.strftime('%Y-%m-%d')
	req_url = 'https://www.quandl.com/api/v3/datasets/WIKI/'+stock+'.json?start_date='+start_date+'&end_date='+end_date+'&order=asc&api_key=3bkydVzcH_PPsy5zzAPn'
	r = requests.get(req_url)
        
        # pandas in action
	request_df = DataFrame(r.json()) 
	df = DataFrame(request_df.ix['data','dataset'], columns = request_df.ix['column_names','dataset'])
	df.columns = [x.lower() for x in df.columns]
	df = df.set_index(['date'])
	df.index = to_datetime(df.index)
	
	  
       
        # create plot - PLAY AROUND WITH THIS TO MAKE IT GENUINE
	#output_file("output.html", title="Stock prices changes for last month")
	p = figure(x_axis_type = "datetime")
	if 'open' in options:
	    p.line(df.index, df['open'], color='black', legend='Opening price')
	if 'high' in options:
	    p.line(df.index, df['high'], color='red', legend='Highest price')
	if 'close' in options:
	    p.line(df.index, df['close'], color='blue', legend='Closing price')
	return p
Пример #26
0
    def test_unstack_fill_frame(self):

        # From a dataframe
        rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
        df = DataFrame(rows, columns=list("AB"), dtype=np.int32)
        df.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")])

        result = df.unstack(fill_value=-1)

        rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
        expected = DataFrame(rows, index=list("xyz"), dtype=np.int32)
        expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")])
        assert_frame_equal(result, expected)

        # From a mixed type dataframe
        df["A"] = df["A"].astype(np.int16)
        df["B"] = df["B"].astype(np.float64)

        result = df.unstack(fill_value=-1)
        expected["A"] = expected["A"].astype(np.int16)
        expected["B"] = expected["B"].astype(np.float64)
        assert_frame_equal(result, expected)

        # From a dataframe with incorrect data type for fill_value
        result = df.unstack(fill_value=0.5)

        rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
        expected = DataFrame(rows, index=list("xyz"), dtype=np.float)
        expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")])
        assert_frame_equal(result, expected)
 def get_regression_table(self):
     regression_table = DataFrame({"beta": self.coef, "std_X": self.std_X})
     regression_table.index = self.features
     regression_table['beta_normalized'] = regression_table.beta * regression_table.std_X
     regression_table['effect'] = np.fabs(regression_table['beta_normalized'])
     regression_table = regression_table.sort_index(by='effect', ascending=False)
     return regression_table
Пример #28
0
def make_plot():
    types = request.form.getlist("type")

    ticker = request.form["ticker"]
    now = datetime.now()
    end_date = now.strftime("%Y-%m-%d")
    start_date = (now - timedelta(days=180)).strftime("%Y-%m-%d")  # six - month timeframe

    URL = (
        "https://www.quandl.com/api/v3/datasets/WIKI/"
        + ticker
        + ".json?start_date="
        + start_date
        + "&end_date="
        + end_date
        + "&order=asc&api_key=eFoXAcyvLhyuB3Rsvg6o"
    )
    r = requests.get(URL)
    df_handle = DataFrame(r.json())

    df = DataFrame(df_handle.ix["data", "dataset"], columns=df_handle.ix["column_names", "dataset"])
    df.columns = [x.lower() for x in df.columns]
    df = df.set_index(["date"])
    df.index = to_datetime(df.index)

    p = figure(x_axis_type="datetime")

    if "open" in types:
        p.line(df.index, df["open"], color="blue", legend="opening price")
    if "high" in types:
        p.line(df.index, df["high"], color="red", legend="highest price")
    if "close" in types:
        p.line(df.index, df["close"], color="green", legend="closing price")
    return p
Пример #29
0
def fetchQuotes(sym, start=FROM_DATE, end=CURRENT_DATE):
    his = None
    data = None
    try:
        # print start, end
        data = ystockquote.get_historical_prices(sym, start, end)
    except Exception:
        print "Please check the dates. Data might not be available. 404 returned"

        # 404 due to data yet not available
    if data:
        his = DataFrame(collections.OrderedDict(sorted(data.items()))).T
        his = his.convert_objects(convert_numeric=True)
        his.index = pd.to_datetime(his.index)
        his.insert(0, 'symbol', sym, allow_duplicates=True)
        # insert the date as dataframe too
        his.insert(1, 'date', his.index)
        # his.columns = getColumns('stock_quote_historical')   # Removing as db dependency is removed
        his.columns = getColumnsNoSql('stock_quote_historical')

    daily = ystockquote.get_all(sym)
    # print daily
    # persist(his, daily, sym, end)

    return his, daily
    def __init__(self, outcomes, texts, parameters_display, verbose=False):

        options = {"lowercase": True, "lemmatize": True, "remove-stopwords": True}
        super(DisplayTextModel, self).__init__(outcomes, texts, 'bag-of-words', options)

        data = DataFrame({"y": outcomes, "text": texts})

        # Storing whether the outcome is a dummy:
        if set(data.y) == set([0, 1]):
            self.is_dummy_outcome = True

        N = data.shape[0]
        self.number_of_observations = N

        data.index = [str(x) for x in range(N)]
        data['y_hat'] = self.pipe.predict(texts)

        ridge = self.pipe.named_steps['ridge_model']

        self.std_X = ridge.std_X
        self.parameters_display = parameters_display
        self.mean_outcome_in_groups = mean_outcome_in_groups(data.y, data.y_hat)
        self.percent_correct = share_correct(data.y, data.y_hat, verbose=verbose)
        self.outcome_summary = get_summary(outcomes)

        self.coef = ridge.coef_
        self.number_of_features = len(self.coef)
        features = self.pipe.named_steps['featurizer'].get_feature_names()
        self.features = [f.split("__")[1] for f in features]
Пример #31
0
                    ) + '" target="_blank">' + targetpage.geturl() + '</a>'
                    print(messages)
                else:
                    result.loc[result['misspelling'] == rowdata[0],
                               'wiki'] = True
                    #result.loc[result['misspelling'] == rowdata[0], 'wikiurl'] = browser.current_url
                    result.loc[result['misspelling'] == rowdata[0],
                               'wikiurl'] = targetpage.geturl()
                    messages = '[OK] ' + rowdata[
                        0] + ': Found\n + Link: ' + targetpage.geturl()
                    print(messages)
                    output = output + '\n' + messages
        # Sorting result values
        result.sort_values(by=['duplication', 'wiki', 'misspelling', 'url'],
                           ascending=[True, True, True, True],
                           inplace=True)
        result.index = range(len(result))
        # Exporting to csv file
        result.to_csv(outputname, header=True, index=True)
        #print (result.to_string())
        #output = output + '\n' + result.to_string()
        f.write(output)
        f.close()
    else:
        f = open(logname, 'w')
        messages = '[ERR] Initialization faliure'
        print(messages)
        output = output + '\n' + messages
        f.write(output)
        f.close()
Пример #32
0
def classify_otus_experimental(
        representative_sequences: DNASequencesDirectoryFormat,
        tree: NewickFormat,
        reference_taxonomy: pd.DataFrame = None) -> pd.DataFrame:
    if reference_taxonomy is None:
        filename_default_taxonomy = os.path.join(_sepp_refs_path(),
                                                 'taxonomy_gg99.qza')
        reference_taxonomy = Artifact.load(filename_default_taxonomy).view(
            pd.DataFrame)

    # convert type of feature IDs to str (depending on pandas type inference
    # they might come as integers), to make sure they are of the same type as
    # in the tree.
    reference_taxonomy.index = map(str, reference_taxonomy.index)

    # load the insertion tree
    tree = skbio.TreeNode.read(str(tree))

    # ensure that all reference tips in the tree (those without the inserted
    # fragments) have a mapping in the user provided taxonomy table
    names_tips = {node.name for node in tree.tips()}
    names_fragments = {
        fragment.metadata['id']
        for fragment in representative_sequences.file.view(DNAIterator)
    }
    missing_features = (names_tips - names_fragments) -\
        set(reference_taxonomy.index)
    if len(missing_features) > 0:
        # QIIME2 users can run with --verbose and see stderr and stdout.
        # Thus, we here report more details about the mismatch:
        sys.stderr.write(
            ("The taxonomy artifact you provided does not contain lineage "
             "information for the following %i features:\n%s") %
            (len(missing_features), "\n".join(missing_features)))
        raise ValueError("Not all OTUs in the provided insertion tree have "
                         "mappings in the provided reference taxonomy.")

    taxonomy = []
    for fragment in representative_sequences.file.view(DNAIterator):
        # for every inserted fragment we now try to find the closest OTU tip
        # in the tree and available mapping from the OTU-ID to a lineage
        # string:
        lineage_str = np.nan
        # first, let us check if the fragment has been inserted at all ...
        try:
            curr_node = tree.find(fragment.metadata['id'])
        except skbio.tree.MissingNodeError:
            continue
        # if yes, we start from the inserted node and traverse the tree as less
        # as possible towards the root and check at every level if one or
        # several OTU-tips are within the sub-tree.
        if curr_node is not None:
            foundOTUs = []
            # Traversal is stopped at a certain level, if one or more OTU-tips
            # have been found in the sub-tree OR ... (see break below)
            while len(foundOTUs) == 0:
                # SEPP insertion - especially for multiple very similar
                # sequences - can result in a rather complex topology change
                # if all those sequences are inserted into the same branch
                # leading to one OTU-tip. Thus, we cannot simply visit only
                # all siblings or decendents and rather need to traverse the
                # whole sub-tree. Average case should be well behaved,
                # thus I think it is ok.
                for node in curr_node.postorder():
                    if (node.name is not None) and \
                       (node.name in reference_taxonomy.index):
                        # if a suitable OTU-tip node is found AND this OTU-ID
                        # has a mapping in the user provided reference_taxonomy
                        # we store the OTU-ID in the growing result list
                        foundOTUs.append(node.name)
                # ... if the whole tree has been traversed without success,
                # e.g. if user provided reference_taxonomy did not contain any
                # matching OTU-IDs.
                if curr_node.is_root():
                    break
                # prepare next while iteration, by changing to the parent node
                curr_node = curr_node.parent

            if len(foundOTUs) > 0:
                # If the above method has identified exactly one OTU-tip,
                # resulting lineage string would simple be the one provided by
                # the user reference_taxonomy. However, if the inserted
                # fragment cannot unambiguously places into the reference tree,
                # the above method will find multiple OTU-IDs, which might have
                # lineage strings in the user provided reference_taxonomy that
                # are similar up to a certain rank and differ e.g. for genus
                # and species.
                # Thus, we here find the longest common prefix of all lineage
                # strings. We don't operate per character, but per taxonomic
                # rank. Therefore, we first "convert" every lineage sting into
                # a list of taxa, one per rank.
                split_lineages = []
                for otu in foundOTUs:
                    # find lineage string for OTU
                    lineage = reference_taxonomy.loc[otu, 'Taxon']
                    # necessary to split lineage apart to ensure that
                    # the longest common prefix operates on atomic ranks
                    # instead of characters
                    split_lineages.append(
                        list(map(str.strip, lineage.split(';'))))
                # find the longest common prefix rank-wise and concatenate to
                # one lineage string, separated by ;
                lineage_str = "; ".join(os.path.commonprefix(split_lineages))
            taxonomy.append({
                'Feature ID': fragment.metadata['id'],
                'Taxon': lineage_str
            })
    pd_taxonomy = pd.DataFrame(taxonomy)
    # test if dataframe is completely empty, or if no lineages could be found
    if (len(taxonomy) == 0) or \
       (pd_taxonomy['Taxon'].dropna().shape[0] == 0):
        raise ValueError(
            ("None of the representative-sequences can be found in the "
             "insertion tree. Please double check that both inputs match up, "
             "i.e. are results from the same 'sepp' run."))

    return pd_taxonomy.set_index('Feature ID')
Пример #33
0
def test_non_cython_api():

    # GH5610
    # non-cython calls should not include the grouper

    df = DataFrame(
        [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], columns=["A", "B", "C"]
    )
    g = df.groupby("A")
    gni = df.groupby("A", as_index=False)

    # mad
    expected = DataFrame([[0], [np.nan]], columns=["B"], index=[1, 3])
    expected.index.name = "A"
    result = g.mad()
    tm.assert_frame_equal(result, expected)

    expected = DataFrame([[1, 0.0], [3, np.nan]], columns=["A", "B"], index=[0, 1])
    result = gni.mad()
    tm.assert_frame_equal(result, expected)

    # describe
    expected_index = Index([1, 3], name="A")
    expected_col = pd.MultiIndex(
        levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
        codes=[[0] * 8, list(range(8))],
    )
    expected = DataFrame(
        [
            [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
            [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
        ],
        index=expected_index,
        columns=expected_col,
    )
    result = g.describe()
    tm.assert_frame_equal(result, expected)

    expected = pd.concat(
        [
            df[df.A == 1].describe().unstack().to_frame().T,
            df[df.A == 3].describe().unstack().to_frame().T,
        ]
    )
    expected.index = Index([0, 1])
    result = gni.describe()
    tm.assert_frame_equal(result, expected)

    # any
    expected = DataFrame(
        [[True, True], [False, True]], columns=["B", "C"], index=[1, 3]
    )
    expected.index.name = "A"
    result = g.any()
    tm.assert_frame_equal(result, expected)

    # idxmax
    expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3])
    expected.index.name = "A"
    result = g.idxmax()
    tm.assert_frame_equal(result, expected)
Пример #34
0
    def test_boolean_comparison(self):

        # GH 4576
        # boolean comparisons with a tuple/list give unexpected results
        df = DataFrame(np.arange(6).reshape((3, 2)))
        b = np.array([2, 2])
        b_r = np.atleast_2d([2, 2])
        b_c = b_r.T
        l = (2, 2, 2)
        tup = tuple(l)

        # gt
        expected = DataFrame([[False, False], [False, True], [True, True]])
        result = df > b
        assert_frame_equal(result, expected)

        result = df.values > b
        assert_numpy_array_equal(result, expected.values)

        result = df > l
        assert_frame_equal(result, expected)

        result = df > tup
        assert_frame_equal(result, expected)

        result = df > b_r
        assert_frame_equal(result, expected)

        result = df.values > b_r
        assert_numpy_array_equal(result, expected.values)

        pytest.raises(ValueError, df.__gt__, b_c)
        pytest.raises(ValueError, df.values.__gt__, b_c)

        # ==
        expected = DataFrame([[False, False], [True, False], [False, False]])
        result = df == b
        assert_frame_equal(result, expected)

        result = df == l
        assert_frame_equal(result, expected)

        result = df == tup
        assert_frame_equal(result, expected)

        result = df == b_r
        assert_frame_equal(result, expected)

        result = df.values == b_r
        assert_numpy_array_equal(result, expected.values)

        pytest.raises(ValueError, lambda: df == b_c)
        assert not np.array_equal(df.values, b_c)

        # with alignment
        df = DataFrame(np.arange(6).reshape((3, 2)),
                       columns=list('AB'),
                       index=list('abc'))
        expected.index = df.index
        expected.columns = df.columns

        result = df == l
        assert_frame_equal(result, expected)

        result = df == tup
        assert_frame_equal(result, expected)
Пример #35
0
    def test_unstack_fill(self):

        # GH #9746: fill_value keyword argument for Series
        # and DataFrame unstack

        # From a series
        data = Series([1, 2, 4, 5], dtype=np.int16)
        data.index = MultiIndex.from_tuples(
            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

        result = data.unstack(fill_value=-1)
        expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]},
                             index=['x', 'y', 'z'], dtype=np.int16)
        assert_frame_equal(result, expected)

        # From a series with incorrect data type for fill_value
        result = data.unstack(fill_value=0.5)
        expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]},
                             index=['x', 'y', 'z'], dtype=np.float)
        assert_frame_equal(result, expected)

        # From a dataframe
        rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
        df = DataFrame(rows, columns=list('AB'), dtype=np.int32)
        df.index = MultiIndex.from_tuples(
            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

        result = df.unstack(fill_value=-1)

        rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
        expected = DataFrame(rows, index=list('xyz'), dtype=np.int32)
        expected.columns = MultiIndex.from_tuples(
            [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
        assert_frame_equal(result, expected)

        # From a mixed type dataframe
        df['A'] = df['A'].astype(np.int16)
        df['B'] = df['B'].astype(np.float64)

        result = df.unstack(fill_value=-1)
        expected['A'] = expected['A'].astype(np.int16)
        expected['B'] = expected['B'].astype(np.float64)
        assert_frame_equal(result, expected)

        # From a dataframe with incorrect data type for fill_value
        result = df.unstack(fill_value=0.5)

        rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
        expected = DataFrame(rows, index=list('xyz'), dtype=np.float)
        expected.columns = MultiIndex.from_tuples(
            [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
        assert_frame_equal(result, expected)

        # Test unstacking with date times
        dv = pd.date_range('2012-01-01', periods=4).values
        data = Series(dv)
        data.index = MultiIndex.from_tuples(
            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

        result = data.unstack()
        expected = DataFrame({'a': [dv[0], pd.NaT, dv[3]],
                              'b': [dv[1], dv[2], pd.NaT]},
                             index=['x', 'y', 'z'])
        assert_frame_equal(result, expected)

        result = data.unstack(fill_value=dv[0])
        expected = DataFrame({'a': [dv[0], dv[0], dv[3]],
                              'b': [dv[1], dv[2], dv[0]]},
                             index=['x', 'y', 'z'])
        assert_frame_equal(result, expected)

        # Test unstacking with time deltas
        td = [Timedelta(days=i) for i in range(4)]
        data = Series(td)
        data.index = MultiIndex.from_tuples(
            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

        result = data.unstack()
        expected = DataFrame({'a': [td[0], pd.NaT, td[3]],
                              'b': [td[1], td[2], pd.NaT]},
                             index=['x', 'y', 'z'])
        assert_frame_equal(result, expected)

        result = data.unstack(fill_value=td[1])
        expected = DataFrame({'a': [td[0], td[1], td[3]],
                              'b': [td[1], td[2], td[1]]},
                             index=['x', 'y', 'z'])
        assert_frame_equal(result, expected)

        # Test unstacking with period
        periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'),
                   Period('2012-04')]
        data = Series(periods)
        data.index = MultiIndex.from_tuples(
            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

        result = data.unstack()
        expected = DataFrame({'a': [periods[0], None, periods[3]],
                              'b': [periods[1], periods[2], None]},
                             index=['x', 'y', 'z'])
        assert_frame_equal(result, expected)

        result = data.unstack(fill_value=periods[1])
        expected = DataFrame({'a': [periods[0], periods[1], periods[3]],
                              'b': [periods[1], periods[2], periods[1]]},
                             index=['x', 'y', 'z'])
        assert_frame_equal(result, expected)

        # Test unstacking with categorical
        data = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
        data.index = pd.MultiIndex.from_tuples(
            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

        # By default missing values will be NaN
        result = data.unstack()
        expected = DataFrame({'a': pd.Categorical(list('axa'),
                                                  categories=list('abc')),
                              'b': pd.Categorical(list('bcx'),
                                                  categories=list('abc'))},
                             index=list('xyz'))
        assert_frame_equal(result, expected)

        # Fill with non-category results in NaN entries similar to above
        result = data.unstack(fill_value='d')
        assert_frame_equal(result, expected)

        # Fill with category value replaces missing values as expected
        result = data.unstack(fill_value='c')
        expected = DataFrame({'a': pd.Categorical(list('aca'),
                                                  categories=list('abc')),
                              'b': pd.Categorical(list('bcc'),
                                                  categories=list('abc'))},
                             index=list('xyz'))
        assert_frame_equal(result, expected)
Пример #36
0
def anova2_lm_single(model, design_info, n_rows, test, pr_test, robust):
    """
    Anova type II table for one fitted linear model.

    Parameters
    ----------
    model : fitted linear model results instance
        A fitted linear model

    **kwargs**

    scale : float
        Estimate of variance, If None, will be estimated from the largest
    model. Default is None.
        test : str {"F", "Chisq", "Cp"} or None
        Test statistics to provide. Default is "F".

    Notes
    -----
    Use of this function is discouraged. Use anova_lm instead.

    Type II
    Sum of Squares compares marginal contribution of terms. Thus, it is
    not particularly useful for models with significant interaction terms.
    """
    terms_info = design_info.terms[:]  # copy
    terms_info = _remove_intercept_patsy(terms_info)

    names = ['sum_sq', 'df', test, pr_test]

    table = DataFrame(np.zeros((n_rows, 4)), columns=names)
    cov = _get_covariance(model, None)
    robust_cov = _get_covariance(model, robust)
    col_order = []
    index = []
    for i, term in enumerate(terms_info):
        # grab all varaibles except interaction effects that contain term
        # need two hypotheses matrices L1 is most restrictive, ie., term==0
        # L2 is everything except term==0
        cols = design_info.slice(term)
        L1 = lrange(cols.start, cols.stop)
        L2 = []
        term_set = set(term.factors)
        for t in terms_info:  # for the term you have
            other_set = set(t.factors)
            if term_set.issubset(other_set) and not term_set == other_set:
                col = design_info.slice(t)
                # on a higher order term containing current `term`
                L1.extend(lrange(col.start, col.stop))
                L2.extend(lrange(col.start, col.stop))

        L1 = np.eye(model.model.exog.shape[1])[L1]
        L2 = np.eye(model.model.exog.shape[1])[L2]

        if L2.size:
            LVL = np.dot(np.dot(L1, robust_cov), L2.T)
            from scipy import linalg
            orth_compl, _ = linalg.qr(LVL)
            r = L1.shape[0] - L2.shape[0]
            # L1|2
            # use the non-unique orthogonal completion since L12 is rank r
            L12 = np.dot(orth_compl[:, -r:].T, L1)
        else:
            L12 = L1
            r = L1.shape[0]
        #from IPython.core.debugger import Pdb; Pdb().set_trace()
        if test == 'F':
            f = model.f_test(L12, cov_p=robust_cov)
            table.loc[table.index[i], test] = test_value = f.fvalue
            table.loc[table.index[i], pr_test] = f.pvalue

        # need to back out SSR from f_test
        table.loc[table.index[i], 'df'] = r
        col_order.append(cols.start)
        index.append(term.name())

    table.index = Index(index + ['Residual'])
    table = table.iloc[np.argsort(col_order + [model.model.exog.shape[1] + 1])]
    # back out sum of squares from f_test
    ssr = table[test] * table['df'] * model.ssr / model.df_resid
    table['sum_sq'] = ssr
    # fill in residual
    table.loc['Residual',
              ['sum_sq', 'df', test, pr_test]] = (model.ssr, model.df_resid,
                                                  np.nan, np.nan)

    return table
Пример #37
0
    def test_margin_normalize(self):
        # GH 27500
        df = DataFrame(
            {
                "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
                "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
                "C": [
                    "small",
                    "large",
                    "large",
                    "small",
                    "small",
                    "large",
                    "small",
                    "small",
                    "large",
                ],
                "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
                "E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
            }
        )
        # normalize on index
        result = crosstab(
            [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0
        )
        expected = DataFrame(
            [[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]]
        )
        expected.index = MultiIndex(
            levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
            codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
            names=["A", "B"],
        )
        expected.columns = Index(["large", "small"], dtype="object", name="C")
        tm.assert_frame_equal(result, expected)

        # normalize on columns
        result = crosstab(
            [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1
        )
        expected = DataFrame(
            [
                [0.25, 0.2, 0.222222],
                [0.25, 0.2, 0.222222],
                [0.5, 0.2, 0.333333],
                [0, 0.4, 0.222222],
            ]
        )
        expected.columns = Index(
            ["large", "small", "Sub-Total"], dtype="object", name="C"
        )
        expected.index = MultiIndex(
            levels=[["bar", "foo"], ["one", "two"]],
            codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
            names=["A", "B"],
        )
        tm.assert_frame_equal(result, expected)

        # normalize on both index and column
        result = crosstab(
            [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True
        )
        expected = DataFrame(
            [
                [0.111111, 0.111111, 0.222222],
                [0.111111, 0.111111, 0.222222],
                [0.222222, 0.111111, 0.333333],
                [0.000000, 0.222222, 0.222222],
                [0.444444, 0.555555, 1],
            ]
        )
        expected.columns = Index(
            ["large", "small", "Sub-Total"], dtype="object", name="C"
        )
        expected.index = MultiIndex(
            levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
            codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
            names=["A", "B"],
        )
        tm.assert_frame_equal(result, expected)
Пример #38
0
def textual_update_analysis(
    df: pd.DataFrame, extra_columns: List
) -> Tuple[Dict[str, str], List[Dict[str, Union[str, float]]]]:
    template_vars: Dict[str, Any] = {}
    summary = []
    df = df.rename(columns={'date': 'ds', 'search_downloads': 'y'})
    if 'asa' in df.columns:
        df['y'] = df['y'] - df['asa']
    df.index = df['ds']

    df = handle_outliers(df)

    if options.weekly:
        df = df.resample('W').apply(safe_mean)

    time_regressors = []
    for _, row in df.iterrows():
        if row['update'] == 'textual':
            additional_regressor = '{} (text)'.format(
                str(row['ds']).split(" ")[0])
            df[additional_regressor] = [
                other_row['y'] if other_row['ds'] >= row['ds'] else 0
                for _, other_row in df.iterrows()
            ]
            time_regressors.append(additional_regressor)

    model = create_model('sherlock_textual', df, True,
                         time_regressors + extra_columns)

    model.fit(10000 if options.sampler == 'metropolis' else 2000,
              method=Sampler.METROPOLIS
              if options.sampler == 'metropolis' else Sampler.NUTS,
              step_kwargs={'compute_convergence_checks': False}
              if options.sampler == 'metropolis' else {})

    fig = plot_nowcast(
        model,
        [row['ds'] for _, row in df.iterrows() if row['update'] == 'textual'])
    plt.title('Downloads & Textual Updates')
    template_vars['textual_model'] = figure_to_base64(fig)

    summary.extend(
        summary_from_model_regressors(model, time_regressors + extra_columns))

    extra_regressors_plots: List[Dict[str, str]] = []
    for i in range(len(time_regressors),
                   len(time_regressors) + len(extra_columns)):
        fig = plt.figure()
        plt.grid()
        plt.hist(model.trace['regressors_{}'.format(model.name)][:, i] * 100,
                 bins=30,
                 alpha=0.8,
                 histtype='stepfilled')
        plt.axvline(
            np.median(model.trace['regressors_{}'.format(model.name)][:, i]) *
            100,
            color="C3",
            lw=1,
            ls="dotted")
        plt.title("{} (in %)".format(extra_columns[i - len(time_regressors)]))
        extra_regressors_plots.append({
            'name':
            extra_columns[i - len(time_regressors)],
            'img_data':
            figure_to_base64(fig)
        })

    template_vars['extra_regressors_plots'] = extra_regressors_plots

    seasonality = {}
    for period, fig in plot_seasonality(model,
                                        alpha=options.alpha,
                                        plot_kwargs={}).items():
        seasonality[int(period)] = figure_to_base64(fig)
    template_vars['textual_seasonality'] = seasonality

    return template_vars, summary
Пример #39
0
def pool_duplicate_subsets(
    data: pd.DataFrame,
    col_dupl_thresh: float = 0.2,
    subset_thresh: float = 0.2,
    min_col_pool: int = 3,
    exclude: Optional[List[str]] = None,
    return_details=False,
) -> pd.DataFrame:
    """ Checks for duplicates in subsets of columns and pools them. This can reduce \
        the number of columns in the data without loosing much information. Suitable \
        columns are combined to subsets and tested for duplicates. In case sufficient \
        duplicates can be found, the respective columns are aggregated into a \
        "pooled_var" column. Identical numbers in the "pooled_var" column indicate \
        identical information in the respective rows.

        Note:  It is advised to exclude features that provide sufficient informational \
        content by themselves as well as the target column by using the "exclude" \
        setting.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame
    col_dupl_thresh : float, optional
        Columns with a ratio of duplicates higher than "col_dupl_thresh" are \
        considered in the further analysis. Columns with a lower ratio are not \
        considered for pooling, by default 0.2
    subset_thresh : float, optional
        The first subset with a duplicate threshold higher than "subset_thresh" is \
        chosen and aggregated. If no subset reaches the threshold, the algorithm \
        continues with continuously smaller subsets until "min_col_pool" is reached, \
        by default 0.2
    min_col_pool : int, optional
        Minimum number of columns to pool. The algorithm attempts to combine as many \
        columns as possible to suitable subsets and stops when "min_col_pool" is \
        reached, by default 3
    exclude : Optional[List[str]], optional
        List of column names to be excluded from the analysis. These columns are \
        passed through without modification, by default None
    return_details : bool, optional
        Provdies flexibility to return intermediary results, by default False

    Returns
    -------
    pd.DataFrame
        DataFrame with low cardinality columns pooled

    optional:
    subset_cols: List of columns used as subset
    """

    # Input validation
    _validate_input_range(col_dupl_thresh, "col_dupl_thresh", 0, 1)
    _validate_input_range(subset_thresh, "subset_thresh", 0, 1)
    _validate_input_range(min_col_pool, "min_col_pool", 0, data.shape[1])

    excluded_cols = []
    if exclude is not None:
        excluded_cols = data[exclude]
        data = data.drop(columns=exclude)

    subset_cols = []
    for i in range(data.shape[1] + 1 - min_col_pool):
        check_list = [
            col for col in data.columns
            if data.duplicated(subset=col).mean() > col_dupl_thresh
        ]

        if len(check_list) > 0:
            combinations = itertools.combinations(check_list,
                                                  len(check_list) - i)
        else:
            continue

        ratios = [
            *map(lambda comb: data.duplicated(subset=list(comb)).mean(),
                 combinations)
        ]

        max_ratio = max(ratios)
        max_idx = np.argmax(ratios)

        if max_ratio > subset_thresh:
            best_subset = itertools.islice(
                itertools.combinations(check_list,
                                       len(check_list) - i),
                max_idx,
                max_idx + 1,
            )
            best_subset = data[list(list(best_subset)[0])]
            subset_cols = best_subset.columns.tolist()

            unique_subset = (
                best_subset.drop_duplicates().reset_index().rename(
                    columns={"index": "pooled_vars"}))
            data = data.merge(unique_subset,
                              how="left",
                              on=best_subset.columns.tolist()).drop(
                                  columns=best_subset.columns.tolist())
            data.index = pd.RangeIndex(len(data))
            break

    data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1)

    if return_details:
        return data, subset_cols

    return data
Пример #40
0
    colourlist = [
        '#0033CC', '#33CC33', '#FFAA00', '#CC3300', '#AAAAAA', '#0032FF', 'r',
        'c', 'm', 'y', '#000000', '#333333'
    ]
    #colourlist = ['#AAAAAA','#
    #filename = '/tier2/dickson/bathd/FlyMAD/JAABA_tracking/140927/wing_angles_nano.csv'
    #binsize = '5s'  # ex: '1s' or '4Min' etc
    #BAG_FILE = '/groups/dickson/home/bathd/Dropbox/140927_flymad_rosbag_copy/rosbagOut_2014-09-27-14-53-54.bag'

    if 1:  #COMPILE_FOLDERS == False:
        baglist = []
        for bag in glob.glob(BAGS + '/*.bag'):
            bagtimestamp = parse_bagtime(bag)
            baglist.append((bag, bagtimestamp))
        bagframe = DataFrame(baglist, columns=['Filepath', 'Timestamp'])
        bagframe.index = pd.to_datetime(bagframe['Timestamp'])
        bagframe = bagframe.sort()
        bagframe.to_csv(BAGS + '/list_of_bags.csv', sep=',')

        if not os.path.exists(JAABA + 'JAR') == True:
            print "MAKING A JAR"
            os.makedirs(JAABA + 'JAR')
        if not os.path.exists(JAABA + 'TRACES') == True:
            os.makedirs(JAABA + 'TRACES')

        updated = False

        for directory in glob.glob(JAABA + '*' + HANDLE + '*' + '*zoom*'):
            FLY_ID, FMF_TIME, GROUP = parse_fmftime(directory)
            if not os.path.exists(JAABA + 'JAR/' + FLY_ID + '_' + binsize +
                                  '_fly.pickle') == True:
Пример #41
0
def _flex_binary_moment(arg1, arg2, f, pairwise=False):

    if not (isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame))
            and isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame))):
        raise TypeError("arguments to moment function must be of type "
                        "np.ndarray/Series/DataFrame")

    if isinstance(arg1, (np.ndarray, ABCSeries)) and isinstance(
            arg2, (np.ndarray, ABCSeries)):
        X, Y = _prep_binary(arg1, arg2)
        return f(X, Y)

    elif isinstance(arg1, ABCDataFrame):
        from pandas import DataFrame

        def dataframe_from_int_dict(data, frame_template):
            result = DataFrame(data, index=frame_template.index)
            if len(result.columns) > 0:
                result.columns = frame_template.columns[result.columns]
            return result

        results = {}
        if isinstance(arg2, ABCDataFrame):
            if pairwise is False:
                if arg1 is arg2:
                    # special case in order to handle duplicate column names
                    for i, col in enumerate(arg1.columns):
                        results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i])
                    return dataframe_from_int_dict(results, arg1)
                else:
                    if not arg1.columns.is_unique:
                        raise ValueError("'arg1' columns are not unique")
                    if not arg2.columns.is_unique:
                        raise ValueError("'arg2' columns are not unique")
                    with warnings.catch_warnings(record=True):
                        warnings.simplefilter("ignore", RuntimeWarning)
                        X, Y = arg1.align(arg2, join="outer")
                    X = X + 0 * Y
                    Y = Y + 0 * X

                    with warnings.catch_warnings(record=True):
                        warnings.simplefilter("ignore", RuntimeWarning)
                        res_columns = arg1.columns.union(arg2.columns)
                    for col in res_columns:
                        if col in X and col in Y:
                            results[col] = f(X[col], Y[col])
                    return DataFrame(results,
                                     index=X.index,
                                     columns=res_columns)
            elif pairwise is True:
                results = defaultdict(dict)
                for i, k1 in enumerate(arg1.columns):
                    for j, k2 in enumerate(arg2.columns):
                        if j < i and arg2 is arg1:
                            # Symmetric case
                            results[i][j] = results[j][i]
                        else:
                            results[i][j] = f(
                                *_prep_binary(arg1.iloc[:, i], arg2.iloc[:,
                                                                         j]))

                from pandas import concat

                result_index = arg1.index.union(arg2.index)
                if len(result_index):

                    # construct result frame
                    result = concat(
                        [
                            concat(
                                [
                                    results[i][j]
                                    for j, c in enumerate(arg2.columns)
                                ],
                                ignore_index=True,
                            ) for i, c in enumerate(arg1.columns)
                        ],
                        ignore_index=True,
                        axis=1,
                    )
                    result.columns = arg1.columns

                    # set the index and reorder
                    if arg2.columns.nlevels > 1:
                        result.index = MultiIndex.from_product(
                            arg2.columns.levels + [result_index])
                        result = result.reorder_levels([2, 0, 1]).sort_index()
                    else:
                        result.index = MultiIndex.from_product([
                            range(len(arg2.columns)),
                            range(len(result_index))
                        ])
                        result = result.swaplevel(1, 0).sort_index()
                        result.index = MultiIndex.from_product([result_index] +
                                                               [arg2.columns])
                else:

                    # empty result
                    result = DataFrame(
                        index=MultiIndex(levels=[arg1.index, arg2.columns],
                                         codes=[[], []]),
                        columns=arg2.columns,
                        dtype="float64",
                    )

                # reset our index names to arg1 names
                # reset our column names to arg2 names
                # careful not to mutate the original names
                result.columns = result.columns.set_names(arg1.columns.names)
                result.index = result.index.set_names(result_index.names +
                                                      arg2.columns.names)

                return result

            else:
                raise ValueError("'pairwise' is not True/False")
        else:
            results = {
                i: f(*_prep_binary(arg1.iloc[:, i], arg2))
                for i, col in enumerate(arg1.columns)
            }
            return dataframe_from_int_dict(results, arg1)

    else:
        return _flex_binary_moment(arg2, arg1, f)
Пример #42
0
    def test_ix_loc_consistency(self):

        # GH 8613
        # some edge cases where ix/loc should return the same
        # this is not an exhaustive case

        def compare(result, expected):
            if is_scalar(expected):
                assert result == expected
            else:
                assert expected.equals(result)

        # failure cases for .loc, but these work for .ix
        df = DataFrame(np.random.randn(5, 4), columns=list('ABCD'))
        for key in [slice(1, 3), tuple([slice(0, 2), slice(0, 2)]),
                    tuple([slice(0, 2), df.columns[0:2]])]:

            for index in [tm.makeStringIndex, tm.makeUnicodeIndex,
                          tm.makeDateIndex, tm.makePeriodIndex,
                          tm.makeTimedeltaIndex]:
                df.index = index(len(df.index))
                with catch_warnings(record=True):
                    df.ix[key]

                msg = (r"cannot do slice indexing"
                       r" on {klass} with these indexers \[(0|1)\] of"
                       r" {kind}"
                       .format(klass=type(df.index), kind=str(int)))
                with pytest.raises(TypeError, match=msg):
                    df.loc[key]

        df = DataFrame(np.random.randn(5, 4), columns=list('ABCD'),
                       index=pd.date_range('2012-01-01', periods=5))

        for key in ['2012-01-03',
                    '2012-01-31',
                    slice('2012-01-03', '2012-01-03'),
                    slice('2012-01-03', '2012-01-04'),
                    slice('2012-01-03', '2012-01-06', 2),
                    slice('2012-01-03', '2012-01-31'),
                    tuple([[True, True, True, False, True]]), ]:

            # getitem

            # if the expected raises, then compare the exceptions
            try:
                with catch_warnings(record=True):
                    expected = df.ix[key]
            except KeyError:
                with pytest.raises(KeyError, match=r"^'2012-01-31'$"):
                    df.loc[key]
                continue

            result = df.loc[key]
            compare(result, expected)

            # setitem
            df1 = df.copy()
            df2 = df.copy()

            with catch_warnings(record=True):
                df1.ix[key] = 10
            df2.loc[key] = 10
            compare(df2, df1)

        # edge cases
        s = Series([1, 2, 3, 4], index=list('abde'))

        result1 = s['a':'c']
        with catch_warnings(record=True):
            result2 = s.ix['a':'c']
        result3 = s.loc['a':'c']
        tm.assert_series_equal(result1, result2)
        tm.assert_series_equal(result1, result3)

        # now work rather than raising KeyError
        s = Series(range(5), [-2, -1, 1, 2, 3])

        with catch_warnings(record=True):
            result1 = s.ix[-10:3]
        result2 = s.loc[-10:3]
        tm.assert_series_equal(result1, result2)

        with catch_warnings(record=True):
            result1 = s.ix[0:3]
        result2 = s.loc[0:3]
        tm.assert_series_equal(result1, result2)
Пример #43
0
    def test_timegrouper_with_reg_groups(self):

        # GH 3794
        # allow combination of timegrouper/reg groups

        df_original = DataFrame(
            {
                "Branch": "A A A A A A A B".split(),
                "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
                "Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
                "Date": [
                    datetime(2013, 1, 1, 13, 0),
                    datetime(2013, 1, 1, 13, 5),
                    datetime(2013, 10, 1, 20, 0),
                    datetime(2013, 10, 2, 10, 0),
                    datetime(2013, 10, 1, 20, 0),
                    datetime(2013, 10, 2, 10, 0),
                    datetime(2013, 12, 2, 12, 0),
                    datetime(2013, 12, 2, 14, 0),
                ],
            }
        ).set_index("Date")

        df_sorted = df_original.sort_values(by="Quantity", ascending=False)

        for df in [df_original, df_sorted]:
            expected = DataFrame(
                {
                    "Buyer": "Carl Joe Mark".split(),
                    "Quantity": [10, 18, 3],
                    "Date": [
                        datetime(2013, 12, 31, 0, 0),
                        datetime(2013, 12, 31, 0, 0),
                        datetime(2013, 12, 31, 0, 0),
                    ],
                }
            ).set_index(["Date", "Buyer"])

            result = df.groupby([Grouper(freq="A"), "Buyer"]).sum()
            tm.assert_frame_equal(result, expected)

            expected = DataFrame(
                {
                    "Buyer": "Carl Mark Carl Joe".split(),
                    "Quantity": [1, 3, 9, 18],
                    "Date": [
                        datetime(2013, 1, 1, 0, 0),
                        datetime(2013, 1, 1, 0, 0),
                        datetime(2013, 7, 1, 0, 0),
                        datetime(2013, 7, 1, 0, 0),
                    ],
                }
            ).set_index(["Date", "Buyer"])
            result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum()
            tm.assert_frame_equal(result, expected)

        df_original = DataFrame(
            {
                "Branch": "A A A A A A A B".split(),
                "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
                "Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
                "Date": [
                    datetime(2013, 10, 1, 13, 0),
                    datetime(2013, 10, 1, 13, 5),
                    datetime(2013, 10, 1, 20, 0),
                    datetime(2013, 10, 2, 10, 0),
                    datetime(2013, 10, 1, 20, 0),
                    datetime(2013, 10, 2, 10, 0),
                    datetime(2013, 10, 2, 12, 0),
                    datetime(2013, 10, 2, 14, 0),
                ],
            }
        ).set_index("Date")

        df_sorted = df_original.sort_values(by="Quantity", ascending=False)
        for df in [df_original, df_sorted]:

            expected = DataFrame(
                {
                    "Buyer": "Carl Joe Mark Carl Joe".split(),
                    "Quantity": [6, 8, 3, 4, 10],
                    "Date": [
                        datetime(2013, 10, 1, 0, 0),
                        datetime(2013, 10, 1, 0, 0),
                        datetime(2013, 10, 1, 0, 0),
                        datetime(2013, 10, 2, 0, 0),
                        datetime(2013, 10, 2, 0, 0),
                    ],
                }
            ).set_index(["Date", "Buyer"])

            result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum()
            tm.assert_frame_equal(result, expected)

            result = df.groupby([Grouper(freq="1M"), "Buyer"]).sum()
            expected = DataFrame(
                {
                    "Buyer": "Carl Joe Mark".split(),
                    "Quantity": [10, 18, 3],
                    "Date": [
                        datetime(2013, 10, 31, 0, 0),
                        datetime(2013, 10, 31, 0, 0),
                        datetime(2013, 10, 31, 0, 0),
                    ],
                }
            ).set_index(["Date", "Buyer"])
            tm.assert_frame_equal(result, expected)

            # passing the name
            df = df.reset_index()
            result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum()
            tm.assert_frame_equal(result, expected)

            with pytest.raises(KeyError, match="'The grouper name foo is not found'"):
                df.groupby([Grouper(freq="1M", key="foo"), "Buyer"]).sum()

            # passing the level
            df = df.set_index("Date")
            result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum()
            tm.assert_frame_equal(result, expected)
            result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum()
            tm.assert_frame_equal(result, expected)

            with pytest.raises(ValueError, match="The level foo is not valid"):
                df.groupby([Grouper(freq="1M", level="foo"), "Buyer"]).sum()

            # multi names
            df = df.copy()
            df["Date"] = df.index + offsets.MonthEnd(2)
            result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum()
            expected = DataFrame(
                {
                    "Buyer": "Carl Joe Mark".split(),
                    "Quantity": [10, 18, 3],
                    "Date": [
                        datetime(2013, 11, 30, 0, 0),
                        datetime(2013, 11, 30, 0, 0),
                        datetime(2013, 11, 30, 0, 0),
                    ],
                }
            ).set_index(["Date", "Buyer"])
            tm.assert_frame_equal(result, expected)

            # error as we have both a level and a name!
            msg = "The Grouper cannot specify both a key and a level!"
            with pytest.raises(ValueError, match=msg):
                df.groupby(
                    [Grouper(freq="1M", key="Date", level="Date"), "Buyer"]
                ).sum()

            # single groupers
            expected = DataFrame(
                [[31]],
                columns=["Quantity"],
                index=DatetimeIndex(
                    [datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date"
                ),
            )
            result = df.groupby(Grouper(freq="1M")).sum()
            tm.assert_frame_equal(result, expected)

            result = df.groupby([Grouper(freq="1M")]).sum()
            tm.assert_frame_equal(result, expected)

            expected.index = expected.index.shift(1)
            assert expected.index.freq == offsets.MonthEnd()
            result = df.groupby(Grouper(freq="1M", key="Date")).sum()
            tm.assert_frame_equal(result, expected)

            result = df.groupby([Grouper(freq="1M", key="Date")]).sum()
            tm.assert_frame_equal(result, expected)
data.replace(-999, np.nan)

data.replace([-999, -1000], np.nan)

data.replace([-999, -1000], [np.nan, 0])

data.replace({-999: np.nan, -1000: 0})

###重命名轴索引
data = DataFrame(np.arange(12).reshape((3, 4)),
                 index=['Ohio', 'Colorado', 'New York'],
                 columns=['one', 'two', 'three', 'four'])

data.index.map(str.upper)

data.index = data.index.map(str.upper)
data

data.rename(index=str.title, columns=str.upper)

data.rename(index={'OHIO': 'INDIANA'}, columns={'three': 'peekaboo'})

# 总是返回DataFrame的引用
_ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data

###离散化与面元划分
#1
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

bins = [18, 25, 35, 60, 100]
Пример #45
0
def test_to_frame():
    tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')]

    index = MultiIndex.from_tuples(tuples)
    result = index.to_frame(index=False)
    expected = DataFrame(tuples)
    tm.assert_frame_equal(result, expected)

    result = index.to_frame()
    expected.index = index
    tm.assert_frame_equal(result, expected)

    tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')]
    index = MultiIndex.from_tuples(tuples, names=['first', 'second'])
    result = index.to_frame(index=False)
    expected = DataFrame(tuples)
    expected.columns = ['first', 'second']
    tm.assert_frame_equal(result, expected)

    result = index.to_frame()
    expected.index = index
    tm.assert_frame_equal(result, expected)

    # See GH-22580
    index = MultiIndex.from_tuples(tuples)
    result = index.to_frame(index=False, name=['first', 'second'])
    expected = DataFrame(tuples)
    expected.columns = ['first', 'second']
    tm.assert_frame_equal(result, expected)

    result = index.to_frame(name=['first', 'second'])
    expected.index = index
    expected.columns = ['first', 'second']
    tm.assert_frame_equal(result, expected)

    msg = "'name' must be a list / sequence of column names."
    with tm.assert_raises_regex(TypeError, msg):
        index.to_frame(name='first')

    msg = "'name' should have same length as number of levels on index."
    with tm.assert_raises_regex(ValueError, msg):
        index.to_frame(name=['first'])

    # Tests for datetime index
    index = MultiIndex.from_product(
        [range(5), pd.date_range('20130101', periods=3)])
    result = index.to_frame(index=False)
    expected = DataFrame({
        0: np.repeat(np.arange(5, dtype='int64'), 3),
        1: np.tile(pd.date_range('20130101', periods=3), 5)
    })
    tm.assert_frame_equal(result, expected)

    result = index.to_frame()
    expected.index = index
    tm.assert_frame_equal(result, expected)

    # See GH-22580
    result = index.to_frame(index=False, name=['first', 'second'])
    expected = DataFrame({
        'first':
        np.repeat(np.arange(5, dtype='int64'), 3),
        'second':
        np.tile(pd.date_range('20130101', periods=3), 5)
    })
    tm.assert_frame_equal(result, expected)

    result = index.to_frame(name=['first', 'second'])
    expected.index = index
    tm.assert_frame_equal(result, expected)
Пример #46
0
    def table(self, from_date=None, to_date=None):
        '''
		Returns the league table with the global basic information.
		'''
        played = self.played_matches(from_date, to_date)

        points = self.points(from_date, to_date)
        points = Series(points["Points"].tolist(),
                        index=points["Team"].tolist())

        matches = self.matches(from_date, to_date)

        home_grouped = matches.groupby(
            ["HomeTeam"]).apply(lambda df_group: len(df_group[df_group[
                "FTHG"] > df_group["FTAG"]]))
        away_grouped = matches.groupby(
            ["AwayTeam"]).apply(lambda df_group: len(df_group[df_group[
                "FTAG"] > df_group["FTHG"]]))
        won = home_grouped + away_grouped

        home_grouped = matches.groupby(
            ["HomeTeam"]).apply(lambda df_group: len(df_group[df_group[
                "FTHG"] == df_group["FTAG"]]))
        away_grouped = matches.groupby(
            ["AwayTeam"]).apply(lambda df_group: len(df_group[df_group[
                "FTAG"] == df_group["FTHG"]]))
        draw = home_grouped + away_grouped

        home_grouped = matches.groupby(
            ["HomeTeam"]).apply(lambda df_group: len(df_group[df_group[
                "FTHG"] < df_group["FTAG"]]))
        away_grouped = matches.groupby(
            ["AwayTeam"]).apply(lambda df_group: len(df_group[df_group[
                "FTAG"] < df_group["FTHG"]]))
        lost = home_grouped + away_grouped

        home_grouped = matches.groupby(
            ["HomeTeam"]).apply(lambda df_group: sum(df_group["FTHG"]))
        away_grouped = matches.groupby(
            ["AwayTeam"]).apply(lambda df_group: sum(df_group["FTAG"]))
        goals_for = home_grouped + away_grouped

        home_grouped = matches.groupby(
            ["HomeTeam"]).apply(lambda df_group: sum(df_group["FTAG"]))
        away_grouped = matches.groupby(
            ["AwayTeam"]).apply(lambda df_group: sum(df_group["FTHG"]))
        goals_aga = home_grouped + away_grouped

        table = DataFrame(
            dict(Points=points,
                 Played=played,
                 Won=won,
                 Draw=draw,
                 Lost=lost,
                 GF=goals_for,
                 GA=goals_aga,
                 GD=goals_for - goals_aga)).reset_index()
        table = table.rename(columns={"index": "Team"})
        table = table[[
            "Points", "Team", "Played", "Won", "Draw", "Lost", "GF", "GA", "GD"
        ]].sort_values(by=["Points", "GD"], ascending=False).reset_index()
        table.index = range(1, len(table) + 1)
        del table["index"]
        return table
Пример #47
0
def reformat_index(x: pd.DataFrame):
    x.index = [str(a) for a in x.index]
    x.index.name = "moneyness_cut"
    x.columns = [str(a) for a in x.columns]
    x.columns.name = "time_cut"
Пример #48
0
def prepare_data(wt_organ_vol: pd.DataFrame,
                 wt_staging: pd.DataFrame,
                 mut_organ_vol: pd.DataFrame,
                 mut_staging: pd.DataFrame,
                 label_meta: Path = None,
                 normalise_to_whole_embryo=False,
                 qc_file: Path = None) -> pd.DataFrame:
    """
    Merge the mutant and wildtype dtaframes
    Optionally normalise to staging metric (Usually whole embryo volume)
    Optionally remove any qc-flagged organs (These will be set to 'nan')

    Returns
    -------
    Dataframe with following columns:
        - a column for each label (prefixed with 'x' as statsmodels does not like integer ids)
        - line
        - genotype (baseline or mutant)
        - staging (whole embryo volume)
    """

    wt_staging.rename(columns={'value': 'staging'}, inplace=True)
    mut_staging.rename(columns={'value': 'staging'}, inplace=True)
    wt_staging.index = wt_staging.index.astype(str)

    # Ensure all indxes are same type
    for d in [wt_organ_vol, mut_organ_vol, wt_staging, mut_staging]:
        d.index = d.index.astype(str)

    if normalise_to_whole_embryo:
        wt_organ_vol = wt_organ_vol.divide(wt_staging['staging'], axis=0)
        mut_organ_vol = mut_organ_vol.divide(mut_staging['staging'], axis=0)
        logging.info('Normalising organ volume to whole embryo volume')

    # merge the organ vol
    organ_vols = pd.concat([wt_organ_vol, mut_organ_vol])

    # Drop any organ columns that has only zero values. These are the gaps in the label map caused by merging labels
    # in the atlas
    organ_vols = organ_vols.loc[:, (organ_vols != 0).any(axis=0)]

    # For the statsmodels linear mode to work, column names cannot start with a digit. Prefix with 'x'
    organ_vols.columns = [
        f'x{x}' if x.isdigit() else x for x in organ_vols.columns
    ]

    staging = pd.concat([wt_staging, mut_staging])

    # Merge staging to the organvolume dataframe. First drop line so we don't get duplicate entries
    # staging.drop(columns=['line'], inplace=True)

    data = pd.concat([organ_vols, staging], axis=1)

    # Filter any labels that have been flagged at the label-level (for all specimens)
    if label_meta:

        label_meta = pd.read_csv(label_meta, index_col=0)

        if 'no_analysis' in label_meta:  # If we have a no_analysis column, drop labels that are flagged
            flagged_lables = label_meta[label_meta.no_analysis == True].index
            data.drop(
                columns=[f'x{x}' for x in flagged_lables if f'x{x}' in data],
                inplace=True)

    # QC-flagged organs from specimens specified in QC file are set to None
    if qc_file:
        logging.info(f'Excluding organ volumes specified in: {qc_file}')
        qc = pd.read_csv(qc_file)

        for _, row in qc.iterrows():
            qc_id = str(row.id)

            if qc_id not in data.index:
                raise LamaDataException(
                    f'QC flagged specimen {row.id} does not exist in dataset')

            if f'x{row.label}' not in data:
                raise LamaDataException(
                    f'QC flagegd label, {row.label}, does not exist in dataset'
                )

            data.loc[qc_id, f'x{row.label}'] = None

    return data
Пример #49
0
def test_basic():

    cats = Categorical(
        ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
        categories=["a", "b", "c", "d"],
        ordered=True,
    )
    data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})

    exp_index = CategoricalIndex(list("abcd"), name="b", ordered=True)
    expected = DataFrame({"a": [1, 2, 4, np.nan]}, index=exp_index)
    result = data.groupby("b", observed=False).mean()
    tm.assert_frame_equal(result, expected)

    cat1 = Categorical(["a", "a", "b", "b"],
                       categories=["a", "b", "z"],
                       ordered=True)
    cat2 = Categorical(["c", "d", "c", "d"],
                       categories=["c", "d", "y"],
                       ordered=True)
    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})

    # single grouper
    gb = df.groupby("A", observed=False)
    exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True)
    expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)})
    result = gb.sum()
    tm.assert_frame_equal(result, expected)

    # GH 8623
    x = DataFrame(
        [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]],
        columns=["person_id", "person_name"],
    )
    x["person_name"] = Categorical(x.person_name)

    g = x.groupby(["person_id"], observed=False)
    result = g.transform(lambda x: x)
    tm.assert_frame_equal(result, x[["person_name"]])

    result = x.drop_duplicates("person_name")
    expected = x.iloc[[0, 1]]
    tm.assert_frame_equal(result, expected)

    def f(x):
        return x.drop_duplicates("person_name").iloc[0]

    result = g.apply(f)
    expected = x.iloc[[0, 1]].copy()
    expected.index = Index([1, 2], name="person_id")
    expected["person_name"] = expected["person_name"].astype("object")
    tm.assert_frame_equal(result, expected)

    # GH 9921
    # Monotonic
    df = DataFrame({"a": [5, 15, 25]})
    c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df["a"])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df["a"])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum), df[["a"]])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.max(xs)),
        df[["a"]])

    # Filter
    tm.assert_series_equal(
        df.a.groupby(c, observed=False).filter(np.all), df["a"])
    tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df)

    # Non-monotonic
    df = DataFrame({"a": [5, 15, 25, -5]})
    c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df["a"])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df["a"])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum), df[["a"]])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df[["a"]])

    # GH 9603
    df = DataFrame({"a": [1, 0, 0, 0]})
    c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd")))
    result = df.groupby(c, observed=False).apply(len)

    exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered)
    expected = Series([1, 0, 0, 0], index=exp_index)
    expected.index.name = "a"
    tm.assert_series_equal(result, expected)

    # more basic
    levels = ["foo", "bar", "baz", "qux"]
    codes = np.random.randint(0, 4, size=100)

    cats = Categorical.from_codes(codes, levels, ordered=True)

    data = DataFrame(np.random.randn(100, 4))

    result = data.groupby(cats, observed=False).mean()

    expected = data.groupby(np.asarray(cats), observed=False).mean()
    exp_idx = CategoricalIndex(levels,
                               categories=cats.categories,
                               ordered=True)
    expected = expected.reindex(exp_idx)

    tm.assert_frame_equal(result, expected)

    grouped = data.groupby(cats, observed=False)
    desc_result = grouped.describe()

    idx = cats.codes.argsort()
    ord_labels = np.asarray(cats).take(idx)
    ord_data = data.take(idx)

    exp_cats = Categorical(ord_labels,
                           ordered=True,
                           categories=["foo", "bar", "baz", "qux"])
    expected = ord_data.groupby(exp_cats, sort=False,
                                observed=False).describe()
    tm.assert_frame_equal(desc_result, expected)

    # GH 10460
    expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
    exp = CategoricalIndex(expc)
    tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp)
    exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] *
                4)
    tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
Пример #50
0
def test_sort_datetimelike():
    # GH10505

    # use same data as test_groupby_sort_categorical, which category is
    # corresponding to datetime.month
    df = DataFrame(
        {
            "dt": [
                datetime(2011, 7, 1),
                datetime(2011, 7, 1),
                datetime(2011, 2, 1),
                datetime(2011, 5, 1),
                datetime(2011, 2, 1),
                datetime(2011, 1, 1),
                datetime(2011, 5, 1),
            ],
            "foo": [10, 8, 5, 6, 4, 1, 7],
            "bar": [10, 20, 30, 40, 50, 60, 70],
        },
        columns=["dt", "foo", "bar"],
    )

    # ordered=True
    df["dt"] = Categorical(df["dt"], ordered=True)
    index = [
        datetime(2011, 1, 1),
        datetime(2011, 2, 1),
        datetime(2011, 5, 1),
        datetime(2011, 7, 1),
    ]
    result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
                            columns=["foo", "bar"])
    result_sort.index = CategoricalIndex(index, name="dt", ordered=True)

    index = [
        datetime(2011, 7, 1),
        datetime(2011, 2, 1),
        datetime(2011, 5, 1),
        datetime(2011, 1, 1),
    ]
    result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
                              columns=["foo", "bar"])
    result_nosort.index = CategoricalIndex(index,
                                           categories=index,
                                           name="dt",
                                           ordered=True)

    col = "dt"
    tm.assert_frame_equal(result_sort,
                          df.groupby(col, sort=True, observed=False).first())

    # when categories is ordered, group is ordered by category's order
    tm.assert_frame_equal(result_sort,
                          df.groupby(col, sort=False, observed=False).first())

    # ordered = False
    df["dt"] = Categorical(df["dt"], ordered=False)
    index = [
        datetime(2011, 1, 1),
        datetime(2011, 2, 1),
        datetime(2011, 5, 1),
        datetime(2011, 7, 1),
    ]
    result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
                            columns=["foo", "bar"])
    result_sort.index = CategoricalIndex(index, name="dt")

    index = [
        datetime(2011, 7, 1),
        datetime(2011, 2, 1),
        datetime(2011, 5, 1),
        datetime(2011, 1, 1),
    ]
    result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
                              columns=["foo", "bar"])
    result_nosort.index = CategoricalIndex(index, categories=index, name="dt")

    col = "dt"
    tm.assert_frame_equal(result_sort,
                          df.groupby(col, sort=True, observed=False).first())
    tm.assert_frame_equal(result_nosort,
                          df.groupby(col, sort=False, observed=False).first())
Пример #51
0
    def test_convert_dti_to_series(self):
        # don't cast a DatetimeIndex WITH a tz, leave as object
        # GH 6032
        idx = DatetimeIndex(to_datetime(["2013-1-1 13:00", "2013-1-2 14:00"]),
                            name="B").tz_localize("US/Pacific")
        df = DataFrame(np.random.randn(2, 1), columns=["A"])

        expected = Series(
            np.array(
                [
                    Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"),
                    Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"),
                ],
                dtype="object",
            ),
            name="B",
        )

        # convert index to series
        result = Series(idx)
        tm.assert_series_equal(result, expected)

        # assign to frame
        df["B"] = idx
        result = df["B"]
        tm.assert_series_equal(result, expected)

        # convert to series while keeping the timezone
        msg = "stop passing 'keep_tz'"
        with tm.assert_produces_warning(FutureWarning) as m:
            result = idx.to_series(keep_tz=True, index=[0, 1])
        tm.assert_series_equal(result, expected)
        assert msg in str(m[0].message)

        # convert to utc
        with tm.assert_produces_warning(FutureWarning) as m:
            df["B"] = idx.to_series(keep_tz=False, index=[0, 1])
        result = df["B"]
        comp = Series(DatetimeIndex(expected.values).tz_localize(None),
                      name="B")
        tm.assert_series_equal(result, comp)
        msg = "do 'idx.tz_convert(None)' before calling"
        assert msg in str(m[0].message)

        result = idx.to_series(index=[0, 1])
        tm.assert_series_equal(result, expected)

        with tm.assert_produces_warning(FutureWarning) as m:
            result = idx.to_series(keep_tz=False, index=[0, 1])
        tm.assert_series_equal(result, expected.dt.tz_convert(None))
        msg = "do 'idx.tz_convert(None)' before calling"
        assert msg in str(m[0].message)

        # list of datetimes with a tz
        df["B"] = idx.to_pydatetime()
        result = df["B"]
        tm.assert_series_equal(result, expected)

        # GH 6785
        # set the index manually
        import pytz

        df = DataFrame([{
            "ts": datetime(2014, 4, 1, tzinfo=pytz.utc),
            "foo": 1
        }])
        expected = df.set_index("ts")
        df.index = df["ts"]
        df.pop("ts")
        tm.assert_frame_equal(df, expected)
Пример #52
0
    def test_read_excel_multiindex(self, read_ext):
        # see gh-4679
        if pd.read_excel.keywords["engine"] == "pyxlsb":
            pytest.xfail("Sheets containing datetimes not supported by pyxlsb")

        mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]])
        mi_file = "testmultiindex" + read_ext

        # "mi_column" sheet
        expected = DataFrame(
            [
                [1, 2.5, pd.Timestamp("2015-01-01"), True],
                [2, 3.5, pd.Timestamp("2015-01-02"), False],
                [3, 4.5, pd.Timestamp("2015-01-03"), False],
                [4, 5.5, pd.Timestamp("2015-01-04"), True],
            ],
            columns=mi,
        )

        actual = pd.read_excel(
            mi_file, sheet_name="mi_column", header=[0, 1], index_col=0
        )
        tm.assert_frame_equal(actual, expected)

        # "mi_index" sheet
        expected.index = mi
        expected.columns = ["a", "b", "c", "d"]

        actual = pd.read_excel(mi_file, sheet_name="mi_index", index_col=[0, 1])
        tm.assert_frame_equal(actual, expected, check_names=False)

        # "both" sheet
        expected.columns = mi

        actual = pd.read_excel(
            mi_file, sheet_name="both", index_col=[0, 1], header=[0, 1]
        )
        tm.assert_frame_equal(actual, expected, check_names=False)

        # "mi_index_name" sheet
        expected.columns = ["a", "b", "c", "d"]
        expected.index = mi.set_names(["ilvl1", "ilvl2"])

        actual = pd.read_excel(mi_file, sheet_name="mi_index_name", index_col=[0, 1])
        tm.assert_frame_equal(actual, expected)

        # "mi_column_name" sheet
        expected.index = list(range(4))
        expected.columns = mi.set_names(["c1", "c2"])
        actual = pd.read_excel(
            mi_file, sheet_name="mi_column_name", header=[0, 1], index_col=0
        )
        tm.assert_frame_equal(actual, expected)

        # see gh-11317
        # "name_with_int" sheet
        expected.columns = mi.set_levels([1, 2], level=1).set_names(["c1", "c2"])

        actual = pd.read_excel(
            mi_file, sheet_name="name_with_int", index_col=0, header=[0, 1]
        )
        tm.assert_frame_equal(actual, expected)

        # "both_name" sheet
        expected.columns = mi.set_names(["c1", "c2"])
        expected.index = mi.set_names(["ilvl1", "ilvl2"])

        actual = pd.read_excel(
            mi_file, sheet_name="both_name", index_col=[0, 1], header=[0, 1]
        )
        tm.assert_frame_equal(actual, expected)

        # "both_skiprows" sheet
        actual = pd.read_excel(
            mi_file,
            sheet_name="both_name_skiprows",
            index_col=[0, 1],
            header=[0, 1],
            skiprows=2,
        )
        tm.assert_frame_equal(actual, expected)
Пример #53
0
 def test_apply_multi_index(self):
     s = DataFrame([[1, 2], [3, 4], [5, 6]])
     s.index = MultiIndex.from_arrays([['a', 'a', 'b'], ['c', 'd', 'd']])
     s.columns = ['col1', 'col2']
     res = s.apply(lambda x: Series({'min': min(x), 'max': max(x)}), 1)
     assert isinstance(res.index, MultiIndex)
Пример #54
0
    def bdh(self,
            ticker_list,
            fld_list,
            start_date,
            end_date=date.today().strftime('%Y%m%d'),
            periodselection='DAILY',
            overrides=None):
        """
        Get ticker_list and field_list
        return pandas multi level columns dataframe
        """
        # Create and fill the request for the historical data
        self.service_refData()

        if isstring(ticker_list):
            ticker_list = [ticker_list]
        if isstring(fld_list):
            fld_list = [fld_list]

        if hasattr(start_date, 'strftime'):
            start_date = start_date.strftime('%Y%m%d')
        if hasattr(end_date, 'strftime'):
            end_date = end_date.strftime('%Y%m%d')

        request = self.refDataService.createRequest("HistoricalDataRequest")
        for t in ticker_list:
            request.getElement("securities").appendValue(t)
        for f in fld_list:
            request.getElement("fields").appendValue(f)
        request.set("periodicityAdjustment", "ACTUAL")
        request.set("periodicitySelection", periodselection)
        request.set("startDate", start_date)
        request.set("endDate", end_date)

        if overrides is not None:
            overrideOuter = request.getElement('overrides')
            for k in overrides:
                override1 = overrideOuter.appendElement()
                override1.setElement('fieldId', k)
                override1.setElement('value', overrides[k])

        #print("Sending Request:", request)
        # Send the request
        self.session.sendRequest(request)
        # defaultdict - later convert to pandas
        data = defaultdict(dict)
        # Process received events
        while (True):
            # We provide timeout to give the chance for Ctrl+C handling:
            ev = self.session.nextEvent(500)
            for msg in ev:
                ticker = msg.getElement('securityData').getElement(
                    'security').getValue()
                fieldData = msg.getElement('securityData').getElement(
                    'fieldData')
                for i in range(fieldData.numValues()):
                    for j in range(1, fieldData.getValue(i).numElements()):
                        data[(ticker, fld_list[j - 1])][fieldData.getValue(
                            i).getElement(0).getValue()] = fieldData.getValue(
                                i).getElement(j).getValue()

            if ev.eventType() == blpapi.Event.RESPONSE:
                # Response completly received, so we could exit
                break

        if len(fld_list) == 1:
            data = {k[0]: v for k, v in data.items()}
            data = DataFrame(data)
            #data.index = pd.to_datetime(data.index)
            return data

        if len(data) == 0:
            # security error case
            return DataFrame()

        data = DataFrame(data)
        data.columns = pd.MultiIndex.from_tuples(data,
                                                 names=['ticker', 'field'])
        data.index = pd.to_datetime(data.index)
        return data
Пример #55
0
def _normalize(
    table: DataFrame, normalize, margins: bool, margins_name="All"
) -> DataFrame:

    if not isinstance(normalize, (bool, str)):
        axis_subs = {0: "index", 1: "columns"}
        try:
            normalize = axis_subs[normalize]
        except KeyError as err:
            raise ValueError("Not a valid normalize argument") from err

    if margins is False:

        # Actual Normalizations
        normalizers: dict[bool | str, Callable] = {
            "all": lambda x: x / x.sum(axis=1).sum(axis=0),
            "columns": lambda x: x / x.sum(),
            "index": lambda x: x.div(x.sum(axis=1), axis=0),
        }

        normalizers[True] = normalizers["all"]

        try:
            f = normalizers[normalize]
        except KeyError as err:
            raise ValueError("Not a valid normalize argument") from err

        table = f(table)
        table = table.fillna(0)

    elif margins is True:
        # keep index and column of pivoted table
        table_index = table.index
        table_columns = table.columns
        last_ind_or_col = table.iloc[-1, :].name

        # check if margin name is not in (for MI cases) and not equal to last
        # index/column and save the column and index margin
        if (margins_name not in last_ind_or_col) & (margins_name != last_ind_or_col):
            raise ValueError(f"{margins_name} not in pivoted DataFrame")
        column_margin = table.iloc[:-1, -1]
        index_margin = table.iloc[-1, :-1]

        # keep the core table
        table = table.iloc[:-1, :-1]

        # Normalize core
        table = _normalize(table, normalize=normalize, margins=False)

        # Fix Margins
        if normalize == "columns":
            column_margin = column_margin / column_margin.sum()
            table = concat([table, column_margin], axis=1)
            table = table.fillna(0)
            table.columns = table_columns

        elif normalize == "index":
            index_margin = index_margin / index_margin.sum()
            table = table._append(index_margin)
            table = table.fillna(0)
            table.index = table_index

        elif normalize == "all" or normalize is True:
            column_margin = column_margin / column_margin.sum()
            index_margin = index_margin / index_margin.sum()
            index_margin.loc[margins_name] = 1
            table = concat([table, column_margin], axis=1)
            table = table._append(index_margin)

            table = table.fillna(0)
            table.index = table_index
            table.columns = table_columns

        else:
            raise ValueError("Not a valid normalize argument")

    else:
        raise ValueError("Not a valid margins argument")

    return table
Пример #56
0
    def test_set_index_datetime(self):
        # GH#3950
        df = DataFrame({
            "label": ["a", "a", "a", "b", "b", "b"],
            "datetime": [
                "2011-07-19 07:00:00",
                "2011-07-19 08:00:00",
                "2011-07-19 09:00:00",
                "2011-07-19 07:00:00",
                "2011-07-19 08:00:00",
                "2011-07-19 09:00:00",
            ],
            "value":
            range(6),
        })
        df.index = to_datetime(df.pop("datetime"), utc=True)
        df.index = df.index.tz_convert("US/Pacific")

        expected = DatetimeIndex(
            [
                "2011-07-19 07:00:00", "2011-07-19 08:00:00",
                "2011-07-19 09:00:00"
            ],
            name="datetime",
        )
        expected = expected.tz_localize("UTC").tz_convert("US/Pacific")

        df = df.set_index("label", append=True)
        tm.assert_index_equal(df.index.levels[0], expected)
        tm.assert_index_equal(df.index.levels[1],
                              Index(["a", "b"], name="label"))
        assert df.index.names == ["datetime", "label"]

        df = df.swaplevel(0, 1)
        tm.assert_index_equal(df.index.levels[0],
                              Index(["a", "b"], name="label"))
        tm.assert_index_equal(df.index.levels[1], expected)
        assert df.index.names == ["label", "datetime"]

        df = DataFrame(np.random.random(6))
        idx1 = DatetimeIndex(
            [
                "2011-07-19 07:00:00",
                "2011-07-19 08:00:00",
                "2011-07-19 09:00:00",
                "2011-07-19 07:00:00",
                "2011-07-19 08:00:00",
                "2011-07-19 09:00:00",
            ],
            tz="US/Eastern",
        )
        idx2 = DatetimeIndex(
            [
                "2012-04-01 09:00",
                "2012-04-01 09:00",
                "2012-04-01 09:00",
                "2012-04-02 09:00",
                "2012-04-02 09:00",
                "2012-04-02 09:00",
            ],
            tz="US/Eastern",
        )
        idx3 = date_range("2011-01-01 09:00", periods=6, tz="Asia/Tokyo")
        idx3 = idx3._with_freq(None)

        df = df.set_index(idx1)
        df = df.set_index(idx2, append=True)
        df = df.set_index(idx3, append=True)

        expected1 = DatetimeIndex(
            [
                "2011-07-19 07:00:00", "2011-07-19 08:00:00",
                "2011-07-19 09:00:00"
            ],
            tz="US/Eastern",
        )
        expected2 = DatetimeIndex(["2012-04-01 09:00", "2012-04-02 09:00"],
                                  tz="US/Eastern")

        tm.assert_index_equal(df.index.levels[0], expected1)
        tm.assert_index_equal(df.index.levels[1], expected2)
        tm.assert_index_equal(df.index.levels[2], idx3)

        # GH#7092
        tm.assert_index_equal(df.index.get_level_values(0), idx1)
        tm.assert_index_equal(df.index.get_level_values(1), idx2)
        tm.assert_index_equal(df.index.get_level_values(2), idx3)
Пример #57
0
def _generate_marginal_results(table,
                               data,
                               values,
                               rows,
                               cols,
                               aggfunc,
                               observed,
                               margins_name: str = "All"):
    if len(cols) > 0:
        # need to "interleave" the margins
        table_pieces = []
        margin_keys = []

        def _all_key(key):
            return (key, margins_name) + ("", ) * (len(cols) - 1)

        if len(rows) > 0:
            margin = data[rows + values].groupby(
                rows, observed=observed).agg(aggfunc)
            cat_axis = 1

            for key, piece in table.groupby(level=0,
                                            axis=cat_axis,
                                            observed=observed):
                all_key = _all_key(key)

                # we are going to mutate this, so need to copy!
                piece = piece.copy()
                piece[all_key] = margin[key]

                table_pieces.append(piece)
                margin_keys.append(all_key)
        else:
            from pandas import DataFrame

            cat_axis = 0
            for key, piece in table.groupby(level=0,
                                            axis=cat_axis,
                                            observed=observed):
                if len(cols) > 1:
                    all_key = _all_key(key)
                else:
                    all_key = margins_name
                table_pieces.append(piece)
                # GH31016 this is to calculate margin for each group, and assign
                # corresponded key as index
                transformed_piece = DataFrame(piece.apply(aggfunc)).T
                transformed_piece.index = Index([all_key],
                                                name=piece.index.name)

                # append piece for margin into table_piece
                table_pieces.append(transformed_piece)
                margin_keys.append(all_key)

        result = concat(table_pieces, axis=cat_axis)

        if len(rows) == 0:
            return result
    else:
        result = table
        margin_keys = table.columns

    if len(cols) > 0:
        row_margin = data[cols + values].groupby(
            cols, observed=observed).agg(aggfunc)
        row_margin = row_margin.stack()

        # slight hack
        new_order = [len(cols)] + list(range(len(cols)))
        row_margin.index = row_margin.index.reorder_levels(new_order)
    else:
        row_margin = Series(np.nan, index=result.columns)

    return result, margin_keys, row_margin
Пример #58
0
def return_risk_analysis_old(nav_df: pd.DataFrame,
                             date_frm=None,
                             date_to=None,
                             freq='weekly',
                             rf=0.02):
    """
    按列统计 rr_df 收益率绩效
    :param nav_df: 收益率DataFrame,index为日期,每一列为一个产品的净值走势
    :param date_frm: 统计日期区间,可以为空
    :param date_to: 统计日期区间,可以为空
    :param freq: None 自动识别, 'daily' 'weekly' 'monthly'
    :param rf: 无风险收益率,默认 0.02
    :return:
    """
    nav_df.index = [try_2_date(idx) for idx in nav_df.index]
    nav_sorted_df = nav_df.sort_index()
    rr_df = (1 + nav_sorted_df.pct_change().fillna(0)).cumprod()
    rr_df.index = [try_2_date(d) for d in rr_df.index]
    # 计算数据实际频率是日频、周频、月頻
    rr_df_len = rr_df.shape[0]
    day_per_data = (rr_df.index[rr_df_len - 1] -
                    rr_df.index[0]).days / rr_df_len
    if day_per_data <= 0.005:
        freq_real = 'minute'
    elif day_per_data <= 0.2:
        freq_real = 'hour'
    elif day_per_data <= 2:
        freq_real = 'daily'
    elif day_per_data <= 10:
        freq_real = 'weekly'
    else:
        freq_real = 'monthly'
    if freq is None:
        freq = freq_real
    elif freq != freq_real:
        warnings_msg = "data freq wrong, expect %s, but %s was detected" % (
            freq, freq_real)
        # warnings.warn(warnings_msg)
        # logging.warning(warnings_msg)
        raise ValueError(warnings_msg)

    freq_str = ''
    if freq == 'weekly':
        data_count_per_year = 50
        freq_str = '周'
    elif freq == 'monthly':
        data_count_per_year = 12
        freq_str = '月'
    elif freq == 'daily':
        data_count_per_year = 250
        freq_str = '日'
    elif freq == 'hour':
        data_count_per_year = 1250
        freq_str = '时'
    elif freq == 'minute':
        data_count_per_year = 75000
        freq_str = '分'
    else:
        raise ValueError('freq=%s 只接受 daily weekly monthly 三种之一', freq)
    stat_dic_dic = OrderedDict()
    # rr_df.index = [str_2_date(d) for d in rr_df.index]
    rr_uindex_df = rr_df.reset_index()
    col_name_list = list(rr_uindex_df.columns)
    date_col_name = col_name_list[0]
    col_name_list = col_name_list[1:]
    if type(date_frm) is str:
        date_frm = datetime.strptime(date_frm, '%Y-%m-%d').date()
    if type(date_to) is str:
        date_to = datetime.strptime(date_to, '%Y-%m-%d').date()
    for col_name in col_name_list:
        data_df = rr_uindex_df[[date_col_name, col_name]]
        # print(data_df)
        data_df.columns = ['Date', 'Value']
        data_df = get_df_between_date(data_df, date_frm, date_to)
        data_df.Value = data_df.Value / data_df.Value[0]
        data_df['ret'] = data_df.Value.pct_change().fillna(0)
        date_span = data_df.Date[data_df.index[-1]] - data_df.Date[
            data_df.index[0]]
        date_span_fraction = 365 / date_span.days if date_span.days > 0 else 1
        # basic indicators
        CAGR = data_df.Value[data_df.index[-1]]**date_span_fraction - 1
        period_rr = data_df.Value[data_df.index[-1]] - 1
        ann_vol = np.std(data_df.ret, ddof=1) * np.sqrt(data_count_per_year)
        down_side_vol = np.std(data_df.ret[data_df.ret < 0],
                               ddof=1) * np.sqrt(data_count_per_year)
        # WeeksNum = data.shape[0]
        profit_loss_ratio = -np.mean(data_df.ret[data_df.ret > 0]) / np.mean(
            data_df.ret[data_df.ret < 0])
        win_ratio = len(data_df.ret[data_df.ret >= 0]) / len(data_df.ret)
        min_value = min(data_df.Value)
        final_value = data_df.Value[data_df.index[-1]]
        max_ret = max(data_df.ret)
        min_ret = min(data_df.ret)
        # End of basic indicators
        # max dropdown related
        data_df['mdd'] = data_df.Value / data_df.Value.cummax() - 1
        mdd_size = min(data_df.mdd)
        droparray = pd.Series(data_df.index[data_df.mdd == 0])
        if len(droparray) == 1:
            mdd_max_period = len(data_df.mdd)
        else:
            if float(data_df.Value[droparray.tail(1)]) > float(
                    data_df.Value.tail(1)):
                droparray = droparray.append(pd.Series(data_df.index[-1]),
                                             ignore_index=True)
            mdd_max_period = max(droparray.diff().dropna()) - 1
        # End of max dropdown related
        # High level indicators
        sharpe_ratio = (CAGR - rf) / ann_vol
        sortino_ratio = (CAGR - rf) / down_side_vol
        calmar_ratio = CAGR / (-mdd_size)
        #  Natural month return
        j = 1
        for i in data_df.index:
            if i == 0:
                month_ret = pd.DataFrame([[data_df.Date[i], data_df.Value[i]]],
                                         columns=('Date', 'Value'))
            else:
                if data_df.Date[i].month != data_df.Date[i - 1].month:
                    month_ret.loc[j] = [
                        data_df.Date[i - 1], data_df.Value[i - 1]
                    ]
                    j += 1
        month_ret.loc[j] = [
            data_df.Date[data_df.index[-1]], data_df.Value[data_df.index[-1]]
        ]
        month_ret['ret'] = month_ret.Value.pct_change().fillna(0)
        max_rr_month = max(month_ret.ret)
        min_rr_month = min(month_ret.ret)
        # End of Natural month return
        data_len = data_df.shape[0]
        date_begin = data_df.Date[0]  # .date()
        date_end = data_df.Date[data_len - 1]
        stat_dic = OrderedDict([('起始日期', date_begin), ('截止日期', date_end),
                                ('区间收益率', '%.2f%%' % (period_rr * 100)),
                                ('最终净值', '%.4f' % final_value),
                                ('最低净值', '%.4f' % min_value),
                                ('年化收益率', '%.2f%%' % (CAGR * 100)),
                                ('年化波动率', '%.2f%%' % (ann_vol * 100)),
                                ('年化下行波动率', '%.2f%%' % (down_side_vol * 100)),
                                ('最大回撤', '%.2f%%' % (mdd_size * 100)),
                                ('夏普率', '%.2f' % sharpe_ratio),
                                ('索提诺比率', '%.2f' % sortino_ratio),
                                ('卡马比率', '%.2f' % calmar_ratio),
                                ('盈亏比', '%.2f' % profit_loss_ratio),
                                ('胜率', '%.2f' % win_ratio),
                                ('最长不创新高(%s)' % freq_str, mdd_max_period),
                                ('统计周期最大收益', '%.2f%%' % (max_ret * 100)),
                                ('统计周期最大亏损', '%.2f%%' % (min_ret * 100)),
                                ('最大月收益', '%.2f%%' % (max_rr_month * 100)),
                                ('最大月亏损', '%.2f%%' % (min_rr_month * 100))])
        stat_dic_dic[col_name] = stat_dic
    stat_df = pd.DataFrame(stat_dic_dic)
    stat_df = stat_df.ix[list(stat_dic.keys())]
    return stat_df
Пример #59
0
    def validate(
            self,
            check_obj: pd.DataFrame,
            head: Optional[int] = None,
            tail: Optional[int] = None,
            sample: Optional[int] = None,
            random_state: Optional[int] = None,
            lazy: bool = False,
    ) -> pd.DataFrame:
        # pylint: disable=too-many-locals,too-many-branches
        """Check if all columns in a dataframe have a column in the Schema.

        :param pd.DataFrame dataframe: the dataframe to be validated.
        :param head: validate the first n rows. Rows overlapping with `tail` or
            `sample` are de-duplicated.
        :param tail: validate the last n rows. Rows overlapping with `head` or
            `sample` are de-duplicated.
        :param sample: validate a random sample of n rows. Rows overlapping
            with `head` or `tail` are de-duplicated.
        :param random_state: random seed for the ``sample`` argument.
        :param lazy: if True, lazily evaluates dataframe against all validation
            checks and raises a ``SchemaErrorReport``. Otherwise, raise
            ``SchemaError`` as soon as one occurs.
        :returns: validated ``DataFrame``

        :raises SchemaError: when ``DataFrame`` violates built-in or custom
            checks.

        :example:

        Calling ``schema.validate`` returns the dataframe.

        >>> import pandas as pd
        >>> import pandera as pa
        >>>
        >>> df = pd.DataFrame({
        ...     "probability": [0.1, 0.4, 0.52, 0.23, 0.8, 0.76],
        ...     "category": ["dog", "dog", "cat", "duck", "dog", "dog"]
        ... })
        >>>
        >>> schema_withchecks = pa.DataFrameSchema({
        ...     "probability": pa.Column(
        ...         pa.Float, pa.Check(lambda s: (s >= 0) & (s <= 1))),
        ...
        ...     # check that the "category" column contains a few discrete
        ...     # values, and the majority of the entries are dogs.
        ...     "category": pa.Column(
        ...         pa.String, [
        ...             pa.Check(lambda s: s.isin(["dog", "cat", "duck"])),
        ...             pa.Check(lambda s: (s == "dog").mean() > 0.5),
        ...         ]),
        ... })
        >>>
        >>> schema_withchecks.validate(df)[["probability", "category"]]
           probability category
        0         0.10      dog
        1         0.40      dog
        2         0.52      cat
        3         0.23     duck
        4         0.80      dog
        5         0.76      dog
        """

        if self._is_inferred:
            warnings.warn(
                "This %s is an inferred schema that hasn't been "
                "modified. It's recommended that you refine the schema "
                "by calling `add_columns`, `remove_columns`, or "
                "`update_columns` before using it to validate data."
                % type(self),
                UserWarning
            )

        error_handler = SchemaErrorHandler(lazy)

        # dataframe strictness check makes sure all columns in the dataframe
        # are specified in the dataframe schema
        if self.strict:

            # expand regex columns
            col_regex_matches = []  # type: ignore
            for colname, col_schema in self.columns.items():
                if col_schema.regex:
                    try:
                        col_regex_matches.extend(
                            col_schema.get_regex_columns(check_obj.columns))
                    except errors.SchemaError:
                        pass

            expanded_column_names = frozenset(
                [n for n, c in self.columns.items() if not c.regex] +
                col_regex_matches
            )

            for column in check_obj:
                if column not in expanded_column_names:
                    msg = (
                        "column '%s' not in DataFrameSchema %s" %
                        (column, self.columns)
                    )
                    error_handler.collect_error(
                        "column_not_in_schema", errors.SchemaError(
                            self, check_obj, msg,
                            failure_cases=scalar_failure_case(column),
                            check="column_in_schema",
                        )
                    )

        # column data-type coercion logic
        lazy_exclude_columns = []
        for colname, col_schema in self.columns.items():
            if col_schema.regex:
                try:
                    matched_columns = col_schema.get_regex_columns(
                        check_obj.columns)
                except errors.SchemaError:
                    matched_columns = pd.Index([])

                for matched_colname in matched_columns:
                    if col_schema.coerce or self.coerce:
                        check_obj[matched_colname] = col_schema.coerce_dtype(
                            check_obj[matched_colname])

            elif colname not in check_obj and col_schema.required:
                if lazy:
                    # exclude columns that are not present in the dataframe
                    # for lazy validation, the error is collected by the
                    # error_handler and should raise a SchemaErrors exception
                    # at the end of the `validate` method.
                    lazy_exclude_columns.append(colname)
                msg = (
                    "column '%s' not in dataframe\n%s" %
                    (colname, check_obj.head())
                )
                error_handler.collect_error(
                    "column_not_in_dataframe", errors.SchemaError(
                        self, check_obj, msg,
                        failure_cases=scalar_failure_case(colname),
                        check="column_in_dataframe",
                    )
                )

            elif col_schema.coerce or self.coerce:
                check_obj.loc[:, colname] = col_schema.coerce_dtype(
                    check_obj[colname])

        schema_components = [
            col for col_name, col in self.columns.items()
            if (col.required or col_name in check_obj)
            and col_name not in lazy_exclude_columns
        ]
        if self.index is not None:
            if self.index.coerce or self.coerce:
                check_obj.index = self.index.coerce_dtype(check_obj.index)
            schema_components.append(self.index)

        dataframe_to_validate = self._dataframe_to_validate(
            check_obj, head, tail, sample, random_state)

        check_results = []
        # schema-component-level checks
        for schema_component in schema_components:
            try:
                check_results.append(isinstance(
                    schema_component(dataframe_to_validate), pd.DataFrame))
            except errors.SchemaError as err:
                error_handler.collect_error("schema_component_check", err)

        # dataframe-level checks
        for check_index, check in enumerate(self.checks):
            try:
                check_results.append(_handle_check_results(
                    self, check_index, check, dataframe_to_validate))
            except errors.SchemaError as err:
                error_handler.collect_error("dataframe_check", err)

        if lazy and error_handler.collected_errors:
            raise errors.SchemaErrors(
                error_handler.collected_errors, check_obj)

        assert all(check_results)
        return check_obj
Пример #60
0
    def test_margin_dropna(self):
        # GH 12577
        # pivot_table counts null into margin ('All')
        # when margins=true and dropna=true

        df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]})
        actual = crosstab(df.a, df.b, margins=True, dropna=True)
        expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]])
        expected.index = Index([1.0, 2.0, "All"], name="a")
        expected.columns = Index([3, 4, "All"], name="b")
        tm.assert_frame_equal(actual, expected)

        df = DataFrame({
            "a": [1, np.nan, np.nan, np.nan, 2, np.nan],
            "b": [3, np.nan, 4, 4, 4, 4]
        })
        actual = crosstab(df.a, df.b, margins=True, dropna=True)
        expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
        expected.index = Index([1.0, 2.0, "All"], name="a")
        expected.columns = Index([3.0, 4.0, "All"], name="b")
        tm.assert_frame_equal(actual, expected)

        df = DataFrame({
            "a": [1, np.nan, np.nan, np.nan, np.nan, 2],
            "b": [3, 3, 4, 4, 4, 4]
        })
        actual = crosstab(df.a, df.b, margins=True, dropna=True)
        expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
        expected.index = Index([1.0, 2.0, "All"], name="a")
        expected.columns = Index([3, 4, "All"], name="b")
        tm.assert_frame_equal(actual, expected)

        # GH 12642
        # _add_margins raises KeyError: Level None not found
        # when margins=True and dropna=False
        df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]})
        actual = crosstab(df.a, df.b, margins=True, dropna=False)
        expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]])
        expected.index = Index([1.0, 2.0, "All"], name="a")
        expected.columns = Index([3, 4, "All"], name="b")
        tm.assert_frame_equal(actual, expected)

        df = DataFrame({
            "a": [1, np.nan, np.nan, np.nan, 2, np.nan],
            "b": [3, np.nan, 4, 4, 4, 4]
        })
        actual = crosstab(df.a, df.b, margins=True, dropna=False)
        expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]])
        expected.index = Index([1.0, 2.0, "All"], name="a")
        expected.columns = Index([3.0, 4.0, "All"], name="b")
        tm.assert_frame_equal(actual, expected)

        a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"],
                     dtype=object)
        b = np.array(["one", "one", "two", "one", "two", np.nan, "two"],
                     dtype=object)
        c = np.array(
            ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"],
            dtype=object)

        actual = crosstab(a, [b, c],
                          rownames=["a"],
                          colnames=["b", "c"],
                          margins=True,
                          dropna=False)
        m = MultiIndex.from_arrays(
            [
                ["one", "one", "two", "two", "All"],
                ["dull", "shiny", "dull", "shiny", ""],
            ],
            names=["b", "c"],
        )
        expected = DataFrame(
            [[1, 0, 1, 0, 2], [2, 0, 1, 1, 5], [3, 0, 2, 1, 7]], columns=m)
        expected.index = Index(["bar", "foo", "All"], name="a")
        tm.assert_frame_equal(actual, expected)

        actual = crosstab([a, b],
                          c,
                          rownames=["a", "b"],
                          colnames=["c"],
                          margins=True,
                          dropna=False)
        m = MultiIndex.from_arrays(
            [["bar", "bar", "foo", "foo", "All"],
             ["one", "two", "one", "two", ""]],
            names=["a", "b"],
        )
        expected = DataFrame(
            [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 2, 7]], index=m)
        expected.columns = Index(["dull", "shiny", "All"], name="c")
        tm.assert_frame_equal(actual, expected)

        actual = crosstab([a, b],
                          c,
                          rownames=["a", "b"],
                          colnames=["c"],
                          margins=True,
                          dropna=True)
        m = MultiIndex.from_arrays(
            [["bar", "bar", "foo", "foo", "All"],
             ["one", "two", "one", "two", ""]],
            names=["a", "b"],
        )
        expected = DataFrame(
            [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 1, 6]], index=m)
        expected.columns = Index(["dull", "shiny", "All"], name="c")
        tm.assert_frame_equal(actual, expected)