Пример #1
0
    def build_related_objects(self):
        tag_cols = ['indicator_id', 'indicator_tag_id']
        bound_cols = ['indicator_id', 'bound_name', 'mn_val', 'mx_val']

        tag_df = DataFrame(list(IndicatorToTag.objects.filter(
            indicator_id__in=self.indicator_id_list).values_list(*tag_cols)), columns=tag_cols)

        bound_df = DataFrame(list(
            IndicatorBound.objects.filter(
                indicator_id__in=self.indicator_id_list).values_list(*bound_cols)
        ), columns=bound_cols)

        qs = Indicator.objects.filter(id__in=self.indicator_id_list)

        bound_df = bound_df.where((notnull(bound_df)), None)
        tag_df = tag_df.where((notnull(tag_df)), None)
        # office_df = office_df.where((notnull(office_df)), None)

        return qs, bound_df, tag_df
Пример #2
0
    def build_indicator_queryset(self, indicator_id_list):
        '''
        Based on the indicator_ids, we make 3 requests, one for the indicators
        one for the bounds, and another for the tags.
        '''

        tag_cols = ['indicator_id','indicator_tag_id']
        bound_cols = ['indicator_id','bound_name','mn_val','mx_val']
        ind_to_office_cols = ['indicator_id','office_id']

        if indicator_id_list:
            ## get tags ##
            tag_df = DataFrame(list(IndicatorToTag.objects.filter(indicator_id__in=\
                indicator_id_list).values_list(*tag_cols)),columns = tag_cols)

            bound_df = DataFrame(list(IndicatorBound.objects\
                .filter(indicator_id__in=indicator_id_list)\
                .values_list(*bound_cols)),columns = bound_cols)

            office_df = DataFrame(list(IndicatorToOffice.objects\
                .filter(indicator_id__in=indicator_id_list)\
                .values_list(*ind_to_office_cols)),columns = ind_to_office_cols)

            qs = Indicator.objects.filter(id__in=indicator_id_list)

        else:
            tag_df = DataFrame(list(IndicatorToTag.objects.all()\
                .values_list(*tag_cols)),columns = tag_cols)

            bound_df = DataFrame(list(IndicatorBound.objects.all()\
                .values_list(*bound_cols)),columns = bound_cols)

            office_df = DataFrame(list(IndicatorToOffice.objects\
                .all().values_list(*ind_to_office_cols))\
                ,columns = ind_to_office_cols)

            qs = Indicator.objects.all()

        bound_df = bound_df.where((notnull(bound_df)), None)
        tag_df = tag_df.where((notnull(tag_df)), None)
        # office_df = office_df.where((notnull(office_df)), None)

        return qs, bound_df, tag_df, office_df
Пример #3
0
def test_where_with_numeric_data(data):
    # GH 17386
    lower_bound = 1.5

    sparse = SparseDataFrame(data)
    result = sparse.where(sparse > lower_bound)

    dense = DataFrame(data)
    dense_expected = dense.where(dense > lower_bound)
    sparse_expected = SparseDataFrame(dense_expected)

    tm.assert_frame_equal(result, dense_expected)
    tm.assert_sp_frame_equal(result, sparse_expected)
Пример #4
0
def test_where_with_bool_data():
    # GH 17386
    data = [[False, False], [True, True], [False, False]]
    cond = True

    sparse = SparseDataFrame(data)
    result = sparse.where(sparse == cond)

    dense = DataFrame(data)
    dense_expected = dense.where(dense == cond)
    sparse_expected = SparseDataFrame(dense_expected)

    tm.assert_frame_equal(result, dense_expected)
    tm.assert_sp_frame_equal(result, sparse_expected)
Пример #5
0
def test_where_with_numeric_data_and_other(data, other):
    # GH 17386
    lower_bound = 1.5

    sparse = SparseDataFrame(data)
    result = sparse.where(sparse > lower_bound, other)

    dense = DataFrame(data)
    dense_expected = dense.where(dense > lower_bound, other)
    sparse_expected = SparseDataFrame(dense_expected,
                                      default_fill_value=other)

    tm.assert_frame_equal(result, dense_expected)
    tm.assert_sp_frame_equal(result, sparse_expected)
Пример #6
0
def test_where_with_bool_data_and_other(other):
    # GH 17386
    data = [[False, False], [True, True], [False, False]]
    cond = True

    sparse = SparseDataFrame(data)
    result = sparse.where(sparse == cond, other)

    dense = DataFrame(data)
    dense_expected = dense.where(dense == cond, other)
    sparse_expected = SparseDataFrame(dense_expected,
                                      default_fill_value=other)

    tm.assert_frame_equal(result, dense_expected)
    tm.assert_sp_frame_equal(result, sparse_expected)
Пример #7
0
    def add_missing_data(self, objects):
        '''
        In high charts ( our front end visualization module ) when we pass
        data to a stacked / grouped bar chart, everything has to be in order,
        meaning that if we have missing data, we have to pass the chart
        an object that contains an object for each peice of missing data.

        We are currently working on impementing the logic you see below into
        the front end, but in order to get somethign working fo the TAG
        meeting next week, instead of pushign forward on the front end
        implmentatino which is in it's early stages.. I have put this piece of
        code in to the campaign api so that the grouped bar charts
        will render properly.

        For more informatino on this see: https://trello.com/c/euIwyOh4/9
        '''

        ## build a data frame from the object list
        df = DataFrame(list(objects))

        ## create a dataframe with all possible objects based on possble objects
        list_of_lists = [df['location_id'].unique(), \
            df['indicator_id'].unique(), df['campaign_id'].unique()]
        cart_product = list(itertools.product(*list_of_lists))
        columns_list = ['location_id','indicator_id', 'campaign_id']
        cart_prod_df = DataFrame(cart_product, columns = columns_list)

        ## merge the two data frames, which will effectively fill in the missing
        ## data giving obects a Null value if they did not exist in query result
        df = df.merge(cart_prod_df, how='outer', on=columns_list)
        non_null_df = df.where((notnull(df)), None)

        ## create a list of dictionaries ( same structure as the input )
        object_list = [row.to_dict() for ix, row in non_null_df.iterrows()]

        return object_list
Пример #8
0
    def test_align(self):
        af, bf = self.frame.align(self.frame)
        self.assertIsNot(af._data, self.frame._data)

        af, bf = self.frame.align(self.frame, copy=False)
        self.assertIs(af._data, self.frame._data)

        # axis = 0
        other = self.frame.ix[:-5, :3]
        af, bf = self.frame.align(other, axis=0, fill_value=-1)
        self.assert_index_equal(bf.columns, other.columns)
        # test fill value
        join_idx = self.frame.index.join(other.index)
        diff_a = self.frame.index.difference(join_idx)
        diff_b = other.index.difference(join_idx)
        diff_a_vals = af.reindex(diff_a).values
        diff_b_vals = bf.reindex(diff_b).values
        self.assertTrue((diff_a_vals == -1).all())

        af, bf = self.frame.align(other, join='right', axis=0)
        self.assert_index_equal(bf.columns, other.columns)
        self.assert_index_equal(bf.index, other.index)
        self.assert_index_equal(af.index, other.index)

        # axis = 1
        other = self.frame.ix[:-5, :3].copy()
        af, bf = self.frame.align(other, axis=1)
        self.assert_index_equal(bf.columns, self.frame.columns)
        self.assert_index_equal(bf.index, other.index)

        # test fill value
        join_idx = self.frame.index.join(other.index)
        diff_a = self.frame.index.difference(join_idx)
        diff_b = other.index.difference(join_idx)
        diff_a_vals = af.reindex(diff_a).values

        # TODO(wesm): unused?
        diff_b_vals = bf.reindex(diff_b).values  # noqa

        self.assertTrue((diff_a_vals == -1).all())

        af, bf = self.frame.align(other, join='inner', axis=1)
        self.assert_index_equal(bf.columns, other.columns)

        af, bf = self.frame.align(other, join='inner', axis=1, method='pad')
        self.assert_index_equal(bf.columns, other.columns)

        # test other non-float types
        af, bf = self.intframe.align(other, join='inner', axis=1, method='pad')
        self.assert_index_equal(bf.columns, other.columns)

        af, bf = self.mixed_frame.align(self.mixed_frame,
                                        join='inner', axis=1, method='pad')
        self.assert_index_equal(bf.columns, self.mixed_frame.columns)

        af, bf = self.frame.align(other.ix[:, 0], join='inner', axis=1,
                                  method=None, fill_value=None)
        self.assert_index_equal(bf.index, Index([]))

        af, bf = self.frame.align(other.ix[:, 0], join='inner', axis=1,
                                  method=None, fill_value=0)
        self.assert_index_equal(bf.index, Index([]))

        # mixed floats/ints
        af, bf = self.mixed_float.align(other.ix[:, 0], join='inner', axis=1,
                                        method=None, fill_value=0)
        self.assert_index_equal(bf.index, Index([]))

        af, bf = self.mixed_int.align(other.ix[:, 0], join='inner', axis=1,
                                      method=None, fill_value=0)
        self.assert_index_equal(bf.index, Index([]))

        # try to align dataframe to series along bad axis
        self.assertRaises(ValueError, self.frame.align, af.ix[0, :3],
                          join='inner', axis=2)

        # align dataframe to series with broadcast or not
        idx = self.frame.index
        s = Series(range(len(idx)), index=idx)

        left, right = self.frame.align(s, axis=0)
        tm.assert_index_equal(left.index, self.frame.index)
        tm.assert_index_equal(right.index, self.frame.index)
        self.assertTrue(isinstance(right, Series))

        left, right = self.frame.align(s, broadcast_axis=1)
        tm.assert_index_equal(left.index, self.frame.index)
        expected = {}
        for c in self.frame.columns:
            expected[c] = s
        expected = DataFrame(expected, index=self.frame.index,
                             columns=self.frame.columns)
        assert_frame_equal(right, expected)

        # GH 9558
        df = DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
        result = df[df['a'] == 2]
        expected = DataFrame([[2, 5]], index=[1], columns=['a', 'b'])
        assert_frame_equal(result, expected)

        result = df.where(df['a'] == 2, 0)
        expected = DataFrame({'a': [0, 2, 0], 'b': [0, 5, 0]})
        assert_frame_equal(result, expected)
Пример #9
0
 def setup(self):
     data = np.random.randn(1000, 500)
     df = DataFrame(data)
     df = df.where(df > 0)
     self.bools = df > 0
     self.mask = isnull(df)
Пример #10
0
class MagicDataFrame(object):

    """
    Each MagicDataFrame corresponds to one MagIC table.
    The MagicDataFrame object consists of a pandas DataFrame,
    and assorted methods for manipulating that DataFrame.
    """

    def __init__(self, magic_file=None, columns=None, dtype=None,
                 groups=None, dmodel=None, df=None):
        """
        Provide either a magic_file or a dtype.
        List of columns is optional,
        and will only be used if magic_file == None.
        Instead of a list of columns, you can also provide
        a list of group-names, and the specific col_names
        will be filled in by the data model.
        If provided, col_names takes precedence.
        """
        if isinstance(df, pd.DataFrame):
            self.df = df
            if dtype:
                self.dtype = dtype
            else:
                print '-W- Please provide data type...'
        # make sure all required arguments are present
        if not magic_file and not dtype and not isinstance(df, pd.DataFrame):
            print "-W- To make a MagicDataFrame, you must provide either a filename or a datatype"
            return
        # fetch data model if not provided
        if isinstance(dmodel, type(None)):
            self.data_model = data_model.DataModel()
        else:
            self.data_model = dmodel

        if isinstance(df, pd.DataFrame):
            pass
        # if no file is provided, make an empty dataframe of the appropriate type
        elif not magic_file:
            self.dtype = dtype
            if not isinstance(columns, type(None)):
                self.df = DataFrame(columns=columns)
            else:
                self.df = DataFrame()
                self.df.index.name = dtype[:-1] if dtype.endswith("s") else dtype
        # if there is a file provided, read in the data and ascertain dtype
        else:
            ## old way of reading in data using pmag.magic_read
            #data, dtype, keys = pmag.magic_read(magic_file, return_keys=True)
            ## create dataframe, maintaining column order:
            #self.df = DataFrame(data, columns=keys)
            #if dtype == 'bad_file':
            #    print "-W- Bad file {}".format(magic_file)
            #    self.dtype = 'empty'
            #    return

            ## new way of reading in data using pd.read_table
            with open(magic_file) as f:
                delim, dtype = f.readline().split('\t')[:2]
            self.df = pd.read_table(magic_file, skiprows=[0])
            self.dtype = dtype.strip()
            if self.dtype == 'measurements':
                ###self.df['measurement_name'] = self.df['experiment_name'] + self.df['measurement_number']
                self.df['measurement'] = self.df['experiment'] + self.df['number'].astype(str)
                name = 'measurement'
            elif self.dtype.endswith('s'):
                #dtype = dtype[:-1]
                name = '{}'.format(self.dtype[:-1])
            elif self.dtype == 'contribution':
                name = 'doi'
                # **** this is broken at the moment, fix it!
                return
            else:
                name = self.dtype
            # fix these:
            if self.dtype == 'images':
                self.df = pd.DataFrame()
                return
            if self.dtype == 'criteria':
                #self.df = pd.DataFrame()
                self.df.index = self.df['table_column']
                return
            if len(self.df) and self.dtype != 'ages':
                self.df.index = self.df[name].astype(str)
            elif self.dtype == 'ages':
                self.df.index = self.df.index.astype(str)
            #del self.df[name]
            #self.dtype = dtype
            # replace '' with None, so you can use isnull(), notnull(), etc.
            # can always switch back with DataFrame.fillna('')
            self.df = self.df.where(self.df.notnull(), None)

            # drop any completely blank columns
            # this is not necessarily a good idea....
            #self.df.dropna(axis=1, how='all', inplace=True)
            #
            # add df columns that were passed in but weren't in the file
            if columns:
                for col in columns:
                    if col not in self.df.columns:
                        self.df[col] = None

        # add col_names by group
        if groups and not columns:
            columns = []
            for group_name in groups:
                columns.extend(list(self.data_model.get_group_headers(self.dtype, group_name)))
            for col in columns:
                if col not in self.df.columns:
                    self.df[col] = None
            self.df = self.df[columns]



    ## Methods to change self.df inplace

    def update_row(self, ind, row_data):
        """
        Update a row with data.
        Must provide the specific numeric index (not row label).
        If any new keys are present in row_data dictionary,
        that column will be added to the dataframe.
        This is done inplace.
        """
        if sorted(row_data.keys()) != sorted(self.df.columns):
            # add any new column names
            for key in row_data:
                if key not in self.df.columns:
                    self.df[key] = None
            # add missing column names into row_data
            for col_label in self.df.columns:
                if col_label not in row_data.keys():
                    row_data[col_label] = None
        try:
            self.df.iloc[ind] = pd.Series(row_data)
        except IndexError:
            return False
        return self.df


    def add_row(self, label, row_data, columns=""):
        """
        Add a row with data.
        If any new keys are present in row_data dictionary,
        that column will be added to the dataframe.
        This is done inplace
        """
        # use provided column order, making sure you don't lose any values
        # from self.df.columns
        if len(columns):
            if sorted(self.df.columns) == sorted(columns):
                self.df.columns = columns
            else:
                new_columns = []
                new_columns.extend(columns)
                for col in self.df.columns:
                    if col not in new_columns:
                        new_columns.append(col)
        # makes sure all columns have data or None
        if sorted(row_data.keys()) != sorted(self.df.columns):
            # add any new column names
            for key in row_data:
                if key not in self.df.columns:
                    self.df[key] = None
            # add missing column names into row_data
            for col_label in self.df.columns:
                if col_label not in row_data.keys():
                    row_data[col_label] = None

        # (make sure you are working with strings)
        self.df.index = self.df.index.astype(str)
        label = str(label)

        # create a new row with suffix "new"
        # (this ensures that you get a unique, new row,
        #  instead of adding on to an existing row with the same label)
        self.df.loc[label + "new"] = pd.Series(row_data)
        # rename it to be correct
        self.df.rename(index={label + "new": label}, inplace=True)
        # use next line to sort index inplace
        #self.df.sort_index(inplace=True)
        return self.df


    def add_blank_row(self, label):
        """
        Add a blank row with only an index value to self.df.
        This is done inplace.
        """
        col_labels = self.df.columns
        blank_item = pd.Series({}, index=col_labels, name=label)
        # use .loc to add in place (append won't do that)
        self.df.loc[blank_item.name] = blank_item
        return self.df


    def delete_row(self, ind):
        """
        remove self.df row at ind
        inplace
        """
        self.df = pd.concat([self.df[:ind], self.df[ind+1:]])
        return self.df

    def delete_rows(self,condition):
        """
        delete all rows with  condition==True
        inplace
        """
        self.df['num'] = range(len(self.df))
        df_data = self.df
        # delete all records that meet condition
        if len(df_data[condition]) > 0:  #we have one or more records to delete
            inds = df_data[condition]['num'] # list of all rows where condition is true
            for ind in inds:
                df_data = self.delete_row(ind)
                print 'deleting row where: ',condition
        # sort so that all rows for an item are together
        df_data.sort_index(inplace=True)
        # redo temporary index
        df_data['num'] = range(len(df_data))
        self.df = df_data
        return df_data


    def update_record(self, name, new_data, condition, update_only=False,
                      debug=False):
        """
        Find the first row in self.df with index == name
        and condition == True.
        Update that record with new_data, then delete any
        additional records where index == name and condition == True.
        Change is inplace
        """
        # add numeric index column temporarily
        self.df['num'] = range(len(self.df))
        df_data = self.df
        # edit first of existing data that meets condition
        if len(df_data[condition]) > 0:  #we have one or more records to update or delete
            #print "updating:", name
            inds = df_data[condition]['num'] # list of all rows where condition is true
            existing_data = dict(df_data.iloc[inds[0]]) # get first record of existing_data from dataframe
            existing_data.update(new_data) # update existing data with new interpretations
            # update row
            self.update_row(inds[0], existing_data)
            # now remove all the remaining records of same condition
            if len(inds) > 1:
                for ind in inds[1:]:
                    print "deleting redundant records for:", name
                    df_data = self.delete_row(ind)
        else:
            if update_only:
                print "no record found for that condition, not updating ", name
            else:
                print 'no record found - creating new one for ', name
                # add new row
                df_data = self.add_row(name, new_data)
        # sort so that all rows for an item are together
        df_data.sort_index(inplace=True)
        # redo temporary index
        df_data['num'] = range(len(df_data))
        self.df = df_data
        return df_data


    ## Methods that take self.df and extract some information from it

    def convert_to_pmag_data_list(self, lst_or_dict="lst", df=None):

        """
        Take MagicDataFrame and turn it into a list of dictionaries.
        This will have the same format as reading in a 2.5 file
        with pmag.magic_read(), i.e.:
        if "lst":
          [{"sample": "samp_name", "azimuth": 12, ...}, {...}]
        if "dict":
          {"samp_name": {"azimuth": 12, ...}, "samp_name2": {...}, ...}
        """
        if isinstance(df, type(None)):
            df = self.df
        dictionary = dict(df.T)
        if lst_or_dict == "lst":
            return [dict(dictionary[key]) for key in dictionary]
        else:
            return {key: dict(dictionary[key]) for key in dictionary}


    def get_name(self, col_name, df_slice="", index_names=""):
        """
        Takes in a column name, and either a DataFrame slice or
        a list of index_names to slice self.df using fancy indexing.
        Then return the value for that column in the relevant slice.
        (Assumes that all values for column will be the same in the
         chosen slice, so return the first one.)
        """
        # if slice is provided, use it
        if any(df_slice):
            df_slice = df_slice
        # if given index_names, grab a slice using fancy indexing
        elif index_names:
            df_slice = self.df.ix[index_names]
        # otherwise, use the full DataFrame
        else:
            df_slice = self.df
        # if the slice is empty, return ""
        if len(df_slice) == 0:
            return ""
        # if the column name isn't present in the slice, return ""
        if col_name not in df_slice.columns:
            return ""
        # otherwise, return the first value from that column
        first_val = df_slice[col_name].dropna()
        if any(first_val):
            return first_val[0]
        else:
            return ""
        #return df_slice[col_name].dropna()[0]


    def get_di_block(self, df_slice=None, do_index=False,
                     item_names=None, tilt_corr='100',
                     excl=None):
        """
        Input either a DataFrame slice
        or
        do_index=True and a list of index_names.
        Output dec/inc from the slice in this format:
        [[dec1, inc1], [dec2, inc2], ...].
        Not inplace
        """
        tilt_corr = int(tilt_corr)
        if isinstance(df_slice, str):
            if df_slice.lower() == "all":
                # use entire DataFrame
                df_slice = self.df
        elif do_index:
            # use fancy indexing (but note this will give duplicates)
            df_slice = self.df.ix[item_names]
        elif not do_index:
            # otherwise use the provided slice
            df_slice = df_slice

        # once you have the slice, fix up the data
        # tilt correction must match
        if tilt_corr != 0:
            df_slice = df_slice[df_slice['dir_tilt_correction'] == tilt_corr]
        else:
            # if geographic ("0"),
            # use records with no tilt_corr and assume geographic
            cond1 = df_slice['dir_tilt_correction'] == None
            cond2 = df_slice['dir_tilt_correction'] == tilt_corr
            df_slice = df_slice[cond1 | cond2]
        # exclude data with unwanted codes
        if excl:
            for ex in excl:
                df_slice = self.get_records_for_code(ex, incl=False,
                                                     use_slice=True,
                                                     sli=df_slice)

        df_slice = df_slice[df_slice['dir_inc'].notnull() & df_slice['dir_dec'].notnull()]
        # possible add in:
        # split out di_block from this study from di_block from other studies (in citations column)
        # for now, just use "This study"
        if 'citations' in df_slice.columns:
            df_slice = df_slice[df_slice['citations'] == "This study"]

        # convert values into DIblock format
        di_block = [[float(row['dir_dec']), float(row['dir_inc'])] for ind, row in df_slice.iterrows()]
        return di_block


    def get_records_for_code(self, meth_code, incl=True, use_slice=False,
                             sli=None, strict_match=True):
        """
        Use regex to see if meth_code is in the method_codes ":" delimited list.
        If incl == True, return all records WITH meth_code.
        If incl == False, return all records WITHOUT meth_code.
        If strict_match == True, return only records with the exact meth_code.
        If strict_match == False, return records that contain the meth_code partial string,
        (i.e., "DE-").
        Not inplace
        """
        # (must use fillna to replace np.nan with False for indexing)
        if use_slice:
            df = sli.copy()
        else:
            df = self.df.copy()
        # if meth_code not provided, return unchanged dataframe
        if not meth_code:
            return df
        # get regex
        if not strict_match:
            # grab any record that contains any part of meth_code
            cond = df['method_codes'].str.contains(meth_code).fillna(False)
        else:
            # grab only an exact match
            pattern = re.compile('{}(?=:|\s|\Z)'.format(meth_code))
            cond = df['method_codes'].str.contains(pattern).fillna(False)
        if incl:
            # return a copy of records with that method code:
            return df[cond]
        else:
            # return a copy of records without that method code
            return df[~cond]


    ## Combining multiple DataFrames

    def merge_dfs(self, df1, replace_dir_or_int):
        """
        Description: takes new calculated directional, intensity data, or both and replaces the corresponding data in self.df with the new input data preserving any data that is not replaced.

        @param: df1 - first DataFrame whose data will preferentially be used.
        @param: replace_dir_or_int - must be string 'dir', 'int', or 'full' and acts as a flag to tell the funciton weather to replace directional, intensity data, or just everything in current table. If there is not enough data in the current table to split by dir or int the two dfs will be fully merged (Note: if you are dealing with tables other than specimens.txt you should likely use full as that is the only table the other options have been tested on)
        """

        if self.df.empty: return df1
        elif df1.empty: return self.df

        #copy to prevent mutation
        cdf2 = self.df.copy()

        #split data into types and decide which to replace
        if replace_dir_or_int == 'dir' and 'method_codes' in cdf2.columns:
            cdf2 = cdf2[cdf2['method_codes'].notnull()]
            acdf2 = cdf2[cdf2['method_codes'].str.contains('LP-PI')]
            mcdf2 = cdf2[cdf2['method_codes'].str.contains('LP-DIR')]
        elif replace_dir_or_int == 'int' and 'method_codes' in cdf2.columns:
            cdf2 = cdf2[cdf2['method_codes'].notnull()]
            mcdf2 = cdf2[cdf2['method_codes'].str.contains('LP-PI')]
            acdf2 = cdf2[cdf2['method_codes'].str.contains('LP-DIR')]
        else:
            mcdf2 = cdf2
            acdf2 = pd.DataFrame(columns=mcdf2.columns)

        #get rid of stupid duplicates
        for c in [cx for cx in mcdf2.columns if cx in df1.columns]:
            del mcdf2[c]

        #join the new calculated data with the old data of same type
        mdf = df1.join(mcdf2, how='inner', lsuffix='__remove')
        #duplicates rows for some freaking reason
        mdf.drop_duplicates(inplace=True,subset=[col for col in mdf.columns if col != 'description'])
        #merge the data of the other type with the new data
        mdf = mdf.merge(acdf2, how='outer')
        if self.dtype.endswith('s'): dtype = self.dtype[:-1]
        else: dtype = self.dtype
        if dtype in mdf.columns:
            #fix freaking indecies because pandas
            mdf = mdf.set_index(dtype)
            #really? I wanted the index changed not a column deleted?!?
            mdf[dtype] = mdf.index
            mdf.sort_index(inplace=True)

        return mdf


    ## Methods for writing self.df out to tab-delimited file

    def write_magic_file(self, custom_name=None, dir_path=".", append=False):
        """
        Write self.df out to tab-delimited file.
        By default will use standard MagIC filenames (specimens.txt, etc.),
        or you can provide a custom_name to write to instead.
        By default will write to current directory,
        or provide dir_path to write out to instead.
        """
        # *** maybe add some logical order to the column names, here?
        # *** i.e., alphabetical...  see grid_frame3.GridBuilder.make_grid
        df = self.df
        # if indexing column was put in, remove it
        if "num" in self.df.columns:
            self.df.drop("num", axis=1, inplace=True)
        dir_path = os.path.realpath(dir_path)
        if custom_name:
            fname = os.path.join(dir_path, custom_name)
        else:
            fname = os.path.join(dir_path, self.dtype + ".txt")
        # add to existing file
        if append:
            print '-I- appending {} data to {}'.format(self.dtype, fname)
            mode = "a"
        # overwrite existing file
        elif os.path.exists(fname):
            print '-I- overwriting {}'.format(fname)
            mode = "w"
        # or create new file
        else:
            print '-I- writing {} data to {}'.format(self.dtype, fname)
            mode = "w"
        f = open(fname, mode)
        f.write('tab\t{}\n'.format(self.dtype))
        df.to_csv(f, sep="\t", header=True, index=False)
        f.close()
Пример #11
0
    def agg_datapoints(self):
        '''
        Regional Aggregation based on the adjacency list built on top of the
        parent_location_id column.

        Data stored in the DataPoint table for a location with the same
        indicator, campaign will always override the aggregated values.

        Here, we create a tuple_dict in which the unique key of (locaiton,
        indicator, campaign) represents the key, and the cooresponding value
        is the value.  This way we add to the dict the aggregated values, then
        iterrate through the raw values, adding or updating the data in the
        tuple dict before bulk inserting the data.

        The tuple looks like:  {(1, 201, 164): 12, (2, 101, 168): .24}
        '''

        agg_dp_batch, tuple_dict = [],{}
        location_tree_columns = ['location_id','parent_location_id','lvl']

        dp_df = DataFrame(list(DataPoint.objects\
            .filter(campaign_id = self.campaign_id)\
            .values_list(*self.dp_columns)),columns=self.dp_columns)

        ## NaN to None
        no_nan_dp_df = dp_df.where((notnull(dp_df)), None)

        ## represents the location heirarchy as a cache from the location table
        location_tree_df = DataFrame(list(LocationTree.objects\
            .filter(location_id__in=list(dp_df['location_id'].unique()))
            .values_list(*location_tree_columns)),columns=location_tree_columns)

        ## join the location tree to the datapoints
        joined_location_df = no_nan_dp_df.merge(location_tree_df)

        ## filter the joined dataframe so that we aggregate at the highest
        ## level for which there is stored data.  If we do not do this, then
        ## if we ingest both, district and province level data, the national
        ## will be double the value that it should be.

        max_location_lvl_for_indicator_df = DataFrame(joined_location_df\
            .groupby(['indicator_id'])['lvl'].min()) # highest lvl per indicator
        max_location_lvl_for_indicator_df.reset_index(level=0, inplace=True)

        ## filter df to keep the data for the highest level per indicator ##
        prepped_for_sum_df = joined_location_df\
            .merge(max_location_lvl_for_indicator_df,on=['indicator_id','lvl'])

        ## group by parent_location_id and take the sum ##
        grouped_df = DataFrame(prepped_for_sum_df\
            .groupby(['parent_location_id', 'indicator_id','campaign_id'])\
            ['value'].sum())

        ## add aggregate values to the tuple dict ##
        for ix, dp in grouped_df.iterrows():
            tuple_dict[ix] = dp.value

        ## now add the raw data to the dict ( overriding agregate if exists )
        for ix, dp in no_nan_dp_df.iterrows():
            ## dont override null value from parent if sum exists for children
            if dp.value and dp.value != 'NaN' :
                tuple_dict[(dp.location_id, dp.indicator_id, dp.campaign_id)] \
                    = dp.value

        ## now prep the batch for the bulk insert ##
        for dp_unique_key, value in tuple_dict.iteritems():
            dp_dict =  dict(zip(('location_id','indicator_id','campaign_id')\
                ,dp_unique_key))

            dp_dict['value'] = value
            dp_dict['cache_job_id'] = self.cache_job.id

            agg_dp_batch.append(AggDataPoint(**dp_dict))

        AggDataPoint.objects.filter(campaign_id = self.campaign_id).delete()
        AggDataPoint.objects.bulk_create(agg_dp_batch)
Пример #12
0
    def test_align(self):
        af, bf = self.frame.align(self.frame)
        self.assertIsNot(af._data, self.frame._data)

        af, bf = self.frame.align(self.frame, copy=False)
        self.assertIs(af._data, self.frame._data)

        # axis = 0
        other = self.frame.ix[:-5, :3]
        af, bf = self.frame.align(other, axis=0, fill_value=-1)
        self.assertTrue(bf.columns.equals(other.columns))
        # test fill value
        join_idx = self.frame.index.join(other.index)
        diff_a = self.frame.index.difference(join_idx)
        diff_b = other.index.difference(join_idx)
        diff_a_vals = af.reindex(diff_a).values
        diff_b_vals = bf.reindex(diff_b).values
        self.assertTrue((diff_a_vals == -1).all())

        af, bf = self.frame.align(other, join='right', axis=0)
        self.assertTrue(bf.columns.equals(other.columns))
        self.assertTrue(bf.index.equals(other.index))
        self.assertTrue(af.index.equals(other.index))

        # axis = 1
        other = self.frame.ix[:-5, :3].copy()
        af, bf = self.frame.align(other, axis=1)
        self.assertTrue(bf.columns.equals(self.frame.columns))
        self.assertTrue(bf.index.equals(other.index))

        # test fill value
        join_idx = self.frame.index.join(other.index)
        diff_a = self.frame.index.difference(join_idx)
        diff_b = other.index.difference(join_idx)
        diff_a_vals = af.reindex(diff_a).values

        # TODO(wesm): unused?
        diff_b_vals = bf.reindex(diff_b).values  # noqa

        self.assertTrue((diff_a_vals == -1).all())

        af, bf = self.frame.align(other, join='inner', axis=1)
        self.assertTrue(bf.columns.equals(other.columns))

        af, bf = self.frame.align(other, join='inner', axis=1, method='pad')
        self.assertTrue(bf.columns.equals(other.columns))

        # test other non-float types
        af, bf = self.intframe.align(other, join='inner', axis=1, method='pad')
        self.assertTrue(bf.columns.equals(other.columns))

        af, bf = self.mixed_frame.align(self.mixed_frame,
                                        join='inner',
                                        axis=1,
                                        method='pad')
        self.assertTrue(bf.columns.equals(self.mixed_frame.columns))

        af, bf = self.frame.align(other.ix[:, 0],
                                  join='inner',
                                  axis=1,
                                  method=None,
                                  fill_value=None)
        self.assertTrue(bf.index.equals(Index([])))

        af, bf = self.frame.align(other.ix[:, 0],
                                  join='inner',
                                  axis=1,
                                  method=None,
                                  fill_value=0)
        self.assertTrue(bf.index.equals(Index([])))

        # mixed floats/ints
        af, bf = self.mixed_float.align(other.ix[:, 0],
                                        join='inner',
                                        axis=1,
                                        method=None,
                                        fill_value=0)
        self.assertTrue(bf.index.equals(Index([])))

        af, bf = self.mixed_int.align(other.ix[:, 0],
                                      join='inner',
                                      axis=1,
                                      method=None,
                                      fill_value=0)
        self.assertTrue(bf.index.equals(Index([])))

        # try to align dataframe to series along bad axis
        self.assertRaises(ValueError,
                          self.frame.align,
                          af.ix[0, :3],
                          join='inner',
                          axis=2)

        # align dataframe to series with broadcast or not
        idx = self.frame.index
        s = Series(range(len(idx)), index=idx)

        left, right = self.frame.align(s, axis=0)
        tm.assert_index_equal(left.index, self.frame.index)
        tm.assert_index_equal(right.index, self.frame.index)
        self.assertTrue(isinstance(right, Series))

        left, right = self.frame.align(s, broadcast_axis=1)
        tm.assert_index_equal(left.index, self.frame.index)
        expected = {}
        for c in self.frame.columns:
            expected[c] = s
        expected = DataFrame(expected,
                             index=self.frame.index,
                             columns=self.frame.columns)
        assert_frame_equal(right, expected)

        # GH 9558
        df = DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
        result = df[df['a'] == 2]
        expected = DataFrame([[2, 5]], index=[1], columns=['a', 'b'])
        assert_frame_equal(result, expected)

        result = df.where(df['a'] == 2, 0)
        expected = DataFrame({'a': [0, 2, 0], 'b': [0, 5, 0]})
        assert_frame_equal(result, expected)
Пример #13
0
def ccf(x,
        y,
        lags=365,
        bin_method='rectangle',
        bin_width=0.5,
        max_gap=inf,
        min_obs=100,
        full_output=False,
        alpha=0.05):
    """Method to compute the cross-correlation for irregular time series.

    Parameters
    ----------
    x,y: pandas.Series
        Pandas Series containing the values to calculate the
        cross-correlation for. The index has to be a Pandas.DatetimeIndex
    lags: array_like, optional
        numpy array containing the lags in days for which the
        cross-correlation is calculated. Default [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
        12, 13, 14, 30, 61, 90, 120, 150, 180, 210, 240, 270, 300, 330, 365]
    bin_method: str, optional
        method to determine the type of bin. Options are "rectangle" (default),
        "gaussian" and "regular" (for regular timesteps).
    bin_width: float, optional
        number of days used as the width for the bin to calculate the
        correlation. By default these values are chosen based on the
        bin_method and the average time step (dt_mu). That is 0.5dt_mu when
        bin_method="rectangle" and 0.25dt_mu when bin_method="gaussian".
    max_gap: float, optional
        Maximum timestep gap in the data. All timesteps above this gap value
        are not used for calculating the average timestep. This can be
        helpful when there is a large gap in the data that influences the
        average timestep.
    min_obs: int, optional
        Minimum number of observations in a bin to determine the correlation.
    full_output: bool, optional
        If True, also estimated uncertainties are returned. Default is False.
    alpha: float
        alpha level to compute the confidence interval (e.g., 1-alpha).

    Returns
    -------
    c: pandas.Series or pandas.DataFrame
        The Cross-correlation function.

    References
    ----------
    Rehfeld, K., Marwan, N., Heitzig, J., Kurths, J. (2011). Comparison
    of correlation analysis techniques for irregularly sampled time series.
    Nonlinear Processes in Geophysics. 18. 389-404. 10.5194 pg-18-389-2011.

    Tip
    ---
    This method will be significantly faster when Numba is installed. Check
    out the [Numba project here](https://numba.pydata.org)

    Examples
    --------
    >>> ccf = ps.stats.ccf(x, y, bin_method="gaussian")

    """
    # prepare the time indices for x and y
    if x.index.inferred_freq and y.index.inferred_freq:
        bin_method = "regular"
    elif bin_method == "regular":
        raise Warning("time series does not have regular time steps, "
                      "choose different bin_method")

    x, t_x, dt_x_mu = _preprocess(x, max_gap=max_gap)
    y, t_y, dt_y_mu = _preprocess(y, max_gap=max_gap)
    dt_mu = max(dt_x_mu, dt_y_mu)  # Mean time step from both series

    if isinstance(lags, int) and bin_method == "regular":
        lags = arange(int(dt_mu), lags + 1, int(dt_mu), dtype=float)
    elif isinstance(lags, int):
        lags = arange(1.0, lags + 1, dtype=float)
    elif isinstance(lags, list):
        lags = array(lags, dtype=float)

    if bin_method == "rectangle":
        if bin_width is None:
            bin_width = 0.5 * dt_mu
        c, b = _compute_ccf_rectangle(lags, t_x, x, t_y, y, bin_width)
    elif bin_method == "gaussian":
        if bin_width is None:
            bin_width = 0.25 * dt_mu
        c, b = _compute_ccf_gaussian(lags, t_x, x, t_y, y, bin_width)
    elif bin_method == "regular":
        c, b = _compute_ccf_regular(arange(1.0, len(lags) + 1), x, y)
    else:
        raise NotImplementedError

    std = norm.ppf(1 - alpha / 2.) / sqrt(b)
    result = DataFrame(data={
        "ccf": c,
        "stderr": std,
        "n": b
    },
                       index=TimedeltaIndex(lags, unit="D", name="Lags"))

    result = result.where(result.n > min_obs).dropna()

    if full_output:
        return result
    else:
        return result.ccf
Пример #14
0
 def setup(self):
     data = np.random.randn(1000, 500)
     df = DataFrame(data)
     df = df.where(df > 0)
     self.bools = df > 0
     self.mask = isnull(df)
Пример #15
0
def pull_data(lat, lng, state, timestamp, freq, variables, forecastio_key = '80697c3428e4e6523286833268b5530c'):


    url  = "https://api.forecast.io/forecast/%s/%s,%s,%s?%s"%(forecastio_key,lat,lng,timestamp,'solar')
    print url
    r = requests.get(url)

    # set timezone... if error thrown, load default file
    print r.json()
    try:
        tz= r.json()['timezone']
    except KeyError:
        tz=json.loads(open(timezones).read())[state]

    now = Timestamp(datetime.now(),tz='UTC').tz_convert(tz).to_datetime()
    temp = DataFrame(index=[],columns=[])
    # get forecast ever hour
    print r.json()
    for key in r.json()['hourly']['data']:
        # print Timestamp(datetime.fromtimestamp(key['time']),tz='UTC').tz_convert(tz).to_datetime()
        try:
            key = dict(key['solar'].items() + key.items())
            del key['solar']
        except KeyError: pass
        temp= concat([temp,DataFrame(key, index=[0]).head()]) #is head needed?
        
    temp=temp.rename(columns={'time':'Timestamp'})
    temp['Timestamp'] = [Timestamp(datetime.fromtimestamp(i), tz='UTC').tz_convert(tz).to_datetime() for i in temp['Timestamp']]

    temp['State'] = state
    if timestamp =='': #change this to compare now to date
        #hour ahead forecasts
        temp['Offset'] = [floor(convert_hours(i-now))+1 for i in temp['Timestamp']]
    else:
        #actual data
        temp['Offset'] = 0
        
    temp['Downloaded'] = ts_to_dt(now)
    temp['Timestamp'] = [ts_to_dt(i) for i in temp['Timestamp']]
    temp = temp.sort('Timestamp')
    
    if timestamp =='': #change this to compare now to date
        temp = temp[temp['Offset']>0]

    index = date_range(start = min(temp['Timestamp']), end = max(temp['Timestamp']), freq=freq)
    temp = temp.set_index('Timestamp')

    #interpolation?
    keep = DataFrame(index=[],columns=[])
    for col in variables:
        series = interpolate(temp[col], index)
        keep = merge(keep, series, how='outer', left_index=True, right_index=True)
        #keep[col] = [float(i) for i in keep[col]]
        
    # forward fill the offset and non-numerical columns
    for col in ['summary','Offset','State','Downloaded','icon']:
        series = DataFrame(temp[col].reindex(set(temp.index).union(index)).sort_index().ffill().ix[index])
        keep = merge(keep, series, how='outer',left_index=True, right_index=True)
        
    keep = keep.reset_index().rename(columns={'index':'Timestamp'})

    keep = keep.where((notnull(keep)), None)

    return keep
Пример #16
0
    def test_align_float(self, float_frame):
        af, bf = float_frame.align(float_frame)
        assert af._mgr is not float_frame._mgr

        af, bf = float_frame.align(float_frame, copy=False)
        assert af._mgr is float_frame._mgr

        # axis = 0
        other = float_frame.iloc[:-5, :3]
        af, bf = float_frame.align(other, axis=0, fill_value=-1)

        tm.assert_index_equal(bf.columns, other.columns)

        # test fill value
        join_idx = float_frame.index.join(other.index)
        diff_a = float_frame.index.difference(join_idx)
        diff_b = other.index.difference(join_idx)
        diff_a_vals = af.reindex(diff_a).values
        diff_b_vals = bf.reindex(diff_b).values
        assert (diff_a_vals == -1).all()

        af, bf = float_frame.align(other, join="right", axis=0)
        tm.assert_index_equal(bf.columns, other.columns)
        tm.assert_index_equal(bf.index, other.index)
        tm.assert_index_equal(af.index, other.index)

        # axis = 1
        other = float_frame.iloc[:-5, :3].copy()
        af, bf = float_frame.align(other, axis=1)
        tm.assert_index_equal(bf.columns, float_frame.columns)
        tm.assert_index_equal(bf.index, other.index)

        # test fill value
        join_idx = float_frame.index.join(other.index)
        diff_a = float_frame.index.difference(join_idx)
        diff_b = other.index.difference(join_idx)
        diff_a_vals = af.reindex(diff_a).values

        # TODO(wesm): unused?
        diff_b_vals = bf.reindex(diff_b).values  # noqa

        assert (diff_a_vals == -1).all()

        af, bf = float_frame.align(other, join="inner", axis=1)
        tm.assert_index_equal(bf.columns, other.columns)

        af, bf = float_frame.align(other, join="inner", axis=1, method="pad")
        tm.assert_index_equal(bf.columns, other.columns)

        af, bf = float_frame.align(other.iloc[:, 0],
                                   join="inner",
                                   axis=1,
                                   method=None,
                                   fill_value=None)
        tm.assert_index_equal(bf.index, Index([]))

        af, bf = float_frame.align(other.iloc[:, 0],
                                   join="inner",
                                   axis=1,
                                   method=None,
                                   fill_value=0)
        tm.assert_index_equal(bf.index, Index([]))

        # Try to align DataFrame to Series along bad axis
        msg = "No axis named 2 for object type DataFrame"
        with pytest.raises(ValueError, match=msg):
            float_frame.align(af.iloc[0, :3], join="inner", axis=2)

        # align dataframe to series with broadcast or not
        idx = float_frame.index
        s = Series(range(len(idx)), index=idx)

        left, right = float_frame.align(s, axis=0)
        tm.assert_index_equal(left.index, float_frame.index)
        tm.assert_index_equal(right.index, float_frame.index)
        assert isinstance(right, Series)

        left, right = float_frame.align(s, broadcast_axis=1)
        tm.assert_index_equal(left.index, float_frame.index)
        expected = {c: s for c in float_frame.columns}
        expected = DataFrame(expected,
                             index=float_frame.index,
                             columns=float_frame.columns)
        tm.assert_frame_equal(right, expected)

        # see gh-9558
        df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
        result = df[df["a"] == 2]
        expected = DataFrame([[2, 5]], index=[1], columns=["a", "b"])
        tm.assert_frame_equal(result, expected)

        result = df.where(df["a"] == 2, 0)
        expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]})
        tm.assert_frame_equal(result, expected)