示例#1
0
    def setUp(self):
        super(TestNDFrame, self).setUp()

        data = {
            'A': [0., 1., 2., 3., np.nan],
            'B': [0, 1, 0, 1, 0],
            'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
            'D': date_range('1/1/2009', periods=5),
            'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
        }

        self.frame = {
            'float': DataFrame(dict(A=data['A'], B=Series(data['A']) + 1)),
            'int': DataFrame(dict(A=data['B'], B=Series(data['B']) + 1)),
            'mixed':
            DataFrame(dict([(k, data[k]) for k in ['A', 'B', 'C', 'D']]))
        }

        self.panel = {
            'float':
            Panel(
                dict(ItemA=self.frame['float'], ItemB=self.frame['float'] + 1))
        }
示例#2
0
    def setUp(self):
        super(TestNDFrame, self).setUp()

        data = {
            'A': [0., 1., 2., 3., np.nan],
            'B': [0, 1, 0, 1, 0],
            'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
            'D': date_range('1/1/2009', periods=5),
            'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
            'F': [Timestamp('20130102', tz='US/Eastern')] * 5,
            'G': [Timestamp('20130603', tz='CET')] * 5,
            'H': Categorical(['a', 'b', 'c', 'd', 'e']),
            'I': Categorical(['a', 'b', 'c', 'd', 'e'], ordered=True),
        }

        self.frame = {
            'float': DataFrame(dict(A=data['A'], B=Series(data['A']) + 1)),
            'int': DataFrame(dict(A=data['B'], B=Series(data['B']) + 1)),
            'mixed': DataFrame(data)}

        self.panel = {
            'float': Panel(dict(ItemA=self.frame['float'],
                                ItemB=self.frame['float'] + 1))}
示例#3
0
文件: __init__.py 项目: whigg/satkml
def loopsat(tlefn, dates, obs):
    cols = ['az', 'el', 'lat', 'lon', 'alt', 'srange']
    sats, satnum = loadTLE(tlefn)

    data = Panel(items=dates, major_axis=satnum, minor_axis=cols)
    for d in dates:
        obs.date = d

        df = DataFrame(index=satnum, columns=cols)
        for i, s in enumerate(sats):
            s.compute(
            )  # don't compute lat/lon/alt with obs! will give wrong answer!
            df.at[satnum[i], ['lat', 'lon', 'alt']] = degrees(
                s.sublat), degrees(s.sublong), s.elevation
            s.compute(obs)
            df.at[satnum[i], ['az', 'el', 'srange']] = degrees(s.az), degrees(
                s.alt), s.range

        df.ix[df['el'] < 0, ['az', 'el', 'srange']] = nan

        data[d] = df

    return data
示例#4
0
def makeBlocks(rinex, ntypes, maxsv, svnames, obstypes, obstimes):
    """
    inputs:
    rinex: file stream
    ntypes: number of observation types
    obstimes: datetime() of each observation
    obstypes: type of measurment e.g. P1, P2,...
    maxsv: maximum number of SVs the reciever saw in this file (i.e. across the entire obs. time)

    outputs:
    blocks: dimensions timeINTERVALs x maxsv x ntypes (page x row x col)
    """
    blocks = Panel(items=obstimes, major_axis=svnames, minor_axis=obstypes)

    for i in range(
            obstimes.size
    ):  #this means maxtimes was specified, otherwise we'd reach end of file
        sathead = rinex.readline()
        if not sathead: break  #EOF
        svnum = int(sathead[29:32])

        obslinespersat = int(np.ceil(ntypes / 5))
        blockrows = svnum * obslinespersat

        satnames = sathead[32:68]
        for _ in range(int(np.ceil(svnum / 12)) - 1):
            line = rinex.readline()
            sathead += line
            satnames += line[32:68]  #FIXME is this right end?
        blocksvnames = satnumfixer(grouper(satnames, 3, svnum))
        #%% read this INTERVAL's text block
        block = ''.join(rinex.readline() for _ in range(blockrows))
        btime = _obstime(sathead[:26].split())
        bdf = _block2df(block, svnum, obstypes, blocksvnames)
        blocks.loc[btime, blocksvnames] = bdf

    return blocks
示例#5
0
def _dl_mult_symbols(symbols, start, end, interval, chunksize, retry_count, pause,
                     method):
    stocks = {}
    failed = []
    for sym_group in _in_chunks(symbols, chunksize):
        for sym in sym_group:
            try:
                stocks[sym] = method(sym, start, end, interval, retry_count, pause)
            except IOError:
                warnings.warn('Failed to read symbol: {0!r}, replacing with '
                              'NaN.'.format(sym), SymbolWarning)
                failed.append(sym)

    try:
        if len(stocks) > 0 and len(failed) > 0:
            df_na = stocks.values()[0].copy()
            df_na[:] = np.nan
            for sym in failed:
                stocks[sym] = df_na
        return Panel(stocks).swapaxes('items', 'minor')
    except AttributeError:
        # cannot construct a panel with just 1D nans indicating no data
        raise RemoteDataError("No data fetched using "
                              "{0!r}".format(method.__name__))
示例#6
0
 def downloadData(self,symbols='all'):
     ''' get data from yahoo  '''
     
     if symbols == 'all':
         symbols = self.symbols
     
     #store = HDFStore(self.dataFile)        
     p = ProgressBar(len(symbols))
     
     for idx,symbol in enumerate(symbols):
         
         try:            
             df = getSymbolData(symbol,sDate=self.startDate,verbose=False)
             if self.autoAdjust:
                 df =  _adjust(df,removeOrig=True)
             
             if len(self.symbols)==0:
                 self.wp = Panel({symbol:df})
             else:
                 self.wp[symbol] = df
         
         except Exception,e:
             print e 
         p.animate(idx+1)
示例#7
0
    def test_take(self):
        indices = [1, 5, -2, 6, 3, -1]
        for s in [tm.makeFloatSeries(), tm.makeStringSeries(),
                  tm.makeObjectSeries()]:
            out = s.take(indices)
            expected = Series(data=s.values.take(indices),
                              index=s.index.take(indices), dtype=s.dtype)
            tm.assert_series_equal(out, expected)
        for df in [tm.makeTimeDataFrame()]:
            out = df.take(indices)
            expected = DataFrame(data=df.values.take(indices, axis=0),
                                 index=df.index.take(indices),
                                 columns=df.columns)
            tm.assert_frame_equal(out, expected)

        indices = [-3, 2, 0, 1]
        with catch_warnings(record=True):
            for p in [tm.makePanel()]:
                out = p.take(indices)
                expected = Panel(data=p.values.take(indices, axis=0),
                                 items=p.items.take(indices),
                                 major_axis=p.major_axis,
                                 minor_axis=p.minor_axis)
                tm.assert_panel_equal(out, expected)
示例#8
0
文件: git.py 项目: ypuzikov/vbench
def get_code_churn(commits):
    shas = commits.index[::-1]

    prev = shas[0]

    insertions = [np.nan]
    deletions = [np.nan]

    insertions = {}
    deletions = {}

    for cur in shas[1:]:
        i, d = get_commit_churn(cur, prev)

        insertions[cur] = i
        deletions[cur] = d

        # insertions.append(i)
        # deletions.append(d)

        prev = cur

    return Panel({'insertions': DataFrame(insertions),
                  'deletions': DataFrame(deletions)}, minor_axis=shas)
示例#9
0
    def test_resample_panel(self):
        rng = date_range('1/1/2000', '6/30/2000')
        n = len(rng)

        panel = Panel(np.random.randn(3, n, 5),
                      items=['one', 'two', 'three'],
                      major_axis=rng,
                      minor_axis=['a', 'b', 'c', 'd', 'e'])

        result = panel.resample('M', axis=1)

        def p_apply(panel, f):
            result = {}
            for item in panel.items:
                result[item] = f(panel[item])
            return Panel(result, items=panel.items)

        expected = p_apply(panel, lambda x: x.resample('M'))
        tm.assert_panel_equal(result, expected)

        panel2 = panel.swapaxes(1, 2)
        result = panel2.resample('M', axis=2)
        expected = p_apply(panel2, lambda x: x.resample('M', axis=1))
        tm.assert_panel_equal(result, expected)
示例#10
0
def getHistoricData(symbols, **options):
    ''' 
    get data from Yahoo finance and return pandas dataframe
    Will get OHLCV data frame if sinle symbol is provided. 
    If many symbols are provided, it will return a wide panel
    
    Parameters
    ------------
    symbols : str or list  
        Yahoo finanance symbol or a list of symbols
    sDate : tuple  (optional)
        start date (y,m,d)
    eDate : tuple  (optional)
        end date (y,m,d) 
    adjust : bool
        T/[F] adjust data based on adj_close
    
    Returns
    ---------
    Panel
    
    '''
    
    assert isinstance(symbols,(list,str)), 'Input must be a string symbol or a list of symbols'
    
    if isinstance(symbols,str):
        return getSymbolData(symbols,**options)
    else:
        data = {}
        print('Downloading data:')
        p = ProgressBar(len(symbols))
        for idx,symbol in enumerate(symbols):
            p.animate(idx+1)
            data[symbol] = getSymbolData(symbol,verbose=False,**options)
        
        return Panel(data)
示例#11
0
    def test_partial_setting(self):

        # GH2578, allow ix and friends to partially set

        # series
        s_orig = Series([1, 2, 3])

        s = s_orig.copy()
        s[5] = 5
        expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        s = s_orig.copy()
        s.loc[5] = 5
        expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        s = s_orig.copy()
        s[5] = 5.
        expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        s = s_orig.copy()
        s.loc[5] = 5.
        expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        # iloc/iat raise
        s = s_orig.copy()

        def f():
            s.iloc[3] = 5.

        pytest.raises(IndexError, f)

        def f():
            s.iat[3] = 5.

        pytest.raises(IndexError, f)

        # ## frame ##

        df_orig = DataFrame(np.arange(6).reshape(3, 2),
                            columns=['A', 'B'],
                            dtype='int64')

        # iloc/iat raise
        df = df_orig.copy()

        def f():
            df.iloc[4, 2] = 5.

        pytest.raises(IndexError, f)

        def f():
            df.iat[4, 2] = 5.

        pytest.raises(IndexError, f)

        # row setting where it exists
        expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]}))
        df = df_orig.copy()
        df.iloc[1] = df.iloc[2]
        tm.assert_frame_equal(df, expected)

        expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]}))
        df = df_orig.copy()
        df.loc[1] = df.loc[2]
        tm.assert_frame_equal(df, expected)

        # like 2578, partial setting with dtype preservation
        expected = DataFrame(dict({'A': [0, 2, 4, 4], 'B': [1, 3, 5, 5]}))
        df = df_orig.copy()
        df.loc[3] = df.loc[2]
        tm.assert_frame_equal(df, expected)

        # single dtype frame, overwrite
        expected = DataFrame(dict({'A': [0, 2, 4], 'B': [0, 2, 4]}))
        df = df_orig.copy()
        with catch_warnings(record=True):
            df.ix[:, 'B'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        # mixed dtype frame, overwrite
        expected = DataFrame(dict({'A': [0, 2, 4], 'B': Series([0, 2, 4])}))
        df = df_orig.copy()
        df['B'] = df['B'].astype(np.float64)
        with catch_warnings(record=True):
            df.ix[:, 'B'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        # single dtype frame, partial setting
        expected = df_orig.copy()
        expected['C'] = df['A']
        df = df_orig.copy()
        with catch_warnings(record=True):
            df.ix[:, 'C'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        # mixed frame, partial setting
        expected = df_orig.copy()
        expected['C'] = df['A']
        df = df_orig.copy()
        with catch_warnings(record=True):
            df.ix[:, 'C'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        with catch_warnings(record=True):
            # ## panel ##
            p_orig = Panel(np.arange(16).reshape(2, 4, 2),
                           items=['Item1', 'Item2'],
                           major_axis=pd.date_range('2001/1/12', periods=4),
                           minor_axis=['A', 'B'],
                           dtype='float64')

            # panel setting via item
            p_orig = Panel(np.arange(16).reshape(2, 4, 2),
                           items=['Item1', 'Item2'],
                           major_axis=pd.date_range('2001/1/12', periods=4),
                           minor_axis=['A', 'B'],
                           dtype='float64')
            expected = p_orig.copy()
            expected['Item3'] = expected['Item1']
            p = p_orig.copy()
            p.loc['Item3'] = p['Item1']
            tm.assert_panel_equal(p, expected)

            # panel with aligned series
            expected = p_orig.copy()
            expected = expected.transpose(2, 1, 0)
            expected['C'] = DataFrame(
                {
                    'Item1': [30, 30, 30, 30],
                    'Item2': [32, 32, 32, 32]
                },
                index=p_orig.major_axis)
            expected = expected.transpose(2, 1, 0)
            p = p_orig.copy()
            p.loc[:, :, 'C'] = Series([30, 32], index=p_orig.items)
            tm.assert_panel_equal(p, expected)

        # GH 8473
        dates = date_range('1/1/2000', periods=8)
        df_orig = DataFrame(np.random.randn(8, 4),
                            index=dates,
                            columns=['A', 'B', 'C', 'D'])

        expected = pd.concat(
            [df_orig, DataFrame({'A': 7}, index=[dates[-1] + 1])])
        df = df_orig.copy()
        df.loc[dates[-1] + 1, 'A'] = 7
        tm.assert_frame_equal(df, expected)
        df = df_orig.copy()
        df.at[dates[-1] + 1, 'A'] = 7
        tm.assert_frame_equal(df, expected)

        exp_other = DataFrame({0: 7}, index=[dates[-1] + 1])
        expected = pd.concat([df_orig, exp_other], axis=1)

        df = df_orig.copy()
        df.loc[dates[-1] + 1, 0] = 7
        tm.assert_frame_equal(df, expected)
        df = df_orig.copy()
        df.at[dates[-1] + 1, 0] = 7
        tm.assert_frame_equal(df, expected)
示例#12
0
 def make_source(self):
     return Panel(self.raw_data).tz_localize('UTC', axis=1)
示例#13
0
def create_data():
    """ create the pickle data """

    from distutils.version import LooseVersion
    import numpy as np
    import pandas
    from pandas import (Series,TimeSeries,DataFrame,Panel,
                        SparseSeries,SparseTimeSeries,SparseDataFrame,SparsePanel,
                        Index,MultiIndex,PeriodIndex,
                        date_range,period_range,bdate_range,Timestamp,Categorical)
    nan = np.nan

    data = {
        'A': [0., 1., 2., 3., np.nan],
        'B': [0, 1, 0, 1, 0],
        'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
        'D': date_range('1/1/2009', periods=5),
        'E' : [0., 1, Timestamp('20100101'),'foo',2.],
        }

    index = dict(int = Index(np.arange(10)),
                 date = date_range('20130101',periods=10),
                 period = period_range('2013-01-01', freq='M', periods=10))

    mi = dict(reg2 = MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
                                                      ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])),
                                                 names=['first', 'second']))
    series = dict(float = Series(data['A']),
                  int = Series(data['B']),
                  mixed = Series(data['E']),
                  ts = TimeSeries(np.arange(10).astype(np.int64),index=date_range('20130101',periods=10)),
                  mi = Series(np.arange(5).astype(np.float64),index=MultiIndex.from_tuples(tuple(zip(*[[1,1,2,2,2],
                                                                                                    [3,4,3,4,5]])),
                                                                                           names=['one','two'])),
                  dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']),
                  cat=Series(Categorical(['foo', 'bar', 'baz'])))

    frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)),
                 int = DataFrame(dict(A = series['int']  , B = series['int']   + 1)),
                 mixed = DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']])),
                 mi = DataFrame(dict(A = np.arange(5).astype(np.float64), B = np.arange(5).astype(np.int64)),
                                index=MultiIndex.from_tuples(tuple(zip(*[['bar','bar','baz','baz','baz'],
                                                                       ['one','two','one','two','three']])),
                                                             names=['first','second'])),
                 dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
                               columns=['A', 'B', 'A']),
                 cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))),
                 cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']),
                                              B=np.arange(3).astype(np.int64))),
    )
    panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)),
                 dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64),
                             items=['A', 'B', 'A']))

    if LooseVersion(pandas.__version__) >= '0.14.1':
        # Pre-0.14.1 versions generated non-unpicklable mixed-type frames and
        # panels if their columns/items were non-unique.
        mixed_dup_df = DataFrame(data)
        mixed_dup_df.columns = list("ABCDA")

        mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int']))
        mixed_dup_panel.items = ['ItemA', 'ItemA']

        frame['mixed_dup'] = mixed_dup_df
        panel['mixed_dup'] = mixed_dup_panel

    return dict( series = series,
                 frame = frame,
                 panel = panel,
                 index = index,
                 mi = mi,
                 sp_series = dict(float = _create_sp_series(),
                                  ts = _create_sp_tsseries()),
                 sp_frame = dict(float = _create_sp_frame())
                 )
示例#14
0
def WQXtoPandas(
        xmlLocation,
        charDict,
        outputPath='.',
        fromFile=False,
        outputDirName='Processed-Sites',
        RUN_PHREEQC=False,
        PHREEQC_PATH='/home/mcoving/phreeqc-2.18.0/bin/',
        DATABASE_FILE='/home/mcoving/phreeqc-2.18.0/database/phreeqc.dat',
        LOG_FILE='Result.log',
        START_FILE=None,
        splittag='',
        bracket_charge_balance=False):
    """
    Processes a WQX xml data file and loads data for each site in the WQX file into Pandas data objects that are stored in directories for each site.
    
    Parameters
    ----------
    xmlLocation : string
       Content depends on mode in which WQXtoPandas is run. When fromFile is set to False (input methods 2 or 3 in excel file) this string contains the html for a query to the USGS NWIS database to obtain an xml file of the desired data.  Alternatively, if fromFile is True (input method 1 in excel file) then this string contains the name of the xml file from which to read the data.

    charDict : dict
       A dictionary containing information about the characteristics to be processed.  Keys are EPA SRS characteristic names. Each entry in the dictionary is a second dictionary that contains keys IsRequired, pcode, fraction, and quality. These entries tell WQXtoPandas whether a given characteristic is required in order to process a sample, and whether a specific pcode, fraction, or quality should be required.  See excel example file for more details.

    outputPath : string
       path to directory that will contain output directory

    fromFile : boolean
       True if data will be read from an xml file already present on computer.  False if xml file should be queried from NWIS. (Default=False)

    outputDirName : string
       Name of output directory where all site data will be written out. (Default='Processed-Sites')

    RUN_PHREEQC : boolean
       Set to true if samples should be processed through PHREEQC. (Default=False)
    PHREEQC_PATH : string
       Path to PHREEQC executable (folder only, not executable file name)

    DATABASE_FILE : string
       Path to database file that PHREEQC should use, including database file name.
    LOG_FILE : string
       Name of log file that WQXtoPandas will create. (Default='Result.log')

    START_FILE : string
       Name of xls start file that was used to run this instance of WQXtoPandas. Name will be written out in log file.

    bracket_charge_balance : bool
       If set to true, WQXtoPandas will alternately force charge balance on calcium and alkalinity, while the latter is not physically meaningful, this provides a useful estimate of uncertainty for cases with high charge balance errors.  This is most useful for water that is very dilute or with high organic content, such that titrated alkalinity values are artificially high.

    Returns
    -------

    Returns 0 if execution successful.  Returns -1 in case of error.

    Notes
    -----

    Designed to be run through convenience function runWQXtoPandas().
    """
    try:
        #Check to see if output directory exists
        absOutputDirPath = os.path.abspath(outputPath)
        sitesdir = os.path.join(absOutputDirPath, outputDirName)
        print "sitesdir", sitesdir
        if not (os.path.exists(sitesdir)):
            try:
                os.makedirs(sitesdir)
            except os.error:
                print(
                    "Problem creating output directory. Check output path name: "
                    + outputPath)
                return -1
        #create xml tree
        if fromFile:
            #read from file
            wqxtree = etree.ElementTree(file=xmlLocation)
        else:
            #check whether we already have a matching xml file
            xmlSaveFile = LOG_FILE + splittag + '.xml'
            if (os.path.isfile(xmlSaveFile)):
                goodAnswer = False
                while not (goodAnswer):
                    answer = raw_input(
                        "An xml file (" + xmlSaveFile +
                        ") already exists.  \n Use this instead of html query (y or n)?"
                    )
                    if (answer.startswith('y')):
                        #read from file
                        wqxtree = etree.ElementTree(file=xmlSaveFile)
                        goodAnswer = True
                        queryXML = False
                    elif (answer.startswith('n')):
                        goodAnswer = True
                        queryXML = True
            else:
                queryXML = True
            #If we don't have a matching xml file, or we want to obtain a new one, then get the new xml
            if (queryXML):
                print "Obtaining xml file from USGS NWIS using html query..."
                #parse from html query
                r = requests.get(xmlLocation)
                #write to xml file
                try:
                    #write xml to file
                    xmlFile = open(xmlSaveFile, 'w')
                    print >> xmlFile, r.text
                    xmlFile.close()
                    wqxtree = etree.ElementTree(file=xmlSaveFile)
                except IOError:
                    print("Problem writing to xml file to store html query: " +
                          xmlSaveFile)
                    return -1
        #begin parsing XML tree
        root = wqxtree.getroot()
        #get namespace map
        NSMAP = root.nsmap
        WQX = "{%s}" % NSMAP[None]
        #iterate over all <Activity> tags within file and process each sample
        samples_processed = []
        samples_not_processed = []
        sitesDict = {}
        sitesMetaDict = {}
        for activity in wqxtree.getiterator(tag=WQX + "Activity"):
            processThisSample = True
            reason = ''
            description = activity.find(WQX + "ActivityDescription")
            if (description != None):
                datetext = description.findtext(WQX + "ActivityStartDate")
                starttime = description.find(WQX + "ActivityStartTime")
                if (starttime != None):
                    timetext = starttime.findtext(WQX + "Time")
                    timezone = starttime.findtext(WQX + "TimeZoneCode")
                else:
                    timetext = ''
                    timezone = ''
                location = description.findtext(WQX +
                                                "MonitoringLocationIdentifier")
                descriptionDict = {
                    'location': location,
                    'date': datetext,
                    'time': timetext,
                    'timezone': timezone
                }
            else:
                descriptionDict = None
                processThisSample = False
                reason = 'No description'
            print('Processing sample from ' + location + ' on ' + datetext)
            #create null sample dict
            sampleDict = {}
            sampleMetaDict = {}
            #iterate though all results for this activity
            for result in activity.getiterator(tag=WQX + 'Result'):
                if (processThisSample):
                    try:
                        resultdesc = result.find(WQX + "ResultDescription")
                        characteristic = resultdesc.findtext(
                            WQX + "CharacteristicName")
                        if (characteristic in charDict):
                            samplefraction = resultdesc.findtext(
                                WQX + "ResultSampleFractionText")
                            pcode = resultdesc.findtext(WQX + "USGSPCode")
                            quality = resultdesc.findtext(
                                WQX + "ResultStatusIdentifier")
                            measure = resultdesc.find(WQX + "ResultMeasure")
                            count = 1.0
                            if not (measure == None):
                                value = measure.findtext(WQX +
                                                         "ResultMeasureValue")
                                units = measure.findtext(WQX +
                                                         "MeasureUnitCode")
                                #split pcode into list
                                tempPcodeList = charDict[characteristic][
                                    'pcode'].split(';')
                                #                            print("tempPcodeList="+str(tempPcodeList))
                                pcodeDict = {}
                                for codePriority, code in enumerate(
                                        tempPcodeList):
                                    code = code.strip()
                                    if code != '':
                                        pcodeDict[code] = codePriority
                                #Check whether characteristic meets criteria
                                #for inclusion, otherwise don't add to sampleDict
                                addCharacteristic = True
                                if (charDict[characteristic]['fraction'] !=
                                        '0'):
                                    #test for correct fraction
                                    if (charDict[characteristic]['fraction'] !=
                                            samplefraction):
                                        addCharacteristic = False
                                if (addCharacteristic):
                                    if (charDict[characteristic]['pcode'] !=
                                            '0'):
                                        #test for correct pcode
                                        #                                        print("pcode = "+pcode)
                                        #                                        print("pcodeList = "+str(pcodeList))
                                        #                                        print("pcode in list="+str(pcode in pcodeList))
                                        if not (pcode in pcodeDict):
                                            addCharacteristic = False
                                if (addCharacteristic):
                                    if (charDict[characteristic]['quality'] !=
                                            '0'):
                                        #test for correct data quality
                                        if (charDict[characteristic]['quality']
                                                != quality):
                                            addCharacteristic = False
                                #end of characteristic criteria check
                                #Process duplicate characteristics
                                if (addCharacteristic):
                                    if (characteristic in sampleDict):
                                        priorPcode = sampleMetaDict[
                                            characteristic]['pcode']
                                        #if there are already multiple pcodes get only first one
                                        priorPcode = priorPcode.split(';')[0]
                                        averageValue = False
                                        if (len(pcodeDict) > 1):
                                            thisPcodePriority = pcodeDict[
                                                pcode]
                                            priorPcodePriority = \
                                                pcodeDict[priorPcode]
                                            if (thisPcodePriority >\
                                                    priorPcodePriority):
                                                #previous characteristic remains
                                                addCharacteristic = False
                                            elif (thisPcodePriority ==\
                                                  priorPcodePriority):
                                                averageValue = True
                                        else:
                                            averageValue = True
                                        if averageValue:
                                            #average this value with existing values
                                            count = \
                                                sampleMetaDict[characteristic]['count']
                                            count += 1.
                                            oldvalue = float(\
                                                sampleDict[characteristic])
                                            newvalue = (oldvalue * (count - 1.)\
                                                            + float(value))/count
                                            value = str(newvalue)
                                            pcode = priorPcode + '; ' + pcode
                                            priorUnits = \
                                                sampleMetaDict[characteristic]['units']
                                            units = priorUnits + '; ' + units

                                if (addCharacteristic):
                                    sampleDict[characteristic] = value
                                    sampleMetaDict[characteristic] = {
                                        'samplefraction': samplefraction,
                                        'units': units,
                                        'pcode': pcode,
                                        'quality': quality,
                                        'count': count
                                    }
                    #end results loop
                    except etree.XMLSyntaxError as detail:
                        print "File contains invalid XML syntax: ", detail
                        processThisSample = False
                        reason = "Entry contains invalid XML syntax."
            #check whether sample has all the required constituents


#            print "Checking for requirements."
            if (processThisSample):
                for characteristic in charDict.iterkeys():
                    if (charDict[characteristic]['IsRequired'] != '0'):
                        if not (characteristic in sampleDict):
                            processThisSample = False
                            reason += characteristic + ' not available. '
            if (processThisSample):
                #check to see whether site directory exists, if not, create it
                sampledir = os.path.join(sitesdir, location)
                if not (os.path.exists(sampledir)):
                    try:
                        os.makedirs(sampledir)
                    except os.error:
                        print("Problem creating location directory: " +
                              sampledir)
                        processThisSample = False
                        reason = "Problem creating location directory: " + sampledir

            if (processThisSample):
                #Pull daily discharge data from USGS website
                dischargeDict = GetDailyDischarge(
                    location, datetext
                )  #currently hard-wired to pcode 00060 (daily discharge, cfs)
                if (dischargeDict != None):
                    sampleDict['Stream flow, mean. daily'] = dischargeDict[
                        'discharge']
                    sampleMetaDict['Stream flow, mean. daily'] = {
                        'units': 'cfs',
                        'pcode': '00060',
                        'quality': dischargeDict['quality'],
                        'count': 1,
                        'samplefraction': None
                    }
                    descriptionDict['name'] = dischargeDict['name']
                else:
                    #Possibly allow this sample to be thrown out if no mean daily discharge, and/or similar for instantaneous discharge
                    sampleDict['Stream flow, mean. daily'] = None
                    sampleMetaDict['Stream flow, mean. daily'] = {
                        'units': 'cfs',
                        'pcode': '00060',
                        'quality': None,
                        'count': 1,
                        'samplefraction': None
                    }
                # Create data frame row for this sample date
                if descriptionDict['time'] != '':
                    rowdate = to_datetime(datetext + ' ' +
                                          descriptionDict['time'])
                else:
                    rowdate = to_datetime(datetext)
                #sampleRow = DataFrame(sampleDict, index=[rowdate], dtype='float')
                #Create Panel to contain sample meta data
                samplePanelRow = Panel({
                    'data':
                    DataFrame(sampleDict, index=[rowdate], dtype='float'),
                    'time':
                    DataFrame(descriptionDict['time'],
                              index=[rowdate],
                              columns=sampleMetaDict.keys()),
                    'timezone':
                    DataFrame(descriptionDict['timezone'],
                              index=[rowdate],
                              columns=sampleMetaDict.keys()),
                    'pcode':
                    DataFrame(
                        [extractValues(sampleMetaDict, ['pcode'])['values']],
                        index=[rowdate],
                        columns=sampleMetaDict.keys()),
                    'quality':
                    DataFrame(
                        [extractValues(sampleMetaDict, ['quality'])['values']],
                        index=[rowdate],
                        columns=sampleMetaDict.keys()),
                    'fraction':
                    DataFrame([
                        extractValues(sampleMetaDict,
                                      ['samplefraction'])['values']
                    ],
                              index=[rowdate],
                              columns=sampleMetaDict.keys()),
                    'units':
                    DataFrame(
                        [extractValues(sampleMetaDict, ['units'])['values']],
                        index=[rowdate],
                        columns=sampleMetaDict.keys()),
                    'count':
                    DataFrame(
                        [extractValues(sampleMetaDict, ['count'])['values']],
                        index=[rowdate],
                        columns=sampleMetaDict.keys()),
                })
                #sampleMetaRow = Series(sampleMetaDict, index=[to_datetime(datetext)], dtype='object')
                #Previous solution was reading/writing from pickle files
                #New solution will keep all data in memory until end.
                #This could cause memory problems with large data sets

                #Test whether a df for this location already exists
                if location in sitesDict:
                    #                    tempDF = sitesDict[location]
                    #                    sitesDict[location] = tempDF.append(sampleRow)
                    tempPanel = sitesDict[location]
                    sitesDict[location] = concat([tempPanel, samplePanelRow],
                                                 axis=1)
                else:
                    sitesDict[location] = samplePanelRow
            #add one to number of samples processed
            if (processThisSample):
                samples_processed.append(location + ' ' + datetext)
            else:
                samples_not_processed.append(location + ' ' + datetext +
                                             ' - ' + reason)
        print('Number of Samples Processed = ' + str(len(samples_processed)))
        print('Number of Samples Not Processed = ' +
              str(len(samples_not_processed)))

        #Write out individual site data pickle and csv files in each site directory
        print('Writing out site data files...')
        for location, pnl in sitesDict.iteritems():
            print(location)
            pickleFile = os.path.join(sitesdir, location,
                                      location + '-Panel.pkl')
            pickle.dump(pnl, open(pickleFile, 'wb'))
            pnl.to_excel(pickleFile[:-3] + 'xls')
            #Retrieve and store site description metadata
            siteDescriptionDataDF = GetSiteData(location)
            siteDescriptionDataFileName = os.path.join(
                sitesdir, location, location + '-Site-Description.pkl')
            pickle.dump(siteDescriptionDataDF,
                        open(siteDescriptionDataFileName, 'wb'))
            siteDescriptionDataDF.to_csv(siteDescriptionDataFileName[:-3] +
                                         'csv')
        #Process sites through PHREEQC
        if RUN_PHREEQC:
            print("Processing site water chemisty data in PHREEQC...")
            for location, pnl in sitesDict.iteritems():
                phreeqc_df = processPanel(pnl,
                                          os.path.join(sitesdir, location),
                                          PHREEQC_PATH, DATABASE_FILE)
                phreeqc_site_file = os.path.join(sitesdir, location,
                                                 location + '-PHREEQC.pkl')
                try:
                    pickle.dump(phreeqc_df, open(phreeqc_site_file, 'wb'))
                    phreeqc_df.to_csv(phreeqc_site_file[:-3] + 'csv')
                except IOError:
                    print('Problem writing out PHREEQC data file.')
            if bracket_charge_balance:
                for location, pnl in sitesDict.iteritems():
                    #Force balance on Calcium
                    phreeqc_df_ca = processPanel(pnl,
                                                 os.path.join(
                                                     sitesdir, location),
                                                 PHREEQC_PATH,
                                                 DATABASE_FILE,
                                                 force_balance='Ca')
                    phreeqc_site_file_ca = os.path.join(
                        sitesdir, location, location + '-PHREEQC-Ca.pkl')
                    try:
                        pickle.dump(phreeqc_df_ca,
                                    open(phreeqc_site_file_ca, 'wb'))
                        phreeqc_df_ca.to_csv(phreeqc_site_file_ca[:-3] + 'csv')
                    except IOError:
                        print('Problem writing out PHREEQC Ca data file.')
                    #Force balance on Alkalinity
                    phreeqc_df_alk = processPanel(pnl,
                                                  os.path.join(
                                                      sitesdir, location),
                                                  PHREEQC_PATH,
                                                  DATABASE_FILE,
                                                  force_balance='Alk')
                    phreeqc_site_file_alk = os.path.join(
                        sitesdir, location, location + '-PHREEQC-Alk.pkl')
                    try:
                        pickle.dump(phreeqc_df_alk,
                                    open(phreeqc_site_file_alk, 'wb'))
                        phreeqc_df_alk.to_csv(phreeqc_site_file_alk[:-3] +
                                              'csv')
                    except IOError:
                        print('Problem writing out PHREEQC Alk data file.')
        #Create log file
        print('Writing log file: ' + LOG_FILE + splittag)
        try:
            log_file = open(LOG_FILE + splittag, 'w')
            print >> log_file, 'Start file = ' + START_FILE
            print >> log_file, 'Number of Samples Processed = ' + str(
                len(samples_processed))
            print >> log_file, 'Number of Samples Not Processed = ' + str(
                len(samples_not_processed))
            print >> log_file, "###############"
            print >> log_file, "Characteristics"
            print >> log_file, "###############"
            printColumnNames = True
            for key, flags in charDict.iteritems():
                if (printColumnNames):
                    names = ['characteristic']  # + '\t'
                    for column in flags.iterkeys():
                        names.append(str(column))
                    print >> log_file, str("\t".join(names))
                    printColumnNames = False
                columns = [key]
                for column in flags.iterkeys():
                    if isinstance(flags[column], basestring):
                        columns.append(flags[column])
                print >> log_file, str("\t".join(columns))
            print >> log_file, "###############"
            print >> log_file, "Samples processed"
            print >> log_file, "###############"
            for line in samples_processed:
                print >> log_file, line
            print >> log_file, "###############"
            print >> log_file, "Samples not processed"
            print >> log_file, "###############"
            for line in samples_not_processed:
                print >> log_file, line
        except IOError:
            print("Problem opening log file: " + LOG_FILE)
            return -1
    #exceptions for parsing of xml file
    except IOError:
        print("Error opening xml file. Does it exist?")
        #Note: can throw this error when discharge values are not read correctly,
        #I should fix this, 6/16/2014
    except etree.XMLSyntaxError as detail:
        print "File contains invalid XML syntax: ", detail
    except requests.exceptions.RequestException as detail:
        print "Error retrieving data by xml query: ", detail
    return 0
    def test_iloc_getitem_panel(self):

        with catch_warnings(record=True):
            # GH 7189
            p = Panel(np.arange(4 * 3 * 2).reshape(4, 3, 2),
                      items=['A', 'B', 'C', 'D'],
                      major_axis=['a', 'b', 'c'],
                      minor_axis=['one', 'two'])

            result = p.iloc[1]
            expected = p.loc['B']
            tm.assert_frame_equal(result, expected)

            result = p.iloc[1, 1]
            expected = p.loc['B', 'b']
            tm.assert_series_equal(result, expected)

            result = p.iloc[1, 1, 1]
            expected = p.loc['B', 'b', 'two']
            assert result == expected

            # slice
            result = p.iloc[1:3]
            expected = p.loc[['B', 'C']]
            tm.assert_panel_equal(result, expected)

            result = p.iloc[:, 0:2]
            expected = p.loc[:, ['a', 'b']]
            tm.assert_panel_equal(result, expected)

            # list of integers
            result = p.iloc[[0, 2]]
            expected = p.loc[['A', 'C']]
            tm.assert_panel_equal(result, expected)

            # neg indices
            result = p.iloc[[-1, 1], [-1, 1]]
            expected = p.loc[['D', 'B'], ['c', 'b']]
            tm.assert_panel_equal(result, expected)

            # dups indices
            result = p.iloc[[-1, -1, 1], [-1, 1]]
            expected = p.loc[['D', 'D', 'B'], ['c', 'b']]
            tm.assert_panel_equal(result, expected)

            # combined
            result = p.iloc[0, [True, True], [0, 1]]
            expected = p.loc['A', ['a', 'b'], ['one', 'two']]
            tm.assert_frame_equal(result, expected)

            # out-of-bounds exception
            with pytest.raises(IndexError):
                p.iloc[tuple([10, 5])]

            with pytest.raises(IndexError):
                p.iloc[0, [True, True], [0, 1, 2]]

            # trying to use a label
            with pytest.raises(ValueError):
                p.iloc[tuple(['j', 'D'])]

            # GH
            p = Panel(np.random.rand(4, 3, 2),
                      items=['A', 'B', 'C', 'D'],
                      major_axis=['U', 'V', 'W'],
                      minor_axis=['X', 'Y'])
            expected = p['A']

            result = p.iloc[0, :, :]
            tm.assert_frame_equal(result, expected)

            result = p.iloc[0, [True, True, True], :]
            tm.assert_frame_equal(result, expected)

            result = p.iloc[0, [True, True, True], [0, 1]]
            tm.assert_frame_equal(result, expected)

            with pytest.raises(IndexError):
                p.iloc[0, [True, True, True], [0, 1, 2]]

            with pytest.raises(IndexError):
                p.iloc[0, [True, True, True], [2]]
示例#16
0
def create_data():
    """ create the pickle/msgpack data """

    data = {
        'A': [0., 1., 2., 3., np.nan],
        'B': [0, 1, 0, 1, 0],
        'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
        'D': date_range('1/1/2009', periods=5),
        'E': [0., 1, Timestamp('20100101'), 'foo', 2.]
    }

    scalars = dict(timestamp=Timestamp('20130101'))
    if LooseVersion(pandas.__version__) >= '0.17.0':
        scalars['period'] = Period('2012', 'M')

    index = dict(int=Index(np.arange(10)),
                 date=date_range('20130101', periods=10),
                 period=period_range('2013-01-01', freq='M', periods=10))

    mi = dict(reg2=MultiIndex.from_tuples(tuple(
        zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
              ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])),
                                          names=['first', 'second']))
    series = dict(float=Series(data['A']),
                  int=Series(data['B']),
                  mixed=Series(data['E']),
                  ts=TimeSeries(np.arange(10).astype(np.int64),
                                index=date_range('20130101', periods=10)),
                  mi=Series(np.arange(5).astype(np.float64),
                            index=MultiIndex.from_tuples(tuple(
                                zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])),
                                                         names=['one',
                                                                'two'])),
                  dup=Series(np.arange(5).astype(np.float64),
                             index=['A', 'B', 'C', 'D', 'A']),
                  cat=Series(Categorical(['foo', 'bar', 'baz'])))
    if LooseVersion(pandas.__version__) >= '0.17.0':
        series['period'] = Series([Period('2000Q1')] * 5)

    mixed_dup_df = DataFrame(data)
    mixed_dup_df.columns = list("ABCDA")
    frame = dict(
        float=DataFrame(dict(A=series['float'], B=series['float'] + 1)),
        int=DataFrame(dict(A=series['int'], B=series['int'] + 1)),
        mixed=DataFrame(dict([(k, data[k]) for k in ['A', 'B', 'C', 'D']])),
        mi=DataFrame(dict(A=np.arange(5).astype(np.float64),
                          B=np.arange(5).astype(np.int64)),
                     index=MultiIndex.from_tuples(tuple(
                         zip(*[['bar', 'bar', 'baz', 'baz', 'baz'],
                               ['one', 'two', 'one', 'two', 'three']])),
                                                  names=['first', 'second'])),
        dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
                      columns=['A', 'B', 'A']),
        cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))),
        cat_and_float=DataFrame(
            dict(A=Categorical(['foo', 'bar', 'baz']),
                 B=np.arange(3).astype(np.int64))),
        mixed_dup=mixed_dup_df)

    mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int']))
    mixed_dup_panel.items = ['ItemA', 'ItemA']
    panel = dict(float=Panel(
        dict(ItemA=frame['float'], ItemB=frame['float'] + 1)),
                 dup=Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64),
                           items=['A', 'B', 'A']),
                 mixed_dup=mixed_dup_panel)

    return dict(series=series,
                frame=frame,
                panel=panel,
                index=index,
                scalars=scalars,
                mi=mi,
                sp_series=dict(float=_create_sp_series(),
                               ts=_create_sp_tsseries()),
                sp_frame=dict(float=_create_sp_frame()))
示例#17
0
# -------------------- S1. Prepare Dataset --------------------
# Form a panel dataset comprising of all historical data
# from Shanghai and Shenzhen exchages
data_panel = None
for root, dirs, files in os.walk(folder_stock_data):
    if debug == True:
        # Limit the number of files to load to 10
        files = files[:4]

    data_dict = {}
    for file in files:
        if file.endswith('.xlsx') and file != "index.xlsx":
            logger.debug("Now loading " + root + '/' + file)
            stock_id, dump = file.split('.')
            data_temp = pd.read_excel(root + '/' + file)
            # Declare timeseries
            data_temp = data_temp.set_index('Date').tz_localize(
                'Asia/Shanghai')
            data_dict[stock_id] = data_temp

data_panel = Panel(data_dict)  # Daily series

##
# Calculate return and cumulative return

# Non-trading stocks - The days that a stock is non-traded are excluded from the
# sample.
for item in data_panel.items:
    df_temp = data_panel.ix[item]
##
示例#18
0
 def _read_wide(self, group, where=None):
     return Panel(self._read_block_manager(group))
示例#19
0
    def _read_panel_table(self, group, where=None):
        table = getattr(group, 'table')
        fields = table._v_attrs.fields

        # create the selection
        sel = Selection(table, where, table._v_attrs.index_kind)
        sel.select()
        fields = table._v_attrs.fields

        columns = _maybe_convert(sel.values['column'],
                                 table._v_attrs.columns_kind)
        index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind)
        values = sel.values['values']

        major = Factor.from_array(index)
        minor = Factor.from_array(columns)

        J, K = len(major.levels), len(minor.levels)
        key = major.labels * K + minor.labels

        if len(unique(key)) == len(key):
            sorter, _ = lib.groupsort_indexer(com._ensure_int64(key), J * K)
            sorter = com._ensure_platform_int(sorter)

            # the data need to be sorted
            sorted_values = values.take(sorter, axis=0)
            major_labels = major.labels.take(sorter)
            minor_labels = minor.labels.take(sorter)

            block = block2d_to_block3d(sorted_values, fields, (J, K),
                                       major_labels, minor_labels)

            mgr = BlockManager([block],
                               [block.ref_items, major.levels, minor.levels])
            wp = Panel(mgr)
        else:
            if not self._quiet:  # pragma: no cover
                print(
                    'Duplicate entries in table, taking most recently '
                    'appended')

            # reconstruct
            long_index = MultiIndex.from_arrays([index, columns])
            lp = DataFrame(values, index=long_index, columns=fields)

            # need a better algorithm
            tuple_index = long_index._tuple_index

            unique_tuples = lib.fast_unique(tuple_index)
            unique_tuples = _asarray_tuplesafe(unique_tuples)

            indexer = match(unique_tuples, tuple_index)
            indexer = com._ensure_platform_int(indexer)

            new_index = long_index.take(indexer)
            new_values = lp.values.take(indexer, axis=0)

            lp = DataFrame(new_values, index=new_index, columns=lp.columns)
            wp = lp.to_panel()

        if sel.column_filter:
            new_minor = sorted(set(wp.minor_axis) & sel.column_filter)
            wp = wp.reindex(minor=new_minor)
        return wp
示例#20
0
    def __init__(self, x, var_name='x', convert_dummies=True, drop_first=True):
        self._var_name = var_name
        self._convert_dummies = convert_dummies
        self._drop_first = drop_first
        if isinstance(x, PanelData):
            x = x.dataframe
        self._original = x

        if isinstance(x, DataArray):
            if x.ndim not in (2, 3):
                raise ValueError('Only 2-d or 3-d DataArrays are supported')
            x = x.to_pandas()

        if isinstance(x, Series) and isinstance(x.index, pd.MultiIndex):
            x = DataFrame(x)
        elif isinstance(x, Series):
            raise ValueError(
                'Series can only be used with a 2-level MultiIndex')

        if isinstance(x, (Panel, DataFrame)):
            if isinstance(x, DataFrame):
                if isinstance(x.index, pd.MultiIndex):
                    if len(x.index.levels) != 2:
                        raise ValueError('DataFrame input must have a '
                                         'MultiIndex with 2 levels')
                    self._frame = x.copy()
                else:
                    self._frame = DataFrame(
                        {var_name: x.T.stack(dropna=False)})
            else:
                self._frame = x.swapaxes(1,
                                         2).to_frame(filter_observations=False)
        elif isinstance(x, ndarray):
            if not 2 <= x.ndim <= 3:
                raise ValueError('2 or 3-d array required for numpy input')
            if x.ndim == 2:
                x = x[None, :, :]

            k, t, n = x.shape
            variables = [var_name] if k == 1 else [
                var_name + '.{0}'.format(i) for i in range(k)
            ]
            entities = ['entity.{0}'.format(i) for i in range(n)]
            time = list(range(t))
            x = x.astype(np.float64)
            panel = Panel(x,
                          items=variables,
                          major_axis=time,
                          minor_axis=entities)
            self._frame = panel.swapaxes(1,
                                         2).to_frame(filter_observations=False)
        else:
            raise TypeError('Only ndarrays, DataFrames, Panels or DataArrays '
                            'supported.')
        if convert_dummies:
            self._frame = expand_categoricals(self._frame, drop_first)
            self._frame = self._frame.astype(np.float64)

        time_index = Series(self._frame.index.levels[1])
        if not (is_numeric_dtype(time_index.dtype)
                or is_datetime64_any_dtype(time_index.dtype)):
            raise ValueError('The index on the time dimension must be either '
                             'numeric or date-like')
        self._k, self._t, self._n = self.panel.shape
        self._frame.index.levels[0].name = 'entity'
        self._frame.index.levels[1].name = 'time'
示例#21
0
    def test_sample(sel):
        # Fixes issue: 2419
        # additional specific object based tests

        # A few dataframe test with degenerate weights.
        easy_weight_list = [0] * 10
        easy_weight_list[5] = 1

        df = pd.DataFrame({
            'col1': range(10, 20),
            'col2': range(20, 30),
            'colString': ['a'] * 10,
            'easyweights': easy_weight_list
        })
        sample1 = df.sample(n=1, weights='easyweights')
        assert_frame_equal(sample1, df.iloc[5:6])

        # Ensure proper error if string given as weight for Series, panel, or
        # DataFrame with axis = 1.
        s = Series(range(10))
        with pytest.raises(ValueError):
            s.sample(n=3, weights='weight_column')

        with catch_warnings(record=True):
            panel = Panel(items=[0, 1, 2],
                          major_axis=[2, 3, 4],
                          minor_axis=[3, 4, 5])
            with pytest.raises(ValueError):
                panel.sample(n=1, weights='weight_column')

        with pytest.raises(ValueError):
            df.sample(n=1, weights='weight_column', axis=1)

        # Check weighting key error
        with pytest.raises(KeyError):
            df.sample(n=3, weights='not_a_real_column_name')

        # Check that re-normalizes weights that don't sum to one.
        weights_less_than_1 = [0] * 10
        weights_less_than_1[0] = 0.5
        tm.assert_frame_equal(df.sample(n=1, weights=weights_less_than_1),
                              df.iloc[:1])

        ###
        # POJO.Test axis argument
        ###

        # POJO.Test axis argument
        df = pd.DataFrame({'col1': range(10), 'col2': ['a'] * 10})
        second_column_weight = [0, 1]
        assert_frame_equal(
            df.sample(n=1, axis=1, weights=second_column_weight), df[['col2']])

        # Different axis arg types
        assert_frame_equal(
            df.sample(n=1, axis='columns', weights=second_column_weight),
            df[['col2']])

        weight = [0] * 10
        weight[5] = 0.5
        assert_frame_equal(df.sample(n=1, axis='rows', weights=weight),
                           df.iloc[5:6])
        assert_frame_equal(df.sample(n=1, axis='index', weights=weight),
                           df.iloc[5:6])

        # Check out of range axis values
        with pytest.raises(ValueError):
            df.sample(n=1, axis=2)

        with pytest.raises(ValueError):
            df.sample(n=1, axis='not_a_name')

        with pytest.raises(ValueError):
            s = pd.Series(range(10))
            s.sample(n=1, axis=1)

        # POJO.Test weight length compared to correct axis
        with pytest.raises(ValueError):
            df.sample(n=1, axis=1, weights=[0.5] * 10)

        # Check weights with axis = 1
        easy_weight_list = [0] * 3
        easy_weight_list[2] = 1

        df = pd.DataFrame({
            'col1': range(10, 20),
            'col2': range(20, 30),
            'colString': ['a'] * 10
        })
        sample1 = df.sample(n=1, axis=1, weights=easy_weight_list)
        assert_frame_equal(sample1, df[['colString']])

        # POJO.Test default axes
        with catch_warnings(record=True):
            p = Panel(items=['a', 'b', 'c'],
                      major_axis=[2, 4, 6],
                      minor_axis=[1, 3, 5])
            assert_panel_equal(p.sample(n=3, random_state=42),
                               p.sample(n=3, axis=1, random_state=42))
            assert_frame_equal(df.sample(n=3, random_state=42),
                               df.sample(n=3, axis=0, random_state=42))

        # POJO.Test that function aligns weights with frame
        df = DataFrame({
            'col1': [5, 6, 7],
            'col2': ['a', 'b', 'c'],
        },
                       index=[9, 5, 3])
        s = Series([1, 0, 0], index=[3, 5, 9])
        assert_frame_equal(df.loc[[3]], df.sample(1, weights=s))

        # Weights have index values to be dropped because not in
        # sampled DataFrame
        s2 = Series([0.001, 0, 10000], index=[3, 5, 10])
        assert_frame_equal(df.loc[[3]], df.sample(1, weights=s2))

        # Weights have empty values to be filed with zeros
        s3 = Series([0.01, 0], index=[3, 5])
        assert_frame_equal(df.loc[[3]], df.sample(1, weights=s3))

        # No overlap in weight and sampled DataFrame indices
        s4 = Series([1, 0], index=[1, 2])
        with pytest.raises(ValueError):
            df.sample(1, weights=s4)
def create_data():
    """ create the pickle/msgpack data """

    data = {
        u'A': [0., 1., 2., 3., np.nan],
        u'B': [0, 1, 0, 1, 0],
        u'C': [u'foo1', u'foo2', u'foo3', u'foo4', u'foo5'],
        u'D': date_range('1/1/2009', periods=5),
        u'E': [0., 1, Timestamp('20100101'), u'foo', 2.]
    }

    scalars = dict(timestamp=Timestamp('20130101'), period=Period('2012', 'M'))

    index = dict(int=Index(np.arange(10)),
                 date=date_range('20130101', periods=10),
                 period=period_range('2013-01-01', freq='M', periods=10))

    mi = dict(reg2=MultiIndex.from_tuples(tuple(
        zip(*[[u'bar', u'bar', u'baz', u'baz', u'foo', u'foo', u'qux', u'qux'],
              [u'one', u'two', u'one', u'two', u'one', u'two', u'one', u'two']
              ])),
                                          names=[u'first', u'second']))

    series = dict(
        float=Series(data[u'A']),
        int=Series(data[u'B']),
        mixed=Series(data[u'E']),
        ts=Series(np.arange(10).astype(np.int64),
                  index=date_range('20130101', periods=10)),
        mi=Series(np.arange(5).astype(np.float64),
                  index=MultiIndex.from_tuples(tuple(
                      zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])),
                                               names=[u'one', u'two'])),
        dup=Series(np.arange(5).astype(np.float64),
                   index=[u'A', u'B', u'C', u'D', u'A']),
        cat=Series(Categorical([u'foo', u'bar', u'baz'])),
        dt=Series(date_range('20130101', periods=5)),
        dt_tz=Series(date_range('20130101', periods=5, tz='US/Eastern')),
        period=Series([Period('2000Q1')] * 5))

    mixed_dup_df = DataFrame(data)
    mixed_dup_df.columns = list(u"ABCDA")
    frame = dict(
        float=DataFrame({
            u'A': series[u'float'],
            u'B': series[u'float'] + 1
        }),
        int=DataFrame({
            u'A': series[u'int'],
            u'B': series[u'int'] + 1
        }),
        mixed=DataFrame({k: data[k]
                         for k in [u'A', u'B', u'C', u'D']}),
        mi=DataFrame(
            {
                u'A': np.arange(5).astype(np.float64),
                u'B': np.arange(5).astype(np.int64)
            },
            index=MultiIndex.from_tuples(tuple(
                zip(*[[u'bar', u'bar', u'baz', u'baz', u'baz'],
                      [u'one', u'two', u'one', u'two', u'three']])),
                                         names=[u'first', u'second'])),
        dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
                      columns=[u'A', u'B', u'A']),
        cat_onecol=DataFrame({u'A': Categorical([u'foo', u'bar'])}),
        cat_and_float=DataFrame({
            u'A': Categorical([u'foo', u'bar', u'baz']),
            u'B': np.arange(3).astype(np.int64)
        }),
        mixed_dup=mixed_dup_df,
        dt_mixed_tzs=DataFrame(
            {
                u'A': Timestamp('20130102', tz='US/Eastern'),
                u'B': Timestamp('20130603', tz='CET')
            },
            index=range(5)),
        dt_mixed2_tzs=DataFrame(
            {
                u'A': Timestamp('20130102', tz='US/Eastern'),
                u'B': Timestamp('20130603', tz='CET'),
                u'C': Timestamp('20130603', tz='UTC')
            },
            index=range(5)))

    with catch_warnings(record=True):
        mixed_dup_panel = Panel({
            u'ItemA': frame[u'float'],
            u'ItemB': frame[u'int']
        })
        mixed_dup_panel.items = [u'ItemA', u'ItemA']
        panel = dict(float=Panel({
            u'ItemA': frame[u'float'],
            u'ItemB': frame[u'float'] + 1
        }),
                     dup=Panel(np.arange(30).reshape(3, 5,
                                                     2).astype(np.float64),
                               items=[u'A', u'B', u'A']),
                     mixed_dup=mixed_dup_panel)

    cat = dict(int8=Categorical(list('abcdefg')),
               int16=Categorical(np.arange(1000)),
               int32=Categorical(np.arange(10000)))

    timestamp = dict(normal=Timestamp('2011-01-01'),
                     nat=NaT,
                     tz=Timestamp('2011-01-01', tz='US/Eastern'))

    if _loose_version < '0.19.2':
        timestamp['freq'] = Timestamp('2011-01-01', offset='D')
        timestamp['both'] = Timestamp('2011-01-01',
                                      tz='Asia/Tokyo',
                                      offset='M')
    else:
        timestamp['freq'] = Timestamp('2011-01-01', freq='D')
        timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo', freq='M')

    off = {
        'DateOffset': DateOffset(years=1),
        'DateOffset_h_ns': DateOffset(hour=6, nanoseconds=5824),
        'BusinessDay': BusinessDay(offset=timedelta(seconds=9)),
        'BusinessHour': BusinessHour(normalize=True, n=6, end='15:14'),
        'CustomBusinessDay': CustomBusinessDay(weekmask='Mon Fri'),
        'SemiMonthBegin': SemiMonthBegin(day_of_month=9),
        'SemiMonthEnd': SemiMonthEnd(day_of_month=24),
        'MonthBegin': MonthBegin(1),
        'MonthEnd': MonthEnd(1),
        'QuarterBegin': QuarterBegin(1),
        'QuarterEnd': QuarterEnd(1),
        'Day': Day(1),
        'YearBegin': YearBegin(1),
        'YearEnd': YearEnd(1),
        'Week': Week(1),
        'Week_Tues': Week(2, normalize=False, weekday=1),
        'WeekOfMonth': WeekOfMonth(week=3, weekday=4),
        'LastWeekOfMonth': LastWeekOfMonth(n=1, weekday=3),
        'FY5253': FY5253(n=2, weekday=6, startingMonth=7, variation="last"),
        'Easter': Easter(),
        'Hour': Hour(1),
        'Minute': Minute(1)
    }

    return dict(series=series,
                frame=frame,
                panel=panel,
                index=index,
                scalars=scalars,
                mi=mi,
                sp_series=dict(float=_create_sp_series(),
                               ts=_create_sp_tsseries()),
                sp_frame=dict(float=_create_sp_frame()),
                cat=cat,
                timestamp=timestamp,
                offsets=off)
示例#23
0
 def p_apply(panel, f):
     result = {}
     for item in panel.items:
         result[item] = f(panel[item])
     return Panel(result, items=panel.items)
def create_data():
    """ create the pickle/msgpack data """

    data = {
        u'A': [0., 1., 2., 3., np.nan],
        u'B': [0, 1, 0, 1, 0],
        u'C': [u'foo1', u'foo2', u'foo3', u'foo4', u'foo5'],
        u'D': date_range('1/1/2009', periods=5),
        u'E': [0., 1, Timestamp('20100101'), u'foo', 2.]
    }

    scalars = dict(timestamp=Timestamp('20130101'), period=Period('2012', 'M'))

    index = dict(int=Index(np.arange(10)),
                 date=date_range('20130101', periods=10),
                 period=period_range('2013-01-01', freq='M', periods=10))

    mi = dict(reg2=MultiIndex.from_tuples(tuple(
        zip(*[[u'bar', u'bar', u'baz', u'baz', u'foo', u'foo', u'qux', u'qux'],
              [u'one', u'two', u'one', u'two', u'one', u'two', u'one', u'two']
              ])),
                                          names=[u'first', u'second']))
    series = dict(
        float=Series(data[u'A']),
        int=Series(data[u'B']),
        mixed=Series(data[u'E']),
        ts=Series(np.arange(10).astype(np.int64),
                  index=date_range('20130101', periods=10)),
        mi=Series(np.arange(5).astype(np.float64),
                  index=MultiIndex.from_tuples(tuple(
                      zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])),
                                               names=[u'one', u'two'])),
        dup=Series(np.arange(5).astype(np.float64),
                   index=[u'A', u'B', u'C', u'D', u'A']),
        cat=Series(Categorical([u'foo', u'bar', u'baz'])),
        dt=Series(date_range('20130101', periods=5)),
        dt_tz=Series(date_range('20130101', periods=5, tz='US/Eastern')),
        period=Series([Period('2000Q1')] * 5))

    mixed_dup_df = DataFrame(data)
    mixed_dup_df.columns = list(u"ABCDA")
    frame = dict(
        float=DataFrame({
            u'A': series[u'float'],
            u'B': series[u'float'] + 1
        }),
        int=DataFrame({
            u'A': series[u'int'],
            u'B': series[u'int'] + 1
        }),
        mixed=DataFrame({k: data[k]
                         for k in [u'A', u'B', u'C', u'D']}),
        mi=DataFrame(
            {
                u'A': np.arange(5).astype(np.float64),
                u'B': np.arange(5).astype(np.int64)
            },
            index=MultiIndex.from_tuples(tuple(
                zip(*[[u'bar', u'bar', u'baz', u'baz', u'baz'],
                      [u'one', u'two', u'one', u'two', u'three']])),
                                         names=[u'first', u'second'])),
        dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
                      columns=[u'A', u'B', u'A']),
        cat_onecol=DataFrame({u'A': Categorical([u'foo', u'bar'])}),
        cat_and_float=DataFrame({
            u'A': Categorical([u'foo', u'bar', u'baz']),
            u'B': np.arange(3).astype(np.int64)
        }),
        mixed_dup=mixed_dup_df,
        dt_mixed_tzs=DataFrame(
            {
                u'A': Timestamp('20130102', tz='US/Eastern'),
                u'B': Timestamp('20130603', tz='CET')
            },
            index=range(5)))

    mixed_dup_panel = Panel({
        u'ItemA': frame[u'float'],
        u'ItemB': frame[u'int']
    })
    mixed_dup_panel.items = [u'ItemA', u'ItemA']
    panel = dict(float=Panel({
        u'ItemA': frame[u'float'],
        u'ItemB': frame[u'float'] + 1
    }),
                 dup=Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64),
                           items=[u'A', u'B', u'A']),
                 mixed_dup=mixed_dup_panel)

    return dict(series=series,
                frame=frame,
                panel=panel,
                index=index,
                scalars=scalars,
                mi=mi,
                sp_series=dict(float=_create_sp_series(),
                               ts=_create_sp_tsseries()),
                sp_frame=dict(float=_create_sp_frame()))
示例#25
0
 def response_as_panel(self, swap=False):
     panel = Panel(self.response)
     if swap:
         panel = panel.swapaxes('items', 'minor')
     return panel
示例#26
0
#coding:utf-8
# import pandas as pd
# import numpy as np
# p1 = pd.Panel(np.arange(27).reshape((3,3,3)))
# print(p1)
# print(p1.values)
#coding:utf-8
import numpy as np
import pandas as pd
from pandas_datareader import *
from pandas import Series, DataFrame, Index, Panel

# da= get_data_yahoo('AAPL')
# print(da)
data =dict((stk, get_data_yahoo(stk, '1/1/2016', '1/15/2016')) for stk in ['AAPL', 'GOOG', 'BIDU', 'MSFT'])
print(data)
pdata = Panel(data)
print(pdata)
# pdata = pdata.swapaxes('items', 'minor')
print(pdata)
#访问顺序:# Item -> Major -> Minor

print(pdata['AAPL'])
print(pdata[:, '1/5/2016', :])
print(pdata['AAPL', '1/6/2016', :])


#Panel与DataFrame相互转换
stacked = pdata.ix[:, '1/7/2016':, :].to_frame()
print(stacked)
print(stacked.to_panel())
示例#27
0
    def setup_method(self, method):

        self.series_ints = Series(np.random.rand(4), index=lrange(0, 8, 2))
        self.frame_ints = DataFrame(np.random.randn(4, 4),
                                    index=lrange(0, 8, 2),
                                    columns=lrange(0, 12, 3))
        with catch_warnings(record=True):
            self.panel_ints = Panel(np.random.rand(4, 4, 4),
                                    items=lrange(0, 8, 2),
                                    major_axis=lrange(0, 12, 3),
                                    minor_axis=lrange(0, 16, 4))

        self.series_uints = Series(np.random.rand(4),
                                   index=UInt64Index(lrange(0, 8, 2)))
        self.frame_uints = DataFrame(np.random.randn(4, 4),
                                     index=UInt64Index(lrange(0, 8, 2)),
                                     columns=UInt64Index(lrange(0, 12, 3)))
        with catch_warnings(record=True):
            self.panel_uints = Panel(np.random.rand(4, 4, 4),
                                     items=UInt64Index(lrange(0, 8, 2)),
                                     major_axis=UInt64Index(lrange(0, 12, 3)),
                                     minor_axis=UInt64Index(lrange(0, 16, 4)))

        self.series_labels = Series(np.random.randn(4), index=list('abcd'))
        self.frame_labels = DataFrame(np.random.randn(4, 4),
                                      index=list('abcd'),
                                      columns=list('ABCD'))
        with catch_warnings(record=True):
            self.panel_labels = Panel(np.random.randn(4, 4, 4),
                                      items=list('abcd'),
                                      major_axis=list('ABCD'),
                                      minor_axis=list('ZYXW'))

        self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8])
        self.frame_mixed = DataFrame(np.random.randn(4, 4),
                                     index=[2, 4, 'null', 8])
        with catch_warnings(record=True):
            self.panel_mixed = Panel(np.random.randn(4, 4, 4),
                                     items=[2, 4, 'null', 8])

        self.series_ts = Series(np.random.randn(4),
                                index=date_range('20130101', periods=4))
        self.frame_ts = DataFrame(np.random.randn(4, 4),
                                  index=date_range('20130101', periods=4))
        with catch_warnings(record=True):
            self.panel_ts = Panel(np.random.randn(4, 4, 4),
                                  items=date_range('20130101', periods=4))

        dates_rev = (date_range('20130101',
                                periods=4).sort_values(ascending=False))
        self.series_ts_rev = Series(np.random.randn(4), index=dates_rev)
        self.frame_ts_rev = DataFrame(np.random.randn(4, 4), index=dates_rev)
        with catch_warnings(record=True):
            self.panel_ts_rev = Panel(np.random.randn(4, 4, 4),
                                      items=dates_rev)

        self.frame_empty = DataFrame({})
        self.series_empty = Series({})
        with catch_warnings(record=True):
            self.panel_empty = Panel({})

        # form agglomerates
        for o in self._objs:

            d = dict()
            for t in self._typs:
                d[t] = getattr(self, '%s_%s' % (o, t), None)

            setattr(self, o, d)
示例#28
0
 def setup(self):
     with warnings.catch_warnings(record=True):
         self.p = Panel(np.random.randn(100, 100, 100))
         self.inds = range(0, 100, 10)
示例#29
0
# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
import pandas_datareader.data as web
from pandas import Series, DataFrame, Index, Panel

pdata = Panel(
    dict((stk, web.get_data_yahoo(stk, '1/1/2016', '1/15/2016'))
         for stk in ['AAPL', 'GOOG', 'BIDU', 'MSFT']))
print(pdata)
pdata = pdata.swapaxes('items', 'minor')
print(pdata)
print()

print("## access order: Item -> Major -> Minor:")
print(pdata['Adj Close'])
print(pdata[:, '1/5/2016', :])  # items as columns
print(pdata['Adj Close', '1/6/2016', :])
print()

print("## conversion between Panel and DataFrame:")
print("""### items as columns;
    major, and minor as hierarchical index:""")
stacked1 = pdata.ix[:, '1/7/2016':, :].to_frame()
print("stacked1 ==>")
print(stacked1)
print()

stacked2 = pdata.ix[:, '1/7/2016':, 0:1].to_frame()
#stacked2 = pdata.ix[:, '1/7/2016':, 0].to_frame()  # Error, not interval
示例#30
0
def Regroup(groupinfo, labels, *args):
    """
    Modify the 3D numpy arrays in *args so that data is grouped
    according to user specifications.

    For example, presume that the following scenarios are given:

    Fast_Down_01
    Slow_Down_01
    Fast_Down_02
    Slow_Down_02
    Fast_Down_04
    Slow_Down_04
    Fast_Down_08
    Slow_Down_08

    And only a single track run is specified: SCIT.
    In this example, the third item, skill scores, can be arbitrary.
    Now, supposed that we want to display the result data such that the x-axis
    is for the Down* and there are two plots: one for Fast and one for Slow.

    So, we group the scenarios data by some key (discussed later) _into_
    the trackruns dimension. For this reason, the data dimension being
    grouped into (in this case, trackruns) must originally be singleton.

    *groupinfo* - dict with keys "group", "into", and "by".
        The "group" element states which dimension the grouping will occur on.
        The "into" element states along which dimension the groups will be
        stacked. These two elements can have values of "scenarios", "skills",
        or "trackruns".

        The "by" element is rudimentary for now, but it controls the key value
        function used for grouping. The key function is applied to the list of
        default labels for the dimension stated for "group". The unique set of
        keys generated by the function on these labels become the new default
        labels for the "into" dimension.

        Currently, the keyfunction is hard-coded to split the label by under-
        scores and search for the string given in "by" in the resulting list.
        It then returns the list's next value. So, in the above example, the
        new labels for the "trackruns" dimension would be "01", "02", "04", and
        "08".
    """
    if groupinfo is None:
        return args

    if len(args) == 0:
        return args

    if len(labels[groupinfo['into']]) != 1:
        raise ValueError("Dim %s is not singleton!" % groupinfo['into'])

    if groupinfo['group'] == groupinfo['into']:
        raise ValueError("Can not group %s dimension into itself!" %
                         groupinfo['group'])

    from pandas import Panel

    grpAxis = dataAxes[groupinfo['group']]
    intoAxis = dataAxes[groupinfo['into']]
    otherAxis = dataAxes[list(
        set(['scenarios', 'trackruns', 'skills']) -
        set([groupinfo['group'], groupinfo['into']]))[0]]

    # !!Temporary!! restricted functionality for just trackrun variables
    keyfunc = lambda x: _varval(x, groupinfo['by'])

    g_args = []
    for a in args:
        wp = Panel(a,
                   items=labels['scenarios'],
                   major_axis=labels['skills'],
                   minor_axis=labels['trackruns'])

        grouped = wp.groupby(keyfunc, axis=grpAxis)

        if len(grouped) == 0:
            raise ValueError("Grouping didn't result in anything!")

        intolabs, g_a = zip(*grouped)
        # Get a list of numpy arrays from the list of Panels
        g_a = np.concatenate(map(lambda x: x.values, g_a), axis=intoAxis)

        g_args.append(g_a)

    labels[groupinfo['into']] = intolabs

    # Do the full set for error-checking purposes
    trunclabs = None
    for intolab in intolabs:
        # TODO: Generalize this!
        # Take some original labels and remove the variable and its value that
        # were used to make *intolabs*
        tmp = [
            '_'.join(_remove_varval(lab.split('_'), groupinfo['by']))
            for lab in grouped.groups[intolab]
        ]
        if trunclabs is not None:
            if tmp != trunclabs:
                raise ValueError("The labels do not match! %s\n%s" %
                                 (trunclabs, tmp))
        else:
            trunclabs = tmp

    labels[groupinfo['group']] = trunclabs

    return g_args