Пример #1
0
def pd_dataframe():
    data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
    frame=DataFrame(data)
    print frame
    print DataFrame(data,columns=['year','state','pop'])
    frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],index=['one', 'two', 'three', 'four', 'five'])
    print frame2
    print frame2['state']
    print frame2.year
    print frame2.ix['three']
    frame2.debt=16.5
    print frame2
    frame2['debt']=np.arange(5.)
    print frame2
    val=Series(data=[-1.2,-1.5,-1.7],index=['two','four','five'])
    frame2['debt']=val
    print frame2
    frame2['eastern']=frame2.state=='Ohio'
    print frame2
    del frame2['eastern']
    print frame2.columns
Пример #2
0
# Name: first_name, dtype: object
print("\n")
print(df.loc[1]) # loc : index의 location 가지고 특정 위치의 데이터에 접근
print("\n")
print(df["age"].iloc[1:]) # iloc: index의 position
print("\n")
s = pd.Series(np.nan, index=[49,48,47,46,45,1,2,3,4,5]) # np.nan은 값을 NaN 으로
print(s)
print("\n")
print(s.iloc[3:]) # index의 위치가 3부터 출력
print("\n")
print(s.loc[:3]) # index가 3인 곳까지만 출력
print("\n")
print(df.age>40)
print("\n")
df.debt = df.age>40 # colum에 새로운 데이터 할당
print(df)
#   first_name last_name  age           city   debt
# 0      Jason    Miller   42  San Francisco   True
# 1      Molly  Jacobson   52      Baltimore   True
# 2       Tina       Ali   36          Miami  False
# 3       Jake    Milner   24        Douglas  False
# 4        Amy     Cooze   73         Boston   True
print("\n=======================================================\n")

values = Series(data=["M", "F", "F"], index=[0,1,3])
print(values)
print("\n")
df["sex"] = values # sex라는 column만들고 values에 할당된 index의 위치에 값들을 삽입
print(df)
print("\n")
Пример #3
0
df["first_name"]

# 데이터 접근
df.loc[1]  # loc: 특정 row 접근. index 의 이름을 기준
df.loc[1:2]
df.iloc[1:]  # index 순서를 기준

import numpy as np
s = pd.Series(np.nan, index=[49, 48, 47, 46, 45, 1, 2, 3, 4, 5])
s.loc[:3]
s.iloc[:3]

df

# 데이터 새로 할당
df.debt = df.age > 40
df

df.T  # transpose

df.values
df.to_csv()

del df["debt"]
df

## Selection  & Drop
df.head(3)
df[["first_data", "age"]].head(3)

df[:3]  # column 없이 쓰는 경우는 row 기준
Пример #4
0
                   columns=['year', 'state', 'pop', 'debt'],
                   index=['one', 'two', 'three', 'four', 'five'])
frame2
# 2 ways to retrieve a column:
frame2['state']
frame2.year

## rows retrieval:
frame2.ix['three']
frame2[2:]

frame2['debt'] = np.arange(5)
frame2

val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2.debt = val
frame2

frame2['eastern'] = frame2['state'] == 'Ohio'
frame2

## delete a column
del frame2['eastern']
frame2.columns

### passing nested dictionary to DataFrame:
pop = {
    'Nevada': {
        2001: 2.4,
        2002: 2.9
    },
Пример #5
0
frame2 = DataFrame(data,
                   columns=['year', 'state', 'pop', 'debt'],
                   index=['one', 'two', 'three', 'four', 'five'])
print(frame2)
print(frame2['state'])
print(frame2.year)
print("-------------")
print(frame2.iloc[1:3, :])  # 1到2行
print("-------------")
print(frame2.loc[['three', 'one']])  # 1到2行
##.ix is deprecated. Please use,.loc for label based indexing or,.iloc for positional indexing
print('2----------------')
frame2['debt'] = 16.5  # 修改一整列
print(frame2)

frame2.debt = np.arange(5)  # 用numpy数组修改元素
print(frame2)
print()

print('用Series指定要修改的索引及其对应的值,没有指定的默认数据用NaN。')
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
print(frame2)
print()

print('赋值给新列')
frame2['eastern'] = (frame2.state == 'Ohio')  # 如果state等于Ohio为True
print(frame2)
print(frame2.columns)
print()
Пример #6
0
def main():
    # Series
    # Series is likea 1d array.
    lst = [4, 7, -5, 3]
    obj = Series(lst)
    print obj
    print obj.values
    print obj.index

    obj2 = Series(copy.deepcopy(lst), index=['d', 'b', 'a', 'c'])
    print obj2
    print obj2.index
    # numpy ndarray like
    print obj2['a']
    obj2['d'] = 6
    print obj2[['c', 'a', 'd']]
    print obj2
    print obj2[obj2 > 0]
    print obj2 * 2
    print np.exp(obj2)

    # dict like
    print 'b' in obj2, 'e' in obj2
    # dict to Series
    sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000,}
    obj3 = Series(sdata)
    print obj3

    # Value is NaN, if index not found.
    states = ['Calfornia', 'Ohio', 'Oregon', 'Texas']
    obj4 = Series(sdata, index=states)
    print obj4
    # Pandas function
    print pd.isnull(obj4)
    print pd.notnull(obj4)
    # Series method
    print obj4.isnull()

    # Value is NaN, Index that does not exist only on one side
    print obj3
    print obj4
    print obj3 + obj4

    # Index name
    obj4.name = 'population'
    obj4.index.name = 'state'
    print obj4

    # Change index
    obj4.index = ['a', 'b', 'c', 'd']
    print obj4

    # DataFrame
    # DataFrame is like a spread sheet(table)
    # DataFrame of Pandas are similar to R DataFrame
    print '',''
    data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
            'year': [2000, 2001, 2002, 2001, 2002],
            'pop': [1.5, 1.7, 3.6, 2.4, 2.9],}
    frame = DataFrame(data)
    print frame

    # Order columns
    print DataFrame(data, columns=['year', 'state', 'pop'])

    # Columns, named indexes
    # Column all value is NaN, if column not found in data
    frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                       index=['one', 'two', 'three', 'four', 'five'])
    print frame2
    print frame2.columns
    print frame2['state']
    print frame2.year
    # row data
    print frame2.ix['three']
    # Change column value
    frame2['debt'] = 16.5
    print frame2.debt
    frame2.debt = np.arange(5.) # error, size is not 5
    print frame2.debt

    # Column value is Nan, if index not found
    val = Series([-1.2, -1.5, -1.7], index=['one', 'three', 'five'])
    frame2.debt = val
    print frame2

    # new column. No frame.eastern
    frame2['eastern'] = frame2.state == 'Ohio'
    print frame2
    # del column. No frame.eastern
    del frame2['eastern']
    print frame2

    # nested dict to DataFrame
    # outer key is column, and sorted
    # inner key in index, and sorted
    pop = {'Ohio':{2002: 3.6, 2000: 1.5, 2001:1.7},
            'Nevada': {2001: 2.4, 2002:2.9},}
    frame3 = DataFrame(pop)
    print frame3
    # transpose
    print frame3.T
    # DataFrame is not sorted, if specify index/columns
    print DataFrame(pop, index=[2001, 2002, 2000], columns=['Ohio', 'Nevada'])

    # Values method return ndarray, ndarray has one type
    # return casted type, if ndarray has many type.
    print frame3.values
    print frame3.values.dtype
    print frame2.values
    print frame2.values.dtype
    print frame2['year'].values.dtype

    # Index Object(IntIndex, DateTimeIndex, PeriodIndex)
    obj = Series(range(3), index='a b c'.split(' '))
    index = obj.index
    print type(index)
    print index.dtype
    # Index is immutable
    try:
        index[1] = 'd'
    except:
        print sys.exc_info()

    print obj.index is pd.Index(['a', 'b', 'c'])
    index = pd.Index(np.arange(3))
    obj2 = Series(range(3), index=index)
    print obj2.index is index

    # Index as a Set
    print frame3
    print type(frame3.columns)
    print 'Ohio' in frame3.columns
    print 2003 in frame3.index
    print index.append(pd.Index(['d']))
    print index # result is true, immutable
Пример #7
0
# result
# year     2002
# state    good
# pop       2.4
# debt      NaN
# Name: three, dtype: object

# x['dept'] = 16.5  #修改整列数据
# result
#        year state  pop debt  dept
# one    2000    ok  3.7  NaN  16.5
# two    2001    ok  3.6  NaN  16.5
# three  2002  good  2.4  NaN  16.5
# four   2003   bad  0.9  NaN  16.5

x.debt = n.arange(4)  # 用numpy数组修改元素
# result
#        year state  pop  debt
# one    2000    ok  3.7     0
# two    2001    ok  3.6     1
# three  2002  good  2.4     2
# four   2003   bad  0.9     3

val = Series([-1.2, -1.5, -1.7, 0], index=['one', 'two', 'four', 'six'])
x.debt = val  # 使用Series修改元素,DataFrme的行索引不变
# result
#        year state  pop  debt
# one    2000    ok  3.7  -1.2
# two    2001    ok  3.6  -1.5
# three  2002  good  2.4   NaN
# four   2003   bad  0.9  -1.7
Пример #8
0
data = {'state' : ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year' : [2000, 2001, 2002, 2001, 2002],
        'pop' : [1.5, 1.7, 3.6, 2.4, 2.9]}

frame = DataFrame(data)
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'], 
                                  index=['one', 'two', 'three', 'four', 'five'])
frame2['state']
frame2.values
frame2.year
frame2.ix['three']
frame2.ix[3]

frame2['debt'] = 16.5
#frame2['debt'] = [16.5, 17, 26]
frame2.debt = np.arange(5.)
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2.debt = val
frame2['eastern'] = frame2.state =='Ohio'
del frame2['eastern']

# dict of dicts into DataFrame
pop = {'Nevada': {2001: 2.5, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = DataFrame(pop)
frame3.index.name = 'year'; frame3.columns.name = 'state'

frameTest = DataFrame(frame3)
frameTest = DataFrame(frame3, index=[1,2,3])

# pandas Index object
# In[29]:

frame2.state

# In[30]:

frame2.year

# In[31]:

frame2.loc["three"]

# In[32]:

frame2.debt = np.arange(5) + 1
frame2

# In[33]:

val = Series([-1.2, -1.5, -1.7], index=['two', 'three', 'five'])

frame2.debt = val

# In[34]:

frame2

# In[35]:

frame2['eastern'] = frame2.state == 'Ohio'
Пример #10
0
}

frame = DataFrame(data)
# print(frame)
# print(type(frame.year))

frame2 = DataFrame(data, columns=['year', 'state', 'pop'])
frame3 = DataFrame(data,
                   columns=['year', 'state', 'pop', 'debt'],
                   index=['one', 'two', 'three', 'four', 'five'])

# print(frame3['state'])
# print(frame3.state)
# print(frame3.ix['one'])

frame3.debt = 100

frame3.debt = np.arange(5.)
# print(np.arange(6.))
# print(type(np.arange(6.)))

temp_series = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame3.debt = temp_series

frame3['ddd'] = np.nan
#추가할때나 지울때는 인덱스 형식으로 해야함 frame3.ddd 안됨
# print(frame3)

frame3['eastern'] = frame3.state == 'Ohio'
# print(frame3)
Пример #11
0
raw_data = {
    'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
    'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
    'age': [42, 52, 36, 24, 73],
    'city': ['San Francisco', 'Baltimore', 'Miami', 'Douglas', 'Boston']
}

df = pd.DataFrame(raw_data, columns=['first_name', 'last_name', 'age', 'city'])
print(df)

print(DataFrame(raw_data, columns=['first_name', 'age']))

print(df.loc[1])

df_1 = DataFrame(raw_data,
                 columns=['first_name', 'last_name', 'age', 'city', 'debt'])

print(df_1)

df_1.debt = df_1.age > 40

print(df_1)

print(df_1.values)

print(df_1.to_csv())

print(df['first_name'].head(2))

print(df[['first_name', 'last_name']].head(2))
"""
data = {
    'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
    'year': [2000, 2001, 2002, 2001, 2002],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9]
}
frame = DataFrame(data)
frame
DataFrame(data, columns=['year', 'state', 'pop'])
frame2 = DataFrame(data,
                   columns=['year', 'state', 'pop', 'debt'],
                   index=['one', 'two', 'three', 'four', 'five'])
frame2['state']
frame2.year
frame2.ix['three']
frame2.debt = 16.5
frame2['debt'] = np.arange(5.)
frame2.debt = 'NaN'
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2['eastern'] = frame2.state == 'Ohio'
del frame2['eastern']
frame2.columns
pop = {
    'Nevada': {
        2001: 2.4,
        2002: 2.9
    },
    'Ohio': {
        2000: 1.5,
        2001: 1.7,
Пример #13
0
print(DataFrame(data))
print(DataFrame(data, columns=['year', 'state', 'pop']))  # designate order
print()

print("""## Designate index and column,
    if column not exist, set 'NaN' in this column:""")
frame2 = DataFrame(data,
                   columns=['year', 'state', 'pop', 'debt'],
                   index=['one', 'two', 'three', 'four', 'five'])
print(frame2)
print(frame2['state'])
print(frame2.year)
print(frame2.ix['three'])
frame2['debt'] = 16.5  # modify the whole column using an element
print(frame2)
frame2.debt = np.arange(5)  # modify the whole column using an array
print(frame2)
print()

print("""## Use Series to designate the index and value for modification,
   the unspecified value set to 'NaN':""")
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
print(frame2)
print()

print("## Assign value to new column:")
frame2['eastern'] = (frame2.state == 'Ohio')
print(frame2)
print(frame2.columns)
Пример #14
0
        'pop':[1.5, 1.7, 3.6, 2.4, 2.9]}
print DataFrame(data)
print DataFrame(data, columns = ['year', 'state', 'pop']) # 指定列顺序
print

print '指定索引,在列中指定不存在的列,默认数据用NaN。'
frame2 = DataFrame(data,
                    columns = ['year', 'state', 'pop', 'debt'],
                    index = ['one', 'two', 'three', 'four', 'five'])
print frame2
print frame2['state']
print frame2.year
print frame2.ix['three']
frame2['debt'] = 16.5 # 修改一整列
print frame2
frame2.debt = np.arange(5)  # 用numpy数组修改元素
print frame2
print

print '用Series指定要修改的索引及其对应的值,没有指定的默认数据用NaN。'
val = Series([-1.2, -1.5, -1.7], index = ['two', 'four', 'five'])
frame2['debt'] = val
print frame2
print

print '赋值给新列'
frame2['eastern'] = (frame2.state == 'Ohio')  # 如果state等于Ohio为True
print frame2
print frame2.columns
print
Пример #15
0
# five   2002  Nevada  2.9  NaN

print(frame2.year)
# one      2000
# two      2001
# three    2002
# four     2001
# five     2002

print(frame2.ix['three']) # 特定インデックス(ixで選択)
# year     2002
# state    Ohio
# pop       3.6
# debt      NaN

frame2.debt = 16.5
print(frame2)
#        year   state  pop  debt
# one    2000    Ohio  1.5  16.5
# two    2001    Ohio  1.7  16.5
# three  2002    Ohio  3.6  16.5
# four   2001  Nevada  2.4  16.5
# five   2002  Nevada  2.9  16.5

frame2.debt = np.arange(5)
print(frame2)
#        year   state  pop  debt
# one    2000    Ohio  1.5     0
# two    2001    Ohio  1.7     1
# three  2002    Ohio  3.6     2
# four   2001  Nevada  2.4     3
Пример #16
0
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9]
}
print DataFrame(data)
print DataFrame(data, columns=['year', 'state', 'pop'])
print

frame2 = DataFrame(data,
                   columns=['year', 'state', 'pop', 'debt'],
                   index=['one', 'two', 'three', 'four', 'five'])
print frame2
print frame2['state']
print frame2.year
print frame2.ix['three']
frame2['debt'] = 16.5
print frame2
frame2.debt = np.arange(5)
print frame2
print

val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
print frame2
print

frame2['eastern'] = (frame2.state == 'Ohio')
print frame2
print frame2.columns
print

pop = {
    'Nevada': {
Пример #17
0
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9]
}

frame = DataFrame(data)
frame2 = DataFrame(data,
                   columns=['year', 'state', 'pop', 'debt'],
                   index=['one', 'two', 'three', 'four', 'five'])
frame2['state']
frame2.values
frame2.year
frame2.ix['three']
frame2.ix[3]

frame2['debt'] = 16.5
#frame2['debt'] = [16.5, 17, 26]
frame2.debt = np.arange(5.)
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2.debt = val
frame2['eastern'] = frame2.state == 'Ohio'
del frame2['eastern']

# dict of dicts into DataFrame
pop = {
    'Nevada': {
        2001: 2.5,
        2002: 2.9
    },
    'Ohio': {
        2000: 1.5,
        2001: 1.7,
        2002: 3.6