Python series 예제들, pandas.series Python 예제들

예제 #1

0

파일 보기

def stanton_optical_locations():
    raw_html = simple_get('https://www.stantonoptical.com/locations/')
    html = BeautifulSoup(raw_html, 'html.parser')
    state_list = []
    address_list = []
    city_state_zip_list = []
    for ptag in html.find_all('p'):
        for i, content in enumerate(ptag.contents):
            if content.string is not None:
                if content.string is not None:  #filter out the spurious results
                    if i == 0:
                        state_list.append(content.string)
                    elif i == 1:
                        address_list.append(content.string)
                        print(content)
                    elif i == 3:
                        city_state_zip_list.append(content.string)
                        print(content)
                    elif i > 3:
                        continue

    os.chdir(directory_where_you_want_to_save_the_new_file)
    address = pd.series(address_list, name='Addresses')
    city = pd.series(city_state_zip_list, name='City/State/Zip')
    df = pd.concat([address, city], axis=1)
    ##    dictionary = {'Stanton Optical Address':address_list,'Stanton Optical City/State/Zip': city_state_zip_list}
    ##    df = pd.DataFrame.from_dict(dictionary)
    df.to_excel(stanton_file_name, index=False)

예제 #2

0

파일 보기

파일: misc.py 프로젝트: monkey1992719/episectrad_origin

def cks(data,
        P=10,
        Q=9,
        X=1,
        high_col='High',
        low_col='Low',
        close_col='Close',
        vol_col='Volume',
        fillna=False):

    high = data[high_col]
    low = data[low_col]
    close = data[close_col]

    ATR = atr(data, P)

    phs_ = []
    pls_ = []
    for l in range(len(close)):
        sindex = l - P + 1
        if sindex < 0:
            sindex = 0

        xatr = X * ATR[l]
        maxv = max(high[sindex:l])
        minv = max(low[sindex:l])
        phs = maxv - xatr
        pls = minv + xatr
        phs_.append(phs)
        pls_.append(pls)

    ss = pd.series([])
    ls = pd.series([])

    for l in range(len(close)):
        sindex = l - Q + 1
        if sindex < 0:
            sindex = 0

        maxv = max(phs_[sindex:l])
        minv = max(pls_[sindex:l])
        ss.append(maxv)
        ls.append(minv)

    if fillna:
        ss = ss.replace([np.inf, -np.inf], np.nan).fillna(0)
        ls = ls.replace([np.inf, -np.inf], np.nan).fillna(0)

    return {ss: pd.Series(ss, name='ss'), ls: pd.Series(ls, name='ls')}

예제 #3

0

파일 보기

파일: tests.py 프로젝트: jzmichal/YelpNLProject

 def test_renameCols(self):
     nested_dict = {
         'dictA': {
             'key_1': 'value_1'
         },
         'dictB': {
             'key_2': 'value_2',
             'key_3': 'value_3'
         }
     }
     srs_1 = pd.series({'dictA key_1': 'value_1'})
     srs_2 = pd.series({'dictB key_2': 'value_2', 'dictB key_3': 'value_3'})
     self.assertEqual(rename_cols("dictA", nested_dict), srs_1,
                      "Test with just one key value pair")
     self.assertEqual(rename_cols("dictB", nested_dict), srs_2,
                      "Test with multiple key value pairs")

예제 #4

0

파일 보기

파일: MPLboundarylayer.py 프로젝트: dashamstyr/MPLcode

def find_capper(layerdict,dayperiod,typemode='Sub-Type'):
    maxalt=layerdict['mpl'].NRB[0].columns[-1]
    mplindex=layerdict['mpl'].NRB[0].index
    try:
        molalt=layerdict['molecular']['Layer0']['Base']
    except KeyError:
        molalt=pan.series(data=maxalt,index=mplindex)
    
    try:    
        layeralt=layerdict['layers']['Layer0']['Base']
        layertype=layerdict['layers']['Layer0'][typemode]
    except KeyError:
        layeralt=pan.Series(data=maxalt,index=mplindex)
        layertype=pan.Series(data=np.nan,index=mplindex)
    
    PBLalt=layerdict['pbl']
    molalt.fillna(maxalt,inplace=True)
    layeralt.fillna(maxalt,inplace=True)
    
    
    captype=pan.DataFrame(index=mplindex,columns=[dayperiod])
    clearcap=pan.DataFrame(index=mplindex,columns=[dayperiod])
    PBL=pan.DataFrame(index=mplindex,columns=[dayperiod])
    for i in captype.index:
        if layeralt.ix[i]<molalt.ix[i]:
            captype.ix[i]=layertype.ix[i]
            PBL.ix[i]=PBLalt.ix[i]
            clearcap.ix[i]='Other'
            
        else:
            captype.ix[i]=np.nan
            PBL.ix[i]=np.nan
            clearcap.ix[i]='Clear Air'

    return captype,clearcap,PBL

예제 #5

0

파일 보기

def zscoreVect(genes, expDat, tVals,ctt, cttVec):
    res={}
    x=expDat.loc[cttVec == ctt,:]
    for gene in genes:
        xvals=x[gene]
        res[gene]= pd.series(data=zscore(xvals, tVals[ctt]['mean'][gene], tVals[ctt]['sd'][gene]), index=xvals.index.values)
    return res

예제 #6

0

파일 보기

파일: misc.py 프로젝트: monkey1992719/episectrad_origin

def choppiness(data,
               tp=14,
               high_col='High',
               low_col='Low',
               close_col='Close',
               vol_col='Volume',
               fillna=False):

    high = data[high_col]
    low = data[low_col]

    ATR = atr(data, tp)
    CP = pd.series([])

    for i in range(len(data)):
        if i < tp * 2:
            CP.append(0)
        else:
            nmrt = np.log10(
                np.sum(ATR[i - tp:i]) /
                (max(high[i - tp:i]) - min(low[i - tp:i])))
            dnmnt = np.log10(tp)
            CP.append(round(100 * nmrt / dnmnt))

    if fillna:
        CP = CP.replace([np.inf, -np.inf], np.nan).fillna(0)
    return pd.Series(CP, name='cp')

예제 #7

0

파일 보기

def zuliValue(x,y,v):
    highSum = 0
    totalSum = 0
    lag = len(y)
    base = np.log(lag+1)
    price = pd.series(y,x)
    price.sort()
    y = price.values
    x = list(price.index)
    
    for i in range(lag):
        if y[i] != y[-1]:
            if i != 0 and y[]
            tmp = v[i] * np.log(1.0/np.abs(y[i]-y[-1])*y[-1]) * (np.log(x[i]+1)/base)
            totalSum += tmp
        
        if y[i] > y[-1]:
            highSum +=  tmp

    if totalSum != 0:
        result = highSum/totalSum
    else:
        result = 0
    

    return x[i],result

예제 #8

0

파일 보기

파일: demo_ner.py 프로젝트: archfool/nlp

def ner_predict(model, x, word2id, label2id, max_len=None, do_word2id=True):
    # 反映射
    id2word = {id: word for word, id in word2id.items()}
    id2label = {id: label for label, id in label2id.items()}
    # 获取最大seq长度
    if max_len == None:
        max_len = max(map(lambda seq: len(seq), x))
    # 规整输入文本
    if do_word2id == True:
        seqs = []
        word_list = []
        for seq in x:
            seq = list(seq)
            word_list.append(seq)
            seq = nn_lib.sentence2id(seq, word2id)
            seqs.append(seq)
        seqs = nn_lib.pad_sequences(seqs, max_len)
    else:
        seqs = x
        word_list = []
        for row in x:
            word_list.append(series(row).map(id2word).tolist())
    seqs = np.array(seqs)
    # 预测标签
    label_id_list = model.infer([seqs])
    # 拼接语料和标签
    corpus_labels = []
    for i in range(len(word_list)):
        corpus_label = []
        for j in range(len(word_list[i])):
            corpus_label.append(
                (word_list[i][j], id2label[label_id_list[i][j]]))
        corpus_labels.append(corpus_label)

    return corpus_labels

예제 #9

0

파일 보기

파일: primitiveBE.py 프로젝트: rododavid19/stateStreetFND

def CEILING(p):
    #Compute the smallest integer greater than or equal to a / b.
    #TODO Make test suite
    a = p.arguments["a"].parent.data
    b = p.arguments["b"].parent.data
    c = pd.series(a / b)
    result = np.ceil(c)
    p.arguments["result"] = result

예제 #10

0

파일 보기

파일: primitiveBE.py 프로젝트: rododavid19/stateStreetFND

def FLOOR(p):
    #Compute the largest integer less than or equal to a / b.
    #TODO make test suite
    a = p.arguments["a"].parent.data
    b = p.arguments["b"].parent.data
    c = pd.series(a / b)
    result = np.floor(c)
    p.arguments["result"] = result

예제 #11

0

파일 보기

파일: gradient_descent(2).py 프로젝트: kelly-pham/Machine-Learning-

def gradient_descent(data_points, b, m, learning_rate, number_of_iteration):
    n = len(data_points)
    array = []

    for i in xrange(number_of_iteration):
        predicted = np.dot(data_points, m)
        m = m - learning_rate / m * np.dot((predicted - value), data_points)
        output = compute_output(data_points, b, m)
    return pd.series(array)

예제 #12

0

파일 보기

파일: excel_test.py 프로젝트: golu1068/csv_work

def read_csv(file):
    day_sum=[];
    file_dir = os.path.split(file)[0]
    file_name = os.path.split(file)[1]
    
    new_file = os.path.join(file_dir, file_name[:-4]+ '_new_file.csv')
    write_file = open(new_file, 'w', newline='')
    
    df = pd.read_csv(file, low_memory=False)
    header = list(df)
    
    area = np.array(df['F_AREA'])
    area_sum = np.sum(area)
    
    weight = area/area_sum
    ######################################################
    year=2018
    if (calendar.isleap(year) == True):
        period = 366
    else:
        period = 365
    date = pd.date_range('01-01-'+str(year), periods=period)
    
    for i in range(3, 368):
        df[header[i]] = np.array(df[header[i]])*weight
        day_sum.append(np.sum(df[header[i]]))
    df.insert(len(header), 'Weight', weight)
    header = list(df)
    if (len(df) < len(day_sum)):
        df1 = pd.DataFrame(index=list(range(len(df), len(day_sum))))
        df = pd.concat([df, df1])
        df.fillna('')
        df.insert(len(header), 'Date', date)
        header = list(df)
        df.insert(len(header), 'Sum', day_sum)
    else:
        df['Date'] = pd.series(date)
        df['Sum'] = pd.series(day_sum)
        df.fillna('')
    
    df.to_csv(write_file, index=False)
    
    write_file.close()

예제 #13

0

파일 보기

파일: datalab_beta.py 프로젝트: paulcrep/world-cup-simulation

def get_outliers(s, eps=0.8, min_samples=5):
    '''
    DBSCAN para identificar, vizualizar e remover outliers

    '''

    try:
        dim = len(s.columns)
    except:
        dim = 1
    s = s.dropna()
    x = s.values.reshape(len(s), dim)
    x = StandardScaler().fit_transform(x)
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    model = dbscan.fit(x)
    return series(model.labels_ != stats.mode(model.labels_).mode[0],
                  index=s.index)

예제 #14

0

파일 보기

    def assign_cluster_label(self, kdd_dataset):
        labels = kdd_dataset['label']
        label_names = list(
            map(
                lambda x: pandas.series([
                    labels[i] for i in range(len(self.km.labels_))
                    if self.km.labels_[i] == x
                ]), range(self.n_cluster)))

        # val = ','.join(map(str, label_names))
        for i in range(self.n_cluster):
            print("cluster {} labels: ".format(i))
            print(label_names[i].value_counts())
            label_dict = label_names[i].value_counts().to_dict()
            val = max(label_dict, key=label_dict.get)
            print(type(val))
            self.label_cluster.append(val)
            print("cluster of : ", val)

예제 #15

0

파일 보기

파일: test_meta_rel_entity.py 프로젝트: elswob/neo4j-build-pipeline

 def test_array_props_conform(self, array_props, meta_rel_df):
     for k, v in array_props.items():
         expected_type = schema_type_mapping[v["items"]["type"]]
         logger.info(f"{k}: {expected_type}")
         parent_is_not_list = (meta_rel_df[k].dropna().apply(
             lambda _: not isinstance(_, list)).pipe(lambda s: sum(s)))
         assert not bool(parent_is_not_list)
         series = pd.series([
             _ for sub_list in meta_rel_df[k].dropna().tolist()
             for _ in sub_list
         ])
         logger.info(series)
         # special handling of nullable integer
         if v["type"] == "integer":
             series = series.astype(pd.int64dtype())
         type_not_conform = series.apply(lambda _: not isinstance(
             _, expected_type)).pipe(lambda s: sum(s))
         assert not bool(type_not_conform)

예제 #16

0

파일 보기

파일: demo_text2vec.py 프로젝트: archfool/nlp

def load_wiki_corpus(path_data_in=None, path_data_out=None, word2vec=True):
    if path_data_in == None:
        corpus_path = path_nlp + r'zhwiki-latest-pages-articles.xml.bz2'
    else:
        corpus_path = path_data_in
    if path_data_out == None:
        if word2vec == True:
            corpus_processed_path = path_nlp + 'corpus_word2vec.txt'
        else:
            corpus_processed_path = path_nlp + 'corpus_doc2vec.txt'
    else:
        corpus_processed_path = path_data_out
    cc = OpenCC('t2s')
    count = 0
    with open(corpus_processed_path, 'w',
              encoding='utf-8') as corpus_processed:
        corpus = WikiCorpus(corpus_path, lemmatize=False, dictionary={})
        if word2vec == True:
            for doc in corpus.get_texts():
                doc_new = series(doc).apply(lambda x: ' '.join(
                    jieba.cut(cc.convert(x), cut_all=False)))
                corpus_processed.write(' '.join(doc_new) + "\n")
                count += 1
                if (count % 100 == 0):
                    logging.warning('Saved ' + str(count) + ' articles')
                if ((flag_test == True) and (count == 1000)):
                    return
        else:
            corpus.metadata = True
            for doc, (page_id, title) in corpus.get_texts():
                doc_new = TaggedDocument(words=[
                    word for sentence in doc
                    for word in jieba.cut(cc.convert(sentence))
                ],
                                         tags=[cc.convert(title)])
                corpus_processed.write(' '.join(doc_new[0]) + '\t' +
                                       '\t'.join(doc_new[1]) + "\n")
                count += 1
                if (count % 100 == 0):
                    logging.warning('Saved ' + str(count) + ' articles')
                if ((flag_test == True) and (count == 1000)):
                    return
    return

예제 #17

0

파일 보기

파일: hejase_stats.py 프로젝트: stsmall/Kiribina_Folonzo

def tmrca_half(ts, pop_nodes, pop_ids, outfile):
    """Calculats the tmrca half fx from Hejase et al 2020.

        "...test on the time to the most recent common ancestor of half the haploid
    samples from a given species (TMRCAH). Requiring only half the samples
    allows us to consider partial sweeps and provides robustness to the
    inherent uncertainty in the inferred local trees."

    Parameters
    ----------
    ts : Object
        object of type tskit tree seqeunce.
    pop_nodes : List
        population leaves as integers loaded from file.
    pop_ids : List
        id of population nodes to be written in DataFrame.
    outfile : str
        base name of DataFrame file output.

    Returns
    -------
    None.

    """
    df_list = []
    for pop, nodes in zip(pop_ids, pop_nodes):
        int1, int2, tmrcah_rel, time_rel, time_rel2 = calc_tmrcah(ts, nodes)
        # set up DataFrame
        df_pop = pd.DataFrame({
            "population": pd.Series([pop] * len(int1)),
            "tree_start": pd.Series(int1),
            "tree_end": pd.series(int2),
            "tmrcah": pd.Series(tmrcah_rel),
            "time_rel": pd.Series(time_rel),
            "time_rel2": pd.Series(time_rel2)
        })
        df_list.append(df_pop)
    df_pop_combine = pd.concat(df_list).reset_index(drop=True)
    df_pop_combine.to_csv(f"{outfile}.tmrca_half.csv",
                          na_rep="NAN",
                          index=False)

예제 #18

0

파일 보기

파일: algorithm.py 프로젝트: ysharma1126/POLLUTION

def matrix(df,date,pollutant):
    df = df.dropna(subset=[pollutant],axis = 0)
    total_rows = df.count()

    A = np.empty([total_rows,total_rows])
    B = np.empty([total_rows])

    #This for loop is used to populate the matrix A and B
    for i in range(0,total_rows): #c1, c2 ..., initial
    final = [df.loc[i,'lon'],df.loc[i,'lat'],df.loc[i,'alt']]
        for j in range(0,total_rows): #q1, q2 ...
            init = [df.loc[j,'lon'],df.loc[j,'lat'],df.loc[j,'alt']]
            coefficient = a(df.loc[j,'uwnd'],df.loc[j,'vwnd'],init[0],init[1], init[2],final[0],final[1],final[2])
            A.itemset((i,j),coefficient)
            B.itemset(i,df.loc[i,pollutant])

    #solve the matrix
    X = np.linalg.solve(A,B)

    #Save as percentages and save it in matrix A
    for m in range(0,total_rows):
        A[m] = numpy.multiply(A[m],X)
        sum_m = np.sum(A[m])
        for n in range(0,total_rows):
            new_value = (A[m,n]/sum_m)*100
            A.itemset((m,n),new_value)

    my_list = []
    #make a series with all the rows, state names and
    for a in range(0,total_rows):
        new_series = pd.series(data = A[a], index = df.state)
        grouppy = new_series.groupby(groupby.index).sum()
        grouppy_dict = grouppy.to_dict()
        listty = [grouppy_dict,df.loc[a,'index']]
        my_list.append(listty)

    #X is the solution, i.e. the concentration of sources, of different places
    #A is the coefficeints multiplying the sources, i.e. the weight of each source
    return my_list

예제 #19

0

파일 보기

파일: utils.py 프로젝트: erkamalkumar/preprocess_kk_nitj

def _get_value_counts(df, col):
	text = ' '.join(df[col])
	text = text.split()
	freq = pd.series(text).value_counts()
	return freq

예제 #20

0

파일 보기

from pandas import DataFrame as df
from pandas import Series as series

details = {
    'name': ['osa', 'ire', 'ifa'],
    'age': [22, 33, 55],
    'location': ['Africa', 'Cuba', 'Brazil']
}

frame = df(details)
print(frame)

frame = df(details, columns=['name', 'location', 'age', 'salary'])
print(frame)

print(frame['location'])
print(frame.location)
print(frame.loc[1])

frame.salary = 5000
print(frame)

s = series([300, 400], index=[0, 1])
print(s)

frame.salary = s
print(frame)

예제 #21

0

파일 보기

파일: tempCodeRunnerFile.py 프로젝트: lukewheless/pandas

import pandas as pd

grade = pd.series([87, 100, 94])

print(grade)

array = pd.series(98.6, range(3))

print(array)

예제 #22

0

파일 보기

파일: test_panda.py 프로젝트: vivek1262/master/master-master/test_panda.py

import numpy as nm
import pandas as pd

dt = nm.array([1, 2, 3, 4])
s = pd.series(dt, index=[10, 20, 30, 40])
print(s)

예제 #23

0

파일 보기

파일: 200722_covid_scrape_simpler.py 프로젝트: sfpinzon/COVIDtracking

gdeathsxl = gdeathsxl.append(dfgdeaths[dfgdeaths.Country == 'Russia'])
gdeathsxl = gdeathsxl.append(dfgdeaths[dfgdeaths.Country == 'Turkey'])
gdeathsxl = gdeathsxl.append(dfgdeaths[dfgdeaths.Country == 'Brazil'])
gdeathsxl = gdeathsxl.append(dfgdeaths[dfgdeaths.Country == 'Chile'])
gdeathsxl = gdeathsxl.append(dfgdeaths[dfgdeaths.Country == 'Colombia'])
gdeathsxl = gdeathsxl.append(dfgdeaths[dfgdeaths.Country == 'Mexico'])
gdeathsxl = gdeathsxl.append(dfgdeaths[dfgdeaths.Country == 'Peru'])
gdeathsxl = gdeathsxl.drop('Population', axis=1)
gdeathsxl = gdeathsxl.reset_index(drop=True)

Locations = pd.series([
    'US', 'Texas', 'Bexar', 'Harris', 'Dallas', 'Tarrant', 'Travis', 'Collin',
    'Hidalgo', 'El Paso', 'Alabama', 'Arizona', 'California', 'Colorado',
    'Conneticut', 'Florida', 'Georgia', 'Louisiana', 'Massachusetts', 'Nevada',
    'New Mexico', 'New York', 'Oklahoma', 'South Carolina', 'Washington',
    'China', 'Belgium', 'Canada', 'France', 'Germany', 'Italy', 'Japan',
    'Korea, South', 'Netherlands', 'Norway', 'Portugal', 'Spain', 'Sweden',
    'Switzerland', 'United Kingdom', 'Egypt', 'South Africa', 'India',
    'Indonesia', 'Iran', 'Philippines', 'Saudi Arabia', 'Singapore',
    'Thailand', 'Poland', 'Russia', 'Turkey', 'Brazil', 'Chile', 'Colombia',
    'Mexico', 'Peru'
])
gcasesxl.loc[:, 'Admin2'] = Locations
gdeathsxl.loc[:, 'Admin2'] = Locations
gcasesxl = gcasesxl.drop(['State', 'Country'], axis=1)
gdeathsxl = gdeathsxl.drop(['State', 'Country'], axis=1)

a = gcasesxl.melt(id_vars='Admin2')

import xlsxwriter

writer = pd.ExcelWriter('covid_tableau_.xlsx', engine='xlsxwriter')

예제 #24

0

파일 보기

파일: 1_pandas.py 프로젝트: Ajitesh13/Python-OOP

import pandas as pd

data = [1, 2, 3, 4, 5]

a = pd.series(data)

예제 #25

0

파일 보기

# #creating a DataFrame using a dictionary
# import pandas as pd
# dictionary={'fruits':['apples', 'banana','mangoes'], 'count':[10,20,15]}
# df= pd.DataFrame(dictionary)
# print (df)

#creating a DataFrame using series
import pandas as pd
series = pd.series([6, 12], index=['a', 'b'])
df = pd.DataFrame(series)
print(df)

# # MERGE OPERATION
# import pandas as pd
# player=['player1', 'player2','player3']
# point =[8,5,6]
# title= ['game1','game2','game3']
# df1  = pd.DataFrame(['Player':player, 'Points':point, 'Title':title])

예제 #26

0

파일 보기

import pandas as pd

grades = pd.Series([87, 100, 94])

print(grades)

same_grade = pd.series(98.6, range(3))

print(same_grade)

#0 98.6
#1 98.6
#2 98.6
#dtype: float64

print(grades[0])
grades.count()
grades.mean()
grades.min()
grades.max()
grades.std()

print(grades.describe())

#you can specify custome indices with the index keyword argument:
grades = pd.series([87, 100, 94], indexes=['Wally', 'Eva', 'Sam'])

print(grades)

#if you initialize a series with a dictionary, its keys become
#the series' indices, and its values become the series' element values

예제 #27

0

파일 보기

Type "help", "copyright", "credits" or "license()" for more information.
>>> list=[13,54,75]
>>> import usermodule
>>> print ("list=",usermodule.list)
list= [13, 54, 75]
>>> list.append(98)
>>> print(list)
[13, 54, 75, 98]
>>> 
>>> 
>>> import pandas as pa
>>> import numpy as nu
>>> import sys
>>> sys._stdout_=sys.stdout
>>> fruit=nu.array(['pears','mango','kiwi'])
>>> series=pa.series(fruit)
    series=pa.series(fruit)
    print(series)
 0 pears
 1 mango
 2 kiwi
>>> 
>>> 
>>> import random
>>> print("random integer is :",random.randint(1,100))
random integer is : 42
>>> 
>>> 
>>> import sys
>>> sys.path
['', 'C:\\Users\\sharo\\AppData\\Local\\Programs\\Python\\Python39\\Lib\\idlelib', 'C:\\Users\\sharo\\AppData\\Local\\Programs\\Python\\Python39', 'C:\\Users\\sharo\\AppData\\Local\\Programs\\Python\\Python39\\python39.zip', 'C:\\Users\\sharo\\AppData\\Local\\Programs\\Python\\Python39\\DLLs', 'C:\\Users\\sharo\\AppData\\Local\\Programs\\Python\\Python39\\lib', 'C:\\Users\\sharo\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages']

예제 #28

0

파일 보기

파일: ANN.py 프로젝트: thareddy/Air-Quality-Inspection-and-Predection

print('\n corrmat index:', corrmat.index)

# # Feature Importance
"""Feature importance is an inbuilt class that comes with 
   Tree Based Regressor, we will be using Extra Tree Regressor 
   for extracting the top 10 features for the dataset. """

model = ExtraTreesRegressor()
model.fit(X, y)

print('\n Head of X:')
print(X.head())
print('\n feature importance:', model.feature_importances_)

# # plot graph of feature importances for better visualization
feat_importances = pd.series(model.feature_importances_, index = X.columns)
feat_importances.nlargest(5).plot(kind = 'barh')
plt.show()

# # K Nearest Neighbor Regression
sns.distplot(y)
plt.show()

# # split the data and do train and test on the splitted data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

model = Sequential()
# The Input Layer :
model.add(Dense(128, kernel_initializer = 'normal', input_dim = X_train.shape[1], activation = 'relu'))

# The Hidden Layers :

예제 #29

0

파일 보기

from matplotlib import rcParams
import seaborn as sb

rcParams['figure.figsize']=8,4
sb.set_style='whitegrid'

#cars dataset
mpg.plot(kind='hist')

plt.hist(mpg)

sb.distplot(mpg)

cars.plot(kind='scatter',x='hp',y='mpg',c=['darkgray'],s=150)

sb.regplot(x='hp',y='mpg',data=cars,scatter=true)

sb.pairplot(cars)#scatterplot matrix

cars_df=pd.DataFrame((cars.ix[:,(1,3,4,6)].values),colums=['mpg','disp','hp','wt'])
cars_target=cars.ix[:,9].values
target_names=[0,1]
cars_df['group']=pd.series(cars_target,dtype='category')
sb.pairplot(cars_df,hue='group',palette='hls')

cars.boxplot(column='mpg',by='am')
cars.boxplot(column='wt',by='am')

sb.boxplot(x='am',y='mpg',data=cars,palette='hls')

예제 #30

0

파일 보기

파일: data_analytics_sample.py 프로젝트: santoshp8097/data-analytics_basics

# -*- coding: utf-8 -*-
"""
Created on Thu Jan  2 18:35:36 2020

@author: Santosh
"""

#list to pandas conversion
import pandas as pd
import numpy as np
np_array = np.array([1, 2, 3, 4, 5])
print(np_array)
new = pd.series(np_array)
print(new)

## pandas to list

import pandas as pd
import numpy as np
new = pd.series([1, 2, 3, 4])
print(new)
print(new.tolist())

##dictonory to pandas and list

import pandas as pd
import numpy as np
ds = {'a': 1, 'b': 2, 'c': 6, 'd': 7}
print(ds)
print(pd.series(ds))
print(ds.tolist())

예제 #31

0

파일 보기

파일: DataFrame.py 프로젝트: RicadoMM/Python-Data-Sciences

import numpy as np
df = pd.DataFrama(columns=('Columna1','Columna2','Columna3'))
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [40, 50, 60], [23, 35, 37]]), 
                  index= [2.5, 12.6, 4.8, 4.8, 2.5], 
                  columns=[48, 49, 50]) #El index hace de indice, columns = indica el nombre las columnas


#Creacion de un DataFrame partiendo de un diccionario o de una serie de pandas
data = {'Lenguaje':['Python','C#','Java'],
       'Dificultad':['Media','Alta','Muy Alta'],
       'Ejecucion':['No compilado','Compilado','Compilado']}
df = pd.DataFrame(data)

#Partiendo de series de pandas
listado_lenguajes = ['Python','C#','Java']
d = {'Lenguaje': pd.series(['Sin compilar','Compilado','Compilado'],index = listado_lenguajes),
    'Dificultad': pd.series(['Media','Alta','Muy Alta'],index = listado_lenguajes)} 
#Si los indices no coincidieran lo que ocurre es que hay un merge y aparecen todos en el indice general del dataFrame, si resulta que una seria,
#no incluye ese indice aparecera como un NaN el valor asociado a ese indice en esa columna.
df = pd.DataFrame(d)

#Hasta aquí hemos creado de diferentes maneras DataFrames, recordar que los valores importados de un csv con read_csv o de un excel mediante read_excel
#asi como los valores importados con SQL pyodbc son mostrados o interpretados por python como un DataFrame, nosotros podemos crear un dataFrame mediante
#un csv unicamente seleccinando los datos que queremos mediante la creacion de un reader y seleccionando las filas.

#Creacion de un dataFrame con columnas seleccionadas -- en este caso solo lenguaje y dificultad
df = pd.dataFrame(data,columnas={'Lenguaje','Dificultad'})

#Podemos crear columnas nuevas de la siguiente manera.
df['Experiencia'] = 'variable asignar' #se crea todo la columna con la misma variable. Podemos hacerla condicional en funcion de otra del Data frame
df['Rentabilidad'] = df['Salario'] > 35000 #Todas las que cumplan con ese criterio en la columna Salario las indexará True en la nueva columna Rentabilidad.

예제 #32

0

파일 보기

import pandas
import numpy
arr = numpy.array([10, 20, 30, 40, 50, 60])
series = pandas.series(arr)
print(series)