def stanton_optical_locations(): raw_html = simple_get('https://www.stantonoptical.com/locations/') html = BeautifulSoup(raw_html, 'html.parser') state_list = [] address_list = [] city_state_zip_list = [] for ptag in html.find_all('p'): for i, content in enumerate(ptag.contents): if content.string is not None: if content.string is not None: #filter out the spurious results if i == 0: state_list.append(content.string) elif i == 1: address_list.append(content.string) print(content) elif i == 3: city_state_zip_list.append(content.string) print(content) elif i > 3: continue os.chdir(directory_where_you_want_to_save_the_new_file) address = pd.series(address_list, name='Addresses') city = pd.series(city_state_zip_list, name='City/State/Zip') df = pd.concat([address, city], axis=1) ## dictionary = {'Stanton Optical Address':address_list,'Stanton Optical City/State/Zip': city_state_zip_list} ## df = pd.DataFrame.from_dict(dictionary) df.to_excel(stanton_file_name, index=False)
def cks(data, P=10, Q=9, X=1, high_col='High', low_col='Low', close_col='Close', vol_col='Volume', fillna=False): high = data[high_col] low = data[low_col] close = data[close_col] ATR = atr(data, P) phs_ = [] pls_ = [] for l in range(len(close)): sindex = l - P + 1 if sindex < 0: sindex = 0 xatr = X * ATR[l] maxv = max(high[sindex:l]) minv = max(low[sindex:l]) phs = maxv - xatr pls = minv + xatr phs_.append(phs) pls_.append(pls) ss = pd.series([]) ls = pd.series([]) for l in range(len(close)): sindex = l - Q + 1 if sindex < 0: sindex = 0 maxv = max(phs_[sindex:l]) minv = max(pls_[sindex:l]) ss.append(maxv) ls.append(minv) if fillna: ss = ss.replace([np.inf, -np.inf], np.nan).fillna(0) ls = ls.replace([np.inf, -np.inf], np.nan).fillna(0) return {ss: pd.Series(ss, name='ss'), ls: pd.Series(ls, name='ls')}
def test_renameCols(self): nested_dict = { 'dictA': { 'key_1': 'value_1' }, 'dictB': { 'key_2': 'value_2', 'key_3': 'value_3' } } srs_1 = pd.series({'dictA key_1': 'value_1'}) srs_2 = pd.series({'dictB key_2': 'value_2', 'dictB key_3': 'value_3'}) self.assertEqual(rename_cols("dictA", nested_dict), srs_1, "Test with just one key value pair") self.assertEqual(rename_cols("dictB", nested_dict), srs_2, "Test with multiple key value pairs")
def find_capper(layerdict,dayperiod,typemode='Sub-Type'): maxalt=layerdict['mpl'].NRB[0].columns[-1] mplindex=layerdict['mpl'].NRB[0].index try: molalt=layerdict['molecular']['Layer0']['Base'] except KeyError: molalt=pan.series(data=maxalt,index=mplindex) try: layeralt=layerdict['layers']['Layer0']['Base'] layertype=layerdict['layers']['Layer0'][typemode] except KeyError: layeralt=pan.Series(data=maxalt,index=mplindex) layertype=pan.Series(data=np.nan,index=mplindex) PBLalt=layerdict['pbl'] molalt.fillna(maxalt,inplace=True) layeralt.fillna(maxalt,inplace=True) captype=pan.DataFrame(index=mplindex,columns=[dayperiod]) clearcap=pan.DataFrame(index=mplindex,columns=[dayperiod]) PBL=pan.DataFrame(index=mplindex,columns=[dayperiod]) for i in captype.index: if layeralt.ix[i]<molalt.ix[i]: captype.ix[i]=layertype.ix[i] PBL.ix[i]=PBLalt.ix[i] clearcap.ix[i]='Other' else: captype.ix[i]=np.nan PBL.ix[i]=np.nan clearcap.ix[i]='Clear Air' return captype,clearcap,PBL
def zscoreVect(genes, expDat, tVals,ctt, cttVec): res={} x=expDat.loc[cttVec == ctt,:] for gene in genes: xvals=x[gene] res[gene]= pd.series(data=zscore(xvals, tVals[ctt]['mean'][gene], tVals[ctt]['sd'][gene]), index=xvals.index.values) return res
def choppiness(data, tp=14, high_col='High', low_col='Low', close_col='Close', vol_col='Volume', fillna=False): high = data[high_col] low = data[low_col] ATR = atr(data, tp) CP = pd.series([]) for i in range(len(data)): if i < tp * 2: CP.append(0) else: nmrt = np.log10( np.sum(ATR[i - tp:i]) / (max(high[i - tp:i]) - min(low[i - tp:i]))) dnmnt = np.log10(tp) CP.append(round(100 * nmrt / dnmnt)) if fillna: CP = CP.replace([np.inf, -np.inf], np.nan).fillna(0) return pd.Series(CP, name='cp')
def zuliValue(x,y,v): highSum = 0 totalSum = 0 lag = len(y) base = np.log(lag+1) price = pd.series(y,x) price.sort() y = price.values x = list(price.index) for i in range(lag): if y[i] != y[-1]: if i != 0 and y[] tmp = v[i] * np.log(1.0/np.abs(y[i]-y[-1])*y[-1]) * (np.log(x[i]+1)/base) totalSum += tmp if y[i] > y[-1]: highSum += tmp if totalSum != 0: result = highSum/totalSum else: result = 0 return x[i],result
def ner_predict(model, x, word2id, label2id, max_len=None, do_word2id=True): # 反映射 id2word = {id: word for word, id in word2id.items()} id2label = {id: label for label, id in label2id.items()} # 获取最大seq长度 if max_len == None: max_len = max(map(lambda seq: len(seq), x)) # 规整输入文本 if do_word2id == True: seqs = [] word_list = [] for seq in x: seq = list(seq) word_list.append(seq) seq = nn_lib.sentence2id(seq, word2id) seqs.append(seq) seqs = nn_lib.pad_sequences(seqs, max_len) else: seqs = x word_list = [] for row in x: word_list.append(series(row).map(id2word).tolist()) seqs = np.array(seqs) # 预测标签 label_id_list = model.infer([seqs]) # 拼接语料和标签 corpus_labels = [] for i in range(len(word_list)): corpus_label = [] for j in range(len(word_list[i])): corpus_label.append( (word_list[i][j], id2label[label_id_list[i][j]])) corpus_labels.append(corpus_label) return corpus_labels
def CEILING(p): #Compute the smallest integer greater than or equal to a / b. #TODO Make test suite a = p.arguments["a"].parent.data b = p.arguments["b"].parent.data c = pd.series(a / b) result = np.ceil(c) p.arguments["result"] = result
def FLOOR(p): #Compute the largest integer less than or equal to a / b. #TODO make test suite a = p.arguments["a"].parent.data b = p.arguments["b"].parent.data c = pd.series(a / b) result = np.floor(c) p.arguments["result"] = result
def gradient_descent(data_points, b, m, learning_rate, number_of_iteration): n = len(data_points) array = [] for i in xrange(number_of_iteration): predicted = np.dot(data_points, m) m = m - learning_rate / m * np.dot((predicted - value), data_points) output = compute_output(data_points, b, m) return pd.series(array)
def read_csv(file): day_sum=[]; file_dir = os.path.split(file)[0] file_name = os.path.split(file)[1] new_file = os.path.join(file_dir, file_name[:-4]+ '_new_file.csv') write_file = open(new_file, 'w', newline='') df = pd.read_csv(file, low_memory=False) header = list(df) area = np.array(df['F_AREA']) area_sum = np.sum(area) weight = area/area_sum ###################################################### year=2018 if (calendar.isleap(year) == True): period = 366 else: period = 365 date = pd.date_range('01-01-'+str(year), periods=period) for i in range(3, 368): df[header[i]] = np.array(df[header[i]])*weight day_sum.append(np.sum(df[header[i]])) df.insert(len(header), 'Weight', weight) header = list(df) if (len(df) < len(day_sum)): df1 = pd.DataFrame(index=list(range(len(df), len(day_sum)))) df = pd.concat([df, df1]) df.fillna('') df.insert(len(header), 'Date', date) header = list(df) df.insert(len(header), 'Sum', day_sum) else: df['Date'] = pd.series(date) df['Sum'] = pd.series(day_sum) df.fillna('') df.to_csv(write_file, index=False) write_file.close()
def get_outliers(s, eps=0.8, min_samples=5): ''' DBSCAN para identificar, vizualizar e remover outliers ''' try: dim = len(s.columns) except: dim = 1 s = s.dropna() x = s.values.reshape(len(s), dim) x = StandardScaler().fit_transform(x) dbscan = DBSCAN(eps=eps, min_samples=min_samples) model = dbscan.fit(x) return series(model.labels_ != stats.mode(model.labels_).mode[0], index=s.index)
def assign_cluster_label(self, kdd_dataset): labels = kdd_dataset['label'] label_names = list( map( lambda x: pandas.series([ labels[i] for i in range(len(self.km.labels_)) if self.km.labels_[i] == x ]), range(self.n_cluster))) # val = ','.join(map(str, label_names)) for i in range(self.n_cluster): print("cluster {} labels: ".format(i)) print(label_names[i].value_counts()) label_dict = label_names[i].value_counts().to_dict() val = max(label_dict, key=label_dict.get) print(type(val)) self.label_cluster.append(val) print("cluster of : ", val)
def test_array_props_conform(self, array_props, meta_rel_df): for k, v in array_props.items(): expected_type = schema_type_mapping[v["items"]["type"]] logger.info(f"{k}: {expected_type}") parent_is_not_list = (meta_rel_df[k].dropna().apply( lambda _: not isinstance(_, list)).pipe(lambda s: sum(s))) assert not bool(parent_is_not_list) series = pd.series([ _ for sub_list in meta_rel_df[k].dropna().tolist() for _ in sub_list ]) logger.info(series) # special handling of nullable integer if v["type"] == "integer": series = series.astype(pd.int64dtype()) type_not_conform = series.apply(lambda _: not isinstance( _, expected_type)).pipe(lambda s: sum(s)) assert not bool(type_not_conform)
def load_wiki_corpus(path_data_in=None, path_data_out=None, word2vec=True): if path_data_in == None: corpus_path = path_nlp + r'zhwiki-latest-pages-articles.xml.bz2' else: corpus_path = path_data_in if path_data_out == None: if word2vec == True: corpus_processed_path = path_nlp + 'corpus_word2vec.txt' else: corpus_processed_path = path_nlp + 'corpus_doc2vec.txt' else: corpus_processed_path = path_data_out cc = OpenCC('t2s') count = 0 with open(corpus_processed_path, 'w', encoding='utf-8') as corpus_processed: corpus = WikiCorpus(corpus_path, lemmatize=False, dictionary={}) if word2vec == True: for doc in corpus.get_texts(): doc_new = series(doc).apply(lambda x: ' '.join( jieba.cut(cc.convert(x), cut_all=False))) corpus_processed.write(' '.join(doc_new) + "\n") count += 1 if (count % 100 == 0): logging.warning('Saved ' + str(count) + ' articles') if ((flag_test == True) and (count == 1000)): return else: corpus.metadata = True for doc, (page_id, title) in corpus.get_texts(): doc_new = TaggedDocument(words=[ word for sentence in doc for word in jieba.cut(cc.convert(sentence)) ], tags=[cc.convert(title)]) corpus_processed.write(' '.join(doc_new[0]) + '\t' + '\t'.join(doc_new[1]) + "\n") count += 1 if (count % 100 == 0): logging.warning('Saved ' + str(count) + ' articles') if ((flag_test == True) and (count == 1000)): return return
def tmrca_half(ts, pop_nodes, pop_ids, outfile): """Calculats the tmrca half fx from Hejase et al 2020. "...test on the time to the most recent common ancestor of half the haploid samples from a given species (TMRCAH). Requiring only half the samples allows us to consider partial sweeps and provides robustness to the inherent uncertainty in the inferred local trees." Parameters ---------- ts : Object object of type tskit tree seqeunce. pop_nodes : List population leaves as integers loaded from file. pop_ids : List id of population nodes to be written in DataFrame. outfile : str base name of DataFrame file output. Returns ------- None. """ df_list = [] for pop, nodes in zip(pop_ids, pop_nodes): int1, int2, tmrcah_rel, time_rel, time_rel2 = calc_tmrcah(ts, nodes) # set up DataFrame df_pop = pd.DataFrame({ "population": pd.Series([pop] * len(int1)), "tree_start": pd.Series(int1), "tree_end": pd.series(int2), "tmrcah": pd.Series(tmrcah_rel), "time_rel": pd.Series(time_rel), "time_rel2": pd.Series(time_rel2) }) df_list.append(df_pop) df_pop_combine = pd.concat(df_list).reset_index(drop=True) df_pop_combine.to_csv(f"{outfile}.tmrca_half.csv", na_rep="NAN", index=False)
def matrix(df,date,pollutant): df = df.dropna(subset=[pollutant],axis = 0) total_rows = df.count() A = np.empty([total_rows,total_rows]) B = np.empty([total_rows]) #This for loop is used to populate the matrix A and B for i in range(0,total_rows): #c1, c2 ..., initial final = [df.loc[i,'lon'],df.loc[i,'lat'],df.loc[i,'alt']] for j in range(0,total_rows): #q1, q2 ... init = [df.loc[j,'lon'],df.loc[j,'lat'],df.loc[j,'alt']] coefficient = a(df.loc[j,'uwnd'],df.loc[j,'vwnd'],init[0],init[1], init[2],final[0],final[1],final[2]) A.itemset((i,j),coefficient) B.itemset(i,df.loc[i,pollutant]) #solve the matrix X = np.linalg.solve(A,B) #Save as percentages and save it in matrix A for m in range(0,total_rows): A[m] = numpy.multiply(A[m],X) sum_m = np.sum(A[m]) for n in range(0,total_rows): new_value = (A[m,n]/sum_m)*100 A.itemset((m,n),new_value) my_list = [] #make a series with all the rows, state names and for a in range(0,total_rows): new_series = pd.series(data = A[a], index = df.state) grouppy = new_series.groupby(groupby.index).sum() grouppy_dict = grouppy.to_dict() listty = [grouppy_dict,df.loc[a,'index']] my_list.append(listty) #X is the solution, i.e. the concentration of sources, of different places #A is the coefficeints multiplying the sources, i.e. the weight of each source return my_list
def _get_value_counts(df, col): text = ' '.join(df[col]) text = text.split() freq = pd.series(text).value_counts() return freq
from pandas import DataFrame as df from pandas import Series as series details = { 'name': ['osa', 'ire', 'ifa'], 'age': [22, 33, 55], 'location': ['Africa', 'Cuba', 'Brazil'] } frame = df(details) print(frame) frame = df(details, columns=['name', 'location', 'age', 'salary']) print(frame) print(frame['location']) print(frame.location) print(frame.loc[1]) frame.salary = 5000 print(frame) s = series([300, 400], index=[0, 1]) print(s) frame.salary = s print(frame)
import pandas as pd grade = pd.series([87, 100, 94]) print(grade) array = pd.series(98.6, range(3)) print(array)
import numpy as nm import pandas as pd dt = nm.array([1, 2, 3, 4]) s = pd.series(dt, index=[10, 20, 30, 40]) print(s)
gdeathsxl = gdeathsxl.append(dfgdeaths[dfgdeaths.Country == 'Russia']) gdeathsxl = gdeathsxl.append(dfgdeaths[dfgdeaths.Country == 'Turkey']) gdeathsxl = gdeathsxl.append(dfgdeaths[dfgdeaths.Country == 'Brazil']) gdeathsxl = gdeathsxl.append(dfgdeaths[dfgdeaths.Country == 'Chile']) gdeathsxl = gdeathsxl.append(dfgdeaths[dfgdeaths.Country == 'Colombia']) gdeathsxl = gdeathsxl.append(dfgdeaths[dfgdeaths.Country == 'Mexico']) gdeathsxl = gdeathsxl.append(dfgdeaths[dfgdeaths.Country == 'Peru']) gdeathsxl = gdeathsxl.drop('Population', axis=1) gdeathsxl = gdeathsxl.reset_index(drop=True) Locations = pd.series([ 'US', 'Texas', 'Bexar', 'Harris', 'Dallas', 'Tarrant', 'Travis', 'Collin', 'Hidalgo', 'El Paso', 'Alabama', 'Arizona', 'California', 'Colorado', 'Conneticut', 'Florida', 'Georgia', 'Louisiana', 'Massachusetts', 'Nevada', 'New Mexico', 'New York', 'Oklahoma', 'South Carolina', 'Washington', 'China', 'Belgium', 'Canada', 'France', 'Germany', 'Italy', 'Japan', 'Korea, South', 'Netherlands', 'Norway', 'Portugal', 'Spain', 'Sweden', 'Switzerland', 'United Kingdom', 'Egypt', 'South Africa', 'India', 'Indonesia', 'Iran', 'Philippines', 'Saudi Arabia', 'Singapore', 'Thailand', 'Poland', 'Russia', 'Turkey', 'Brazil', 'Chile', 'Colombia', 'Mexico', 'Peru' ]) gcasesxl.loc[:, 'Admin2'] = Locations gdeathsxl.loc[:, 'Admin2'] = Locations gcasesxl = gcasesxl.drop(['State', 'Country'], axis=1) gdeathsxl = gdeathsxl.drop(['State', 'Country'], axis=1) a = gcasesxl.melt(id_vars='Admin2') import xlsxwriter writer = pd.ExcelWriter('covid_tableau_.xlsx', engine='xlsxwriter')
import pandas as pd data = [1, 2, 3, 4, 5] a = pd.series(data)
# #creating a DataFrame using a dictionary # import pandas as pd # dictionary={'fruits':['apples', 'banana','mangoes'], 'count':[10,20,15]} # df= pd.DataFrame(dictionary) # print (df) #creating a DataFrame using series import pandas as pd series = pd.series([6, 12], index=['a', 'b']) df = pd.DataFrame(series) print(df) # # MERGE OPERATION # import pandas as pd # player=['player1', 'player2','player3'] # point =[8,5,6] # title= ['game1','game2','game3'] # df1 = pd.DataFrame(['Player':player, 'Points':point, 'Title':title])
import pandas as pd grades = pd.Series([87, 100, 94]) print(grades) same_grade = pd.series(98.6, range(3)) print(same_grade) #0 98.6 #1 98.6 #2 98.6 #dtype: float64 print(grades[0]) grades.count() grades.mean() grades.min() grades.max() grades.std() print(grades.describe()) #you can specify custome indices with the index keyword argument: grades = pd.series([87, 100, 94], indexes=['Wally', 'Eva', 'Sam']) print(grades) #if you initialize a series with a dictionary, its keys become #the series' indices, and its values become the series' element values
Type "help", "copyright", "credits" or "license()" for more information. >>> list=[13,54,75] >>> import usermodule >>> print ("list=",usermodule.list) list= [13, 54, 75] >>> list.append(98) >>> print(list) [13, 54, 75, 98] >>> >>> >>> import pandas as pa >>> import numpy as nu >>> import sys >>> sys._stdout_=sys.stdout >>> fruit=nu.array(['pears','mango','kiwi']) >>> series=pa.series(fruit) series=pa.series(fruit) print(series) 0 pears 1 mango 2 kiwi >>> >>> >>> import random >>> print("random integer is :",random.randint(1,100)) random integer is : 42 >>> >>> >>> import sys >>> sys.path ['', 'C:\\Users\\sharo\\AppData\\Local\\Programs\\Python\\Python39\\Lib\\idlelib', 'C:\\Users\\sharo\\AppData\\Local\\Programs\\Python\\Python39', 'C:\\Users\\sharo\\AppData\\Local\\Programs\\Python\\Python39\\python39.zip', 'C:\\Users\\sharo\\AppData\\Local\\Programs\\Python\\Python39\\DLLs', 'C:\\Users\\sharo\\AppData\\Local\\Programs\\Python\\Python39\\lib', 'C:\\Users\\sharo\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages']
print('\n corrmat index:', corrmat.index) # # Feature Importance """Feature importance is an inbuilt class that comes with Tree Based Regressor, we will be using Extra Tree Regressor for extracting the top 10 features for the dataset. """ model = ExtraTreesRegressor() model.fit(X, y) print('\n Head of X:') print(X.head()) print('\n feature importance:', model.feature_importances_) # # plot graph of feature importances for better visualization feat_importances = pd.series(model.feature_importances_, index = X.columns) feat_importances.nlargest(5).plot(kind = 'barh') plt.show() # # K Nearest Neighbor Regression sns.distplot(y) plt.show() # # split the data and do train and test on the splitted data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) model = Sequential() # The Input Layer : model.add(Dense(128, kernel_initializer = 'normal', input_dim = X_train.shape[1], activation = 'relu')) # The Hidden Layers :
from matplotlib import rcParams import seaborn as sb rcParams['figure.figsize']=8,4 sb.set_style='whitegrid' #cars dataset mpg.plot(kind='hist') plt.hist(mpg) sb.distplot(mpg) cars.plot(kind='scatter',x='hp',y='mpg',c=['darkgray'],s=150) sb.regplot(x='hp',y='mpg',data=cars,scatter=true) sb.pairplot(cars)#scatterplot matrix cars_df=pd.DataFrame((cars.ix[:,(1,3,4,6)].values),colums=['mpg','disp','hp','wt']) cars_target=cars.ix[:,9].values target_names=[0,1] cars_df['group']=pd.series(cars_target,dtype='category') sb.pairplot(cars_df,hue='group',palette='hls') cars.boxplot(column='mpg',by='am') cars.boxplot(column='wt',by='am') sb.boxplot(x='am',y='mpg',data=cars,palette='hls')
# -*- coding: utf-8 -*- """ Created on Thu Jan 2 18:35:36 2020 @author: Santosh """ #list to pandas conversion import pandas as pd import numpy as np np_array = np.array([1, 2, 3, 4, 5]) print(np_array) new = pd.series(np_array) print(new) ## pandas to list import pandas as pd import numpy as np new = pd.series([1, 2, 3, 4]) print(new) print(new.tolist()) ##dictonory to pandas and list import pandas as pd import numpy as np ds = {'a': 1, 'b': 2, 'c': 6, 'd': 7} print(ds) print(pd.series(ds)) print(ds.tolist())
import numpy as np df = pd.DataFrama(columns=('Columna1','Columna2','Columna3')) df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [40, 50, 60], [23, 35, 37]]), index= [2.5, 12.6, 4.8, 4.8, 2.5], columns=[48, 49, 50]) #El index hace de indice, columns = indica el nombre las columnas #Creacion de un DataFrame partiendo de un diccionario o de una serie de pandas data = {'Lenguaje':['Python','C#','Java'], 'Dificultad':['Media','Alta','Muy Alta'], 'Ejecucion':['No compilado','Compilado','Compilado']} df = pd.DataFrame(data) #Partiendo de series de pandas listado_lenguajes = ['Python','C#','Java'] d = {'Lenguaje': pd.series(['Sin compilar','Compilado','Compilado'],index = listado_lenguajes), 'Dificultad': pd.series(['Media','Alta','Muy Alta'],index = listado_lenguajes)} #Si los indices no coincidieran lo que ocurre es que hay un merge y aparecen todos en el indice general del dataFrame, si resulta que una seria, #no incluye ese indice aparecera como un NaN el valor asociado a ese indice en esa columna. df = pd.DataFrame(d) #Hasta aquí hemos creado de diferentes maneras DataFrames, recordar que los valores importados de un csv con read_csv o de un excel mediante read_excel #asi como los valores importados con SQL pyodbc son mostrados o interpretados por python como un DataFrame, nosotros podemos crear un dataFrame mediante #un csv unicamente seleccinando los datos que queremos mediante la creacion de un reader y seleccionando las filas. #Creacion de un dataFrame con columnas seleccionadas -- en este caso solo lenguaje y dificultad df = pd.dataFrame(data,columnas={'Lenguaje','Dificultad'}) #Podemos crear columnas nuevas de la siguiente manera. df['Experiencia'] = 'variable asignar' #se crea todo la columna con la misma variable. Podemos hacerla condicional en funcion de otra del Data frame df['Rentabilidad'] = df['Salario'] > 35000 #Todas las que cumplan con ese criterio en la columna Salario las indexará True en la nueva columna Rentabilidad.
import pandas import numpy arr = numpy.array([10, 20, 30, 40, 50, 60]) series = pandas.series(arr) print(series)