def retweets_vs_likes(data): tfav = pd.Series(data=data['Likes'].values, index=data['Date']) tret = pd.Series(data=data['RTs'].values, index=data['Date']) # Likes vs retweets visualization: tfav.plot(figsize=(16, 4), label="Likes", legend=True) tret.plot(figsize=(16, 4), label="Retweets", legend=True) plt.show()
def nombre_tweets_candidates(data, candidate1, candidate2): # Affiche le nombre de tweets contenant le nom du candidate1 et celui du candidate2 #création de deux nouvelles colonnes qui indiquent la présence des noms des candidats dans le tweet data['Has_candidate1'] = (candidate1 in data['tweet_textual_content']) data['Has_candidate2'] = (candidate2 in data['tweet_textual_content']) #création des séries (nombre de likes pour candidat, date) pour candidate1 et candidate2 tcandidate1 = pd.Series(data[data['Has_candidate1'] == True]['Likes'], index=data['Date']) tcandidate2 = pd.Series(data[data['Has_candidate2'] == True]['Likes'], index=data['Date']) #visualisation tcandidate1.plot(figsize=(16, 4), label="Candidate1", legend=True) tcandidate2.plot(figsize=(16, 4), label="Candidate2", legend=True) plt.show()
def generate_Z_msr_org(numOfBuses, numOfLines, bus_data_df, topo_mat, file_name): import pandas as pd import numpy as np import openpyxl from openpyxl import load_workbook # Creating Measurement Data to run state estimation bus_data = bus_data_df[[ 'Remote controlled bus number', 'Load MW', 'Generation MW' ]] bus_data.columns = ['Bus number', 'Load', 'Generation'] # Correcting the load generation for a lossless DC system correction_load = sum(bus_data['Load']) - sum(bus_data['Generation']) print("correction_load: ", correction_load) # Adding the correction load to the largest generator bus_data['Generation'].loc[ bus_data['Generation'].idxmax()] += correction_load # correction_check = sum(bus_data['Load']) - sum(bus_data['Generation']) # print("correction_check: ", correction_check) # Bus Power = Bus Gen - Bus Load bus_data['Bus Power'] = bus_data['Generation'] - bus_data['Load'] print("bus_data:\n", bus_data.head()) # Padding 0 in the top of the data from reference Z_data_bus_power = pd.DataFrame( pd.concat([pd.Series([0]), bus_data['Bus Power']])) # Topomat containing only the bus power rows along with reference bus B_mat_bus_power = pd.concat( [topo_mat.loc[0:0], topo_mat.loc[numOfLines * 2 + 1:]]) # Estimating the states fromt the bus power data state_original = np.linalg.pinv(B_mat_bus_power) @ Z_data_bus_power # Calculating the Z_msr_org using the Topology Matrix and states Z_msr_org = topo_mat @ state_original Z_msr_org.columns = ['Data'] # Saving the data book = load_workbook(file_name) writer = pd.ExcelWriter(file_name, engine='openpyxl') writer.book = book Z_msr_org.to_excel(writer, "Measurement Data", index=False) bus_data.to_excel(writer, "Bus Data", index=False) writer.save() writer.close() # saving complete ! print("Z_msr_org:\n", Z_msr_org.head()) return Z_msr_org, bus_data
df.head() # shows only the head list df.tail() # df.values df["temperature"] df["day"].head() df["temperature"] > 20 # shows all the temperatures below 20 degrees from the temperatures.csv file df["temperature"] < 0 # shows all the temperatures below 0 degrees from the file df_cool = df[df["temperature"] < 0] df_cool.head() df_cool.to_csv("cool.csv") # saves a new csv file named cool.csv #save stuff into a new file df["temperature"].mean() # this shows the average df["temperature"].max() # this shows the maximum temperature df["temperature"].min() # this shows the minimum temperature df["temperature"].value_counts( ) # it counts how many times a certain value occurs in a row/list df["temperature"].value_counts().head() snacks = pd.Series(["Mars", "Twix", "Oreo"]) snacks.value_counts() df["temperature"].plot() # shows a table of content thing (grafiek)
import panda as pd ps=pd.Series([1,2,3,4,5]) ps1=pd.Series([5,6,7,8,9]) ps+ps1
MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]], labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) #set index names df.index.names #check how many name you need df.index.names = ['Group','Num'] #select the first index df.xs('G1') df.xs(['G1',1]) df.xs(1,level='Num') #GROUP BY function df.groupby('Company') #supportive function .mean(), std() #Create a series in Pando - A Series is very similar to a NumPy array. A Series can have axis labels, meaning it can be indexed by a label, instead of just a number location. #It also doesn't need to hold numeric data, it can hold any arbitrary Python Object. pd.Series(data=my_list) #set label "index=labels" #Can do normal operation just like matrices #Operations To Find Unique df['ColName'].unique #return all unique number df['ColName'].nunique #return the number of unique number df['col2'].value_counts() #return the count of each unique number #applying function def times2(x): return x*2 df['col1'].apply(times2) df['col3'].apply(len) #length #Get Column and index names: df.columns '''return the column''' df.head() #return the top 5
>>> df.apply(np.cumsum) A B C D 2013-01-01 0.070158 0.629035 0.199517 -0.157134 2013-01-02 -0.587144 -0.100147 1.749539 -0.137846 2013-01-03 0.380278 0.125308 2.642619 -0.686718 2013-01-04 1.600488 1.067800 2.316169 -0.986318 2013-01-05 1.730403 -0.289345 0.987189 -0.602161 2013-01-06 2.386028 -0.005460 2.269278 -1.798823 >>> df.apply(lambda x: x.max() - x.min()) A 1.877512 B 2.299638 C 2.879002 D 1.580820 dtype: float64 >>> #HISTOGRAMMING >>> s=pd.Series(np.random.randint(0,7,size=10)) >>> s 0 4 1 4 2 2 3 4 4 1 5 6 6 2 7 2 8 1 9 1 dtype: int32 >>> s.value_counts() 4 3 2 3