def save_num_cat(project_relative_root_path, local_project): url = project_relative_root_path + local_project + '_0_num.xlsx' df_num = pd.read_excel(url) url = project_relative_root_path + local_project + '_0_cat.xlsx' df_cat = pd.read_excel(url) delete_columns = [ 'StartDate', 'EndDate', 'Status', 'IPAddress', 'RecipientLastName', 'RecipientFirstName', 'RecipientEmail', 'ExternalReference', 'LocationLatitude', 'LocationLongitude', 'DistributionChannel', 'UserLanguage', 'RecordedDate', 'ResponseId', 'Progress', 'Duration (in seconds)', 'Finished' ] df_num = df_num.drop(columns=delete_columns, axis=1) df_cat = df_cat.drop(columns=delete_columns, axis=1) df_num = df_num.iloc[1:, ] df_cat = df_cat.iloc[1:, ] for column in df_cat: df_cat.rename(columns={column: (column + '_cat')}, inplace=True) df_all = pd.concat([df_num, df_cat], sort=False, axis=1) url = project_relative_root_path + local_project + '_0_all.csv' print(f'url (save 0_all.csv): {url}') df_all.to_csv(url, index=False, encoding='utf-8') print(df_all.shape)
def getData(fileName, sheetName): if sheetName != None: # 读取excel df: pd.DataFrame = pd.read_excel(fileName, sheet_name=sheetName) else: df: pd.DataFrame = pd.read_excel(fileName) return df
def save_num_cat( project_relative_root_path, local_project ): url = project_relative_root_path + local_project + '_0_num.xlsx' df_num = pd.read_excel( url ) url = project_relative_root_path + local_project + '_0_cat.xlsx' df_cat = pd.read_excel( url ) df_num = df_num.drop( columns = [ 'StartDate' , 'EndDate' , 'Status' , 'IPAddress' , 'RecipientLastName' , 'RecipientFirstName' , 'RecipientEmail' , 'ExternalReference' , 'LocationLatitude' , 'LocationLongitude' , 'DistributionChannel' , 'UserLanguage' , 'RecordedDate' , 'ResponseId' , 'Progress' , 'Duration' , 'Finished' ], axis = 1 ) df_cat = df_cat.drop( columns = [ 'StartDate' , 'EndDate' , 'Status' , 'IPAddress' , 'RecipientLastName' , 'RecipientFirstName' , 'RecipientEmail' , 'ExternalReference' , 'LocationLatitude' , 'LocationLongitude' , 'DistributionChannel' , 'UserLanguage' , 'RecordedDate' , 'ResponseId' , 'Progress' , 'Duration' , 'Finished' ], axis = 1 ) df.rename(columns = {'Duration (in seconds)':'Duration'}, inplace = True)
def save_relevant_features(project_relative_root_path, local_project): url = project_relative_root_path + local_project + DATA_SOURCE_SUFFIX print(f'url (original) : {url}') df = pd.read_excel(url) df = df.drop(columns=[ 'StartDate', 'EndDate', 'Status', 'IPAddress', 'RecipientLastName', 'RecipientFirstName', 'RecipientEmail', 'ExternalReference', 'LocationLatitude', 'LocationLongitude', 'DistributionChannel', 'UserLanguage', 'RecordedDate', 'ResponseId' ], axis=1) # rename - Duration (in seconds) df.rename(columns={'Duration (in seconds)': 'Duration'}, inplace=True) # Delete the first observation, without second column name (first line data) df_selected_features = df.iloc[1:, ] url = project_relative_root_path + local_project + DATA_ORIGINAL_SUFFIX print(f'url (save {DATA_ORIGINAL_SUFFIX}): {url}') df_selected_features.to_csv(url, index=False, encoding='utf-8') print(df_selected_features.shape) return df, df_selected_features
def open_0_num(): url = project_relative_root_path + local_project + DATA_SOURCE_SUFFIX print(f'url (original) : {url}') df = pd.read_excel( url ) df = df.drop( columns = [ 'StartDate' , 'EndDate' , 'Status' , 'IPAddress' , 'RecipientLastName' , 'RecipientFirstName' , 'RecipientEmail' , 'ExternalReference' , 'LocationLatitude' , 'LocationLongitude' , 'DistributionChannel' , 'UserLanguage' , 'RecordedDate' , 'ResponseId' ], axis = 1 ) # rename - Duration (in seconds) df.rename(columns = {'Duration (in seconds)':'Duration'}, inplace = True) return df
def generate_metadata( local_path, local_filename): url = local_path + local_filename df = pd.read_excel( url ) print( df ) return df
def generate_metadata(local_group, local_project): LOCAL_PATH = f'./data/{local_group}/{local_project}/' LOCAL_ORIGINAL_DATA = '_0_cat.xlsx' url = LOCAL_PATH + local_project + LOCAL_ORIGINAL_DATA df = pd.read_excel(url) feature = [] feature_description = [] feature_group = [] for column_indexer in range(0, len(df.columns)): feature.append(df.columns[column_indexer]) feature_description.append(df.iloc[0, column_indexer]) feature_group.append('-') #print( feature ) #print( feature_description ) df_metadata = pd.DataFrame( list(zip(feature, feature_description, feature_group)), columns=['Feature Name', 'Feature Description', 'Feature Group']) url = local_path + local_project + '_metadata.xlsx' print(f'url (save excel): {url}') #df_metadata.to_excel(local_path + local_project + '_metadata.xlsx') print(df_metadata)
def save_metadata_alternative(project_relative_root_path, local_project): url = project_relative_root_path + local_project + DATA_SOURCE_SUFFIX print(f'url (original) : {url}') df = pd.read_excel(url) df = df.drop(columns=[ 'StartDate', 'EndDate', 'Status', 'IPAddress', 'RecipientLastName', 'RecipientFirstName', 'RecipientEmail', 'ExternalReference', 'LocationLatitude', 'LocationLongitude', 'DistributionChannel', 'UserLanguage', 'RecordedDate', 'ResponseId' ], axis=1) # rename - Duration (in seconds) df.rename(columns={'Duration (in seconds)': 'Duration'}, inplace=True) feature = [] feature_label = [] feature_description = [] feature_group = [] drop_down = [] data_types = [] for column_indexer in range(0, len(df.columns)): name = df.columns[column_indexer] description = df.iloc[0, column_indexer] data_types.append(df[name].dtypes.name) feature.append(name) feature_description.append(description)
def generate_metadata(local_path, local_project, local_filename): url = local_path + local_project + local_filename df = pd.read_excel(url) feature = [] feature_description = [] feature_group = [] for column_indexer in range(0, len(df.columns)): feature.append(df.columns[column_indexer]) feature_description.append(df.iloc[0, column_indexer]) feature_group.append('-') #print( feature ) #print( feature_description ) df_metadata = pd.DataFrame( list(zip(feature, feature_description, feature_group)), columns=['Feature Name', 'Feature Description', 'Feature Group']) url = local_path + local_project + '_metadata.xlsx' print(f'url (save excel): {url}') #df_metadata.to_excel(local_path + local_project + '_metadata.xlsx') print(df_metadata)
def generate_metadata( local_group, local_project ): LOCAL_PATH = f'./data/{local_group}/{local_project}/' LOCAL_ORIGINAL_DATA = '_0_cat.xlsx' url = LOCAL_PATH + local_project + LOCAL_ORIGINAL_DATA print(f'url (original) : {url}') df = pd.read_excel( url ) df = df_initial.drop(columns=['IPAddress', 'RecipientLastName', 'RecipientFirstName', 'RecipientEmail','ExternalReference', 'UserLanguage']) feature = [] feature_description = [] feature_group = [] for column_indexer in range(0 , len(df.columns)): feature.append( df. columns[column_indexer] ) feature_description.append( df.iloc[0,column_indexer]) feature_group.append('-') #print( feature ) #print( feature_description ) df_metadata = pd.DataFrame(list(zip(feature, feature_description, feature_group)), columns =['Feature Name', 'Feature Description', 'Feature Group']) url = LOCAL_PATH + local_project + '_metadata.xlsx' print(f'url (save excel): {url}') df_metadata.to_excel( url ) print( df_metadata )
def save_metadata_alternative( project_relative_root_path, local_project ): url = project_relative_root_path + local_project + DATA_SOURCE_SUFFIX print(f'url (original) : {url}') df = pd.read_excel( url ) df = df.drop( columns = [ 'StartDate' , 'EndDate' , 'Status' , 'IPAddress' , 'RecipientLastName' , 'RecipientFirstName' , 'RecipientEmail' , 'ExternalReference' , 'LocationLatitude' , 'LocationLongitude' , 'DistributionChannel' , 'UserLanguage' , 'RecordedDate' , 'ResponseId' ], axis = 1 ) # rename - Duration (in seconds) df.rename(columns = {'Duration (in seconds)':'Duration'}, inplace = True) feature = [] feature_label = [] feature_description = [] feature_group = [] drop_down = [] data_types = [] for column_indexer in range(0 , len(df.columns)): name = df.columns[column_indexer] description = df.iloc[0,column_indexer] data_types.append( df[name].dtypes.name ) feature.append( name ) feature_description.append( description ) label = '' group = '' label, group, alias = get_label_group( name, description ) feature_label.append( label ) feature_group.append( group ) drop_down.append( alias ) chart_type.appen ( 'Single' ) # SAVE df_metadata = pd.DataFrame(list(zip(feature, feature_label, feature_group, feature_description, drop_down)), columns =['FeatureName', 'FeatureLabel', 'FeatureGroup', 'FeatureDescription', 'DropDown']) url = project_relative_root_path + local_project + METADATA_SUFFIX print(f'url (save {METADATA_SUFFIX}): {url}') print(df_metadata.shape) df_metadata.to_excel( url , index = False, encoding = 'utf-8') return df
def save_metadata(project_relative_root_path, local_project): url = project_relative_root_path + local_project + DATA_SOURCE_SUFFIX print(f'url (original) : {url}') df = pd.read_excel(url) df = df.drop(columns=[ 'StartDate', 'EndDate', 'Status', 'IPAddress', 'RecipientLastName', 'RecipientFirstName', 'RecipientEmail', 'ExternalReference', 'LocationLatitude', 'LocationLongitude', 'DistributionChannel', 'UserLanguage', 'RecordedDate', 'ResponseId' ], axis=1) # rename - Duration (in seconds) df.rename(columns={'Duration (in seconds)': 'Duration'}, inplace=True) feature = [] feature_label = [] feature_description = [] feature_group = [] for column_indexer in range(0, len(df.columns)): name = df.columns[column_indexer] description = df.iloc[0, column_indexer] feature.append(name) feature_description.append(description) if name == 'Progress' or name == 'Duration' or name == 'Finished': feature_group.append('Response') feature_label.append(name) elif match(r'Q\d+_', name): split_pattern = '\? - ' if search(split_pattern, description): elements = name.split(split_pattern) feature_group.append(elements[0]) feature_label.append(elements[0]) else: feature_group.append('NA') feature_label.append(name) else: feature_group.append('NA') feature_label.append(name) #print( feature ) #print( feature_description ) df_metadata = pd.DataFrame( list(zip(feature, feature_label, feature_group, feature_description)), columns=['Feature Name', 'Feature Description', 'Feature Group']) url = project_relative_root_path + local_project + METADATA_SUFFIX print(f'url (save {METADATA_SUFFIX}): {url}') df_metadata.to_excel(url, index=False, encoding='utf-8') return df
def generate_metadata( local_group, local_project ): LOCAL_PATH = f'./data/{local_group}/{local_project}/' LOCAL_ORIGINAL_DATA = '_0_cat.xlsx' url = LOCAL_PATH + local_project + LOCAL_ORIGINAL_DATA print(f'url (original) : {url}') df = pd.read_excel( url ) df = df_initial.drop(columns=['IPAddress', 'RecipientLastName', 'RecipientFirstName', 'RecipientEmail','ExternalReference', 'UserLanguage'])
def generate_metadata(local_group, local_project): LOCAL_PATH = f'./data/{local_group}/{local_project}/' LOCAL_ORIGINAL_DATA = '_0_num.xlsx' url = LOCAL_PATH + local_project + LOCAL_ORIGINAL_DATA print(f'url (original) : {url}') df = pd.read_excel(url) df = df.drop(columns=[ 'StartDate', 'EndDate', 'Status', 'IPAddress', 'RecipientLastName', 'RecipientFirstName', 'RecipientEmail', 'ExternalReference', 'LocationLatitude', 'LocationLongitude', 'DistributionChannel', 'UserLanguage' ], axis=1) feature = [] feature_description = [] feature_group = [] for column_indexer in range(0, len(df.columns)): feature.append(df.columns[column_indexer]) feature_description.append(df.iloc[0, column_indexer]) feature_group.append('-') #print( feature ) #print( feature_description ) df_metadata = pd.DataFrame( list(zip(feature, feature_description, feature_group)), columns=['Feature Name', 'Feature Description', 'Feature Group']) url = LOCAL_PATH + local_project + '_metadata.xlsx' print(f'url (save excel): {url}') df_metadata.to_excel(url, index=False, encoding='utf-8') #print( df_metadata ) #df.set_index("StartDate") #df_1_original = df.drop("Start Date", axis = 0) #print( f'df: {df.shape()[0]}') print(df) df = df.iloc[1:, ] url = LOCAL_PATH + local_project + '_1_original.csv' print(f'url (save 1_original): {url}') df.to_csv(url, index=False, encoding='utf-8') print(df) # without missing values df_2_wmv = df[df.Finished != 0] url = LOCAL_PATH + local_project + '_2_wmv.csv' print(f'url (save 2_without_missing_values): {url}') df_2_wmv.to_csv(url, index=False, encoding='utf-8') print(df_2_wmv)
def readFile(filepath): # check that the file name is a string, return an error if not if not isinstance(filepath, basestring): return "ERROR: First Parameter should be a string" # check filepath is valid if os.path.isfile(filepath): # use Pandas to read in the file and return a data frame data = pd.read_excel(filepath) return data else: return "ERROR: File not found"
def check_for_blank_country_codes(self): address_df = pd.read_excel( self.return_global_street_addr_file(), sheet_name=self.return_global_street_addr_sheet()) address_df = address_df.loc[ address_df[self.return_country_code_col()].isnull(), :] if address_df.empty: pass else: print(f'Could not find a country code for: ' f'{address_df["Country"]}') exit()
def save_metadata( project_relative_root_path, local_project ): url = project_relative_root_path + local_project + DATA_SOURCE_SUFFIX print(f'url (original) : {url}') df = pd.read_excel( url ) df = df.drop( columns = [ 'StartDate' , 'EndDate' , 'Status' , 'IPAddress' , 'RecipientLastName' , 'RecipientFirstName' , 'RecipientEmail' , 'ExternalReference' , 'LocationLatitude' , 'LocationLongitude' , 'DistributionChannel' , 'UserLanguage' , 'RecordedDate' , 'ResponseId' ], axis = 1 ) # rename - Duration (in seconds) df.rename(columns = {'Duration (in seconds)':'Duration'}, inplace = True) feature = [] feature_description = [] feature_group = [] for column_indexer in range(0 , len(df.columns)): name = df.columns[column_indexer] feature.append( name ) feature_description.append( df.iloc[0,column_indexer]) if feature_group.append('-') #print( feature ) #print( feature_description ) df_metadata = pd.DataFrame(list(zip(feature, feature_description, feature_group)), columns =['Feature Name', 'Feature Description', 'Feature Group']) url = project_relative_root_path + local_project + METADATA_SUFFIX print(f'url (save {METADATA_SUFFIX}): {url}') df_metadata.to_excel( url , index = False, encoding = 'utf-8') return df
def save_num_cat( project_relative_root_path, local_project ): url = project_relative_root_path + local_project + '_0_num.xlsx' df_num = pd.read_excel( url ) df_cat = '' = pd.read_excel( url ) df = df.drop( columns = [ 'StartDate' , 'EndDate' , 'Status' , 'IPAddress' , 'RecipientLastName' , 'RecipientFirstName' , 'RecipientEmail' , 'ExternalReference' , 'LocationLatitude' , 'LocationLongitude' , 'DistributionChannel' , 'UserLanguage' , 'RecordedDate' , 'ResponseId' ], axis = 1 )
def consolidate(log_name): # consolidate the runs information from os import listdir, path, mkdir from os.path import isfile, join from pandas import pandas as pd import numpy as np log_dir = f"./log/{log_name}" log_files = [f for f in listdir(log_dir) if isfile(join(log_dir, f))] print(log_files) fitness_runs = [] columns_name = [] counter = 0 generations = [] for log_name in log_files: if log_name.startswith("run_"): df = pd.read_excel(log_dir + "/" + log_name) fitness_runs.append(list(df.Fitness)) columns_name.append(log_name.strip(".xslx")) counter += 1 if not generations: generations = list(df["Generation"]) # fitness_sum = [sum(x) for x in zip(*fitness_runs)] df = pd.DataFrame(list(zip(*fitness_runs)), columns=columns_name) fitness_sd = list(df.std(axis=1)) fitness_mean = list(df.mean(axis=1)) # df["Fitness_Sum"] = fitness_sum df["Generation"] = generations df["Fitness_SD"] = fitness_sd df["Fitness_Mean"] = fitness_mean df["Fitness_Lower"] = df["Fitness_Mean"] + df["Fitness_SD"] df["Fitness_Upper"] = df["Fitness_Mean"] - df["Fitness_SD"] if not path.exists(log_dir): mkdir(log_dir) df.to_excel(log_dir + "/all.xlsx", index=False, encoding="utf-8") return df
def generate_metadata( local_group, local_project ): LOCAL_PATH = f'./data/{local_group}/{local_project}/' LOCAL_ORIGINAL_DATA = '_0_num.xlsx' url = LOCAL_PATH + local_project + LOCAL_ORIGINAL_DATA print(f'url (original) : {url}') df_x = pd.read_excel( url ) df = df_x.drop(columns = [ 'StartDate' , 'EndDate' , 'Status' , 'IPAddress' , 'RecipientLastName' , 'RecipientFirstName' , 'RecipientEmail' , 'ExternalReference' , 'LocationLatitude' , 'LocationLongitude' , 'DistributionChannel' , 'UserLanguage' ], axis = 1) feature = [] feature_description = [] feature_group = [] for column_indexer in range(0 , len(df.columns)): feature.append( df. columns[column_indexer] ) feature_description.append( df.iloc[0,column_indexer]) feature_group.append('-') #print( feature ) #print( feature_description ) df_metadata = pd.DataFrame(list(zip(feature, feature_description, feature_group)), columns =['Feature Name', 'Feature Description', 'Feature Group']) url = LOCAL_PATH + local_project + '_metadata.xlsx' print(f'url (save excel): {url}') df_metadata.to_excel( url ) print( df_metadata ) df_1_original = df.drop("Start Date", axis = 0) print(df.shape()) print(df_1_original.shape())
def read_student_surveys(self, filename): filepath = basepath + filename df = pandas.read_excel(filepath) zone_schools = {'ESC1': [], 'ESC2': [], 'ESC3': [], 'ESCSEC': []} data = {} for _, row in df.iterrows(): origin = row['ORG-LUG'] school_type = row['SCHOOL-TYPE'] school_zone = row['DEST-LUG'] if origin not in data: data[origin] = copy.deepcopy(zone_schools) data[origin][school_type].append(school_zone) self.sudents_data = data self.process_students_data()
def generate_metadata(local_path, local_filename): url = local_path + local_filename df = pd.read_excel(url) feature = [] feature_description = [] feature_group = [] for column_indexer in range(0, len(df.columns)): feature.append(df.columns[column_indexer]) feature_description.append(df.iloc[0, column_indexer]) feature_group.append('-') print(feature) print(feature_description) return df
def save_metadata( local_group, local_project ): root = LOCAL_PATH.format(local_group, local_project) url = LOCAL_PATH + local_project + LOCAL_ORIGINAL_DATA print(f'url (original) : {url}') df = pd.read_excel( url ) df = df.drop( columns = [ 'StartDate' , 'EndDate' , 'Status' , 'IPAddress' , 'RecipientLastName' , 'RecipientFirstName' , 'RecipientEmail' , 'ExternalReference' , 'LocationLatitude' , 'LocationLongitude' , 'DistributionChannel' , 'UserLanguage' ], axis = 1 ) feature = [] feature_description = [] feature_group = [] for column_indexer in range(0 , len(df.columns)): feature.append( df. columns[column_indexer] ) feature_description.append( df.iloc[0,column_indexer]) feature_group.append('-') #print( feature ) #print( feature_description ) df_metadata = pd.DataFrame(list(zip(feature, feature_description, feature_group)), columns =['Feature Name', 'Feature Description', 'Feature Group']) url = LOCAL_PATH + local_project + '_metadata.xlsx' print(f'url (save excel): {url}') df_metadata.to_excel( url , index = False, encoding = 'utf-8') return df
def generate_metadata( local_path, local_filename): url = local_path + local_filename df = pd.read_excel( url ) feature = [] feature_description = [] feature_group = [] for column_indexer in range(0 , len(df.columns)): feature.append( df. columns[column_indexer] ) feature_description.append( df.iloc[0,column_indexer]) feature_group.append('-') print( feature ) print( feature_description ) df_metadata = pd.DataFrame(list(zip(feature, feature_description, feature_group)), columns =['Feature Name', 'Feature Description', 'Feature Group']) return df
def read_workers_surveys(self, filename): filepath = basepath + filename df = pandas.read_excel(filepath) data = {} for _, row in df.iterrows(): origin = row['ORG-LUG'] workplace_zone = row['DEST-LUG'] if 'Casal Fern' in workplace_zone: workplace_zone = 'Casal Fernão João' workplace_type = row['WORK-TYPE'] if origin not in data: data[origin] = {'dest': [], 'type': []} data[origin]['dest'].append(workplace_zone) data[origin]['type'].append(workplace_type) self.workers_data = data self.process_workers_data()
def generate_metadata( local_path, local_filename): url = local_path + local_filename df = pd.read_excel( url ) feature = [] feature_description = [] feature_group = [] for column_indexer in range(0 , len(df.columns)): print(df.[[0,column_indexer]]) #name = df. columns[i] #feature = data[ name ] #missing_values = data[name].isnull().sum() #total += missing_values #print( f'feature : {feature.name} missing valus : { missing_values} %mv : {missing_values / OBSERVATIONS}') #print(f'TOTAL = {total}') print( df ) return df
def get_view_type( column, df, df_md ): group = list(df_md[ df_md['FeatureName'] == column ]['FeatureGroup'])[0] count = len(df_md[ df_md['FeatureGroup'] == group ]['FeatureName'].unique()) view_type = '' if count > 1 : view_type = 'Group' else : view_type = 'Single' selected_features = list(df_md[ df_md['FeatureGroup'] == group ]['FeatureName']) #print( f' **** {group} - selected rules: {selected_rows}') for feature in selected_features : data_feature = df[ feature ] print( f"<><><><> {data_feature.unique()} #values_counts()" # print(feature) return view_type def save_metadata( project_relative_root_path, local_project ): url = project_relative_root_path + local_project + DATA_SOURCE_SUFFIX print(f'url (original) : {url}') df = pd.read_excel( url ) df = df.drop( columns = [ 'StartDate' , 'EndDate' , 'Status' , 'IPAddress' , 'RecipientLastName' , 'RecipientFirstName' , 'RecipientEmail' , 'ExternalReference' , 'LocationLatitude' , 'LocationLongitude' , 'DistributionChannel' , 'UserLanguage' , 'RecordedDate' , 'ResponseId' ], axis = 1 ) # rename - Duration (in seconds) df.rename(columns = {'Duration (in seconds)':'Duration'}, inplace = True) feature = [] feature_label = [] feature_description = [] feature_group = [] drop_down = [] data_types = [] for column_indexer in range(0 , len(df.columns)): name = df.columns[column_indexer] description = df.iloc[0,column_indexer] data_types.append( df[name].dtypes.name ) feature.append( name ) feature_description.append( description ) if match(r'Q\d+_', name): if search('\? - ', description): elements = description.split( '? - ') # split does not need the slash (escape), because it will compare texts feature_group.append( elements[0] ) feature_label.append( elements[1] ) drop_down.append( name + '--' + elements[1]) elif search(': - ', description): elements = description.split( ': - ') # split does not need the slash (escape), because it will compare texts feature_group.append( elements[0] ) feature_label.append( elements[1] ) drop_down.append( name + '--' + elements[1]) else: feature_group.append('Misc') feature_label.append( name ) drop_down.append( name ) if match(r'Q\dd_', name): if search('\? - ', description): elements = description.split( '? - ') # split does not need the slash (escape), because it will compare texts feature_group.append( elements[0] ) feature_label.append( elements[1] ) drop_down.append( name + '--' + elements[1]) elif search(': - ', description): elements = description.split( ': - ') # split does not need the slash (escape), because it will compare texts feature_group.append( elements[0] ) feature_label.append( elements[1] ) drop_down.append( name + '--' + elements[1]) elif search('\?', description): elements = description.split( '?') # split does not need the slash (escape), because it will compare texts feature_group.append( elements[0] ) feature_label.append( elements[1] ) drop_down.append( name + '--' + elements[1]) else: feature_group.append('Misc') feature_label.append( name ) drop_down.append( name ) elif match(r'Q\d+', name): # the question doesn't have Q d+ _ <Q number _>, it can be a question with no group, but it is Q10 (No Q10_*) if search('\?', description): elements = description.split( '?') # split does not need the slash (escape), because it will compare texts feature_group.append( elements[0] ) feature_label.append( elements[0] ) if len(elements[0]) < 44: drop_down.append( name + '--' + elements[0]) else: drop_down.append( name + '-- ...' + elements[0][-41:]) else: feature_group.append(description) description.replace(':', '') feature_label.append( description ) #drop_down.append( name + ' - ' ) if len(description) < 44: drop_down.append( name + '--' + description) else: drop_down.append( name + '-- ...' + description[-41:]) elif match(r'Q\d+ - ', name): if search('\? - ', description): elements = description.split( '? - ') # split does not need the slash (escape), because it will compare texts feature_group.append( elements[0] ) feature_label.append( elements[1] ) drop_down.append( name + '--' + elements[1]) elif search(': - ', description): elements = description.split( ': - ') # split does not need the slash (escape), because it will compare texts feature_group.append( elements[0] ) feature_label.append( elements[1] ) drop_down.append( name + '--' + elements[1]) else: feature_group.append('Misc') feature_label.append( name ) drop_down.append( name ) else: feature_group.append( name ) feature_label.append( name ) drop_down.append( name ) #print( feature ) #print( feature_description ) print(data_types) df_metadata = pd.DataFrame(list(zip(feature, feature_label, feature_group, feature_description, drop_down)), columns =['FeatureName', 'FeatureLabel', 'FeatureGroup', 'FeatureDescription', 'DropDown']) url = project_relative_root_path + local_project + METADATA_SUFFIX print(f'url (save {METADATA_SUFFIX}): {url}') print(df_metadata.shape) df_metadata.to_excel( url , index = False, encoding = 'utf-8') return df # ────────────────────────────────────────────────────────────────────────────────────────────────────────────────── # method: save_data_selected_columns # ────────────────────────────────────────────────────────────────────────────────────────────────────────────────── def save_data_selected_columns( df, project_relative_root_path, local_project ): #print( df_metadata ) #df.set_index("StartDate") #df_1_original = df.drop("Start Date", axis = 0) #print( f'df: {df.shape()[0]}') #print( df ) df = df.iloc[1:,] url = project_relative_root_path + local_project + DATA_ORIGINAL_SUFFIX print(f'url (save {DATA_ORIGINAL_SUFFIX}): {url}') df.to_csv( url, index = False, encoding = 'utf-8' ) print( df.shape ) return df # ────────────────────────────────────────────────────────────────────────────────────────────────────────────────── # method: save_only_completed_responses # ────────────────────────────────────────────────────────────────────────────────────────────────────────────────── def save_only_completed_responses( df, project_relative_root_path, local_project ): # without missing values df_2_wmv = df[ df.Finished != 0 ] url = project_relative_root_path + local_project + DATA_WMV_SUFFIX print(f'url (save {DATA_WMV_SUFFIX}): {url}') df_2_wmv.to_csv( url, index = False, encoding = 'utf-8' ) print( df_2_wmv.shape ) url = project_relative_root_path + local_project + DATA_ANALYSIS_SUFFIX print(f'url (save {DATA_WMV_SUFFIX}): {url}') df_2_wmv.to_excel( url, index = False, encoding = 'utf-8' ) return df_2_wmv # ────────────────────────────────────────────────────────────────────────────────────────────────────────────────── # __main__ # ────────────────────────────────────────────────────────────────────────────────────────────────────────────────── if __name__ == '__main__': from terminal import Terminal, FontColor Terminal.print_box( ['PREPROCESSING STARTED'], font_color = FontColor.Green) option = 'SPECIFIC' #'THU_20_FER-1' run_all = False if option == 'SPECIFIC': local_group = 'THU_20_FER' local_project = 'netflix' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata_alternative ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_group = 'FRI_18_FER' local_project = 'live_smart' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata_alternative ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) ############################################################################################### ############################################################################################### ############################################################################################### if option == 'MON_14_FER-1' or run_all: local_group = 'MON_14_FER' local_project = 'ca2a' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_project = 'nova_pets' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_project = 'social_snap' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) ############################################################################################### ############################################################################################### ############################################################################################### if option == 'THU_20_FER-1' or run_all: local_group = 'THU_20_FER' local_project = 'netflix' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_project = 'organibag' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_project = 'catch_volunteering' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_project = 'ajuda_portugal' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) ############################################################################################### ############################################################################################### ############################################################################################### if option == 'THU_20_VIC-1' or run_all: local_group = 'THU_20_VIC' local_project = 'refood' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_project = 'sustainable_u' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_project = 'share_me' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_project = 'micolet' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) ############################################################################################### ############################################################################################### ############################################################################################### if option == 'FRI_18_FER-1' or run_all: local_group = 'FRI_18_FER' local_project = 'live_smart' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_project = 'lost_abroad' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_project = 'ma_beauty' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_project = 'naturally' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) ############################################################################################### ############################################################################################### ############################################################################################### if option == 'FRI_18_VIC-1' or run_all: local_group = 'FRI_18_VIC' local_project = 'food_donations' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_project = 'green_care' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_project = 'kid_kid_store' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_project = 'meduse' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_project = 'ocean_alive' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) if option == 'FRI_20_FER-1' or run_all: local_group = 'FRI_20_FER' local_project = 'chef4u' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_project = 'fake_news' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_project = 'startup_helpe' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) if option == 'FRI_20_VIC-1' or run_all: local_group = 'FRI_20_VIC' local_project = 'gap_year' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_project = 'social_app' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project ) local_project = 'tie_trip' Terminal.print_box( [f'{local_group} - {local_project}'], font_color = FontColor.Blue) project_relative_root_path = f'./data/{local_group}/{local_project}/' df = save_metadata ( project_relative_root_path, local_project ) df = save_data_selected_columns( df, project_relative_root_path, local_project ) df = save_only_completed_responses( df, project_relative_root_path, local_project )
def save_metadata( project_relative_root_path, local_project ): url = project_relative_root_path + local_project + DATA_SOURCE_SUFFIX print(f'url (original) : {url}') df = pd.read_excel( url ) df = df.drop( columns = [ 'StartDate' , 'EndDate' , 'Status' , 'IPAddress' , 'RecipientLastName' , 'RecipientFirstName' , 'RecipientEmail' , 'ExternalReference' , 'LocationLatitude' , 'LocationLongitude' , 'DistributionChannel' , 'UserLanguage' , 'RecordedDate' , 'ResponseId' ], axis = 1 ) # rename - Duration (in seconds) df.rename(columns = {'Duration (in seconds)':'Duration'}, inplace = True) feature = [] feature_label = [] feature_description = [] feature_group = [] drop_down = [] data_types = [] for column_indexer in range(0 , len(df.columns)): name = df.columns[column_indexer] description = df.iloc[0,column_indexer] data_types.append( df[name].dtypes.name ) feature.append( name ) feature_description.append( description ) if match(r'Q\d+_', name): if search('\? - ', description): elements = description.split( '? - ') # split does not need the slash (escape), because it will compare texts feature_group.append( elements[0] ) feature_label.append( elements[1] ) drop_down.append( name + '--' + elements[1]) elif search(': - ', description): elements = description.split( ': - ') # split does not need the slash (escape), because it will compare texts feature_group.append( elements[0] ) feature_label.append( elements[1] ) drop_down.append( name + '--' + elements[1]) else: feature_group.append('Misc') feature_label.append( name ) drop_down.append( name ) if match(r'Q\dd_', name): if search('\? - ', description): elements = description.split( '? - ') # split does not need the slash (escape), because it will compare texts feature_group.append( elements[0] ) feature_label.append( elements[1] ) drop_down.append( name + '--' + elements[1]) elif search(': - ', description): elements = description.split( ': - ') # split does not need the slash (escape), because it will compare texts feature_group.append( elements[0] ) feature_label.append( elements[1] ) drop_down.append( name + '--' + elements[1]) elif search('\?', description): elements = description.split( '?') # split does not need the slash (escape), because it will compare texts feature_group.append( elements[0] ) feature_label.append( elements[1] ) drop_down.append( name + '--' + elements[1]) else: feature_group.append('Misc') feature_label.append( name ) drop_down.append( name ) elif match(r'Q\d+', name): # the question doesn't have Q d+ _ <Q number _>, it can be a question with no group, but it is Q10 (No Q10_*) if search('\?', description): elements = description.split( '?') # split does not need the slash (escape), because it will compare texts feature_group.append( elements[0] ) feature_label.append( elements[0] ) if len(elements[0]) < 44: drop_down.append( name + '--' + elements[0]) else: drop_down.append( name + '-- ...' + elements[0][-41:]) else: feature_group.append(description) description.replace(':', '') feature_label.append( description ) #drop_down.append( name + ' - ' ) if len(description) < 44: drop_down.append( name + '--' + description) else: drop_down.append( name + '-- ...' + description[-41:]) elif match(r'Q\d+ - ', name): if search('\? - ', description): elements = description.split( '? - ') # split does not need the slash (escape), because it will compare texts feature_group.append( elements[0] ) feature_label.append( elements[1] ) drop_down.append( name + '--' + elements[1]) elif search(': - ', description): elements = description.split( ': - ') # split does not need the slash (escape), because it will compare texts feature_group.append( elements[0] ) feature_label.append( elements[1] ) drop_down.append( name + '--' + elements[1]) else: feature_group.append('Misc') feature_label.append( name ) drop_down.append( name ) else: feature_group.append( name ) feature_label.append( name ) drop_down.append( name ) #print( feature ) #print( feature_description ) print(data_types) df_metadata = pd.DataFrame(list(zip(feature, feature_label, feature_group, feature_description, drop_down)), columns =['FeatureName', 'FeatureLabel', 'FeatureGroup', 'FeatureDescription', 'DropDown']) url = project_relative_root_path + local_project + METADATA_SUFFIX print(f'url (save {METADATA_SUFFIX}): {url}') print(df_metadata.shape) df_metadata.to_excel( url , index = False, encoding = 'utf-8') return df
def open_reference( url ): df = pd.read_excel( url )
from data import data from pandas import pandas as pd import numpy as np import matplotlib.pyplot as plt plt.style.use('fivethirtyeight') #This is a program used to query incoming stock data. # get the data from the excel file df = pd.read_excel('companyData.xlsx') # Change the data frame to be indexed by date df = df.set_index(pd.DatetimeIndex(df['date'].values)) # make a graph plt.figure(figsize=(14.0, 8.0)) plt.plot(df['4. close'], label='close') plt.title('Daily close price') plt.xticks(rotation=45) plt.xlabel('Date') plt.ylabel('Price $') # plt.show() # Calculate MACD and signal line indicators # Calculate the short term exponential moving average (EMA) short_EMA = df['4. close'].ewm(span=12, adjust=False).mean() # Calculate long term exponential moving average long_EMA = df['4. close'].ewm(span=26, adjust=False).mean() # calculate MACD line MACD = short_EMA - long_EMA # create signal line signal = MACD.ewm(span=9, adjust=False).mean() plt.figure(figsize=(12.5, 5.0))