def fix_guess(g, df, col): ''' ''' #BUILD SOMETHING TO SAVE THESE CHANGES #NEEDS RETURNS HERE AND IN SWITCH DATA print() c = input( "Choose the number of the correct value (1, 2...) or press Enter to use a custom value: " ) try: c = int(c.strip()) - 1 correct = g[c] switch_data(df, col, g, c, correct) except Exception as e: #print(e) #debugging chosen = False while not chosen: custom = input( "Enter a new correct value or press Enter to cancel: ") custom = custom.strip() if custom: tprint("Use %s ?" & custom) chosen = cli.ask_yes(True) if chosen: switch_data(df, col, g, -1, custom) #-1 is jank fix for how switch_data currently works else: chosen = True
def convert_data(df, col): ''' ''' #replace when finished tprint("Unfortunately this feature is not currently supported") tprint("Stay tuned for future releases and updates") cli.cls()
def drop_na(df, col): ''' ''' tprint( "Would you like to remove the rows that do not contain a value for: %s?" % col) if cli.ask_yes(True): df.dropna(subset=[col])
def end(): ''' just prints "Exiting" then calls sys.exit() ''' tprint("Exiting") print() input("Press Enter\t") sys.exit()
def clean_numeric(df, col): ''' ''' tprint( "Would you like to see summary statistics for the data in this column?" ) if cli.ask_yes(True): tprint(df[col].describe()) cli.cls()
def load_cli_intro(): ''' loads the intro from text/intro.txt currently no input/return ''' with open("text/intro.txt") as f: t = f.read() tprint(t) print()
def ask_default(names, default_name, default=True): ''' ''' rv = False tprint("Would you like to use the default name?") if cli.ask_yes(default): names.append(default_name) rv = True return rv
def ask_convert(df, col): ''' ''' rv = False print("This column seems to contain text data") tprint("Is this column supposed to contain dates or numeric data?") if cli.ask_yes(False): rv = True #cli.cls() return rv
def rename_cols(df, named): ''' takes dataframe, bool for whether columns are already named modifies df in place, no return ''' if not named: give_new_names(df) else: tprint("Column names have been updated for compatibility") modify_column_names(df)
def show_first_values(df, col, count=5): ''' ''' tprint("First %d unique values from this column:" % count) values = list(df.iloc[:, col].unique()) c = 0 for v in values: print("\t" + str(v)) c += 1 if c >= count: break print()
def write_csv(df, name): ''' probably ok may cut index or redo ''' #GIVE THEM AN OPTION TO CHANGE THE NAME f = re.sub(u"\.csv", "", name) fname = f + "_cleaned.csv" c = 1 matched = True while matched: #should prevent it from overwriting previous runs if fname in os.listdir("output"): fname = f + "_cleaned_" + str(c) + ".csv" c += 1 else: matched = False tprint("Writing CSV: %s" % fname) df.to_csv("output/" + fname, index_label="index") tprint("Finished writing cleaned data to Output")
def choose_col(df, working): ''' ''' columns.fix_current_columns( df, list_only=True) #really ugly fix in columns atm print() col = input( "Choose the number of a column to clean (1, 2, 3...) or press Enter: ") try: i = int(col.strip()) - 1 #maybe should do something w/ the name... col_name = df.columns[i] except Exception as e: col_name = "" tprint("No column selected") tprint("Would you like to continue cleaning columns?") working = cli.ask_yes(True) #print(e) #debugging return col_name, working
def fix_current_columns(df, list_only=False): ''' given dataframe (df), accesses column names ensures column names are in format <column_name> then prints them returns list of properly formatted column names ''' tprint("Columns:") compat = [] i = 0 for old in df.columns: i += 1 if not list_only: new = make_compatible(old) compat.append(new) else: #ugly fix atm new = old print("\t" + str(i) + ") " + new) return compat
def clean_strings(df, col): ''' ''' tprint("Removing excess white space from values in %s" % col) df[col] = df[col].str.strip() df[col] = df[col].str.replace(" ,", ",") df[col] = df[col].str.replace(",,", ",") df[col] = df[col].str.replace(" ", " ") df[col] = df[col].str.replace("\t", " ") cli.cls() u = df[col].unique() num_unique = len(u) print("Column " + col + " contains " + str(num_unique) + " unique values") if num_unique > WARN_LEVEL: tprint( "Large numbers of unique values may take significant time to analyze, depending on their length, your system and settings" ) tprint("Would you like to search for possible errors in the data?") guesses = [] if cli.ask_yes(True): cli.cls(False) guesses = consolidate_guesses( guess_linkages(u)) #doing a lot w/ generator if guesses: review_guesses(df, col, guesses) tprint("All automatic revisions reviewed") else: tprint("Our review did not detect any likely errors") cli.cls() ###################### #DO THE MANUAL VERSION print("Current unique values in %s:" % col) show_values(df, col) cli.cls() print("Would you like to further clean or modify the values in %s?" % col) if cli.ask_yes(True): previous = [] #make previous update w/ confirmed automatic fixes... # ^ this will take some work fix_manual(df, col, previous)
def review_guesses(df, col, guesses): ''' ''' #need to think about consolidating if there are a bunch of similar #build so g can contain 2+ values for g in guesses: print("Similar Value (Number in data):") i = 1 #exists is a p-jank solution for not having consolidate_guesses... exists = 0 for v in g: num_v = len(df[df[col] == v]) if num_v: exists += 1 print("\t" + str(i) + ") " + v + "\t(%d)" % num_v) i += 1 if exists <= 1: cli.cls(False) continue tprint("Would you like to update one or more of these values?") if cli.ask_yes(False): fix_guess(g, df, col) cli.cls(True)
def switch_data(df, col, g, c, correct): ''' ''' try: for i in range(len(g)): if i != c: tprint("Would you like to convert " + g[i] + " to " + correct + "?") if cli.ask_yes(True): df.loc[df[col] == g[i], [col]] = correct tprint("Converting " + g[i] + " -> " + correct, 0.5) except Exception as e: tprint("Error converting data. Data not converted")
def give_new_names(df): ''' ''' names = [] num_cols = len(df.columns) for i in range(num_cols): default_name = "column_" + str(i + 1) print("Column " + str(i + 1) + " of " + str(num_cols) + " is currently named: " + default_name) show_first_values(df, i) chosen = False while not chosen: name = input( "Enter a new name for this column or press Enter to keep the default: " ) name = name.strip() if name: com_name = name.lower().replace(" ", "_") tprint("Modifying input for compatibility: " + name + " -> " + com_name) tprint("Would you like to name this column " + com_name + " ?") chosen = cli.ask_yes(True) if chosen: tprint("Setting column name") if name in names: print( "Name already used for a column. Please choose another" ) else: names.append(name) else: chosen = ask_default(names, default_name, False) else: chosen = ask_default(names, default_name, True) cli.cls() #setting the new names tprint("Adding column names") df.columns = names cli.cls()
def show_values(df, col, output=False, values=[]): ''' if output returns chosen value ''' i = 0 if not values: #THIS IS A BIT SILLY values = sorted(df[col].unique()) pages = math.ceil(len(values) / 10) print("Current unique values: (Page 1 of %d)" % pages) for v in values: i += 1 print("\t" + str(i) + ") " + v) if not output: if i % 10 == 0 and i < len(values): tprint("Show more values?") if cli.ask_yes(True): cli.cls(False) print("Current unique values: (Page %d of %d)" % (i / 10 + 1, pages)) else: break else: if i % 10 == 0 or i == len(values): print() more = i < len(values) choosing = True while choosing: #currently always true and break/return out if more: c = input( "Type the number of a value (1, 2...) or press Enter to view more values: " ) else: c = input( "Type the number of a value to select it (1, 2, 3...): " ) try: c = int(c) rv = str(values[c - 1]) return rv except: tprint("No value selected") if more: tprint("View more values?") if cli.ask_yes(True): cli.cls(False) print( "Current unique values: (Page %d of %d)" % (i / 10 + 1, pages)) break
def fix_manual(df, col, previous=[]): ''' ''' working = True while working: tprint("Would you like to load a record of previously used changes?") if cli.ask_yes(True): fixes = choose_record(df, col, previous) #REDO/RENAME if not fixes: fixes = add_new_fixes(df, col, previous) else: cli.cls() fixes = add_new_fixes(df, col, previous) print("Applying fixes") for old, new in fixes.items(): df.loc[df[col] == old, [col]] = new tprint("Fixes applied") cli.cls() show_values(df, col) tprint("Would you like to further modify the values in %s?" % col) working = cli.ask_yes(True) cli.cls()
def ask_meant(guess, default=False): ''' ''' tprint("Did you mean to choose %s?" % guess) rv = ask_yes(default) return rv
def ask_continue(default=True): ''' ''' tprint("Would you like to continue cleaning data?") if not ask_yes(default): end()
def modify_column_names(df): ''' ''' modding = True while modding: #THIS NEEDS TO BECOME A FXN CALL... SO CAN USE W/ named = False df.columns = fix_current_columns(df) tprint("Would you like to modify a column's name?") modding = cli.ask_yes(False) if modding: print() col = input( "Choose the number of the column to modify (1, 2, 3...): ") try: i = int(col.strip()) - 1 cli.cls(verbose=False) tprint("Renaming column: " + df.columns[i]) print() new = input("Type new name: ").strip() new = make_compatible(new) if new: tprint("Converting name for compatibility") tprint("Would you like to rename " + df.columns[i] + " to " + new + " ?") if cli.ask_yes(True): change_one_column(new, df, i) else: tprint("No new column name detected") except Exception as e: #debugging tprint("Error renaming column") #print(e) cli.cls()
def choose_record(df, col, previous): ''' returns dict of old -> new large parts of this fxn are currently deprecated, unclear if stable ''' org = "" #suggest = True rv = {} chosen = False #this is basically a C paradigm tbh while not chosen: #REDO STRINGS/SELECTION ''' if not org: print("Choose an organization from the list below:") for org in sorted(os.listdir("staff")): #REQUIRED DIR: staff print(org.strip(".txt").replace("_", " ")) print("Other") print() org = input("Organization:\t").strip().upper().replace(" ", "_") print() if org == "OTHER": start_record(df, True) org = "" continue #DO SOMETHING FOR OTHER -- requires making new file --> continue, maybe redisplay orgs else: ''' val = [n.strip(".txt") for n in os.listdir("data") if ".txt" in n] #that is some l-comp org = show_values(df, col, output=True, values=val) try: fname = org + ".txt" with open("data/" + fname) as f: data = f.readlines() for row in data: try: old, fix = row.split("///") rv[old] = fix.strip() except ValueError: #This may hurt abstraction too much as is tprint("Bypassing incorrectly formatted data") #print(row) #probably just cut this tbh chosen = True tprint(org + " data loaded") except FileNotFoundError: tprint("Error loading record") tprint("Would you like to start a new record?") if cli.ask_yes(True): chosen = True ''' print("Records not found") print() if suggest and org: likely = cli.guess(fname.strip(".txt"), os.listdir('staff')) if likely: corrected = cli.ask_meant(likely.strip(".txt")) if corrected: org = likely suggest = False continue else: org = "" cli.cls(verbose = False) else: cli.cls(verbose = False) #put rest INSIDE THIS BLOCK block so correction -> straight to return rv if not suggest: add_new_fixes(df, col, present) ''' return rv
def add_new_fixes(df, col, previous): finished = False fixes = {} while not finished: #MAKE A FUNCTION FROM A BUNCH OF THIS SO CAN USE WITH EXISTING... tprint("Choose a value to replace") old = select_value(df, col) tprint( "Would you like to choose another existing value to replace: %s ?" % old) print( "(Otherwise you will be prompted to enter a custom replacement value)" ) if cli.ask_yes(True): cli.cls(False) tprint("Choose a value to replace '%s'" % old) new = select_value(df, col) else: chosen = False while not chosen: new = input("Enter custom value to replace %s:\t" % old) if new: tprint("Use %s ?" % new) chosen = cli.ask_yes(True) cli.cls(verbose=False) if old and new: tprint("You chose: " + old + " -> " + new) tprint("Confirm this replacement?") if cli.ask_yes(True): tprint("Confirmed") fixes[old] = new cli.cls() if fixes: print("Your chosen replacements:") tprint("\tCurrent\tReplaced") sort_fixes = sorted(fixes.items()) for old, new in sort_fixes: print("\t" + old + "\t" + new) tprint("Would you like to add another replacement?") if cli.ask_yes(True): cli.cls() continue #*Slightly* jank tprint( "Would you like to save a record of these replacements for future use?" ) if cli.ask_yes(True): if previous: tprint( "Would you like to include the changes you selected from our suggestions in this record?" ) if cli.ask_yes(): for p in previous: fixes[p[1]] = p[0] sort_fixes = sorted(fixes.items()) cli.cls() named = False while not named: name = input("Choose a name for this record:\t") name = name.lower().strip() tprint("Do you want to name this record: %s ?" % name) named = cli.ask_yes(True) cli.cls(verbose=False) with open("data/" + name + ".txt", 'w') as f: for old, new in sort_fixes: f.write(old + '///' + new) if old != sort_fixes[-1]: f.write("\n") finished = True return fixes