Пример #1
0
def fix_guess(g, df, col):
    '''
    '''
    #BUILD SOMETHING TO SAVE THESE CHANGES
    #NEEDS RETURNS HERE AND IN SWITCH DATA
    print()
    c = input(
        "Choose the number of the correct value (1, 2...) or press Enter to use a custom value:  "
    )
    try:
        c = int(c.strip()) - 1
        correct = g[c]
        switch_data(df, col, g, c, correct)
    except Exception as e:
        #print(e) #debugging
        chosen = False
        while not chosen:
            custom = input(
                "Enter a new correct value or press Enter to cancel:  ")
            custom = custom.strip()
            if custom:
                tprint("Use %s ?" & custom)
                chosen = cli.ask_yes(True)
                if chosen:
                    switch_data(df, col, g, -1, custom)
                    #-1 is jank fix for how switch_data currently works
            else:
                chosen = True
Пример #2
0
def convert_data(df, col):
    '''
    '''
    #replace when finished
    tprint("Unfortunately this feature is not currently supported")
    tprint("Stay tuned for future releases and updates")
    cli.cls()
Пример #3
0
def drop_na(df, col):
    '''
    '''
    tprint(
        "Would you like to remove the rows that do not contain a value for: %s?"
        % col)
    if cli.ask_yes(True):
        df.dropna(subset=[col])
Пример #4
0
def end():
    '''
    just prints "Exiting" then calls sys.exit() 
    '''
    tprint("Exiting")
    print()
    input("Press Enter\t")
    sys.exit()
Пример #5
0
def clean_numeric(df, col):
    '''
    '''
    tprint(
        "Would you like to see summary statistics for the data in this column?"
    )
    if cli.ask_yes(True):
        tprint(df[col].describe())
    cli.cls()
Пример #6
0
def load_cli_intro():
    '''
    loads the intro from text/intro.txt
    currently no input/return
    '''
    with open("text/intro.txt") as f:
        t = f.read()
    tprint(t)
    print()
Пример #7
0
def ask_default(names, default_name, default=True):
    '''
    '''
    rv = False
    tprint("Would you like to use the default name?")
    if cli.ask_yes(default):
        names.append(default_name)
        rv = True
    return rv
Пример #8
0
def ask_convert(df, col):
    '''
    '''
    rv = False
    print("This column seems to contain text data")
    tprint("Is this column supposed to contain dates or numeric data?")
    if cli.ask_yes(False):
        rv = True
    #cli.cls()
    return rv
Пример #9
0
def rename_cols(df, named):
    '''
    takes dataframe, bool for whether columns are already named 
    modifies df in place, no return
    '''
    if not named:
        give_new_names(df)
    else:
        tprint("Column names have been updated for compatibility")
    modify_column_names(df)
Пример #10
0
def show_first_values(df, col, count=5):
    '''
    '''
    tprint("First %d unique values from this column:" % count)
    values = list(df.iloc[:, col].unique())
    c = 0
    for v in values:
        print("\t" + str(v))
        c += 1
        if c >= count:
            break
    print()
Пример #11
0
def write_csv(df, name):
    '''
    probably ok may cut index or redo
    '''
    #GIVE THEM AN OPTION TO CHANGE THE NAME
    f = re.sub(u"\.csv", "", name)
    fname = f + "_cleaned.csv"
    c = 1
    matched = True
    while matched:  #should prevent it from overwriting previous runs
        if fname in os.listdir("output"):
            fname = f + "_cleaned_" + str(c) + ".csv"
            c += 1
        else:
            matched = False
    tprint("Writing CSV: %s" % fname)
    df.to_csv("output/" + fname, index_label="index")
    tprint("Finished writing cleaned data to Output")
Пример #12
0
def choose_col(df, working):
    '''
    '''
    columns.fix_current_columns(
        df, list_only=True)  #really ugly fix in columns atm
    print()
    col = input(
        "Choose the number of a column to clean (1, 2, 3...) or press Enter: ")
    try:
        i = int(col.strip()) - 1  #maybe should do something w/ the name...
        col_name = df.columns[i]
    except Exception as e:
        col_name = ""
        tprint("No column selected")
        tprint("Would you like to continue cleaning columns?")
        working = cli.ask_yes(True)
        #print(e) #debugging
    return col_name, working
Пример #13
0
def fix_current_columns(df, list_only=False):
    '''
    given dataframe (df), accesses column names
    ensures column names are in format <column_name> then prints them
    returns list of properly formatted column names
    '''
    tprint("Columns:")
    compat = []
    i = 0
    for old in df.columns:
        i += 1
        if not list_only:
            new = make_compatible(old)
            compat.append(new)
        else:  #ugly fix atm
            new = old
        print("\t" + str(i) + ") " + new)
    return compat
Пример #14
0
def clean_strings(df, col):
    '''
    '''
    tprint("Removing excess white space from values in %s" % col)
    df[col] = df[col].str.strip()
    df[col] = df[col].str.replace(" ,", ",")
    df[col] = df[col].str.replace(",,", ",")
    df[col] = df[col].str.replace("  ", " ")
    df[col] = df[col].str.replace("\t", " ")
    cli.cls()
    u = df[col].unique()
    num_unique = len(u)
    print("Column " + col + " contains " + str(num_unique) + " unique values")
    if num_unique > WARN_LEVEL:
        tprint(
            "Large numbers of unique values may take significant time to analyze, depending on their length, your system and settings"
        )
    tprint("Would you like to search for possible errors in the data?")
    guesses = []
    if cli.ask_yes(True):
        cli.cls(False)
        guesses = consolidate_guesses(
            guess_linkages(u))  #doing a lot w/ generator
        if guesses:
            review_guesses(df, col, guesses)
            tprint("All automatic revisions reviewed")
        else:
            tprint("Our review did not detect any likely errors")
    cli.cls()
    ######################
    #DO THE MANUAL VERSION
    print("Current unique values in %s:" % col)
    show_values(df, col)
    cli.cls()
    print("Would you like to further clean or modify the values in %s?" % col)
    if cli.ask_yes(True):
        previous = []  #make previous update w/ confirmed automatic fixes...
        # ^ this will take some work
        fix_manual(df, col, previous)
Пример #15
0
def review_guesses(df, col, guesses):
    '''
    '''
    #need to think about consolidating if there are a bunch of similar
    #build so g can contain 2+ values
    for g in guesses:
        print("Similar Value (Number in data):")
        i = 1
        #exists is a p-jank solution for not having consolidate_guesses...
        exists = 0
        for v in g:
            num_v = len(df[df[col] == v])
            if num_v:
                exists += 1
            print("\t" + str(i) + ") " + v + "\t(%d)" % num_v)
            i += 1
        if exists <= 1:
            cli.cls(False)
            continue
        tprint("Would you like to update one or more of these values?")
        if cli.ask_yes(False):
            fix_guess(g, df, col)
        cli.cls(True)
Пример #16
0
def switch_data(df, col, g, c, correct):
    '''
    '''
    try:
        for i in range(len(g)):
            if i != c:
                tprint("Would you like to convert " + g[i] + " to " + correct +
                       "?")
                if cli.ask_yes(True):
                    df.loc[df[col] == g[i], [col]] = correct
                    tprint("Converting " + g[i] + " -> " + correct, 0.5)
    except Exception as e:
        tprint("Error converting data. Data not converted")
Пример #17
0
def give_new_names(df):
    '''
    '''
    names = []
    num_cols = len(df.columns)
    for i in range(num_cols):
        default_name = "column_" + str(i + 1)
        print("Column " + str(i + 1) + " of " + str(num_cols) +
              " is currently named: " + default_name)
        show_first_values(df, i)
        chosen = False
        while not chosen:
            name = input(
                "Enter a new name for this column or press Enter to keep the default: "
            )
            name = name.strip()
            if name:
                com_name = name.lower().replace(" ", "_")
                tprint("Modifying input for compatibility: " + name + " -> " +
                       com_name)
                tprint("Would you like to name this column " + com_name + " ?")
                chosen = cli.ask_yes(True)
                if chosen:
                    tprint("Setting column name")
                    if name in names:
                        print(
                            "Name already used for a column. Please choose another"
                        )
                    else:
                        names.append(name)
                else:
                    chosen = ask_default(names, default_name, False)
            else:
                chosen = ask_default(names, default_name, True)
        cli.cls()
    #setting the new names
    tprint("Adding column names")
    df.columns = names
    cli.cls()
Пример #18
0
def show_values(df, col, output=False, values=[]):
    '''
    if output returns chosen value
    '''
    i = 0
    if not values:  #THIS IS A BIT SILLY
        values = sorted(df[col].unique())
    pages = math.ceil(len(values) / 10)
    print("Current unique values: (Page 1 of %d)" % pages)
    for v in values:
        i += 1
        print("\t" + str(i) + ") " + v)
        if not output:
            if i % 10 == 0 and i < len(values):
                tprint("Show more values?")
                if cli.ask_yes(True):
                    cli.cls(False)
                    print("Current unique values: (Page %d of %d)" %
                          (i / 10 + 1, pages))
                else:
                    break
        else:
            if i % 10 == 0 or i == len(values):
                print()
                more = i < len(values)
                choosing = True
                while choosing:  #currently always true and break/return out
                    if more:
                        c = input(
                            "Type the number of a value (1, 2...) or press Enter to view more values: "
                        )
                    else:
                        c = input(
                            "Type the number of a value to select it (1, 2, 3...):  "
                        )
                    try:
                        c = int(c)
                        rv = str(values[c - 1])
                        return rv
                    except:
                        tprint("No value selected")
                        if more:
                            tprint("View more values?")
                            if cli.ask_yes(True):
                                cli.cls(False)
                                print(
                                    "Current unique values: (Page %d of %d)" %
                                    (i / 10 + 1, pages))
                                break
Пример #19
0
def fix_manual(df, col, previous=[]):
    '''
    '''
    working = True
    while working:
        tprint("Would you like to load a record of previously used changes?")
        if cli.ask_yes(True):
            fixes = choose_record(df, col, previous)  #REDO/RENAME
            if not fixes:
                fixes = add_new_fixes(df, col, previous)
        else:
            cli.cls()
            fixes = add_new_fixes(df, col, previous)
        print("Applying fixes")
        for old, new in fixes.items():
            df.loc[df[col] == old, [col]] = new
        tprint("Fixes applied")
        cli.cls()
        show_values(df, col)
        tprint("Would you like to further modify the values in %s?" % col)
        working = cli.ask_yes(True)
        cli.cls()
Пример #20
0
def ask_meant(guess, default=False):
    '''
    '''
    tprint("Did you mean to choose %s?" % guess)
    rv = ask_yes(default)
    return rv
Пример #21
0
def ask_continue(default=True):
    '''
    '''
    tprint("Would you like to continue cleaning data?")
    if not ask_yes(default):
        end()
Пример #22
0
def modify_column_names(df):
    '''
    '''
    modding = True
    while modding:
        #THIS NEEDS TO BECOME A FXN CALL... SO CAN USE W/ named = False
        df.columns = fix_current_columns(df)
        tprint("Would you like to modify a column's name?")
        modding = cli.ask_yes(False)
        if modding:
            print()
            col = input(
                "Choose the number of the column to modify (1, 2, 3...): ")
            try:
                i = int(col.strip()) - 1
                cli.cls(verbose=False)
                tprint("Renaming column: " + df.columns[i])
                print()
                new = input("Type new name: ").strip()
                new = make_compatible(new)
                if new:
                    tprint("Converting name for compatibility")
                    tprint("Would you like to rename " + df.columns[i] +
                           " to " + new + " ?")
                    if cli.ask_yes(True):
                        change_one_column(new, df, i)
                else:
                    tprint("No new column name detected")
            except Exception as e:  #debugging
                tprint("Error renaming column")
            #print(e)
            cli.cls()
Пример #23
0
def choose_record(df, col, previous):
    '''
    returns dict of old -> new
    large parts of this fxn are currently deprecated, unclear if stable
    '''
    org = ""
    #suggest = True
    rv = {}
    chosen = False  #this is basically a C paradigm tbh
    while not chosen:
        #REDO STRINGS/SELECTION
        '''
        if not org:
            print("Choose an organization from the list below:")
            for org in sorted(os.listdir("staff")): #REQUIRED DIR: staff
                print(org.strip(".txt").replace("_", " "))
            print("Other")
            print()
            org = input("Organization:\t").strip().upper().replace(" ", "_")
            print()
        if org == "OTHER":
            start_record(df, True)
            org = ""
            continue
            #DO SOMETHING FOR OTHER -- requires making new file --> continue, maybe redisplay orgs
        else:
        '''
        val = [n.strip(".txt") for n in os.listdir("data")
               if ".txt" in n]  #that is some l-comp
        org = show_values(df, col, output=True, values=val)
        try:
            fname = org + ".txt"
            with open("data/" + fname) as f:
                data = f.readlines()
            for row in data:
                try:
                    old, fix = row.split("///")
                    rv[old] = fix.strip()
                except ValueError:
                    #This may hurt abstraction too much as is
                    tprint("Bypassing incorrectly formatted data")
                    #print(row) #probably just cut this tbh
            chosen = True
            tprint(org + " data loaded")
        except FileNotFoundError:
            tprint("Error loading record")
            tprint("Would you like to start a new record?")
            if cli.ask_yes(True):
                chosen = True
            '''
            print("Records not found")
            print()
            if suggest and org:
                likely = cli.guess(fname.strip(".txt"), 
                                   os.listdir('staff'))
                if likely:
                    corrected = cli.ask_meant(likely.strip(".txt"))
                    if corrected:
                        org = likely
                        suggest = False                           
                        continue
                    else:
                        org = "" 
                        cli.cls(verbose = False)
                else:
                    cli.cls(verbose = False)
            #put rest INSIDE THIS BLOCK block so correction -> straight to return rv
            if not suggest: 
                add_new_fixes(df, col, present)
            '''
    return rv
Пример #24
0
def add_new_fixes(df, col, previous):
    finished = False
    fixes = {}
    while not finished:
        #MAKE A FUNCTION FROM A BUNCH OF THIS SO CAN USE WITH EXISTING...
        tprint("Choose a value to replace")
        old = select_value(df, col)
        tprint(
            "Would you like to choose another existing value to replace: %s ?"
            % old)
        print(
            "(Otherwise you will be prompted to enter a custom replacement value)"
        )
        if cli.ask_yes(True):
            cli.cls(False)
            tprint("Choose a value to replace '%s'" % old)
            new = select_value(df, col)
        else:
            chosen = False
            while not chosen:
                new = input("Enter custom value to replace %s:\t" % old)
                if new:
                    tprint("Use %s ?" % new)
                    chosen = cli.ask_yes(True)
        cli.cls(verbose=False)
        if old and new:
            tprint("You chose: " + old + " -> " + new)
            tprint("Confirm this replacement?")
            if cli.ask_yes(True):
                tprint("Confirmed")
                fixes[old] = new
            cli.cls()
        if fixes:
            print("Your chosen replacements:")
            tprint("\tCurrent\tReplaced")
            sort_fixes = sorted(fixes.items())
            for old, new in sort_fixes:
                print("\t" + old + "\t" + new)
            tprint("Would you like to add another replacement?")
            if cli.ask_yes(True):
                cli.cls()
                continue  #*Slightly* jank
            tprint(
                "Would you like to save a record of these replacements for future use?"
            )
            if cli.ask_yes(True):
                if previous:
                    tprint(
                        "Would you like to include the changes you selected from our suggestions in this record?"
                    )
                    if cli.ask_yes():
                        for p in previous:
                            fixes[p[1]] = p[0]
                        sort_fixes = sorted(fixes.items())
                cli.cls()
                named = False
                while not named:
                    name = input("Choose a name for this record:\t")
                    name = name.lower().strip()
                    tprint("Do you want to name this record:  %s  ?" % name)
                    named = cli.ask_yes(True)
                    cli.cls(verbose=False)
                with open("data/" + name + ".txt", 'w') as f:
                    for old, new in sort_fixes:
                        f.write(old + '///' + new)
                        if old != sort_fixes[-1]:
                            f.write("\n")
            finished = True
    return fixes