示例#1
0
def convert_data(df, col):
    '''
    '''
    #replace when finished
    tprint("Unfortunately this feature is not currently supported")
    tprint("Stay tuned for future releases and updates")
    cli.cls()
示例#2
0
def modify_column_names(df):
    '''
    '''
    modding = True
    while modding:
        #THIS NEEDS TO BECOME A FXN CALL... SO CAN USE W/ named = False
        df.columns = fix_current_columns(df)
        tprint("Would you like to modify a column's name?")
        modding = cli.ask_yes(False)
        if modding:
            print()
            col = input(
                "Choose the number of the column to modify (1, 2, 3...): ")
            try:
                i = int(col.strip()) - 1
                cli.cls(verbose=False)
                tprint("Renaming column: " + df.columns[i])
                print()
                new = input("Type new name: ").strip()
                new = make_compatible(new)
                if new:
                    tprint("Converting name for compatibility")
                    tprint("Would you like to rename " + df.columns[i] +
                           " to " + new + " ?")
                    if cli.ask_yes(True):
                        change_one_column(new, df, i)
                else:
                    tprint("No new column name detected")
            except Exception as e:  #debugging
                tprint("Error renaming column")
            #print(e)
            cli.cls()
示例#3
0
def clean_numeric(df, col):
    '''
    '''
    tprint(
        "Would you like to see summary statistics for the data in this column?"
    )
    if cli.ask_yes(True):
        tprint(df[col].describe())
    cli.cls()
示例#4
0
def select_value(df, col):
    '''
    '''
    rv = ""
    chosen = False
    print()
    while not chosen:
        rv = show_values(df, col, output=True)  #NEEDS COL
        chosen = cli.ask_meant(rv, default=True)
        if not chosen:
            cli.cls(verbose=False)
    return rv
示例#5
0
def show_values(df, col, output=False, values=[]):
    '''
    if output returns chosen value
    '''
    i = 0
    if not values:  #THIS IS A BIT SILLY
        values = sorted(df[col].unique())
    pages = math.ceil(len(values) / 10)
    print("Current unique values: (Page 1 of %d)" % pages)
    for v in values:
        i += 1
        print("\t" + str(i) + ") " + v)
        if not output:
            if i % 10 == 0 and i < len(values):
                tprint("Show more values?")
                if cli.ask_yes(True):
                    cli.cls(False)
                    print("Current unique values: (Page %d of %d)" %
                          (i / 10 + 1, pages))
                else:
                    break
        else:
            if i % 10 == 0 or i == len(values):
                print()
                more = i < len(values)
                choosing = True
                while choosing:  #currently always true and break/return out
                    if more:
                        c = input(
                            "Type the number of a value (1, 2...) or press Enter to view more values: "
                        )
                    else:
                        c = input(
                            "Type the number of a value to select it (1, 2, 3...):  "
                        )
                    try:
                        c = int(c)
                        rv = str(values[c - 1])
                        return rv
                    except:
                        tprint("No value selected")
                        if more:
                            tprint("View more values?")
                            if cli.ask_yes(True):
                                cli.cls(False)
                                print(
                                    "Current unique values: (Page %d of %d)" %
                                    (i / 10 + 1, pages))
                                break
示例#6
0
def clean_columns(df):
    '''
    '''
    #could also just iterate through columns
    working = True
    while working:  #NEED TO THINK ABOUT BOOL HERE, CURRENTLY NO END COND.
        print("DATA CLEANING")
        col, working = choose_col(df, working)  #df.iloc[:,i]
        cli.cls()
        if working and col:  #lil ugly
            drop_na(df, col)
            cli.cls()
            numeric = assess_data_type(df[col])
            if numeric:
                clean_numeric(df, col)
            else:
                check_mixed_data(df, col)
示例#7
0
def start_record(df, col, default=True):
    '''
    '''
    print("Would you like to create a new organization?")
    new = cli.ask_yes(default)
    print()
    if not new:
        #cli.ask_continue(False)
        cli.cls(verbose=False)
        return
    named = False
    while not named:
        name = input("Type organization name or abbreviation:\t")
        name = name.upper().strip()
        print()
        print("Do you want to name this record: %s ?" % name)
        named = cli.ask_yes(True)
        cli.cls(verbose=False)
    fname = name.replace(" ", "_") + ".txt"
    #maybe put failsafe guess here... or have option to merge records later
    #^ probably latter tbh.
    add_new_fixes(df, col, fname)
示例#8
0
def give_new_names(df):
    '''
    '''
    names = []
    num_cols = len(df.columns)
    for i in range(num_cols):
        default_name = "column_" + str(i + 1)
        print("Column " + str(i + 1) + " of " + str(num_cols) +
              " is currently named: " + default_name)
        show_first_values(df, i)
        chosen = False
        while not chosen:
            name = input(
                "Enter a new name for this column or press Enter to keep the default: "
            )
            name = name.strip()
            if name:
                com_name = name.lower().replace(" ", "_")
                tprint("Modifying input for compatibility: " + name + " -> " +
                       com_name)
                tprint("Would you like to name this column " + com_name + " ?")
                chosen = cli.ask_yes(True)
                if chosen:
                    tprint("Setting column name")
                    if name in names:
                        print(
                            "Name already used for a column. Please choose another"
                        )
                    else:
                        names.append(name)
                else:
                    chosen = ask_default(names, default_name, False)
            else:
                chosen = ask_default(names, default_name, True)
        cli.cls()
    #setting the new names
    tprint("Adding column names")
    df.columns = names
    cli.cls()
示例#9
0
def review_guesses(df, col, guesses):
    '''
    '''
    #need to think about consolidating if there are a bunch of similar
    #build so g can contain 2+ values
    for g in guesses:
        print("Similar Value (Number in data):")
        i = 1
        #exists is a p-jank solution for not having consolidate_guesses...
        exists = 0
        for v in g:
            num_v = len(df[df[col] == v])
            if num_v:
                exists += 1
            print("\t" + str(i) + ") " + v + "\t(%d)" % num_v)
            i += 1
        if exists <= 1:
            cli.cls(False)
            continue
        tprint("Would you like to update one or more of these values?")
        if cli.ask_yes(False):
            fix_guess(g, df, col)
        cli.cls(True)
示例#10
0
def clean_strings(df, col):
    '''
    '''
    tprint("Removing excess white space from values in %s" % col)
    df[col] = df[col].str.strip()
    df[col] = df[col].str.replace(" ,", ",")
    df[col] = df[col].str.replace(",,", ",")
    df[col] = df[col].str.replace("  ", " ")
    df[col] = df[col].str.replace("\t", " ")
    cli.cls()
    u = df[col].unique()
    num_unique = len(u)
    print("Column " + col + " contains " + str(num_unique) + " unique values")
    if num_unique > WARN_LEVEL:
        tprint(
            "Large numbers of unique values may take significant time to analyze, depending on their length, your system and settings"
        )
    tprint("Would you like to search for possible errors in the data?")
    guesses = []
    if cli.ask_yes(True):
        cli.cls(False)
        guesses = consolidate_guesses(
            guess_linkages(u))  #doing a lot w/ generator
        if guesses:
            review_guesses(df, col, guesses)
            tprint("All automatic revisions reviewed")
        else:
            tprint("Our review did not detect any likely errors")
    cli.cls()
    ######################
    #DO THE MANUAL VERSION
    print("Current unique values in %s:" % col)
    show_values(df, col)
    cli.cls()
    print("Would you like to further clean or modify the values in %s?" % col)
    if cli.ask_yes(True):
        previous = []  #make previous update w/ confirmed automatic fixes...
        # ^ this will take some work
        fix_manual(df, col, previous)
示例#11
0
def fix_manual(df, col, previous=[]):
    '''
    '''
    working = True
    while working:
        tprint("Would you like to load a record of previously used changes?")
        if cli.ask_yes(True):
            fixes = choose_record(df, col, previous)  #REDO/RENAME
            if not fixes:
                fixes = add_new_fixes(df, col, previous)
        else:
            cli.cls()
            fixes = add_new_fixes(df, col, previous)
        print("Applying fixes")
        for old, new in fixes.items():
            df.loc[df[col] == old, [col]] = new
        tprint("Fixes applied")
        cli.cls()
        show_values(df, col)
        tprint("Would you like to further modify the values in %s?" % col)
        working = cli.ask_yes(True)
        cli.cls()
示例#12
0
def main():
    #maybe wrap the whole thing in a try/except to help abstraction LOL
    while True:
        #INTRO, loads file with intro text
        cli.cls(verbose=False)
        cli.load_cli_intro()
        cli.cls()

        #OPENING FILE
        df, named, fname = initial.load_in()
        cli.cls()

        #(RE)NAMING COLUMNS:
        columns.rename_cols(df, named)
        cli.cls()

        #ADD AGES FOR KIDS -- THIS IS GETTING ROLLED INTO CLEAN SOMEHOW
        #calcdates.add_age(df)
        #cli.cls()

        #CLEAN THE STAFF
        #try:
        clean.clean_columns(df)
        #except Exception as e: #DEBUGGING
        #   print(e)
        #   input("")
        cli.cls()

        #WRITES TO CSV
        # Done but needs testing
        final.write_csv(df, fname)
        cli.cls()

        #Finish or restart
        cli.outro()
示例#13
0
def add_new_fixes(df, col, previous):
    finished = False
    fixes = {}
    while not finished:
        #MAKE A FUNCTION FROM A BUNCH OF THIS SO CAN USE WITH EXISTING...
        tprint("Choose a value to replace")
        old = select_value(df, col)
        tprint(
            "Would you like to choose another existing value to replace: %s ?"
            % old)
        print(
            "(Otherwise you will be prompted to enter a custom replacement value)"
        )
        if cli.ask_yes(True):
            cli.cls(False)
            tprint("Choose a value to replace '%s'" % old)
            new = select_value(df, col)
        else:
            chosen = False
            while not chosen:
                new = input("Enter custom value to replace %s:\t" % old)
                if new:
                    tprint("Use %s ?" % new)
                    chosen = cli.ask_yes(True)
        cli.cls(verbose=False)
        if old and new:
            tprint("You chose: " + old + " -> " + new)
            tprint("Confirm this replacement?")
            if cli.ask_yes(True):
                tprint("Confirmed")
                fixes[old] = new
            cli.cls()
        if fixes:
            print("Your chosen replacements:")
            tprint("\tCurrent\tReplaced")
            sort_fixes = sorted(fixes.items())
            for old, new in sort_fixes:
                print("\t" + old + "\t" + new)
            tprint("Would you like to add another replacement?")
            if cli.ask_yes(True):
                cli.cls()
                continue  #*Slightly* jank
            tprint(
                "Would you like to save a record of these replacements for future use?"
            )
            if cli.ask_yes(True):
                if previous:
                    tprint(
                        "Would you like to include the changes you selected from our suggestions in this record?"
                    )
                    if cli.ask_yes():
                        for p in previous:
                            fixes[p[1]] = p[0]
                        sort_fixes = sorted(fixes.items())
                cli.cls()
                named = False
                while not named:
                    name = input("Choose a name for this record:\t")
                    name = name.lower().strip()
                    tprint("Do you want to name this record:  %s  ?" % name)
                    named = cli.ask_yes(True)
                    cli.cls(verbose=False)
                with open("data/" + name + ".txt", 'w') as f:
                    for old, new in sort_fixes:
                        f.write(old + '///' + new)
                        if old != sort_fixes[-1]:
                            f.write("\n")
            finished = True
    return fixes