def check_for_header(file, remove_if_present, interactive=True): lines = hfh.get_lines(file) has_header = False if len(lines) > 1: # assumes that a header line has a different length than data # assumes delimited by comma line0 = len(lines[0]) line1 = len(lines[1]) if not line0 == line1: if interactive: has_header = ui.get_yes_no_response( "Remove Header?\n", str("Is this a header?" + lines[0] + "?")) else: has_header = True print( file, line0, '\n', "header automatically removed. Confirm that it had a header" ) # todo: when this fails fix it if remove_if_present and has_header: lines = lines[1:] hfh.write_lines_to_text(lines, file) if has_header: return True return False
def convert_xCalibre_matrix_for_PCI(matrix_file, corresponding_control_file=False, id_length=8, include_id=False): #similar file in h_stats ret = [] first_row = "" if corresponding_control_file: df = pd.read_csv(corresponding_control_file, header=None) item_ids = df.loc[:, 0] for id in item_ids: first_row += id + "," first_row = first_row[:-1] ret = [first_row] lines = hfh.get_lines(matrix_file) for line in lines: ret_line = "" if include_id: ret_line = line[:id_length] answer_string = line[id_length:] for c in answer_string: ret_line += c + ',' ret_line = ret_line[:-3] ret_line += '\n' ret.append(ret_line) name = hfh.get_stem(matrix_file) + "__c.csv" hfh.write_lines_to_text(ret, name) translated_name = hfh.get_stem(matrix_file) + "_T_c.csv" df = pd.read_csv(name, header=None) df.to_csv("pickme.csv") df = df.T df.to_csv(translated_name, index=False) return df.T
def create_c_from_LXR_Test(file_path, destination_path=None): if destination_path is None: destination_path = hfh.get_parent_folder(file_path) lines = hfh.get_lines(file_path) ret = [] counter = 0 for line in lines: counter += 1 if line[0].isnumeric(): entry = line.split() test_name = hfh.get_stem(file_path) test_id = line[:line.index('.')] entry[1] bank_id = entry[1] + '_' + entry[2] if len(entry) == 4: subject = entry[1] + "_" + entry[2] bank_id = subject + entry[3] key_line = lines[counter] key_i = key_line.find('Key: ') if key_i > -1: key = key_line[key_i + len("Key: ")] else: print("hello") record = [bank_id, key, '4', '1', 'Y', 'M'] ret.append(record) df = pd.DataFrame(ret) name = hfh.get_stem(file_path) + "_c.csv" # df.sort_values(df[1]) df.to_csv(destination_path + "/" + name, index=False, header=False) return df
def convert_response_string_to_csv_and_get_df(file, id_length, number_of_spaces, create_csv=False): df_rows = [] lines = hfh.get_lines(file) for line in lines: df_row = [] beginning = id_length + number_of_spaces FR_space = line.rfind('F') if FR_space == -1: FR_space = line.rfind('R') if FR_space > -1: stripped_line = line[beginning:FR_space] else: stripped_line = line[beginning:] for c in stripped_line: df_row.append(c) df_rows.append(df_row) df = pd.DataFrame(df_rows) if create_csv: name = hfh.create_name(file, modificaiton="_d_") df.to_csv(name) return df
def get_f_df_repeat_status(f_path): # places that repeat information lives... # end of string # pearson file type thrid column # ... other things I have not come across ids_with_repeat_status = [] lines = hfh.get_lines(f_path) if is_type_K(lines): df = hfh.get_df(f_path, header=0) df = df.drop(0) df['Attempt'] = df['Attempt'].replace(['1'], 'F') df['Attempt'] = df['Attempt'].replace(['2'], 'R') ids_with_repeat_status = df['ClientID'] + '_' + df['Attempt'] else: for line in lines: ending_character = line.strip()[-1] if ending_character in ['F', 'R']: repeat_status = ending_character line = line.strip() split_line = line.split() _id = None if len(split_line) > 1: _id = split_line[0] else: split_line = line.split(',') if len(split_line) > 1: _id = split_line[0] if _id is None: assert False, "can not assign repeat status to file " + f_path ids_with_repeat_status.append(_id + '_' + repeat_status) ret = process_response_string_file(f_path, create_c=False) f_df = ret f_df = f_df.set_index(ids_with_repeat_status) return ids_with_repeat_status
def convert_delimited_to_iteman(file, destination_path, delimiter=','): #verify is CSV ret = [] lines = hfh.get_lines(file) if len(lines) > 0: if len(lines[0].split(delimiter)) > 1: #is CSV for line in lines: new_line = "" line = line.split(delimiter) id_handled = False non_answer_characters = False for i in line: if not id_handled: i += ' ' id_handled = True if not i == 'Y' and not i == 'M': new_line += i ret.append(new_line) if not non_answer_characters: print( file, "non answer character in data response string. It was removed." ) name = hfh.create_name(hfh.get_stem(file), destination_path, 'txt', '_f') hfh.write_lines_to_text(ret, name) return True else: print(file, "is empty") return False
def convert_2016_format(file_name, destination_path="", pretest_cutoff=False): # of form: # PT1 PT116MAR BB... correct # answers # todo: rename this lines = hfh.get_lines(file_name) start_of_answers = lines[0].rfind(' ') answers = lines[0][start_of_answers + 1:] ret = [] for line in lines[2:]: # remove R or F at end last_entry = line[len(line) - 2] if last_entry == 'R' or last_entry == 'F': line = line[:-5] + '\n' ret.append(line) name = hfh.get_stem(file_name) new = hfh.get_stem(destination_path + "/" + name) + "_f.txt" hfh.write_lines_to_text(ret, new) if is_valid_data(new): convert_answers_to_default_control(name, answers, destination_path, pretest_cutoff) return True return False
def get_first_line_of_stats(path_to_stats): i = 0 lines = get_lines(path_to_stats) for line in lines: i += 1 line_split = line.split(',') if len(line_split) > 0: if line_split[0] == "Sequence": return i
def process_stats_files(stats_files, reports_path, report_name): r_entries = [] # list of tuples constant, message header = False ret = [] header_index = 0 for file in stats_files: lines = hfh.get_lines(file) header_index = hfh.get_index_of_line_that_starts_with_word_based_on_delimiter(lines,'Sequence') blank_index = hfh.get_next_blank_line_after_index_from_lines(lines, header_index) if not header: ret_lines = lines[header_index:blank_index+header_index] header = True else: ret_lines = lines[header_index+1:blank_index+header_index] for line in ret_lines: ret.append(line) # use ret to make master df aggregate_report_path = reports_path+"/"+report_name+"_aggregate_.csv" hfh.write_lines_to_text(ret, aggregate_report_path) df = pd.read_csv(aggregate_report_path) df = df[['Item ID', 'S-Rpbis', 'T-Rpbis','P','a','b','Flags']] df['K'] = 0 df['La'] = 0 df['Lb'] = 0 mask_K = df['Flags'].str.contains(r'K', na=False) mask_Lb = df['Flags'].str.contains(r'Lb', na=False) mask_La = df['Flags'].str.contains(r'La', na=False) df.loc[mask_K, 'K'] = 1 df.loc[mask_La, 'La'] = 1 df.loc[mask_Lb, 'Lb'] = 1 df['count'] = df['Item ID'].map(df['Item ID'].value_counts()) df = add_descriptives_to_df_by_group(df, 'Item ID', 'a') df = add_descriptives_to_df_by_group(df, 'Item ID', 'b') df = add_descriptives_to_df_by_group(df, 'Item ID', 'T-Rpbis') df = add_descriptives_to_df_by_group(df, 'Item ID', 'S-Rpbis') df = add_descriptives_to_df_by_group(df, 'Item ID', 'P') df = df.sort_values('Item ID') df.to_csv(aggregate_report_path) df_new = df['Item ID'].apply(get_id_from_item_id) df = pd.concat([df, df_new], axis=1) df = df.sort_values('Item ID').groupby(['Item ID']).first() df['Caution'] = 0 df['Good'] = 0 mask1 = (df['K'] == 2) | (df['Lb'] == 1) df.loc[mask1, 'Caution'] = 1 mask2 = (df['Caution'] == 0) & (df['a_mean'] > 1) & (df['T-Rpbis_mean'] > .3) df.loc[mask2, 'Good'] = 1 df = df.drop(['S-Rpbis','T-Rpbis','P','a','b','Flags'], axis = 1) df.to_csv(reports_path + "/" + report_name + "_complete_.csv")
def convert_control_to_include_pretest(control_file): lines = hfh.get_lines(control_file) ret = [] for line in lines: split_line = line.split(',') ret_line = split_line[0] + "," + split_line[1] + "," + split_line[ 2] + "," + split_line[3] + ",Y," + split_line[5] ret.append(ret_line) dot = control_file.rfind(".") name = control_file[:dot] + "_cf.csv" hfh.write_lines_to_text(ret, name)
def create_aggregate_report(path, destination_path, target_string, header_txt, sort_by): files = get_all_file_names_in_folder(path, target_string=target_string) file_name = destination_path + "/_aggregate_" + target_string + ".csv" write_lines_to_text([header_txt + "\n"], file_name) for file in files: lines = get_lines(file) write_lines_to_text(lines[1:], file_name, 'a') df = pd.read_csv(file_name, header=0) df = df.sort_values(by=sort_by, ascending=False) df.to_csv(file_name, index=False)
def is_valid_data(file): # id responses lines = hfh.get_lines(file) if not lines: return 0 length = len(lines[1]) end_of_id = lines[1].find(" ") for line in lines[1:]: if not line.find(" ") == end_of_id: print(file + " is invalid because of spacing issues") return False return 1
def convert_csv_response_string_to_raw_with_bogus_header(file, id_spaces=3): lines = hfh.get_lines(file) ret = ["BOGUS \n\n"] for line in lines: id_loc = line.find(',') id = line[:id_loc] spaces = "" for i in range(id_spaces): spaces += ' ' ret_line = id + spaces ret_line += line[id_loc:].replace(',', '') ret.append(ret_line) hfh.write_lines_to_text(ret, file + "_m")
def convert_first_line_answers_to_default_control_and_data( file_name, comma_delimited=False, id_length=8, id_spaces=3): #todo: handle cutoff for not included lines = hfh.get_lines(file_name) correct = lines[0] new = [] counter = 0 if comma_delimited: correct = correct.replace(',', '') for a in correct: counter += 1 if not a == '\n': include = 'y' new.append(str(counter) + "," + a + ",4,1," + include + ",M\n") name = hfh.get_stem(file_name) hfh.write_lines_to_text(new, name + "_c.csv") # contains a random F at the end will test to see if it matters formatted = [] if comma_delimited: for line in lines[1:]: split_line = line.split(',') id = split_line[0] new_id = "" if len(id) < id_length: short = id_length - len(id) for i in range(short): new_id += "_" new_id += id response_string = line[len(id):-2].replace(',', '') + '\n' ret_line = new_id + " " + response_string formatted.append(ret_line) else: for line in lines[2:]: # todo: could be problematic id_end = line.find(',') id = line[:id_end] new_id = "" characters_short = id_length - len(id) for c in range(characters_short): new_id += "_" new_id += id for i in range(id_spaces): new_id += " " new_line = new_id + line formatted.append( new_line) #no clue why : is here perhaps I will remove it. hfh.write_lines_to_text(formatted, name + "_f.txt")
def set_standard_id_length_in_data_files(path_to_files, target_length, spaces_between_id_and_data=3): #assumes data has spaces files = hfh.get_all_file_names_in_folder(path_to_files, target_string='_f.txt') for file in files: new_lines = [] lines = hfh.get_lines(file) for line in lines: new_lines.append( set_standard_id_length_for_line(line, target_length)) #name = hfh.create_name(file,hfh.get_parent_folder(file)+"/formatted_data") hfh.write_lines_to_text(new_lines, file)
def get_data_df(data_file, id_length=8, spaces=3): lines = hfh.get_lines(data_file) ret = [] for line in lines: ret_line = [] id = line[:8] ret_line.append(id) response_sting = line[id_length + spaces:-2] response_sting = response_sting.strip() for character in response_sting: ret_line.append(character) ret.append(ret_line) df = pd.DataFrame(ret) return df
def remove_header_and_blank_lines(control_path): lines = hfh.get_lines(control_path) ret = [] for line in lines: remove = False if line == '\n': remove = True split_line = line.split(',') if len(split_line) > 2: a = split_line[2] if not split_line[2].isnumeric(): remove = True if not remove: ret.append(line) hfh.write_lines_to_text(ret, control_path)
def validate_c_file_header(c_file, debug=True): #first line should be AccNum... lines = hfh.get_lines(c_file) if debug: print("validating: " + c_file) assert len(lines) > 0, "validate_c_file was fed an empty file" if lines[0] == E.C_HEADER_S + '\n': return True if lines[1] == E.C_HEADER_S + '\n': assert False, "validate_c_file detected 2 headers" # header now present df = hfh.get_df(c_file) df.columns = E.C_HEADER_L df.to_csv(c_file, index=False) return True
def convert_default_data_to_iteman(file_name, processed_data_path, new_name=False): if processed_data_path[-1] == '/': processed_data_path = processed_data_path[:-1] lines = hfh.get_lines(file_name) ret = [] for line in lines[2:]: line = set_standard_id_length_for_line(line, 8) ret.append(line) path = processed_data_path + "/" + hfh.get_stem(file_name) + "_f.txt" if new_name: path = processed_data_path + "/" + hfh.get_stem(new_name) + "_f.txt" hfh.write_lines_to_text(ret, path)
def process_LXR_key(key_file, get_c_df_AS_0=False, get_L_df_AS_1=False, destination_path_c=None, destination_path_L=None): c_df = None L_df = None name = hfh.get_stem(key_file) lines = hfh.get_lines(key_file) ids = [] keys = [] for line in lines: split = line.split('.') if len(split) > 1: id = split[1] id = id.strip() id = id.replace(' ', '_') ids.append(id) else: split = line.split(':') if len(split) > 1: key = split[1] key = key.strip() keys.append(key) df = hfh.pd.DataFrame([ids, keys]).T df[2] = 4 df[3] = 1 df[4] = 'Y' df[5] = 'M' df[6] = df.index.values + 1 c_df = df[[0, 1, 2, 3, 4, 5]] L_df = df[[6, 0]] if get_c_df_AS_0 or get_L_df_AS_1: return c_df, L_df if destination_path_c: c_df.to_csv(destination_path_c + '/' + name + '_c.csv', index=False, header=False) if destination_path_L: L_df.to_csv(destination_path_L + '/' + name + '_L.csv', index=False, header=False)
def create_mapping_from_Karen_test_data(file_path, destination_path="", create_csv=False, add_underscore=False): lines = hfh.get_lines(file_path) # assumes files are of format # number. Name NN # .... where no lines will start with a number and then a dot other than my target lines # number. Name NN # name of file is stem(file_path)+_L.csv ret = [] if lines: for line in lines: if line[0].isnumeric(): entry = line.split() test_name = hfh.get_stem(file_path) test_id = line[:line.index('.')] subject = entry[1] bank_id = entry[2] underscore = "" if add_underscore: underscore = "_" record = [ test_name, test_id, subject, bank_id, subject + underscore + str(bank_id) ] ret.append(record) df = pd.DataFrame(ret) df.columns = [ 'form', 'test_id', 'subject', 'bank_id_number', 'bank_id' ] else: print(file_path + "does not contain lines.") if create_csv: name = hfh.get_stem(file_path)[:8] #df.sort_values(df[1]) file_name = destination_path + "/" + name + "_L.csv" df.to_csv(file_name, index=False, header=0) return df
def clean_stats_csv(path, create_csv=True, get_df=False): lines = hfh.get_lines(path) #assumes report starts with Sequence i = -1 cont = True beginning = -1 ret_lines = [] while cont: for line in lines: i += 1 split_line = line.split(',') if split_line[0] == 'Sequence': cont = False beginning = i if beginning > -1: if line == '\n': cont = False else: ret_lines.append(line.split(',')) df = pd.DataFrame(ret_lines[1:]) df.columns = ret_lines[0][:-1] df = df.drop(columns='4 SD') if create_csv: new_path = hfh.get_parent_folder(path) name = new_path + "/" + hfh.get_stem(path)[:-6] + ".cleaned_stats" df.to_csv(name) if get_df: return df #set_standard_id_length_in_data_files("PT_IRT/PT_processed_data", 8) #convert_xCalibre_matrix_for_PCI("PT_data/score_matrices/PT1_18_m.txt") #process_karen_data("LCLE_IRT","LCLE_IRT","LCLE_IRT/processed_data/") #merge_control_and_bank_info(a,b) #path_to_files = "data_files" #convertOldFormatToNew("LCLE_IRT/LCLEApr2019FullAdmin.txt") #processNewFileFormat("LCLE_IRT/lcea1_18c.csv","LCLE_IRT/lcea1_18.txt") #convert_first_line_answers_to_default_control_and_data(path_to_files+"/pt1_16_n.txt") #create_control_files(path_to_files) #update_control_files_with_item_bank_key("data_files/item_map.csv", "data_files") #convert_2016_format("data_files/pt3_16.txt")
def has_acceptable_correct_percentage(xCalibre_report_path, id_length=8, debug=True): files = hfh.get_all_files(xCalibre_report_path, "Matrix") for file in files: total = 0 correct = 0 lines = hfh.get_lines(file) for line in lines: scores = line[id_length:-1] for x in scores: total += 1 try: correct += int(x) except: pass percent_correct = round(correct / total * 100, 4) if debug: print(file, percent_correct) if percent_correct < 50: print(file, "has low correct rate.") return False return True
def process_response_strings_for_IRT(path_to_raw_data, processed=None, bank=None, verbose=False, get_f_df=False): #todo edited while tired confirm it works later path = path_to_raw_data if path is not False: lines = hfh.get_lines(path) r = path.find('raw_data') #assumes that raw_data exists in IRT model name = path if r > -1: project_directory = path[:r] name = project_directory + "/processed_data/" + hfh.get_stem(path) processed = project_directory + '/processed_data/' bank = project_directory + '/bank_files/' valid = is_valid_name(path) while valid is False: print( path + " is a raw data name which does not conform to convention of CCCYYMON." ) name = input("enter an appropriate name here") valid = is_valid_name(name) if lines is False: print( "Error in determine response string.\n Path request error in path " + path) else: process_response_string_file(path, bank, write_csv=True, destination_path=processed)
def is_valid_control(file): lines = hfh.get_lines(file) valid = True for line in lines: valid = False # check for header line and remove split_line = line.split(',') if not len(split_line) == 4: valid = False else: if len(split_line) > 2: if not split_line[2].isnumeric(): valid = False # check for empty line and remove if len(split_line) == 0: valid = False # check for bank id if len(split_line[0]) < 4: valid = True # check for entries in four points if not valid: print("invalid control file attempting repair", file) remove_header_and_blank_lines(file) return valid
def convert_delimited_control(control_file, destination_path, delimiter='\t', remove_version=False, remove_header=True): lines = hfh.get_lines(control_file) ret = [] changed = False csv_detected = False if len(lines) > 1: if remove_header: check_for_header(control_file, True) for line in lines: # check for delimiter split_line = line.split(delimiter) if len(split_line) > 1: changed = True ret_line = split_line[0] + "," if remove_version: version_location = ret_line.rfind('V') if version_location > 0: ret_line = ret_line[:version_location] + ',' for i in split_line[1:]: ret_line += i + ',' ret_line = ret_line[:-1] ret.append(ret_line) else: # check to see if file is comma delimited split_line = line.split(',') # if comma delimited and removing version get bank id and remove version if len(split_line) > 1 and remove_version: csv_detected = True ret_line = split_line[0] version_location = ret_line.rfind('V') if version_location > 0: ret_line = ret_line[:version_location] + ',' for i in split_line[1:]: ret_line += i + ',' ret_line = ret_line[:-1] ret.append(ret_line) else: ret.append(line) if csv_detected: print( "Comma delimited identified. ID_BANK version identifier removed" ) if not changed: print( control_file + " asked to be delimiter converted but did not contain target delimiter" ) hfh.write_lines_to_text( ret, destination_path + '/' + hfh.get_stem(control_file)) return True else: print(control_file, " is an empty file and asked to be converted") return False
def parse_LXR_control(file_path): lines = hfh.get_lines(file_path) ret = [] if lines is not False: for line in lines: line = line.strip() AccNum = "" period_i = line.find('.') if period_i > -1: # look for a period, if present is of form: # 5. HERBAL 851 s = line.split() content = s[1].strip() number = s[2].strip() zeroes_needed = 3 - len(number) number_string = "" for z in range(zeroes_needed): number_string += '0' number_string += str(number) AccNum = content[:5] + number_string target_string = 'Key: ' key_i = line.find(target_string) if key_i > -1: # look for Key: , if present is of form: # 183 Form #: Feb 10 Key: A ASSES367 new_line = line[key_i + len(target_string) + 2:].strip() s = new_line.split() # should only have 2 options if len(s) > 2: print("issue with parse_LXR_KEY") else: if len(s) == 0: print("problem") if len(s) == 1: AccNum = s[0].strip() if len(s) == 2: number = s[1].strip() zeroes_needed = 3 - len(number) number_string = "" for z in range(zeroes_needed): number_string += '0' number_string += str(number) AccNum = s[0].strip() + number_string s = line.split() if len(s) == 2: topic = s[0].strip() number = s[1].strip() AccNum = topic[:5] + number s = line.split(',') if len(s) == 2: topic = s[0].strip() number = s[1].strip() AccNum = topic[:5] + number s = line.split(',') if len(s) > 20: topic = s[0] number = s[1] result = topic + str(number) AccNum = result if len(line) > 1 and AccNum == "": AccNum = line if len(AccNum) > 0: ret.append(AccNum) ret = hfh.pd.Series(ret) return ret else: return False
def remove_header_from_files(files): for file in files: lines = hfh.get_lines(file) ret = lines[1:] ret = pd.DataFrame(ret) ret.to_csv(file)
def process_response_string_file(f_path, bank_path=None, destination_path=None, write_csv=False, get_df=True, create_c=True, paired_bank_xlsx=None): if create_c: assert destination_path is not None, "process response string needs to know where to put the processed data" name = hfh.get_stem(f_path) lines = hfh.get_lines(f_path) assert len(lines) > 0, "asked to process empty file:" + f_path c_df = None f_df = None if is_type_K(lines): processed_lines = processK(lines) f_df = processed_lines elif is_type_A(lines): processed_lines = processA(lines) c_df = processed_lines[0] f_df = processed_lines[1] elif is_type_B(lines): processed_lines = processB(lines) f_df = processed_lines elif is_type_C(lines): processed_lines = processC(lines) c_df = processed_lines[0] f_df = processed_lines[1] elif is_type_D(lines): processed_lines = processD(lines) f_df = processed_lines elif is_type_E(lines): processed_lines = processE(lines) c_df = processed_lines[0] f_df = processed_lines[1] elif is_type_F(lines): processed_lines = processF(lines) f_df = processed_lines elif is_type_G(lines): processed_lines = processG(lines) c_df = processed_lines[0] f_df = processed_lines[1] elif is_type_H(lines): processed_lines = processH(lines) f_df = processed_lines elif is_type_I(lines): processed_lines = processI(lines) f_df = processed_lines elif is_type_J(lines): processed_lines = processJ(lines) f_df = processed_lines else: print(f_path + " is already formatted") is_formatteed(lines) f_df = hfh.get_df(f_path) if c_df is not None and bank_path: # add AccNum instead of sequence b_df = create_c_df_from_bank(bank_path) b_df['Key'] = c_df['Key'] c_df = b_df if c_df is None and bank_path is not None and create_c: #todo: consider respecting the correct answer at the time vs the bank or just destroy it bank_files = hfh.get_all_files(bank_path, extension='xlsx') pair = hfh.pair_files([f_path], bank_files) if len(pair) == 0: print( "could not find matching bank file and no default control information present." ) if len(pair) == 1: # todo: may evaluate differences between bank and response string if desired c_df = create_c_df_from_bank(pair[0][1]) if len(pair) > 1: print("more than one file matched for bank", f_path) #confirm_id_as_index if 0 in f_df.columns or '0' in f_df.columns: f_df = f_df.set_index(f_df[0], drop=True) f_df = f_df.drop(columns=0) if write_csv: #todo changed index... need to make sure all processed items spit out the same... in this case they are pre-strict. f_df.to_csv(destination_path + '/' + name + '_f.csv', index=True, header=False) if c_df is not None: c_df.to_csv(destination_path + '/' + name + '_c.csv', index=None, header=False) if get_df: return f_df
def get_header_argument(c_path): lines = hfh.get_lines(c_path) split = lines[0].split(',') if split[0] == 'AccNum': return 0 return None