def ExtractTestData(read_file, write_file): # Creates a dataframe from read_file and opens a new file for writing. df = pd.read_csv(read_file, sep='\t', header=0) write_file = open(write_file, 'w+') # Asks the user if they want to add a column for pathogenicity in write_file. extract_pathogenicity = mf.get_yes_no("Extract the pathogenicity (y/n)? ") answer_yes = (extract_pathogenicity == 'y') | (extract_pathogenicity == 'Y') # Adds the column names. write_file.write('#Variant') if (answer_yes): write_file.write(',Pathogenicity') write_file.write('\n') # Adds all the variants that are single subsitutions and are not VUS's to write_file. Also, # adds the pathogenicity of the variant if the user replied yes. for index, row in df.iterrows(): if (('>' in row.values[1]) & (not int(row.values[-1]) == 3)): if (row.values[1] == 'BRCA1'): write_file.write('NM_007294.3:' + str(row.values[1])) else: write_file.write('NM_000059.3:' + str(row.values[1])) if (answer_yes): if (int(row.values[-1]) > 3): write_file.write(',' + 'Pathogenic') elif (int(row.values[-1]) < 3): write_file.write(',' + 'Benign') write_file.write('\n') write_file.close()
def DataExtraction(read_file, write_file): # Creates a dataframe from read_file and opens a new file for writing. sep = mf.determine_separator(read_file) df = pd.read_csv(read_file, sep=sep, header=0) write_file = open(write_file, 'w+') # Prints the columns of the file to the console. Gets the name of the column to extract # variant identifiers from. Optionally writes the pathogenicity into a second column. mf.print_columns_with_index(df) col1 = mf.get_int_answer('What is the number of the column to sort by? ') coordinate_column = list(df.columns)[col1 - 1] df.sort_values(by=df.columns[col1 - 1]) first_question = mf.get_yes_no('Write pathogenicity (y/n)? ') if ((first_question == 'y') | (first_question == "Y")): second_question = mf.get_int_answer('Pathogenicity\'s column number? ') pathogenicity_column = list(df.columns)[second_question - 1] # Writes the variant identifier and the pathogencity to output_file. for index, row in df.iterrows(): write_file.write(df[coordinate_column].iloc[index] + ',') if ((first_question == 'y') | (first_question == "Y")): write_file.write(df[pathogenicity_column].iloc[index]) write_file.write('\n') write_file.close()
def CountUniqueElementsColumn(read_file): # Creates a csv file from read_file. sep = mf.determine_separator(read_file) df = pd.read_csv(read_file, sep=sep, header=0) # Print the columns of read_file to the console. Gets the name of the column to be printed # from the user. mf.print_columns_with_index(df) column_number = mf.get_int_answer('What column should be accumulated? ') column_name = list(df.columns)[column_number - 1] # Creates a list of the unique values from the specified column. Prints its length, and # optionally each item and its frequency. unique_values = pd.unique(df[column_name]) print('Number of unique items: ' + str(len(unique_values))) show_items = mf.get_yes_no('Show item/counts (y/n)? ') if ((show_items == 'y') | (show_items == 'Y')): for item in unique_values: print(str(item) + ': ' + str(df[df[column_name] == item].shape[0]))