def test_from_excel(): setup_excel_file(SMALL_ROW_SIZE) pandas_df = pandas.read_excel(TEST_EXCEL_FILENAME) modin_df = pd.read_excel(TEST_EXCEL_FILENAME) df_equals(modin_df, pandas_df) teardown_excel_file()
def test_from_excel(): setup_excel_file(SMALL_ROW_SIZE) pandas_df = pandas.read_excel(TEST_EXCEL_FILENAME) ray_df = pd.read_excel(TEST_EXCEL_FILENAME) assert ray_df_equals_pandas(ray_df, pandas_df) teardown_excel_file()
def test_from_excel_index_col(): setup_excel_file(SMALL_ROW_SIZE) pandas_df = pandas.read_excel(TEST_EXCEL_FILENAME, index_col=0) with pytest.warns(UserWarning): modin_df = pd.read_excel(TEST_EXCEL_FILENAME, index_col=0) df_equals(modin_df, pandas_df) teardown_excel_file()
def test_from_excel_engine(): setup_excel_file(SMALL_ROW_SIZE) pandas_df = pandas.read_excel(TEST_EXCEL_FILENAME, engine="xlrd") with pytest.warns(UserWarning): modin_df = pd.read_excel(TEST_EXCEL_FILENAME, engine="xlrd") df_equals(modin_df, pandas_df) teardown_excel_file()
def mergePanelsFeature(tableWithAuthorsPanels): import modin.pandas as pd projectTable = pd.read_excel("..\\data\\TabellaProgettiPanelJam.xlsx") Table = pd.merge(tableWithAuthorsPanels, projectTable, left_on='id_prog', right_on='project') Table = Table.drop(columns=['project', 'Remixed', 'Time', 'Project depth']) return Table
def test_from_excel_all_sheets(): setup_excel_file(SMALL_ROW_SIZE) pandas_df = pandas.read_excel(TEST_EXCEL_FILENAME, sheet_name=None) modin_df = pd.read_excel(TEST_EXCEL_FILENAME, sheet_name=None) assert isinstance(pandas_df, OrderedDict) assert isinstance(modin_df, OrderedDict) assert pandas_df.keys() == modin_df.keys() for key in pandas_df.keys(): assert modin_df_equals_pandas(modin_df.get(key), pandas_df.get(key)) teardown_excel_file()
def createDataset(): import modin.pandas as pd import time from datetime import date, datetime import selfOverdub as Self import cleanText as clean import panelStarsExtractor as extr import gc df = pd.read_excel('..\\data\\TabellaProgettiPanelJam.xlsx') idProjects = (df['project']).tolist() timeProg = (df['Time']).tolist() today = date.today() start_time = time.time() printTime = time.strftime("%H:%M:%S", time.gmtime(start_time)) print('Start date: ' + str(today) + ' ' + printTime) (finalAuthorsNames, projects_depth, idProjects, removes) = panelsAuthors(idProjects) i = 0 for elem in removes: del timeProg[elem - i] i = i + 1 (panelsId, finalProjectsId, final_projects_depth, mergedRemixed) = searchPanelsId(idProjects, projects_depth) Table = createMergedTable(finalAuthorsNames, final_projects_depth, panelsId, finalProjectsId, mergedRemixed, idProjects, timeProg) gc.collect() Table = Self.removeSelfOverdub(Table) #Table = extr.panelsStar(Table) Table.to_excel('..\\data\\TabellaCompletaProva.xlsx', index=False) print("number of projects removed = " + str(len(removes))) print(removes) today = date.today() printTime = time.strftime("%H:%M:%S", time.gmtime(time.time())) print('End date: ' + str(today) + ' ' + printTime) elapsed_time = time.time() - start_time print('Elapsed time: ' + time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
import modin.pandas as pd filename = '/Users/ethan/Downloads/KMT2B_matrix.xlsx' df = pd.read_excel(filename, dtype={'NGS_ID': 'int32', 'CpGisland': 'int32'}) print('read excel file') col = list(df.columns) col.remove('NGS_ID') col.remove('CpGisland') res_list = [] island_list = list(df['CpGisland'].drop_duplicates()) for idx, island in enumerate(island_list): for sample in col: sub_df = df[df['CpGisland'] == island][['NGS_ID', 'CpGisland', sample]] sub_df['sample'] = sample sub_df.columns = ['NGS_ID', 'CpGisland', 'Methylation_value', 'Sample'] res_list.append(sub_df) print(f'>>> {idx} {island} {(idx+1)/414*100}%') res_df = pd.concat(res_list, ignore_index=True) print('create dataframe done') res_df.to_csv('result.tsv', sep='\t', index=False) print('save it to tsv file')
du.search_explore.dataframe_missing_values(mut_df) # mut_df.describe().transpose() # # + [markdown] {"toc-hr-collapsed": true} # ### Clinical outcome (TCGA-CDR) data # # Description # - cdr_df = pd.read_excel( f'{data_path}{cdr_folder}TCGA-CDR-SupplementalTableS1.xlsx') cdr_df.head() # **Comments:** # * Features such as `age_at_initial_pathologic_diagnosis`, `gender`, `race`, `ajcc_pathologic_tumor_stage`, `vital_status` and `tumor_status` might be very insteresting to include. Others such as overall survival (OS), progression-free interval (PFI), disease-free interval (DFI), and disease-specific survival (DSS) might not be relevant for this use case. # * Missing values appear to be mostly represented as "[Not Applicable]", "[Not Available]", "[Not Evaluated]", "[Unknown]" or "[Discrepancy]". # * Features related to outcomes, such as `treatment_outcome_first_course` and `death_days_to`, should be ignored, as we're classifying tumor type, regardless of the outcome. # #### Basic stats cdr_df.dtypes cdr_df.nunique() du.search_explore.dataframe_missing_values(cdr_df)
import matplotlib.colors as mcolors %matplotlib inline sns.set_context('poster') sns.set_color_codes() plot_kwds = {'alpha' : 0.25, 's' : 80, 'linewidths':0} #Change my working directory to more easily locate my data files. os.chdir(r'C:\Users\acdav\OneDrive\Documentos\OneDrive\Alexjandro\research\Python') #View my working directory. os.getcwd() #Import the data from Memorial Hermann Hospital that we have already cleaned and explored in the 'revisit hermann' supervised analysis. This allows us to get into the unsupervised learning analysis. df = pd.read_excel(r'hermann_df7.xlsx') #complete some initial visualization of the revised data. df.info() columns = df.columns columns df.head() #create a copy of the data to be able to work with it so we dont overwrite it. We will also drop our datatime object since we have the month already and the data is already in order from date. We will also drop the ID number created from python when we exported the dataset. df1 = df.drop(['HOSP_ARRIV_DATE','Unnamed: 0'],axis=1) # Lets also change the dataframe size to use less memory. df1 = df1.astype('uint8', copy=True, errors='raise') df1.info() # Next we will scale our continuous values which are only age, height, weight, and ISS score.
# Necessary if you wish to run each step sequentially from parfit.fit import * from parfit.score import * from parfit.plot import * from parfit.crossval import * %matplotlib inline #Change my working directory to more easily locate my data files. os.chdir(r'C:\Users\acdav\OneDrive\Documentos\OneDrive\Alexjandro\research\Spyder projects\research projects') #View my working directory. os.getcwd() #Import the data from Memorial Hermann Hospital. df = pd.read_excel(r'PEDI1116ALL.xlsx') #complete some initial EDA of the revised data. df.info() columns = df.columns columns df.head() #create a copy of the data to be able to work with it so we dont overwrite it. df1 = df.copy() df1.info() #We can see that we have some missing values so lets better visualize them. An since we have them all over lets look at them by percentage to better judge how to handle them. percent_missing = df1.isnull().sum()* 100 / len(df1) percent_missing