def test_rbind2(): DT = dt.Frame([5, 7, 9]) DT[0] = dt.Type.time64 RES = dt.rbind(DT, DT) EXP = dt.Frame([5, 7, 9] * 2) EXP[0] = dt.Type.time64 assert_equals(RES, EXP)
def reshape_dot(column_names, data, measure_vars, output, id_vars=None): "reshape if '.value' is present in the column names." boolean = [True if ent == '.value' else False for ent in column_names] dot_value = [[*compress(extract, boolean)] for extract in output] if len(dot_value[0]) > 1: dot_value = ["".join(extract) for extract in dot_value] else: dot_value = [*chain.from_iterable(dot_value)] checks = set(dot_value) if id_vars and checks.intersection(id_vars): raise ValueError( f"The new column names associated with .value -> {checks} " "are duplicated in id_vars." ) boolean = [not true for true in boolean] others = [tuple(compress(extract, boolean)) for extract in output] headers_for_others = [extract for extract in column_names if extract != '.value'] measure_vars = [frame for frame in data[:, measure_vars]] out = defaultdict(list) for key, value_column, frame in zip(others, dot_value, measure_vars): frame.names = [value_column] out[key].append(frame) headers_for_others = [dt.Frame([key], names = headers_for_others) for key, _ in out.items()] out = [dt.cbind(frame, force = True) for _, frame in out.items()] out = [dt.cbind(dt.repeat(left, right.nrows), right, force = True) for left, right in zip(headers_for_others, out)] out = dt.rbind(out, force = True) if id_vars: id_vars = dt.repeat(data[:, id_vars], out.nrows//data.nrows) return dt.cbind([id_vars, out], force = True) return out
def reshape_no_dot(measure_vars, output, data, id_vars=None): values = [] for frame in data[:, measure_vars]: frame.names = [value_name] values.append(frame) values = dt.rbind(values, force=True) if id_vars: id_vars = dt.repeat(data[:, id_vars], len(measure_vars)) return dt.cbind([id_vars, output, values], force = True) return dt.cbind([output, values], force = True)
def build_target_table(chembl_df, drugbank_df, output_dir): """ Using data from the Drugbank and ChEMBL drug target files and the UniProt API, build the target table. @param chembl_df: [`dt.Frame`] The ChEMBL drug target table @param drugbank_df: [`dt.Frame`] The DrugBank drug target table @param output_dir: [`string`] The file path to write the final target table @return: [`dt.Frame`] The target table """ # Combine ChEMBL and Drugbank tables to make target table target_df = dt.rbind([chembl_df['name'], drugbank_df['name']]).to_pandas() target_df.drop_duplicates(inplace=True) target_df = dt.Frame(target_df) target_df = write_table(target_df, 'target', output_dir) target_df = rename_and_key(target_df, 'target_id') return target_df
def load_table(name, data_dir): """ Load all PSet tables with name into a datatable, dropping any duplicate rows. @param name: [`string`] The name of the table @param data_dir: [`string`] File path to the directory with all PSet tables @return: [`datatable.Frame`] A datatable containing all rows from all PSets """ logger.info(f"Loading PSet-specific {name} tables from {data_dir}...") # Get all files files = glob.glob(os.path.join(data_dir, "**", f"*{name}.jay")) # Filter so that file path are "{data_dir}/{pset}/{pset}_{name}.jay" files = [ file_name for file_name in files if re.search(data_dir + r"/(\w+)/\1_" + name + ".jay$", file_name) ] # Read and concatenate tables df = dt.rbind(*dt.iread(files)) # Drop duplicates (groups by all columns and # selects only the first row from each group) df = df[0, :, by(df.names)] return df
def fread_table_for_all_psets(table_name: str, data_dir: str, column_dict: dict, rename_dict: dict = None, key_columns: list = None) -> dt.Frame: """ Reads all tables named `table_name` from `data_dir`, using `column_dict` to specify the column names, order and types to read in. The resulting table iterator is then concatenated using `datatable.rbind` and the columns are renamed according to rename dict. :param table_name: :param data_dir: :param column_dict: :param rename_dict: """ logger.info( f"Loading PSet-specific {table_name} tables from {data_dir}...") # Get all files files = glob.glob(os.path.join(data_dir, "**", f"*{table_name}.jay")) # Filter so that file path are "{data_dir}/{pset}/{pset}_{name}.jay" files = [ file_name for file_name in files if re.search(data_dir + r"/(\w+)/\1_" + table_name + ".jay$", file_name) ] # Read and concatenate tables df = dt.rbind(*dt.iread(files, columns=column_dict), force=True) # Drop duplicates (groups by all columns and # selects only the first row from each group) df = df[0, :, by(df.names)] if rename_dict is not None: df.names = rename_dict if key_columns is not None: df = df[0, :, :, by(key_columns)] return df
def test_rbind(): src = [d(2030, 12, 1, 13, 43, 17)] DT = dt.Frame(src) assert DT.type == dt.Type.time64 RES = dt.rbind(DT, DT) assert_equals(RES, dt.Frame(src * 2))
def test_rbind2(): DT = dt.Frame([5, 7, 9], type=dt.Type.time64) RES = dt.rbind(DT, DT) EXP = dt.Frame([5, 7, 9] * 2, type=dt.Type.time64) assert_equals(RES, EXP)
['VS' + d + '_original' for d in release['samples_id'].to_list()[0]]) # ~ 1a ~ # Validate ERNS # Make sure all ERNs values are correct. If there are any unknown ERN values, # add them to the mappings (defined in step 0) and rerun the script up to this # section. Repeat the process until all ERN name variations have been corrected. # There shouldn't be any new ERNs only name variations. # Find ERNs name variations that do not exist in RD3. If the following code # throws any error, add the name variation to the object `ernMappings` defined # in step 0b. Repeat until the no more mapping errors are thrown. If everything # is mapped, then proceed to the next step. dt.Frame([ recodeValue(mappings=ernMappings, value=d, label="ERN") for d in dt.unique( dt.rbind(release['samples_ERN'], release['subject_ERN'], force=True)).to_list()[0] ]) # recode ERNs variables with known variations release['samples_ERN'] = dt.Frame([ recodeValue(mappings=ernMappings, value=d, label='ERN') for d in release['samples_ERN'].to_list()[0] ]) release['subject_ERN'] = dt.Frame([ recodeValue(mappings=ernMappings, value=d, label='ERN') for d in release['subject_ERN'].to_list()[0] ]) # combine both ERNs # rawErnData = dt.unique(
statusMsg('Pulling subjects from', novelOmicsReleases[release]) tmpSubjectData = rd3.get(entity=f'{novelOmicsReleases[release]}_subject', attributes='id,subjectID,patch', batch_size=10000) for row in tmpSubjectData: if 'patch' in row: row['patch'] = ','.join([patch['id'] for patch in row['patch']]) tmpSubjectData = dt.Frame(tmpSubjectData)[:, { 'id': f.id, 'subjectID': f.subjectID, 'patch': f.patch, 'release': release }] existingSubjects = dt.rbind(existingSubjects, tmpSubjectData) # get existing sample metadata existingSamples = dt.Frame() for release in novelOmicsReleases: statusMsg('Pulling samples from', novelOmicsReleases[release]) tmpSampleData = rd3.get(entity=f'{novelOmicsReleases[release]}_sample', attributes='id,sampleID,patch', batch_size=10000) for row in tmpSampleData: if 'patch' in row: row['patch'] = ','.join([patch['id'] for patch in row['patch']]) tmpSampleData = dt.Frame(tmpSampleData)[:, { 'id': f.id,
# spread data by subjectID and release the previous step collapses # multiple samples for a release so we can drop duplicate values here subjectSamplesSummarized=dt.Frame( tmpSamplesBySubjects .to_pandas() .drop_duplicates(subset=['subjectID','release'],keep='first') .pivot(index='subjectID', columns='release', values='idsCollapsed') .reset_index() ) # bind to parent object subjectSamplesSummarized['numberOfSamples']=tmpSamplesBySubjects.nrows samplesSummarized=dt.rbind( samplesSummarized, subjectSamplesSummarized, force=True ) # store processed ids processedSubjectIDs.append(id) del subjectSamplesSummarized del tmpSamplesBySubjects del processedSubjectIDs del sampleSubjectIDs # rename columns samplesSummarized.names={ 'freeze1': 'df1Samples', 'freeze2': 'df2Samples',
def melt(data, id_vars=None, measure_vars=None, variable_name = 'variable', value_name = 'value'): "Turns Frame from wide to long form." if id_vars: if not isinstance(id_vars, (str, list, tuple)): raise TypeError('id_vars should be one of str, list, tuple.') if isinstance(id_vars, str): id_vars = [id_vars] checks = set(id_vars).difference(data.names) if checks: raise ValueError(f'Labels {checks} in id_vars do not exist in the column names.') if not set(data.names).difference(id_vars): return data checks = [key for key,value in Counter(id_vars).items() if value > 1] if checks: raise ValueError(f"Labels {checks} are duplicated in id_vars.") if not measure_vars: measure_vars = [name for name in data.names if name not in id_vars] if measure_vars: if not isinstance(measure_vars, (str, list, tuple)): raise TypeError('measure_vars should be one of str, list, tuple.') if isinstance(measure_vars, str): measure_vars = [measure_vars] checks = set(measure_vars).difference(data.names) if checks: raise ValueError(f'Labels {checks} in measure_vars do not exist in the column names.') checks = [key for key,value in Counter(measure_vars).items() if value > 1] if checks: raise ValueError(f"Labels {checks} are duplicated in measure_vars.") if (not id_vars) and (len(measure_vars) < data.ncols): id_vars = [name for name in data.names if name not in measure_vars] else: measure_vars = data.names def reshape_no_dot(measure_vars, output, data, id_vars=None): values = [] for frame in data[:, measure_vars]: frame.names = [value_name] values.append(frame) values = dt.rbind(values, force=True) if id_vars: id_vars = dt.repeat(data[:, id_vars], len(measure_vars)) return dt.cbind([id_vars, output, values], force = True) return dt.cbind([output, values], force = True) def reshape_dot(column_names, data, measure_vars, output, id_vars=None): "reshape if '.value' is present in the column names." boolean = [True if ent == '.value' else False for ent in column_names] dot_value = [[*compress(extract, boolean)] for extract in output] if len(dot_value[0]) > 1: dot_value = ["".join(extract) for extract in dot_value] else: dot_value = [*chain.from_iterable(dot_value)] checks = set(dot_value) if id_vars and checks.intersection(id_vars): raise ValueError( f"The new column names associated with .value -> {checks} " "are duplicated in id_vars." ) boolean = [not true for true in boolean] others = [tuple(compress(extract, boolean)) for extract in output] headers_for_others = [extract for extract in column_names if extract != '.value'] measure_vars = [frame for frame in data[:, measure_vars]] out = defaultdict(list) for key, value_column, frame in zip(others, dot_value, measure_vars): frame.names = [value_column] out[key].append(frame) headers_for_others = [dt.Frame([key], names = headers_for_others) for key, _ in out.items()] out = [dt.cbind(frame, force = True) for _, frame in out.items()] out = [dt.cbind(dt.repeat(left, right.nrows), right, force = True) for left, right in zip(headers_for_others, out)] out = dt.rbind(out, force = True) if id_vars: id_vars = dt.repeat(data[:, id_vars], out.nrows//data.nrows) return dt.cbind([id_vars, out], force = True) return out if not isinstance(variable_name, (str, tuple, dict, Pattern)): raise TypeError('variable_name should be one of string, tuple, dictionary, regular expression.') if isinstance(variable_name, str): if not isinstance(value_name, str): raise TypeError('value_name should be a string.') if value_name == variable_name: raise ValueError( f"{value_name} is duplicated as variable_name. " f"Kindly provide a unique argument for {value_name}.") if id_vars: if variable_name in id_vars: raise ValueError( f"{variable_name} already exists as a label " "in id_vars. Kindly provide a unique argument.") if value_name in id_vars: raise ValueError( f"{value_name} already exists as a label " "in id_vars. Kindly provide a unique argument.") output = dt.Frame({variable_name:measure_vars}) output = output[np.repeat(range(output.nrows), data.nrows),:] return reshape_no_dot(measure_vars=measure_vars, output = output, data = data, id_vars = id_vars) if isinstance(variable_name, Pattern): if not re.compile(variable_name).groups: raise ValueError("The regex should have at least one group.") output = [re.search(variable_name, word) for word in measure_vars] no_matches = [word for word, match in zip(measure_vars, output) if not match] if no_matches: raise ValueError( f"There was no match for labels {no_matches} " "for the provided regular expression.") output = [entry.groupdict() for entry in output] checks = output[0].keys() if id_vars and set(checks).intersection(id_vars): raise ValueError( f"Labels {checks} already exist in id_vars. " "Kindly provide unique names for the named groups " "in the regular expression." ) output = dt.Frame(output) output = output[np.repeat(range(output.nrows), data.nrows),:] return reshape_no_dot(measure_vars=measure_vars, output = output, data = data, id_vars = id_vars) if isinstance(variable_name, dict) : checks = set(variable_name).intersection(id_vars) if id_vars and checks: raise ValueError( f"Labels {checks} already exist in id_vars. " "Kindly provide keys for the dictionary " "that do not exist in id_vars." ) for key, regex in variable_name.items(): if not isinstance(key, str): raise TypeError(f"{key} should be a string.") if not isinstance(regex, (str, Pattern)): raise TypeError( f"The value for {key} should be a regular expression, " "or can be compiled into one." ) if re.compile(regex).groups: raise ValueError("The regex should not have any groups.") output = [] for key, regex in variable_name.items(): out = [word for word in measure_vars if re.search(regex, word)] if not out: raise ValueError( f"There was no match for {key} for regex => {regex}" ) measure_vars = [word for word in measure_vars if word not in out] if len(out) == 1: frame.names = [key] output.append(frame) else: values = [] for frame in data[:, out]: frame.names = [key] values.append(frame) output.append(dt.rbind(values, force = True)) output = dt.cbind(output, force=True) if id_vars: id_vars = dt.repeat(data[:, id_vars], output.nrows//data.nrows) return dt.cbind([id_vars, output]) return output if isinstance(variable_name, tuple): variable_name = measure(*variable_name) column_names, sep, pattern = variable_name if not column_names: raise ValueError("Kindly provide argument for column_names, in the variable_name tuple.") if not isinstance(column_names, (str, list)): raise TypeError('column_names should be one of string, list.') if isinstance(column_names, str): column_names = [column_names] if id_vars: checks = set(column_names) checks.discard(".value") checks = checks.intersection(id_vars) if checks: raise ValueError( f"Labels {checks} already exist in id_vars. " "Kindly provide unique column_names " "that do not exist in id_vars." ) if not any((sep, pattern)): raise ValueError("Kindly provide one of sep or pattern.") if sep and pattern: raise ValueError("only one of sep or pattern should be provided.") if sep: if not isinstance(sep, (str, Pattern)): raise TypeError( "sep should be a regular expression, " "or can be compiled into one.") output = [re.split(sep, word) for word in measure_vars] checks = max(map(len, output)) if len(column_names) != checks: raise ValueError( f"The maximum number of splits for sep -> {sep} is {checks} " f"while the number of labels in {column_names} " f"is {len(column_names)}" ) if '.value' not in column_names: output = [*map(tuple, output)] output = dt.Frame(output, names=column_names) output = output[np.repeat(range(output.nrows), data.nrows),:] return reshape_no_dot(measure_vars=measure_vars, output = output, data = data, id_vars = id_vars) return reshape_dot(column_names, data, measure_vars, output, id_vars=id_vars) if pattern: if not isinstance(pattern, (str, Pattern)): raise TypeError( "pattern should be a regular expression, " "or can be compiled into one.") checks = re.compile(pattern).groups if not checks: raise ValueError("The regex should have at least one group.") if checks != len(column_names): raise ValueError( "The number of groups in the regex " "should match the number of labels in column_names. " f"The number of groups in the regex is {len(checks)}, " f"while the length of column_names is {len(column_names)}") output = [re.findall(pattern, word) for word in measure_vars] no_matches = [word for word, match in zip(measure_vars, output) if not match] if no_matches: raise ValueError( f"There was no match for labels {no_matches} " "for the provided regular expression.") output = [*chain.from_iterable(output)] if '.value' not in column_names: output = [*map(tuple, output)] output = dt.Frame(output, names=column_names) output = output[np.repeat(range(output.nrows), data.nrows),:] return reshape_no_dot(measure_vars=measure_vars, output = output, data = data, id_vars = id_vars) return reshape_dot(column_names, data, measure_vars, output, id_vars=id_vars)