def sm_dict_from_csv(self, directory, name): """ Load a saved cleaned file from summary statistics for a given chromosome found at the load_path provided, and use this to construct the sm_dict we pass between methods """ load_file = CsvObject(Path(directory, name), self.cleaned_types, set_columns=True) chromo = load_file.column_data[self.summary_dict[self.chromosome]] bp_pos = load_file.column_data[self.summary_dict[self.bp_position]] snp_id = load_file.column_data[self.summary_dict[self.snp_id]] effect = load_file.column_data[self.summary_dict[self.effect_allele]] alt = load_file.column_data[self.summary_dict[self.alt_allele]] log = load_file.column_data[self.summary_dict[self.log_odds]] beta = load_file.column_data[self.summary_dict[self.beta]] freq = load_file.column_data[self.summary_dict[self.freq]] sm_variants = [ Variant(ch, bp, sn, ef, al) for ch, bp, sn, ef, al in zip(chromo, bp_pos, snp_id, effect, alt) ] return { self.sm_variants: np.array(sm_variants), self.log_odds: np.array(log), self.beta: np.array(beta), self.freq: np.array(freq) }
def places_into_dates(self, cleaned_data, write_directory, file_gid=0): """ Sometimes you may have data that is not missing in dates, but just wasn't recorded. This places every place in into ever date. """ # Format the reference into lower case formatted = [[r.lower() for r in row] for row in self._reference.row_data] for file in directory_iterator(cleaned_data): # Load the file as a csv object loaded_file = CsvObject(Path(cleaned_data, file)) # Isolate the GID: Row relation from the file gid = {row[file_gid]: row for row in loaded_file.row_data} # If the place exists in our file, use the file row, else use set to zero's all_places = [] for row in formatted: if row[file_gid] in gid: all_places.append(gid[row[file_gid]]) else: all_places.append([row[i] for i in self.isolates] + [0, 0, 0, 0]) write_csv(write_directory, Path(cleaned_data, file).stem, loaded_file.headers, all_places)
def _isolate(root_directory, file, unique_id, ids): """ Isolate the row that match's the ID if it exists, else return an empty list Note ---- This assumes no duplicates, and if you have duplicates where the values are different you will need to clean that yourself. Otherwise, run the remove_duplicates command first before this method. :param root_directory: The root directory of the csv files :type root_directory: Path | str :param file: The name of the file to search :type file: str :param unique_id: The unique id index :type unique_id: int :param ids: The current match id :type ids: str :return: The row in the data that was matched if it was found, else an empty list of length CsvObject :rtype: list """ csv_obj = CsvObject(Path(root_directory, file)) for row in csv_obj.row_data: if row[unique_id] == ids: return row return ["" for _ in range(csv_obj.row_length)]
def _position_values(self): csv_data = CsvObject(self.csv_path, [str, float, float, float]) if csv_data.row_length != 4: msg = f"Csv file should contain phenotype, coefficient, lower bound, upper bound yet found" \ f" {csv_data.row_length} rows" raise IndexError(msg) # Normalise the values for the table plot with 0 added so we know where to draw the axis numerical_values = flatten([row[1:] for row in csv_data.row_data]) normalised_value_list = normalisation_min_max(numerical_values + [self.axis_target]) # Isolate the axis and normal array, then chunk the normal array back into the coefficient, lower bound and # upper bound x_axis_point = normalised_value_list[-1] normal_array = chunk_list(normalised_value_list[:-1], 3) # Format the rows so we have actual - positional values for each numeric formatted_rows = [] for row, normalised in zip(csv_data.row_data, normal_array): formatted_rows.append( flatten([[row[0]]] + [[row[i + 1], normalised[i]] for i in range(3)])) return formatted_rows, x_axis_point
def standardise_names(self, data_directory, write_directory): """ Standardise each place name to a single name if it has multiple If working with time series data places may change their names over time which can lead to a lot of merge errors or difficulty in linking data. This will standardise all names to a single entry, ensuring that regardless of the actual name of the place in that year that all data from that place is grouped to a single entry. :param data_directory: Directory containing csv files named in a yyyymmdd format :type data_directory: Path | str :param write_directory: Output directory :type write_directory: Path | str :return: Nothing, write out the data for each file found in the data_directory and then stop :rtype: None """ for file in directory_iterator(data_directory): print(file) # Load the data into memory. data = CsvObject(Path(data_directory, file), set_columns=True) # Standardise the name via the matcher rows = [] for i, name in enumerate(data.column_data[self._name_index], 0): reformatted = self._convert_names(name, i, data) if reformatted: rows.append(reformatted) # Set the headers of the output file then write the file of the same name to the write_directory headers = self._reference_types + data.headers[1:] write_csv(write_directory, data.file_path.stem, headers, rows)
def distribute_heritability_genome_wide(self): """If we can't calculate heritability, distribute it from a provided float""" total_snps = 0 config_dict = {} for file in directory_iterator(self.summary_directory): print(file) load_file = CsvObject(Path(self.summary_directory, file), self.cleaned_types, set_columns=True) # Isolate the generic information n_snps, n_iid = self._chromosome_from_load(load_file) chromosome_values = { self.count_snp: n_snps, self.count_iid: n_iid, "Description": f"Chromosome {self.target_chromosome}" } config_dict[self.target_chromosome] = chromosome_values total_snps += n_snps print( f"Suggested LD_Radius based on {total_snps} / 3000 is {total_snps / 3000}" ) for key, value in config_dict.items(): config_dict[key][self.herit] = self.herit_calculated * ( config_dict[key][self.count_snp] / total_snps) config_dict["Genome"] = { f"{self.genome}_{self.herit}": self.herit_calculated } ArgMaker().write_yaml_group_dict(config_dict, self.working_dir, "genome_wide_config")
def write_linked_unique(self, ambiguity=True, ambiguity_file_name="SetAmbiguous.csv"): """ Construct a base lookup-file to append alternative names to as well as lists of unique name files :param ambiguity: If there is ambiguity in the file system :type ambiguity: bool :param ambiguity_file_name: The name of the fix file, defaults to SetAmbiguous.csv :type ambiguity_file_name: str :return: Nothing, construct files then stop :rtype: None """ # Load the files for each shapefile that where written by link_districts_counties as well as the user ambiguous # file named ambiguity_file_name ambiguity_file = self._ambiguity_setter(ambiguity, ambiguity_file_name) relation_files = [ CsvObject(f"{self._working_dir}/{file}") for file in directory_iterator(self._working_dir) if "_relation" in file ] # Construct a list of all the names without any ambiguity name_list = [ self._fix_row_ambiguity(row, ambiguity_file, re.sub(r"[\D]", "", file.file_name)) for file in relation_files for row in file.row_data ] # Write out the reference base unique_relations = [ list(relation) for relation in list({tuple(i) for i in name_list}) ] if not Path(self._working_dir, "LookupBase.csv").exists(): write_csv(self._working_dir, "LookupBase", ["GID"] + self._headers, unique_relations) else: print("Lookup already written, passing") # For each level, write out a list of unique names for index, level in enumerate(self._headers, 1): # Isolate the unique places for a given level unique_places = list( set([level_relation[index] for level_relation in name_list])) # Write it out if it doesn't already exist if not Path(self._working_dir, f"Unique_{level}.csv").exists(): write_csv(self._working_dir, f"Unique_{level}", [level], unique_places) else: print(f"Unique_{level} Already exists, skipping")
def construct_reference(self, base_weights_name="LookupBase.csv", alternative_key="Unique"): """ The construct a reference of every name for every place for every level within the Lookup Base :param base_weights_name: Name of the base weights file :type base_weights_name: str :param alternative_key: Key within files that contains alternative names :type alternative_key: str :return: Nothing, write out place reference csv then stop :rtype: None """ # Load the lookup base base_relation = CsvObject(Path(self._working_dir, base_weights_name)) # Load alternative files alt_files = [ CsvObject(Path(self._working_dir, file), set_columns=True) for file in directory_iterator(self._working_dir) if alternative_key in file ] # Order them in the same manner as the headers order = [ index for header in self._headers for index, file in enumerate(alt_files) if header in file.file_name ] alt_files = np.array(alt_files)[order].tolist() # Link each row to a unique list to create the reference place look up file rows = [ flatten([[row[0]]] + [ self._match_row(match, match_file) for match, match_file in zip(row[1:], alt_files) ]) for row in base_relation.row_data ] write_csv(self._working_dir, "PlaceReference", ["GID"] + flatten([file.headers for file in alt_files]), rows)
def solve_ambiguity(self, standardised_directory, write_directory): """ Remove perfect duplicates and combine non perfect duplicates so that all GIDs are unique. Some places may end up being duplicated, in the raw data or after standardisation. This method will remove perfect duplicates, and combine non perfect duplicates into a single entry. Keep in mind, that if this is not desirable, that the system will print out each non-perfect duplication merge it has done. You may wish to alter your original data set, or change your place reference to avoid this from happening. :param standardised_directory: The data directory of the output from standardise_names :type standardised_directory: str | Path :param write_directory: The output directory :type write_directory: str | Path """ for file in directory_iterator(standardised_directory): print(file) # Load the original file and look for duplicate GIDs; which should be unique data = CsvObject(Path(standardised_directory, file), set_columns=True) duplicate_list = find_duplicates(data.column_data[0]) # Isolate any row that does not suffer from duplication as the base of the write return reset_row = [ row for row in data.row_data if row[0] not in duplicate_list ] for dup in duplicate_list: # Isolate the row names row_names = data.row_data[data.column_data[0].index( dup)][:len(self._reference_types)] # Isolate the values for each duplicate name sub_list = [[ parse_as_numeric(rr, float) for rr in r[len(self._reference_types):] ] for r in data.row_data if dup == r[0]] # Isolate unique lists, to remove duplicates unique_sub_lists = [ list(x) for x in set(tuple(x) for x in sub_list) ] # Warn the user that some values have been combined. if len(unique_sub_lists) > 1: print( f"Found and combined multiple entries that where not perfect duplicates for {row_names}" ) # Add the combined values or singular entry of duplicate values to the reset list reset_row.append(row_names + [sum(i) for i in zip(*unique_sub_lists)]) write_csv(write_directory, data.file_path.stem, data.headers, reset_row)
def suggest_ld_radius(self): """Suggest the size of LD that the user should be using""" total_snps = sum([ CsvObject(Path(self.filter_directory, file)).column_length for file in directory_iterator(self.filter_directory) ]) print( f"Suggested LD Radius based on total snps found after filtering / 3000 is {total_snps / 3000}" ) return total_snps / 3000
def relational_subprocess(self, index_list, index_of_process, data_directory, write_directory): """ This sub process is run via a call from relational_database via Process Each process is set a sub selection of indexes from the PlaceReference loaded into _matcher. Each process will then isolate this name and create a output json database for it by extracting any matching entries attributes from the data directory. :param index_list: A list of indexes to load from the PlaceReference for this process :type index_list: list[int] :param index_of_process: Which process thread this is :type index_of_process: int :param data_directory: Load directory the of standardised, cleaned, and correct data :type data_directory: str | Path :param write_directory: Write Directory for the json database :type write_directory: str | Path :return: Nothing, write a json database for each location that has been indexed from the PlaceReference. :rtype: None """ # Currently processed files in the output directory current_files = [f for f in directory_iterator(write_directory)] for call_index, place_index in enumerate(index_list, 1): print( f"{call_index} / {len(index_list)} for process {index_of_process}" ) # Create the unique name from the groups and isolate the gid for parsing the csv unique_name = "__".join( self._set_standardised_place(self._matcher[place_index])) gid = self._matcher[place_index][0] # Set the output stub for this place's json database place_data = {"Place_Name": unique_name, "GID": gid} # If the data has not already been processed if self._not_processed(unique_name, current_files): for file in directory_iterator(data_directory): # Load the data into memory data = CsvObject(Path(data_directory, file), set_columns=True) # Isolate any data pertaining to this place from this file and add them to the place_data dict self._process_relation_data(data, gid, place_data) write_json(place_data, write_directory, f"{unique_name}_{self._data_name}")
def reformat_raw_names(self, raw_csv, raw_name_i, date_i, data_start, out_directory, date_type="yyyymmdd", date_delimiter="/"): """ This will attempt to reformat names that are in a different style to the required weightGIS format :param raw_csv: The path of the csv of data you want to standardise :type raw_csv: Path | str :param raw_name_i: The place name index in the raw file :type raw_name_i: int :param date_i: The date index in the raw file :type date_i: int :param data_start: The column index wherein after the data starts :type data_start: int :param out_directory: Where you want this file to be written to :type out_directory: str | Path :param date_type: The type of date, takes the values of yyyy, yyyymmdd, or ddmmyyyy. :type date_type: str :param date_delimiter: Delimiter for if dates are standard dd/mm/yyyy :type date_delimiter: str :return: """ raw_csv = CsvObject(raw_csv, set_columns=True) headers = ["Place"] + raw_csv.headers[data_start:] place_dict = self._create_place_dict(raw_csv, raw_name_i) unique_dates = self._set_name_dates(date_delimiter, date_i, date_type, raw_csv) for row_date, date in unique_dates.items(): place_rows = [] for row in raw_csv.row_data: if row[date_i] == row_date: place_rows.append( [place_dict[self._simplify_string(row[raw_name_i])]] + row[data_start:]) write_csv(out_directory, date, headers, place_rows)
def _select_snps(self): """ We may only want to run a subset of snps. If so, then this loads the snp indexes from a csv. Else, just return all the snp ids :return: A list of snp ids :rtype: list[snp] """ if self.args["snps"]: return CsvObject(validate_path(self.args["snps"]), set_columns=True, column_types=int)[0] else: return [i for i in range(self.gen.sid_count)]
def set_snp_ids(memory_location, snps_to_id, gen_path, write_dir, file_name): """ Isolate a subset of snps based on pre-defined named snps in a csv, passed as a str to snps_to_id, or a random set of snps of total == pre-defined int, where the int is set to snps_to_id. :param memory_location: Location of bgen memory file :type memory_location: Path | str :param snps_to_id: Location of snps csv to id :type snps_to_id: Path | str :param gen_path: The path to the genetic file :type gen_path: Path | str :param write_dir: The directory to write the snp index csv file to :type write_dir: Path | str :param file_name: The name of the snp index file :type file_name: str :return: Nothing, write the id's to a csv then stop :rtype: None :raise TypeError: If a str / int is not passed """ # Load the args dict, then set the custom write location for the bgen file memory files and load the genetic ref custom_meta_path(validate_path(memory_location)) gen = Bgen(str(validate_path(gen_path).absolute())) # Construct a lookup dict for variant_id-rsid v_dict = {snp[1]: snp[0] for snp in [snp.split(",") for snp in gen.sid]} # Load the list of snps to validate snps_list = CsvObject(validate_path(snps_to_id), set_columns=True)[0] # Get the index of each snp that is present snp_indexes = [] for snp in snps_list: try: snp_indexes.append( gen.sid_to_index([f"{v_dict[snp]},{snp}"]).tolist()) except KeyError: pass # Write the snp indexes out write_csv(write_dir, f"{file_name}", ["Snp"], snp_indexes) print( f"Constructed snp id list of length {len(snp_indexes)} for {gen_path} at {terminal_time()}" )
def _ambiguity_setter(self, ambiguity, ambiguity_file_name): """ If there is ambiguity, load the fix file """ if ambiguity: try: return CsvObject(f"{self._working_dir}/{ambiguity_file_name}", file_headers=False) except FileNotFoundError: raise FileNotFoundError( f"Ambiguity specified but no fix file named {ambiguity_file_name} found" ) else: return None
def __init__(self, args): read_path, write_path, start_index, name_index = args self._setup_camera() self.data_path = Path(read_path) self.data = CsvObject(self.data_path, set_columns=True) self.write_directory = write_path self.start_index = int(start_index) self.name_i = int(name_index) self.create_frames()
def __init__(self, args): write_directory, file_path, name_index, isolate, y_scale, border_width, colour, border_colour, write_name = args self.write_directory = write_directory self.csv_obj = CsvObject(file_path) self.name_i = int(name_index) self.isolate = int(isolate) self.y_scale = float(y_scale) self.border_width = float(border_width) self.colour = tuple_convert(colour) self.border_colour = tuple_convert(border_colour) self.write_name = write_name bpy.context.tool_settings.mesh_select_mode = (False, False, True) self.make_histogram()
def _set_reference_panel(self): """ Many operations will need a reference panel of individuals that are genetically dis-similar/ Not related to each other. This will load a csv or similar text file with two columns of type FID - IID if set. Else will return None. Note ----- This operation does NOT allow for headers, so do not set them! """ if self.args["Reference_Panel"]: path_to_file = Path(self.args["Reference_Panel"]) validate_path(path_to_file, False) return CsvObject(path_to_file, set_columns=True, file_headers=False).row_data else: return None
def _set_corrections(self, correction_path): """ Set the correction list for changing names after reformatting This is designed to change names when names occur via spelling mistakes in the original source material. You could clean these in the reformatting stage, but if you have multiple dataset's where the spelling errors occur but the formatting is different then this allows for a standardised approach to fixing the error. :param correction_path: A path to the correction file, which contains as many rows as twice the number of types, + 1 for the operator column. :type correction_path: str | Path :return: A list of length match_type + 1, where the addtional is from the operator column so we know to delete or replace a value. """ # Load the data into memory correction_data = CsvObject(validate_path(correction_path)) # Assert there are as many rows as twice the number of types, + 1 for the operator column assert (self._match_types * 2) + 1 == correction_data.row_length # Create the original, correction rows original_i, new_i = [[ i * self._match_types, (i * self._match_types) + self._match_types ] for i in range(2)] correction_list = [] for row in correction_data.row_data: # Isolate the original names and the replacements originals = [ self._simplify_string(name) for name in row[original_i[0]:original_i[1]] ] corrections = [ self._simplify_string(name) for name in row[new_i[0]:new_i[1]] ] # Append this to a list with the operator column; the last one hence -1 correction_list.append([originals, corrections, row[-1]]) return correction_list
def __init__(self, place_reference, data_name, correction_path=None, cpu_cores=1, splitter="__", name_index=0, place_map=None): # Set the standardised name reference from a path to its csv self._reference = CsvObject(validate_path(place_reference), set_columns=True) # The name for this particular sub set of data self._data_name = data_name # Number of cores to use for multi-core enabled methods self._cpu_cores = cpu_cores # Match lists to standardise names to, set the number of match types, -1 is from removing GID self._matcher, self._reference_types, self.isolates = self._construct_match_list( ) self._gid, self._did, self._cid = self.isolates self._match_types = len(self._matcher[0]) - 1 # If there is a correction file, validate it exists, then load it; else None. if correction_path: self._corrections = self._set_corrections(correction_path) else: self._corrections = None # How to break names into chunks and the column index of names in the reformatted data self._splitter = splitter self._name_index = name_index if place_map: # If names need to be remapped then assert there are as many maps as places in the matcher assert self._match_types == len(place_map) self.order = place_map else: # Otherwise set the place_map to be just an ordered list of ints of range equal to the place types self.order = [i for i in range(len(self._matcher[0]) - 1)]
def remove_duplicates(raw_csv, write_directory): """ Sometimes we may have known duplicates in a file which does not link to ambiguity, in this case we can just purge the duplicates and re-write the file :param raw_csv: The csv with potential duplicates within them :type raw_csv: str | Path :param write_directory: The output directory for the file :type write_directory: Path | str :return: Nothing, write file then stop :rtype: None """ csv_obj = CsvObject(validate_path(raw_csv)) unique_values = [ list(v) for v in list( Counter([tuple(r) for r in csv_obj.row_data]).keys()) ] write_csv(write_directory, csv_obj.file_path.stem, csv_obj.headers, unique_values)
def pie_chart(self, start_angle=90, display_values=None): warnings.warn("Deprecated: Will be moved into Seaborn.py soon.tm", DeprecationWarning) # Easier to use a csv object rather than pandas for this so recast the data to CsvObject labels, amount, explode = CsvObject(self._read_directory, column_types=[str, int, float]).column_data # Construct the pie chart from the raw data ax = self.seaborn_figure() ax.pie( amount, explode=explode, labels=labels, startangle=start_angle, colors=self.palette(), autopct=display_values, ) ax.axis("equal") self.write_plot(ax) return ax
def _load_file(file_path, column_indexes): """ Load the file containing the ICD codes, and return this and the column indexes of interest based on the type of column indexes. :param file_path: Path to the icd file :type file_path: str | Path :param column_indexes: The indexes to use will be compared to column indexes if they are set. If a str, headers will be checked to see if this str is in the header and kept it true. If a list[int], then these indexes will be used. If None, then all indexes will be used. :type column_indexes: None | str | list[int] :return: The load Csv file as well as the min and max indexes to use :rtype: (CsvObject, int, int) """ print("Loading file...") icd_file = CsvObject(file_path) if column_indexes is None: indexes = [i for i in range(len(icd_file.headers))] elif isinstance(column_indexes, str): indexes = [i for i, header in enumerate(icd_file.headers) if column_indexes in header] elif isinstance(column_indexes, list): indexes = column_indexes else: sys.exit(f"Unexpected argument for column indexes: Found type {type(column_indexes)} but expected a " f"NoneType, string or list\n" f"If you want to use all columns, leave Column indexes as None\n" f"If you want to use columns containing a string, for example for ICD 10 Primary 41202, then " f"assign 41202 to column indexes\n" f"If you want to use only specific columns, pass a list of the indexes of this columns to " f"column indexes") return icd_file, min(indexes), max(indexes)
def aggregate_scores(self): """ This will combine the scores found by chromosome into a single file """ combined_array = [] for index, file in enumerate(directory_iterator( self.scores_directory)): score_file = CsvObject(Path(self.scores_directory, file), set_columns=True) # If its the first file we want to extract the iid and fid values as well if index == 0: fid, iid, score = score_file[self.fid], score_file[ self.iid], score_file["Scores"] fid, iid, score = np.array(fid), np.array(iid), np.array( score).astype(float) fid.shape, iid.shape, score.shape = (len(fid), 1), (len(iid), 1), (len(score), 1) combined_array = [fid, iid, score] # Else extract the scores and append it to the array else: score = np.array(score_file["Scores"]).astype(float) score.shape = (len(score), 1) combined_array.append(score) # Combine the (IID_Count, 1) * (chromosome count + 2) arrays into a single (IDD_Count, chromosome_count) iid_array = np.hstack(combined_array[:2]) score_array = np.sum(np.hstack(combined_array[2:]), axis=1) score_array.shape = (len(score_array), 1) # Write the scores to the working directory write_rows = np.hstack([iid_array, score_array]).tolist() write_csv(Path(self.working_dir, "PGS"), "PolyGenicScores", ["FID", "IID", "Scores"], write_rows)
def pgs_scores(self): """ This will construct the pgs for a given weight beta type, such as infinitesimal, within this chromosome """ # Load the reference to the full sample of ID's, and use it to extract genetic phenotype information core = self.gen_reference(self.select_file_on_chromosome()) ph_dict = self.genetic_phenotypes(core) # Load the betas based on the weighted beta type specified by the user score_file = f"{self.score_type.split('_')[0]}_{self.target_chromosome}.csv" weights = CsvObject(Path(self.working_dir, "PGS", self.score_type, score_file), [str, float], set_columns=True) # Chunk the data into memory chunks to be processed chunked_snps, chunks = self.chunked_snp_names(weights[self.snp_id], True) chunked_weights = np.array_split(weights[self.inf_beta], chunks) # Weight the dosage data to construct the scores scores = self._weight_dosage(chunked_snps, chunked_weights, core, ph_dict) # Combine the FID/IID, genetic phenotype information, and the score for this chromosome scores.shape = (len(scores), 1) iid_fid = np.array([[v[i] for v in ph_dict.values()] for i in range(core.iid_count)]) write_out = np.hstack([iid_fid, scores]).tolist() # Write this information to a csv headers = list(ph_dict.keys()) + ["Scores"] write_csv(self.scores_directory, f"Scores_{self.target_chromosome}", headers, write_out) print( f"Finished Constructing scores for Chromosome {self.target_chromosome} {terminal_time()}" )
def main_call(out_dir, write_dir, headers): output = [] for file in directory_iterator(out_dir): if ".log" not in file: csv_file = CsvObject(Path(output_dir, file)) # Isolate the model values from the aggregated [snp] + [model 1, ... model N] for row in csv_file.row_data: snp, models = row[0], chunk_list(row[1:], len(headers)) output.append([snp, models]) print(f"For {len(output)} Snps") model_count = len(output[0][1]) model_comp = [] for i in range(model_count): print(f"For model {i+1}") # Write out the aggregated chromosome model data to a directory model_out = [] for snp, model in output: model_out.append([snp] + model[i]) write_csv(write_dir, f"Model{i + 1}", ["Snp"] + headers, model_out) # Append the comparision to a master list of models model_comp.append([f"Model {i+1}"] + [ str(np.mean([float(values[vi]) for values in model_out])) for vi in range(1, 3) ]) # Write the model comp out write_csv( write_dir, "Model Comparision", ["Model", "Mean Coefficent", "Mean Standard Error", "Mean P Values"], model_comp)
def _construct_icd9_lookup(self, definition_path): """ For ICD 9, we need to take the first and second columns to get the min and max range to convert these into a process similar to our ICD10 codes. Row[0] represents the phenotype name. """ return {row[0]: self._set_icd9_def(row) for row in CsvObject(definition_path).row_data}
def combine(self, unique_id, data_start, root_directory, write_directory, write_name): """ weightGIS expects each file to have a single date, so if you have lots of files of the same date that you wan to process at the same time you will need ot combine them :param unique_id: The unique id index :type unique_id: int :param data_start: The index wherein the data starts from :type data_start: int :param root_directory: The root directory of the csv files :type root_directory: Path | str :param write_directory: The output directory for the file :type write_directory: Path | str :param write_name: Name of the combined file :type write_name: str :return: Nothing, write file then stop :rtype: None """ # Create the unique ID's unique_id_list = sorted( list( set( flatten([ CsvObject(Path(root_directory, file), set_columns=True)[unique_id] for file in directory_iterator(root_directory) ])))) # For each unique ID out_list = [] for count_i, ids in enumerate(unique_id_list): if count_i % 10 == 0: print(f"{count_i} / {len(unique_id_list)}") # Check each file for a matching row, and then ids_list = [] for index, file in enumerate(directory_iterator(root_directory)): # If its the first index, take the full values if index == 0: ids_list += self._isolate(root_directory, file, unique_id, ids) # Otherwise only take the values after the data start else: ids_list += self._isolate(root_directory, file, unique_id, ids)[data_start:] out_list.append(ids_list) headers = [] for index, file in enumerate(directory_iterator(root_directory)): if index == 0: headers += CsvObject(Path(root_directory, file)).headers else: headers += CsvObject(Path(root_directory, file)).headers[data_start:] write_csv(write_directory, write_name, headers, out_list)
# Make the District the active object bpy.context.view_layer.objects.active = ob ob.select_set(True) for mat in ob.material_slots: mat.material.node_tree.nodes["Emission"].inputs[0].default_value = colour root = r"I:\Work\DataBases\Adjacent\Months" not_processed = [] for file in directory_iterator(root): print(file) a = CsvObject(Path(root, file)) target = len(a.headers) - 2 found = 0 for img in directory_iterator(r"I:\Work\Figures_and_tables\DiseasesOverTime"): year = img.split("_")[-1].split(".")[0] if year == Path(root, file).stem: found += 1 if found != target: not_processed.append(file) print(not_processed) print(len(not_processed))
def locate_individuals(ids_path, lowest_level_shapefile_path, geo_lookup, east_i, north_i, shape_match_index, write_directory, write_name): """ This will assist you locating individuals with a geo lookup files :param ids_path: The path to a csv file filled with id's with a eastings and northings :type ids_path: Path | str :param lowest_level_shapefile_path: path to the lowest level shapefile you used in your geo reference :type lowest_level_shapefile_path: Path | str :param geo_lookup: The path to the geo lookup :type geo_lookup: Path | str :param east_i: Index of the eastings in the id data :type east_i: int :param north_i: Index of the northings in the id data :type north_i: int :param shape_match_index: Index of the mating parameter, should be common in both geo reference and shapefile :type shape_match_index: int :param write_directory: Saved file will be writen here :type write_name: Path | str :param write_name: The name of the file to write :type write_name: str :return: Nothing, write the file then stop :rtype: None """ id_file = CsvObject(ids_path) geo_file = CsvObject(geo_lookup) shape_obj = ShapeObject(lowest_level_shapefile_path) # Create a list of unique easting_westing coordinates to avoid unnecessary iteration unique_places = sorted( list( set([ f"{respondent[east_i]}__{respondent[north_i]}" for respondent in id_file.row_data ]))) # Create an id: all other rows lookup so we can identify each location from the lowest geo_lookup = {row[shape_match_index]: row for row in geo_file.row_data} # Link all the geometry geo_link = create_geo_link(unique_places, geo_file, geo_lookup, shape_obj, shape_match_index) output_rows = [] for respondent in id_file.row_data: # Isolate the rows that are not east/north non_location = [ r for i, r in enumerate(respondent) if i not in (east_i, north_i) ] # Prepend this along with the geo_link birth location birth_location = f"{respondent[east_i]}__{respondent[north_i]}" output_rows.append(non_location + geo_link[birth_location]) headers = [ h for i, h in enumerate(id_file.headers) if i not in (east_i, north_i) ] + geo_file.headers write_csv(write_directory, write_name, headers, output_rows)