def make_location_pickle(base_path, path, Description): output = get_iteration_location(path, Description=Description) key_list = list(output.keys()) for key in key_list: output[key] = output[key].split(path)[-1].split('\\')[-1] save_obj(os.path.join(base_path, 'Data_Locations.pkl'), output) return None
def main_run(images_description='Bastien_Cervix_Uterus_Data_No_Applicator', base_path=r'K:\Morfeus\BMAnderson\CNN\Data\Data_Bastien'): Series_Descriptions = [] for _, Series_Descriptions, _ in os.walk(base_path): break patient_spacing_info = {} for Series_Description in Series_Descriptions: patient_spacing_info[Series_Description] = {} for _, _, files in os.walk(os.path.join(base_path, Series_Description)): break file_list = [ i for i in files if i.find('.txt') != -1 and i.find('Iteration') != -1 ] for file in file_list: iteration = (file.split('Iteration_')[-1]).split('.txt')[0] fid = open(os.path.join(base_path, Series_Description, file)) data = fid.readline() fid.close() data = data.strip('\n') data = data.split(',') patient_spacing_info[Series_Description][ iteration] = data[0] + ',' + data[1] + ',' + data[2] save_obj( os.path.join(base_path, 'patient_info_' + images_description + '.pkl'), patient_spacing_info)
def calculate_deviations(self, data): data = data.copy() data["pred"] = list(self.value_net.predict_df(data)) data["pred"] = data["pred"].apply(lambda x: x[0]) data["Value"] = data["Value"].astype('int64') data["deviation"] = abs(data["pred"] - data["Value"]) deviations = data.groupby("Value")["deviation"].mean() new_vals = deviations.to_dict() old_vals = self.tree_searcher.std_vals for key in new_vals.keys(): if key not in old_vals: old_vals[key] = new_vals[key] else: old_vals[key] = (old_vals[key] + new_vals[key]) / 2 self.tree_searcher.std_vals = old_vals save_obj(old_vals, self.experiment_name) return old_vals
def save(self, k): name = 'models/' + str(k) save_obj(self.get_params(), name)
def pre_data(): global clean_text, sentence, training_sorted, testing_sorted path = './texts/' text_files = [f for f in listdir(path) if isfile(join(path, f))] texts = [] for text in text_files: print(path) print(text) texts.append(load_text(path + text)) # In[5]: # Compare the number of words in each text for i in range(len(texts)): print("There are {} words in {}.".format(len(texts[i].split()), text_files[i])) # In[9]: # Check to ensure the text looks alright texts[0][:500] # ## Preparing the Data # In[10]: # In[11]: # Clean the text of the texts clean_texts = [] for text in texts: clean_texts.append(clean_text(text)) # In[12]: # Check to ensure the text has been cleaned properly clean_texts[0][:500] # In[13]: # Create a dictionary to convert the vocabulary (characters) to integers vocab_to_int = {} count = 0 for text in clean_texts: for character in text: if character not in vocab_to_int: vocab_to_int[character] = count count += 1 # Add special tokens to vocab_to_int codes = ['<PAD>', '<EOS>', '<GO>'] for code in codes: vocab_to_int[code] = count count += 1 # In[14]: # Check the size of vocabulary and all of the values vocab_size = len(vocab_to_int) print("The vocabulary contains {} characters.".format(vocab_size)) print(sorted(vocab_to_int)) save_obj(vocab_to_int, "vocab_to_int") # *Note: We could have made this project a little easier by using only lower case words and fewer special characters ($,&,-...), but I want to make this spell checker as useful as possible.* # In[15]: # Create another dictionary to convert integers to their respective characters int_to_vocab = {} for character, value in vocab_to_int.items(): int_to_vocab[value] = character save_obj(int_to_vocab, "int_to_vocab") # In[16]: # Split the text from the texts into sentences. sentences = [] for text in clean_texts: for sentence in text.split('.'): sentences.append(sentence.strip() + '.') print("There are {} sentences.".format(len(sentences))) # In[17]: # Check to ensure the text has been split correctly. sentences[:5] # *Note: I expect that you have noticed the very ugly text in the first sentence. We do not need to worry about removing it from any of the texts because will be limiting our data to sentences that are shorter than it.* # In[18]: # Convert sentences to integers int_sentences = [] for sentence in sentences: int_sentence = [] for character in sentence: int_sentence.append(vocab_to_int[character]) int_sentences.append(int_sentence) # In[19]: # Find the length of each sentence lengths = [] for sentence in int_sentences: lengths.append(len(sentence)) lengths = pd.DataFrame(lengths, columns=["counts"]) # In[20]: lengths.describe() # In[21]: # Limit the data we will use to train our model max_length = 50 min_length = 1 good_sentences = [] for sentence in int_sentences: if len(sentence) <= max_length and len(sentence) >= min_length: good_sentences.append(sentence) print("We will use {} to train and test our model.".format(len(good_sentences))) # *Note: I decided to not use very long or short sentences because they are not as useful for training our model. Shorter sentences are less likely to include an error and the text is more likely to be repetitive. Longer sentences are more difficult to learn due to their length and increase the training time quite a bit. If you are interested in using this model for more than just a personal project, it would be worth using these longer sentence, and much more training data to create a more accurate model.* # In[22]: # Split the data into training and testing sentences training, testing = train_test_split(good_sentences, test_size=0.15, random_state=2) print("Number of training sentences:", len(training)) print("Number of testing sentences:", len(testing)) # In[23]: # Sort the sentences by length to reduce padding, which will allow the model to train faster training_sorted = [] testing_sorted = [] training_sorted = training testing_sorted = testing sentences_len = map(len, sentences) from collections import Counter labels, values = zip(*Counter(sentences_len).items()) print("labels: ", labels) print("values: ", values) for i in range(len(labels)): print(str(labels[i]) + ":" + str(values[i])) # In[24]: # Check to ensure the sentences have been selected and sorted correctly for i in range(2): print(training_sorted[i], len(training_sorted[i])) threshold = 0.95 for sentence in training_sorted[:5]: print(sentence) print(noise_maker(sentence, threshold, vocab_to_int)) print() return vocab_to_int