Пример #1
0
def make_location_pickle(base_path, path, Description):
    output = get_iteration_location(path, Description=Description)
    key_list = list(output.keys())
    for key in key_list:
        output[key] = output[key].split(path)[-1].split('\\')[-1]
    save_obj(os.path.join(base_path, 'Data_Locations.pkl'), output)
    return None
def main_run(images_description='Bastien_Cervix_Uterus_Data_No_Applicator',
             base_path=r'K:\Morfeus\BMAnderson\CNN\Data\Data_Bastien'):

    Series_Descriptions = []
    for _, Series_Descriptions, _ in os.walk(base_path):
        break

    patient_spacing_info = {}
    for Series_Description in Series_Descriptions:
        patient_spacing_info[Series_Description] = {}
        for _, _, files in os.walk(os.path.join(base_path,
                                                Series_Description)):
            break
        file_list = [
            i for i in files
            if i.find('.txt') != -1 and i.find('Iteration') != -1
        ]
        for file in file_list:
            iteration = (file.split('Iteration_')[-1]).split('.txt')[0]
            fid = open(os.path.join(base_path, Series_Description, file))
            data = fid.readline()
            fid.close()
            data = data.strip('\n')
            data = data.split(',')
            patient_spacing_info[Series_Description][
                iteration] = data[0] + ',' + data[1] + ',' + data[2]
    save_obj(
        os.path.join(base_path, 'patient_info_' + images_description + '.pkl'),
        patient_spacing_info)
Пример #3
0
    def calculate_deviations(self, data):
        data = data.copy()
        data["pred"] = list(self.value_net.predict_df(data))
        data["pred"] = data["pred"].apply(lambda x: x[0])
        data["Value"] = data["Value"].astype('int64')
        data["deviation"] = abs(data["pred"] - data["Value"])

        deviations = data.groupby("Value")["deviation"].mean()

        new_vals = deviations.to_dict()
        old_vals = self.tree_searcher.std_vals

        for key in new_vals.keys():
            if key not in old_vals:
                old_vals[key] = new_vals[key]
            else:
                old_vals[key] = (old_vals[key] + new_vals[key]) / 2

        self.tree_searcher.std_vals = old_vals
        save_obj(old_vals, self.experiment_name)
        return old_vals
Пример #4
0
 def save(self, k):
     name = 'models/' + str(k)
     save_obj(self.get_params(), name)
Пример #5
0
def pre_data():
    global clean_text, sentence, training_sorted, testing_sorted
    path = './texts/'
    text_files = [f for f in listdir(path) if isfile(join(path, f))]

    texts = []
    for text in text_files:
        print(path)
        print(text)
        texts.append(load_text(path + text))
    # In[5]:
    # Compare the number of words in each text
    for i in range(len(texts)):
        print("There are {} words in {}.".format(len(texts[i].split()), text_files[i]))
    # In[9]:
    # Check to ensure the text looks alright
    texts[0][:500]

    # ## Preparing the Data
    # In[10]:

    # In[11]:
    # Clean the text of the texts
    clean_texts = []
    for text in texts:
        clean_texts.append(clean_text(text))
    # In[12]:
    # Check to ensure the text has been cleaned properly
    clean_texts[0][:500]
    # In[13]:
    # Create a dictionary to convert the vocabulary (characters) to integers
    vocab_to_int = {}
    count = 0
    for text in clean_texts:
        for character in text:
            if character not in vocab_to_int:
                vocab_to_int[character] = count
                count += 1
    # Add special tokens to vocab_to_int
    codes = ['<PAD>', '<EOS>', '<GO>']
    for code in codes:
        vocab_to_int[code] = count
        count += 1
    # In[14]:
    # Check the size of vocabulary and all of the values
    vocab_size = len(vocab_to_int)
    print("The vocabulary contains {} characters.".format(vocab_size))
    print(sorted(vocab_to_int))
    save_obj(vocab_to_int, "vocab_to_int")
    # *Note: We could have made this project a little easier by using only lower case words and fewer special characters ($,&,-...), but I want to make this spell checker as useful as possible.*
    # In[15]:
    # Create another dictionary to convert integers to their respective characters
    int_to_vocab = {}
    for character, value in vocab_to_int.items():
        int_to_vocab[value] = character
    save_obj(int_to_vocab, "int_to_vocab")
    # In[16]:
    # Split the text from the texts into sentences.
    sentences = []
    for text in clean_texts:
        for sentence in text.split('.'):
            sentences.append(sentence.strip() + '.')
    print("There are {} sentences.".format(len(sentences)))
    # In[17]:
    # Check to ensure the text has been split correctly.
    sentences[:5]
    # *Note: I expect that you have noticed the very ugly text in the first sentence. We do not need to worry about removing it from any of the texts because will be limiting our data to sentences that are shorter than it.*
    # In[18]:
    # Convert sentences to integers
    int_sentences = []
    for sentence in sentences:
        int_sentence = []
        for character in sentence:
            int_sentence.append(vocab_to_int[character])
        int_sentences.append(int_sentence)
    # In[19]:
    # Find the length of each sentence
    lengths = []
    for sentence in int_sentences:
        lengths.append(len(sentence))
    lengths = pd.DataFrame(lengths, columns=["counts"])
    # In[20]:
    lengths.describe()
    # In[21]:
    # Limit the data we will use to train our model
    max_length = 50
    min_length = 1
    good_sentences = []
    for sentence in int_sentences:
        if len(sentence) <= max_length and len(sentence) >= min_length:
            good_sentences.append(sentence)

    print("We will use {} to train and test our model.".format(len(good_sentences)))
    # *Note: I decided to not use very long or short sentences because they are not as useful for training our model. Shorter sentences are less likely to include an error and the text is more likely to be repetitive. Longer sentences are more difficult to learn due to their length and increase the training time quite a bit. If you are interested in using this model for more than just a personal project, it would be worth using these longer sentence, and much more training data to create a more accurate model.*
    # In[22]:
    # Split the data into training and testing sentences
    training, testing = train_test_split(good_sentences, test_size=0.15, random_state=2)
    print("Number of training sentences:", len(training))
    print("Number of testing sentences:", len(testing))
    # In[23]:
    # Sort the sentences by length to reduce padding, which will allow the model to train faster
    training_sorted = []
    testing_sorted = []
    training_sorted = training
    testing_sorted = testing

    sentences_len = map(len, sentences)
    from collections import Counter
    labels, values = zip(*Counter(sentences_len).items())
    print("labels: ", labels)
    print("values: ", values)

    for i in range(len(labels)):
        print(str(labels[i]) + ":" + str(values[i]))


    # In[24]:
    # Check to ensure the sentences have been selected and sorted correctly
    for i in range(2):
        print(training_sorted[i], len(training_sorted[i]))
    threshold = 0.95
    for sentence in training_sorted[:5]:
        print(sentence)
        print(noise_maker(sentence, threshold, vocab_to_int))
        print()
    return vocab_to_int