Exemplo n.º 1
0
    def create_data_from_all_species_together(self):
        """
        creating data for all species together -  12000 samples:
        takes random 1/5 of the samples from the training data of each species,
        random 1/17 of the samples from the validation data of each species,
        and random 1/5 of the samples from the test data of each species.
        (same random indices from all species).
        :return:
        """
        print "start creating data for : ", self.project.species[-1], \
              " and : ", self.project.species[-2]
        train_samples_60000 = []
        validation_samples_60000 = []
        test_samples_60000 = []
        train_samples_12000 = []
        validation_samples_12000 = []
        test_samples_12000 = []
        # collect all training, validation and test data from all species:
        section_index = 0
        for section_name in self.sections:
            for j in range(len(self.project.species) - 2):
                species_name = self.project.species[j]
                section_samples_per_species = []
                section_file_x_path = os.path.join(
                    self.project.text_samples_base_dir, species_name,
                    self.project.k_let_dirs[self.project.k - 1],
                    section_name + "_X")
                section_file_y_path = os.path.join(
                    self.project.text_samples_base_dir, species_name,
                    self.project.k_let_dirs[self.project.k - 1],
                    section_name + "_Y")

                sample_sequences = []
                with open(section_file_x_path) as one_species_section_file_x:
                    for line in one_species_section_file_x:
                        if "\n" in line:
                            sequence = line[:
                                            -1]  # without \n at the end of the line
                        else:
                            sequence = line
                        sample_sequences.append(sequence)
                sample_labels = []
                with open(section_file_y_path) as one_species_section_file_y:
                    for line in one_species_section_file_y:
                        if "\n" in line:
                            label = int(
                                line[:-1])  # without \n at the end of the line
                        else:
                            label = int(line)
                        sample_labels.append(label)
                current_number_of_samples_in_section = 0
                for sample_index in range(len(sample_sequences)):
                    sample_object = SampleObject(
                        sample_sequences[sample_index],
                        sample_labels[sample_index])
                    section_samples_per_species.append(sample_object)
                    current_number_of_samples_in_section += 1
                    number_of_samples_in_section = \
                        2*(self.project.get_number_of_samples())*(self.section_ratios[section_index])
                    if current_number_of_samples_in_section >= (
                            number_of_samples_in_section *
                            self.ratio_of_samples_from_all_species):
                        break

                number_section_samples = current_number_of_samples_in_section
                # takes 1/17 from each species samples
                random_indices_of_section_samples = random.sample(
                    xrange(number_section_samples),
                    number_section_samples / (len(self.project.species) - 2))
                random_section_samples = [
                    section_samples_per_species[j]
                    for j in range(len(section_samples_per_species))
                    if j in random_indices_of_section_samples
                ]
                if section_name == 'train':
                    train_samples_12000.extend(random_section_samples)
                    train_samples_60000.extend(section_samples_per_species)
                elif section_name == 'validation':
                    validation_samples_12000.extend(random_section_samples)
                    validation_samples_60000.extend(
                        section_samples_per_species)
                elif section_name == 'test':
                    test_samples_12000.extend(random_section_samples)
                    test_samples_60000.extend(section_samples_per_species)
            section_index += 1
        for i in range(
                len(self.project.species) - 2, len(self.project.species)):
            species_name = self.project.species[i]
            species_dir_text = os.path.join(self.project.text_samples_base_dir,
                                            species_name)
            pos_out_path = os.path.join(species_dir_text, "positive_samples")
            neg_out_path = os.path.join(
                species_dir_text, self.project.k_let_dirs[self.project.k - 1],
                "negative_samples")
            species_dir_npy = os.path.join(self.project.samples_base_dir,
                                           species_name)
            if i == len(
                    self.project.species
            ) - 2:  # iteration #17 - creating data for all species - 60000 samples
                train_samples = train_samples_60000
                validation_samples = validation_samples_60000
                test_samples = test_samples_60000
            elif i == len(
                    self.project.species
            ) - 1:  # iteration #18 - creating data for all species - 12000 samples
                train_samples = train_samples_12000
                validation_samples = validation_samples_12000
                test_samples = test_samples_12000
            with open(pos_out_path, 'w') as out_pos:
                with open(neg_out_path, 'w') as out_neg:
                    for section_name in self.sections:
                        print "start section: ", section_name
                        if section_name == 'train':
                            section_samples = train_samples
                        elif section_name == 'validation':
                            section_samples = validation_samples
                        elif section_name == 'test':
                            section_samples = test_samples
                        # write the section samples and labels to text file and numpy file
                        path_out_text_X, path_out_text_Y = data_handle.get_path(
                            species_dir_text, section_name, self.project.k)
                        path_out_npy_X, path_out_npy_Y = data_handle.get_path(
                            species_dir_npy, section_name, self.project.k)
                        shuffle(section_samples)
                        section_sample_matrices = []
                        section_label_matrices = []
                        section_samples_str = []
                        section_labels_str = []
                        for sample in section_samples:
                            section_sample_matrices.append(
                                sample.get_sample_matrix())
                            section_label_matrices.append(
                                sample.get_label_matrix())
                            section_samples_str.append(sample.get_sample_str())
                            section_labels_str.append(str(sample.get_label()))
                        np.save(path_out_npy_X, section_sample_matrices)
                        np.save(path_out_npy_Y, section_label_matrices)
                        with open(path_out_text_X, 'w') as out_text_samples:
                            string = '\n'.join(section_samples_str)
                            out_text_samples.write(string)
                        with open(path_out_text_Y, 'w') as out_text_labels:
                            string = '\n'.join(section_labels_str)
                            out_text_labels.write(string)
                        self.write_positive_and_negative_text_samples_files(
                            section_labels_str, section_samples_str, out_pos,
                            out_neg)
    def create_data_from_all_species_together(self):
        """
        creating data for all species together -  14000 samples:
        takes random 1/17 of the samples from the training data of each species,
        random 1/17 of the samples from the validation data of each species,
        and random 1/17 of the samples from the test data of each species.
        (same random indices from all species).
        :return:
        """
        print "start creating data for : ", self.project.species[-1], \
              " and : ", self.project.species[-2]
        train_samples_238000 = []
        validation_samples_238000 = []
        test_samples_238000 = []
        train_samples_14000 = []
        validation_samples_14000 = []
        test_samples_14000 = []
        # collect all training, validation and test data from all species:
        section_index = 0
        for section_name in self.sections:
            for j in range(len(self.project.species) - 2):
                species_name = self.project.species[j]
                section_samples_per_species = []
                section_file_path = os.path.join(
                    self.project.text_samples_base_dir, species_name,
                    section_name + "_data")
                with open(section_file_path) as one_species_section_file:
                    line_counter = 0
                    for line in one_species_section_file:
                        split_line = line.split("\t")
                        label = split_line[0]
                        label_int = 1 if label == "True" else 0
                        sequence = (split_line[1]
                                    )[:-1]  # without \n at the end of the line
                        sample_object = SampleObject(sequence, label_int)
                        section_samples_per_species.append(sample_object)
                        line_counter += 1
                        number_of_samples_in_section = \
                            2*(self.project.get_number_of_samples())*(self.section_ratios[section_index])
                        # TODO for now - takes only <self.ratio_of_samples_from_all_species> samples from each section
                        if line_counter >= (
                                number_of_samples_in_section *
                                self.ratio_of_samples_from_all_species):
                            break

                number_section_samples = line_counter
                # takes 1/17 from each species samples
                random_indices_of_section_samples = random.sample(
                    xrange(number_section_samples),
                    number_section_samples / (len(self.project.species) - 2))
                random_section_samples = [
                    section_samples_per_species[j]
                    for j in range(len(section_samples_per_species))
                    if j in random_indices_of_section_samples
                ]
                if section_name == 'train':
                    train_samples_14000.extend(random_section_samples)
                    train_samples_238000.extend(section_samples_per_species)
                elif section_name == 'validation':
                    validation_samples_14000.extend(random_section_samples)
                    validation_samples_238000.extend(
                        section_samples_per_species)
                elif section_name == 'test':
                    test_samples_14000.extend(random_section_samples)
                    test_samples_238000.extend(section_samples_per_species)
            section_index += 1
        # print "len(train_samples_14000) = ", len(train_samples_14000)
        # print "len(validation_samples_14000) = ", len(validation_samples_14000)
        # print "len(test_samples_14000) = ", len(test_samples_14000)
        # print "len(train_samples_238000) = ", len(train_samples_238000)
        # print "len(validation_samples_238000) = ", len(validation_samples_238000)
        # print "len(test_samples_238000) = ", len(test_samples_238000)
        # print "total 238000 train+validation+test: ", len(train_samples_238000) \
        #         + len(validation_samples_238000) + len(test_samples_238000)
        # print "total 14000 train+validation+test: ", len(train_samples_14000) \
        #         + len(validation_samples_14000) + len(test_samples_14000)
        for i in range(
                len(self.project.species) - 2, len(self.project.species)):
            species_name = self.project.species[i]
            species_dir_text = os.path.join(self.project.text_samples_base_dir,
                                            species_name)
            pos_out_path = os.path.join(species_dir_text, "positive_samples")
            neg_out_path = os.path.join(species_dir_text, "negative_samples")
            species_dir_npy = os.path.join(self.project.samples_base_dir,
                                           species_name)
            if i == len(
                    self.project.species
            ) - 2:  # iteration #17 - creating data for all species - 238000 samples
                train_samples = train_samples_238000
                validation_samples = validation_samples_238000
                test_samples = test_samples_238000
            elif i == len(
                    self.project.species
            ) - 1:  # iteration #18 - creating data for all species - 14000 samples
                train_samples = train_samples_14000
                validation_samples = validation_samples_14000
                test_samples = test_samples_14000
            with open(pos_out_path, 'w') as out_pos:
                with open(neg_out_path, 'w') as out_neg:
                    for section_name in self.sections:
                        print "start section: ", section_name
                        if section_name == 'train':
                            section_samples = train_samples
                        elif section_name == 'validation':
                            section_samples = validation_samples
                        elif section_name == 'test':
                            section_samples = test_samples
                        # write the section samples and labels to text file and numpy file
                        path_out_text_X, path_out_text_Y = data_handle.get_path(
                            species_dir_text, section_name)
                        path_out_npy_X, path_out_npy_Y = data_handle.get_path(
                            species_dir_npy, section_name)
                        shuffle(section_samples)
                        section_sample_matrices = []
                        section_label_matrices = []
                        section_samples_str = []
                        section_labels_str = []
                        for sample in section_samples:
                            section_sample_matrices.append(
                                sample.get_sample_matrix())
                            section_label_matrices.append(
                                sample.get_label_matrix())
                            section_samples_str.append(sample.get_sample_str())
                            section_labels_str.append(str(sample.get_label()))
                        np.save(path_out_npy_X, section_sample_matrices)
                        np.save(path_out_npy_Y, section_label_matrices)
                        with open(path_out_text_X, 'w') as out_text_samples:
                            string = '\n'.join(section_samples_str)
                            out_text_samples.write(string)
                        with open(path_out_text_Y, 'w') as out_text_labels:
                            string = '\n'.join(section_labels_str)
                            out_text_labels.write(string)
                        self.write_positive_and_negative_text_samples_files(
                            section_labels_str, section_samples_str, out_pos,
                            out_neg)
Exemplo n.º 3
0
def main():
    create_directories()

    # the k-lets counts are preserved during the negative samples generation
    for k in range(1, MAXIMAL_K + 1):
        print "start creating data, k = ", k
        for species_name in H3K27ac_species_names_ordered:
            if "All" in species_name:
                continue
            print "species name: ", species_name
            positive_samples, negative_samples = create_data(k, species_name)
            all_Xs = np.array(positive_samples + negative_samples)
            all_ys = np.array([1] * len(positive_samples) +
                              [0] * len(negative_samples))
            perm = np.random.permutation(len(all_Xs))

            all_Xs_shuffled = all_Xs[perm]
            all_ys_shuffled = all_ys[perm]

            samples = np.array(all_Xs_shuffled)
            labels = np.array(convert_labels_to_one_hot(all_ys_shuffled))

            all_samples = []
            for i in range(len(all_Xs_shuffled)):
                sample_matrix = all_Xs_shuffled[i]
                label = all_ys_shuffled[i]
                label_matrix = labels_map[label]
                sample_object = SampleObject(sample_matrix,
                                             label_matrix,
                                             is_matrix=True)
                all_samples.append(sample_object)

            indices = dict()
            train_start_idx = 0
            train_end_idx = int(math.ceil(len(all_Xs) * train_ratio))
            indices["train"] = (train_start_idx, train_end_idx)
            validation_start_idx = train_end_idx
            validation_end_idx = train_end_idx + int(
                math.ceil(len(all_Xs) * validation_ratio))
            indices["validation"] = (validation_start_idx, validation_end_idx)
            test_start_idx = validation_end_idx
            test_end_idx = validation_end_idx + int(
                math.ceil(len(all_Xs) * test_ratio))
            indices["test"] = (test_start_idx, test_end_idx)

            # save npy files for each species

            path_out_train_X = os.path.join(samples_out_base_dir_for_npy_files,
                                            species_name,
                                            output_k_lets_dirs[k - 1],
                                            'train_X')
            path_out_train_y = os.path.join(samples_out_base_dir_for_npy_files,
                                            species_name,
                                            output_k_lets_dirs[k - 1],
                                            'train_Y')
            path_out_validation_X = os.path.join(
                samples_out_base_dir_for_npy_files, species_name,
                output_k_lets_dirs[k - 1], 'validation_X')
            path_out_validation_y = os.path.join(
                samples_out_base_dir_for_npy_files, species_name,
                output_k_lets_dirs[k - 1], 'validation_Y')
            path_out_test_X = os.path.join(samples_out_base_dir_for_npy_files,
                                           species_name,
                                           output_k_lets_dirs[k - 1], 'test_X')
            path_out_test_y = os.path.join(samples_out_base_dir_for_npy_files,
                                           species_name,
                                           output_k_lets_dirs[k - 1], 'test_Y')

            np.save(path_out_train_X, samples[train_start_idx:train_end_idx])
            np.save(path_out_train_y, labels[train_start_idx:train_end_idx])

            np.save(path_out_validation_X,
                    samples[validation_start_idx:validation_end_idx])
            np.save(path_out_validation_y,
                    labels[validation_start_idx:validation_end_idx])

            np.save(path_out_test_X, samples[test_start_idx:test_end_idx])
            np.save(path_out_test_y, labels[test_start_idx:test_end_idx])

            # write positive and negative text files:
            dir_path = os.path.join(samples_out_base_dir_for_text_files,
                                    species_name)

            all_Xs_text_shuffled = []
            all_Ys_text_shuffled = []
            for sample in all_samples:
                all_Xs_text_shuffled.append(sample.get_sample_str())
                all_Ys_text_shuffled.append(str(sample.get_label()))

            text_samples = np.array(all_Xs_text_shuffled)
            text_labels = np.array(all_Ys_text_shuffled)

            for section in sections:
                print "section: ", section
                path_out_text_X, path_out_text_Y = data_handle.get_path(
                    dir_path, section, k)
                start, end = indices[section]
                with open(path_out_text_X, 'w') as out_text_samples:
                    string = '\n'.join(text_samples[start:end])
                    out_text_samples.write(string)
                with open(path_out_text_Y, 'w') as out_text_labels:
                    string = '\n'.join(text_labels[start:end])
                    out_text_labels.write(string)

    print "End! :)"
Exemplo n.º 4
0
    def shuffle_and_write_samples(self, all_samples, species_name=None):
        if species_name:
            path_out_npy_files_dir = os.path.join(
                self.project.samples_base_dir, species_name)
            path_out_text_samples_dir = os.path.join(
                self.project.text_samples_base_dir, species_name)
        else:
            path_out_npy_files_dir = self.project.samples_base_dir
            path_out_text_samples_dir = self.project.text_samples_base_dir
        if self.project.sigma:
            if not os.path.exists(
                    path_out_npy_files_dir) and not os.path.isdir(
                        path_out_npy_files_dir):
                print "make directory: ", path_out_npy_files_dir
                os.makedirs(path_out_npy_files_dir)
            if not os.path.exists(
                    path_out_text_samples_dir) and not os.path.isdir(
                        path_out_text_samples_dir):
                print "make directory: ", path_out_text_samples_dir
                os.makedirs(path_out_text_samples_dir)

        positive_samples_file_path = os.path.join(path_out_text_samples_dir,
                                                  "positive_samples")
        negative_samples_file_path = os.path.join(path_out_text_samples_dir,
                                                  "negative_samples")

        shuffle(all_samples)  # shuffle all sample objects in place
        all_Xs_matrices_shuffled = []
        all_Ys_matrices_shuffled = []
        all_Xs_text_shuffled = []
        all_Ys_text_shuffled = []
        positive_samples = []
        negative_samples = []
        for sample in all_samples:
            all_Xs_matrices_shuffled.append(sample.get_sample_matrix())
            all_Ys_matrices_shuffled.append(sample.get_label_matrix())
            all_Xs_text_shuffled.append(sample.get_sample_str())
            all_Ys_text_shuffled.append(str(sample.get_label()))
            if sample.get_label() == 1:
                positive_samples.append(sample)
            elif sample.get_label() == 0:
                negative_samples.append(sample)
        # write positive and negative text files:
        print "write positive and negative text files ... "
        with open(positive_samples_file_path, 'w') as pos_out:
            for sample in positive_samples:
                pos_out.write(sample.get_sample_str() + "\n")

        with open(negative_samples_file_path, 'w') as neg_out:
            for sample in negative_samples:
                neg_out.write(sample.get_sample_str() + "\n")

        samples = np.array(all_Xs_matrices_shuffled)
        labels = np.array(all_Ys_matrices_shuffled)
        text_samples = np.array(all_Xs_text_shuffled)
        text_labels = np.array(all_Ys_text_shuffled)
        indices = dict()
        train_start_idx = 0
        train_end_idx = math.ceil(len(samples) * self.section_ratios[0])
        indices["train"] = (train_start_idx, train_end_idx)
        validation_start_idx = train_end_idx
        validation_end_idx = train_end_idx + math.ceil(
            len(samples) * self.section_ratios[1])
        indices["validation"] = (validation_start_idx, validation_end_idx)
        test_start_idx = validation_end_idx
        test_end_idx = validation_end_idx + math.ceil(
            len(samples) * self.section_ratios[2])
        indices["test"] = (test_start_idx, test_end_idx)
        for section in self.sections:
            print "section: ", section
            path_out_text_X, path_out_text_Y = data_handle.get_path(
                path_out_text_samples_dir, section)
            path_out_npy_X, path_out_npy_Y = data_handle.get_path(
                path_out_npy_files_dir, section)
            start = int(indices[section][0])
            end = int(indices[section][1])
            np.save(path_out_npy_X, samples[start:end])
            np.save(path_out_npy_Y, labels[start:end])
            with open(path_out_text_X, 'w') as out_text_samples:
                string = '\n'.join(text_samples[start:end])
                out_text_samples.write(string)
            with open(path_out_text_Y, 'w') as out_text_labels:
                string = '\n'.join(text_labels[start:end])
                out_text_labels.write(string)