Пример #1
0
            def __init__(self):
                super().__init__(embedding_size=2)

                self.map_a = IntegerModel(distinct_numbers=100,
                                          embedding_size=2)
                self.map_b = IntegerModel(distinct_numbers=100,
                                          embedding_size=2)
Пример #2
0
    def __init__(self, distinct_numbers=-1, mapped_number_dim=4,use_cuda=None):
        mapped_frequency_dim = 3
        super().__init__(embedding_size=mapped_number_dim + mapped_frequency_dim,use_cuda=use_cuda)

        output_dim = mapped_number_dim + mapped_frequency_dim
        self.map_number = IntegerModel(distinct_numbers=distinct_numbers, embedding_size=mapped_number_dim,use_cuda=use_cuda)
        self.map_frequency = FrequencyMapper(use_cuda=use_cuda)
        # self.map_frequency = Variable(torch.FloatTensor([[]]))
        # IntegerModel(distinct_numbers=distinct_frequencies, embedding_size=mapped_frequency_dim)
        if use_mean_to_map_nwf:
            self.mean_sequence = MeanOfList()
        else:
            self.map_sequence = RNNOfList(embedding_size=mapped_number_dim + mapped_frequency_dim,
                                          hidden_size=output_dim,
                                          num_layers=1,use_cuda=use_cuda)
Пример #3
0
    def test_reduce_padding(self):
        reduce=Reduce(input_dims=[2,2,2], encoding_output_dim=4)
        map_ints = IntegerModel(distinct_numbers=100, embedding_size=2)


        reduce([map_ints([1]),map_ints([2]),map_ints([3])])
        reduce([map_ints([1]),map_ints([2])],pad_missing=True)
Пример #4
0
    def test_message_with_reduce(self):
            torch.manual_seed(1212)

            class Message:
                def __init__(self, a, list_of_elements):

                    self.dict={'type':"Message", 'a':a, "list":list_of_elements}

                def __getitem__(self, key):

                    return self.dict[key]

            reduce = Reduce([2, 2], 3)

            def map_Message(m):
                mapped_a = map_a([m['a']])
                mapped_b = map_b(map_a(m['list']))

                return reduce([mapped_a, mapped_b])

            mappers = {"Message": map_Message}
            map_a = IntegerModel(distinct_numbers=100, embedding_size=2)
            map_b = MeanOfList()

            messages = [Message(12, [1, 5, 4]), Message(1, [2, 4, 1])]
            mapper = BatchOfInstances(mappers=mappers,all_modules=[map_a,map_b])
            out = mapper(messages)
            self.assertEquals(out.size(), (2, 3))
            print(out)
Пример #5
0
    def __init__(self, mapped_count_dim=5, count_dim=64, mapped_base_dim=2, mapped_genotype_index_dim=4,use_cuda=None):
        super().__init__(count_dim,use_cuda)
        self.map_sequence = MapSequence(hidden_size=count_dim,
                                        mapped_base_dim=mapped_base_dim,use_cuda=use_cuda)
        self.map_gobyGenotypeIndex = IntegerModel(distinct_numbers=100, embedding_size=mapped_genotype_index_dim,use_cuda=use_cuda)

        self.map_count = IntegerModel(distinct_numbers=100000, embedding_size=mapped_count_dim,use_cuda=use_cuda)
        self.map_boolean = map_Boolean(use_cuda=use_cuda)

        self.frequency_list_mapper_base_qual = MapNumberWithFrequencyList(distinct_numbers=1000,use_cuda=use_cuda)
        self.frequency_list_mapper_num_var = MapNumberWithFrequencyList(distinct_numbers=1000,use_cuda=use_cuda)
        self.frequency_list_mapper_mapping_qual = MapNumberWithFrequencyList(distinct_numbers=100,use_cuda=use_cuda)
        self.frequency_list_mapper_distance_to = MapNumberWithFrequencyList(distinct_numbers=1000,use_cuda=use_cuda)
        self.frequency_list_mapper_aligned_lengths = MapNumberWithFrequencyList(distinct_numbers=1000,use_cuda=use_cuda)
        self.frequency_list_mapper_read_indices = MapNumberWithFrequencyList(distinct_numbers=1000,use_cuda=use_cuda)

        count_mappers = [self.map_gobyGenotypeIndex,
                         self.map_boolean,  # isIndel
                         self.map_boolean,  # matchesReference
                         self.map_sequence,
                         self.map_count,
                         self.map_count]

        self.nf_names_mappers = [('qualityScoresForwardStrand', self.frequency_list_mapper_base_qual),
                                 ('qualityScoresReverseStrand', self.frequency_list_mapper_base_qual),
                                 ('distanceToStartOfRead', self.frequency_list_mapper_distance_to),
                                 ('distanceToEndOfRead', self.frequency_list_mapper_distance_to),  # OK
                                 ('readIndicesReverseStrand', self.frequency_list_mapper_read_indices),  # OK
                                 ('readIndicesForwardStrand', self.frequency_list_mapper_read_indices),  # OK
                                 # 'distancesToReadVariationsForwardStrand', #Wrong
                                 # 'distancesToReadVariationsReverseStrand', #Wrong
                                 ('targetAlignedLengths', self.frequency_list_mapper_aligned_lengths),
                                 ('queryAlignedLengths', self.frequency_list_mapper_aligned_lengths),  # OK
                                 ('numVariationsInReads', self.frequency_list_mapper_num_var),  # OK
                                 ('readMappingQualityForwardStrand', self.frequency_list_mapper_mapping_qual),  # OK
                                 ('readMappingQualityReverseStrand', self.frequency_list_mapper_mapping_qual)  # OK
                                 ]
        for nf_name, mapper in self.nf_names_mappers:
            count_mappers += [mapper]

        # below, [2+2+2] is for the booleans mapped with a function:
        self.reduce_count = Reduce([mapper.embedding_size for mapper in count_mappers], encoding_output_dim=count_dim,use_cuda=use_cuda)
        batched_count_mappers = []
        self.reduce_batched = Reduce([self.map_gobyGenotypeIndex.embedding_size,
                                      self.map_count.embedding_size * 2,
                                      self.map_boolean.embedding_size * 2,
                                      self.map_sequence.embedding_size * 2, ], encoding_output_dim=count_dim,use_cuda=use_cuda)
Пример #6
0
    def __init__(self, mapped_base_dim=2, hidden_size=64, num_layers=1, bases=('A', 'C', 'T', 'G', '-', 'N'),use_cuda=None):
        super().__init__(embedding_size=hidden_size,use_cuda=use_cuda)
        self.map_sequence = RNNOfList(embedding_size=mapped_base_dim, hidden_size=hidden_size,
                                      num_layers=num_layers,use_cuda=use_cuda)
        self.base_to_index = {}
        for base_index, base in enumerate(bases):
            self.base_to_index[base[0]] = base_index

        self.map_bases = IntegerModel(distinct_numbers=len(self.base_to_index), embedding_size=mapped_base_dim,use_cuda=use_cuda)
Пример #7
0
class MapNumberWithFrequencyList(StructuredEmbedding):
    def __init__(self, distinct_numbers=-1, mapped_number_dim=4,use_cuda=None):
        mapped_frequency_dim = 3
        super().__init__(embedding_size=mapped_number_dim + mapped_frequency_dim,use_cuda=use_cuda)

        output_dim = mapped_number_dim + mapped_frequency_dim
        self.map_number = IntegerModel(distinct_numbers=distinct_numbers, embedding_size=mapped_number_dim,use_cuda=use_cuda)
        self.map_frequency = FrequencyMapper(use_cuda=use_cuda)
        # self.map_frequency = Variable(torch.FloatTensor([[]]))
        # IntegerModel(distinct_numbers=distinct_frequencies, embedding_size=mapped_frequency_dim)
        if use_mean_to_map_nwf:
            self.mean_sequence = MeanOfList()
        else:
            self.map_sequence = RNNOfList(embedding_size=mapped_number_dim + mapped_frequency_dim,
                                          hidden_size=output_dim,
                                          num_layers=1,use_cuda=use_cuda)

    def forward(self, nwf_list, tensor_cache=NoCache(), cuda=None, nf_name="unknown"):

        if len(nwf_list) > 0:
            mapped_frequencies = torch.cat([
                self.map_number([nwf['number'] for nwf in nwf_list], tensor_cache, cuda),
                self.map_frequency([nwf['frequency'] for nwf in nwf_list], cuda)], dim=1)

            if use_mean_to_map_nwf:
                return self.mean_sequence(mapped_frequencies, cuda=cuda)
            else:
                return self.map_sequence(mapped_frequencies, cuda=cuda)
        else:
            variable = Variable(torch.zeros(1, self.embedding_size), requires_grad=True)
            if cuda:
                variable = variable.cuda(async=True)
            return variable

    def collect_inputs(self, nwf_list, phase=0, tensor_cache=NoCache(), cuda=None, batcher=None):
        if phase == 0:
            # the following tensors are batched:
            nwf_list['indices'] = {}
            store_indices_in_message(mapper=self.map_number, message=nwf_list,
                                     indices=self.map_number.collect_inputs(
                                         values=nwf_list['number'], phase=phase, cuda=cuda, batcher=batcher))

            store_indices_in_message(mapper=self.map_frequency, message=nwf_list,
                                     indices=self.map_frequency.collect_inputs(
                                         values=nwf_list['frequency'], phase=phase, cuda=cuda, batcher=batcher))
Пример #8
0
    def test_integer_batching(self):
        mapper = IntegerModel(100, 2)
        batcher = Batcher()
        list_of_ints = [12, 3, 2]
        no_cache = NoCache()

        example_indices = []

        for value in list_of_ints:
            example_indices += batcher.collect_inputs(mapper, [value])

        print(batcher.get_batched_input(mapper))
        print(batcher.forward_batch(mapper))
        for example_index in example_indices:
            print(batcher.get_forward_for_example(mapper, example_index))

        self.assertEqual(str(mapper([12], no_cache).data),
                         str(batcher.get_forward_for_example(mapper, 0).data))
        self.assertEqual(str(mapper([3], no_cache).data),
                         str(batcher.get_forward_for_example(mapper, 1).data))
        self.assertEqual(str(mapper([2], no_cache).data),
                         str(batcher.get_forward_for_example(mapper, 2).data))
Пример #9
0
class MapCountInfo(StructuredEmbedding):
    def __init__(self, mapped_count_dim=5, count_dim=64, mapped_base_dim=2, mapped_genotype_index_dim=4,use_cuda=None):
        super().__init__(count_dim,use_cuda)
        self.map_sequence = MapSequence(hidden_size=count_dim,
                                        mapped_base_dim=mapped_base_dim,use_cuda=use_cuda)
        self.map_gobyGenotypeIndex = IntegerModel(distinct_numbers=100, embedding_size=mapped_genotype_index_dim,use_cuda=use_cuda)

        self.map_count = IntegerModel(distinct_numbers=100000, embedding_size=mapped_count_dim,use_cuda=use_cuda)
        self.map_boolean = map_Boolean(use_cuda=use_cuda)

        self.frequency_list_mapper_base_qual = MapNumberWithFrequencyList(distinct_numbers=1000,use_cuda=use_cuda)
        self.frequency_list_mapper_num_var = MapNumberWithFrequencyList(distinct_numbers=1000,use_cuda=use_cuda)
        self.frequency_list_mapper_mapping_qual = MapNumberWithFrequencyList(distinct_numbers=100,use_cuda=use_cuda)
        self.frequency_list_mapper_distance_to = MapNumberWithFrequencyList(distinct_numbers=1000,use_cuda=use_cuda)
        self.frequency_list_mapper_aligned_lengths = MapNumberWithFrequencyList(distinct_numbers=1000,use_cuda=use_cuda)
        self.frequency_list_mapper_read_indices = MapNumberWithFrequencyList(distinct_numbers=1000,use_cuda=use_cuda)

        count_mappers = [self.map_gobyGenotypeIndex,
                         self.map_boolean,  # isIndel
                         self.map_boolean,  # matchesReference
                         self.map_sequence,
                         self.map_count,
                         self.map_count]

        self.nf_names_mappers = [('qualityScoresForwardStrand', self.frequency_list_mapper_base_qual),
                                 ('qualityScoresReverseStrand', self.frequency_list_mapper_base_qual),
                                 ('distanceToStartOfRead', self.frequency_list_mapper_distance_to),
                                 ('distanceToEndOfRead', self.frequency_list_mapper_distance_to),  # OK
                                 ('readIndicesReverseStrand', self.frequency_list_mapper_read_indices),  # OK
                                 ('readIndicesForwardStrand', self.frequency_list_mapper_read_indices),  # OK
                                 # 'distancesToReadVariationsForwardStrand', #Wrong
                                 # 'distancesToReadVariationsReverseStrand', #Wrong
                                 ('targetAlignedLengths', self.frequency_list_mapper_aligned_lengths),
                                 ('queryAlignedLengths', self.frequency_list_mapper_aligned_lengths),  # OK
                                 ('numVariationsInReads', self.frequency_list_mapper_num_var),  # OK
                                 ('readMappingQualityForwardStrand', self.frequency_list_mapper_mapping_qual),  # OK
                                 ('readMappingQualityReverseStrand', self.frequency_list_mapper_mapping_qual)  # OK
                                 ]
        for nf_name, mapper in self.nf_names_mappers:
            count_mappers += [mapper]

        # below, [2+2+2] is for the booleans mapped with a function:
        self.reduce_count = Reduce([mapper.embedding_size for mapper in count_mappers], encoding_output_dim=count_dim,use_cuda=use_cuda)
        batched_count_mappers = []
        self.reduce_batched = Reduce([self.map_gobyGenotypeIndex.embedding_size,
                                      self.map_count.embedding_size * 2,
                                      self.map_boolean.embedding_size * 2,
                                      self.map_sequence.embedding_size * 2, ], encoding_output_dim=count_dim,use_cuda=use_cuda)

    def forward(self, c, tensor_cache, cuda=None):
        mapped_gobyGenotypeIndex = self.map_gobyGenotypeIndex([c['gobyGenotypeIndex']], tensor_cache=tensor_cache,
                                                              cuda=cuda)
        # Do not map isCalled, it is a field that contains the truth and is used to calculate the label.

        mapped_isIndel = self.map_boolean(c['isIndel'], tensor_cache=tensor_cache, cuda=cuda)
        mapped_matchesReference = self.map_boolean(c['matchesReference'], tensor_cache=tensor_cache, cuda=cuda)

        # NB: fromSequence was mapped at the level of BaseInformation.
        mapped_to = self.map_sequence(c['toSequence'], tensor_cache=tensor_cache, cuda=cuda)
        mapped_genotypeCountForwardStrand = self.map_count([c['genotypeCountForwardStrand']], tensor_cache=tensor_cache,
                                                           cuda=cuda)
        mapped_genotypeCountReverseStrand = self.map_count([c['genotypeCountReverseStrand']], tensor_cache=tensor_cache,
                                                           cuda=cuda)

        mapped = [mapped_gobyGenotypeIndex,
                  mapped_isIndel,
                  mapped_matchesReference,
                  mapped_to,
                  mapped_genotypeCountForwardStrand,
                  mapped_genotypeCountReverseStrand]

        for nf_name, mapper in self.nf_names_mappers:
            if nf_name in c.keys():
                mapped += [mapper(c[nf_name],
                                  tensor_cache=tensor_cache,
                                  cuda=cuda, nf_name=nf_name)]
            else:
                variable = Variable(torch.zeros(1, mapper.embedding_size), requires_grad=True)
                if cuda:
                    variable = variable.cuda(async=True)
                mapped += [variable]
        return self.reduce_count(mapped, cuda)

    def cat_inputs(self, mapper, list_of_values, tensor_cache=NoCache(), phase=0, cuda=False, direct_forward=False):
        mapper_id = id(mapper)
        results = {mapper_id: []}
        for value in list_of_values:
            if direct_forward:
                mapped = mapper(value, tensor_cache=tensor_cache, cuda=cuda)
            else:
                mapped = mapper.collect_inputs(value, tensor_cache=tensor_cache, phase=phase, cuda=cuda)

            results[mapper_id] += [mapped]

        return torch.cat(results[mapper_id], dim=1)

    def collect_inputs(self, c, phase=0, tensor_cache=NoCache(), cuda=None, batcher=None):
        if phase == 0:
            c['indices'] = {}
            # the following tensors are batched:
            store_indices_in_message(mapper=self.map_gobyGenotypeIndex, message=c,
                                     indices=self.map_gobyGenotypeIndex.collect_inputs(
                                         values=[c['gobyGenotypeIndex']], phase=phase, cuda=cuda, batcher=batcher))

            store_indices_in_message(mapper=self.map_count, message=c, indices=self.map_count.collect_inputs(
                values=[c['genotypeCountForwardStrand'], c['genotypeCountReverseStrand']], phase=phase, cuda=cuda,
                batcher=batcher))

            store_indices_in_message(mapper=self.map_boolean, message=c, indices=self.map_boolean.collect_inputs(
                values=[c['isIndel'], c['matchesReference']],
                tensor_cache=tensor_cache, phase=phase,
                cuda=cuda, batcher=batcher))

            c['mapped-not-batched'] = {}
            # the following tensors are not batched, but computed once per instance, right here with direct_forward=True:
            c['mapped-not-batched'][id(self.map_sequence)] = self.cat_inputs(self.map_sequence,
                                                                             [c['fromSequence'], c['toSequence']],
                                                                             tensor_cache=tensor_cache, phase=phase,
                                                                             cuda=cuda,
                                                                             direct_forward=True)
            return []

        if phase == 1:
            mapped_goby_genotype_indices = batcher.get_forward_for_example(mapper=self.map_gobyGenotypeIndex,
                                                                           message=c).view(1, -1)

            mapped_counts = batcher.get_forward_for_example(mapper=self.map_count, message=c).view(1, -1)

            mapped_booleans = batcher.get_forward_for_example(mapper=self.map_boolean, message=c).view(1, -1)

            # mapped_sequences are not currently batchable, so we get the input from the prior phase:
            mapped_sequences = c['mapped-not-batched'][id(self.map_sequence)].view(1, -1)

            all_mapped = [mapped_goby_genotype_indices, mapped_counts, mapped_booleans, mapped_sequences]

            return batcher.store_inputs(mapper=self, inputs=self.reduce_batched(all_mapped))

    def forward_batch(self, batcher, phase=0):
        if phase == 0:
            # calculate the forward on the batches:
            self.map_gobyGenotypeIndex.forward_batch(batcher)
            self.map_count.forward_batch(batcher)
            self.map_boolean.forward_batch(batcher)
            return None
        if phase == 1:
            batched_input = batcher.get_batched_input(mapper=self)
            batcher.store_batched_result(self, batched_result=batched_input)
            return batched_input
Пример #10
0
 def test_int_single(self):
     visitor = IntegerModel(100, 2)
     print(visitor([12, 3, 2]))
Пример #11
0
 def test_int_rnn(self):
     torch.manual_seed(1212)
     visitor_int = IntegerModel(100, 2)
     visitor_list_rnn = RNNOfList(embedding_size=2, hidden_size=4, num_layers=1)
     print(visitor_list_rnn(visitor_int([12, 3, 2])))
Пример #12
0
 def test_int_average(self):
     visitor_int = IntegerModel(100, 2)
     visitor_list_average = MeanOfList()
     print(visitor_list_average(visitor_int([12, 3, 2])))
Пример #13
0
        class MapMessage(StructuredEmbedding):
            def __init__(self):
                super().__init__(embedding_size=2)

                self.map_a = IntegerModel(distinct_numbers=100,
                                          embedding_size=2)
                self.map_b = IntegerModel(distinct_numbers=100,
                                          embedding_size=2)

            def forward(self, message, tensor_cache, cuda=None):
                return self.map_a([message['a']], tensor_cache,
                                  cuda) + self.map_b([message['b']],
                                                     tensor_cache, cuda)

            def collect_inputs(self,
                               message,
                               phase=0,
                               tensor_cache=NoCache(),
                               cuda=None,
                               batcher=None):
                assert isinstance(message, Message)

                if phase == 0:
                    message['indices'] = {}
                    store_indices_in_message(mapper=self.map_a,
                                             message=message,
                                             indices=self.map_a.collect_inputs(
                                                 values=[message['a']],
                                                 phase=phase,
                                                 tensor_cache=tensor_cache,
                                                 cuda=cuda,
                                                 batcher=batcher))
                    store_indices_in_message(mapper=self.map_b,
                                             message=message,
                                             indices=self.map_b.collect_inputs(
                                                 values=[message['b']],
                                                 phase=phase,
                                                 tensor_cache=tensor_cache,
                                                 cuda=cuda,
                                                 batcher=batcher))
                    # no indices yet for this mapper.
                    return []
                if phase == 1:

                    my_a = batcher.get_forward_for_example(
                        mapper=self.map_a,
                        example_indices=get_indices_in_message(
                            mapper=self.map_a, message=message))
                    my_b = batcher.get_forward_for_example(
                        mapper=self.map_b,
                        example_indices=get_indices_in_message(
                            mapper=self.map_b, message=message))
                    return batcher.store_inputs(mapper=self,
                                                inputs=(my_a + my_b))

            def forward_batch(self, batcher, phase=0):
                my_input = {}
                if phase == 0:
                    # do forward for batches, results are kept in batcher.
                    batched_a = batcher.forward_batch(mapper=self.map_a)
                    batched_b = batcher.forward_batch(mapper=self.map_b)
                    return (batched_a, batched_b)

                if phase == 1:
                    return batcher.get_batched_input(mapper=self)