예제 #1
0
파일: gintervals.py 프로젝트: mmosmond/cvtk
class NestedContainmentList(object):
    def __init__(self, starts=None, ends=None, indices=None, reduce=False):
        self.ncls = None
        if starts is not None and indices is not None:
            if ends is None:
                ends = [s + 1 for s in starts]
            if reduce:
                starts, ends, indices = list(
                    zip(*merge_overlaps(zip(starts, ends, indices))))
            starts = np.array(starts, dtype='i8')
            ends = np.array(ends, dtype='i8')
            indices = np.array(indices, dtype='i8')
            self.ncls = NCLS(starts, ends, indices)

    def find_overlaps(self, start, end):
        if self.ncls is None:
            # we allow for empty objects, in which case nothing overlaps
            # use case: non-matching seqids
            return []
        overlaps = []
        for overlap in self.ncls.find_overlap(start, end):
            overlaps.append(Interval(*overlap))
        return overlaps

    @staticmethod
    def from_intervals(intervals, reduce=False):
        starts, ends, indices = zip(*intervals)
        starts = np.array(starts, dtype='i8')
        ends = np.array(ends, dtype='i8')
        indices = np.array(indices, dtype='i8')
        obj = NestedContainmentList(starts, ends, indices, reduce=reduce)
        return obj
예제 #2
0
    def test_ncls():
        # ids = starts

        print(starts, ends, ids)

        ncls = NCLS(starts, ends, ids)
        print(ncls)
        print(ncls.intervals())

        assert list(ncls.find_overlap(0, 2)) == []
        assert list(ncls.find_overlap(0, 2_147_483_647)) == [(5, 6, 0), (2_147_483_645, 2_147_483_646, 3)]

        r, l = ncls.all_overlaps_both(starts, ends, ids)
        assert list(r) == [0, 3]
        assert list(l) == [0, 3]
예제 #3
0
def test_ncls():
    # ids = starts

    print(starts, ends, ids)

    ncls = NCLS(starts, ends, ids)
    print(ncls)
    print(ncls.intervals())

    assert list(ncls.find_overlap(0, 2)) == []
    print("aaa", list(ncls.find_overlap(9_223_372_036_854_775_805, 9_223_372_036_854_775_806)))
    assert list(ncls.find_overlap(0, 9_223_372_036_854_775_806)) == [(5, 6, 2147483647), (9223372036854775805, 9223372036854775807, 3)]

    r, l = ncls.all_overlaps_both(starts, ends, ids)
    assert list(r) == [2147483647, 3]
    assert list(l) == [2147483647, 3]
예제 #4
0
 def filter_by_human_annotations(self, article, annotations):
     ncls = NCLS(*get_intervals(article['annotations']))
     new_annotations = []
     num_filtered = 0
     for annotation in annotations:
         entity_start, entity_end = get_start_end(annotation)
         matched_human_annotation = list(
             ncls.find_overlap(entity_start, entity_end))
         if len(matched_human_annotation) == 0:
             new_annotations.append(annotation)
         else:
             human_annotation = article['annotations'][
                 matched_human_annotation[0][2]]
             human_annotation_start, human_annotation_end = get_start_end(
                 human_annotation)
             assert intersect(human_annotation_start, human_annotation_end,
                              entity_start, entity_end)
             num_filtered += 1
     assert len(new_annotations) + num_filtered == len(annotations)
     return new_annotations, num_filtered
예제 #5
0
class MultiNode(Node):
    def __init__(self, node_type, node_id, nodes_list, is_robot=False):
        super(MultiNode, self).__init__(node_type,
                                        node_id,
                                        data=None,
                                        is_robot=is_robot)
        self.nodes_list = nodes_list
        for node in self.nodes_list:
            node.is_robot = is_robot

        self.first_timestep = min(node.first_timestep
                                  for node in self.nodes_list)
        self._last_timestep = max(node.last_timestep
                                  for node in self.nodes_list)

        starts = np.array([node.first_timestep for node in self.nodes_list],
                          dtype=np.int64)
        ends = np.array([node.last_timestep for node in self.nodes_list],
                        dtype=np.int64)
        ids = np.arange(len(self.nodes_list), dtype=np.int64)
        self.interval_tree = NCLS(starts, ends, ids)

    @staticmethod
    def find_non_overlapping_nodes(nodes_list, min_timesteps=1) -> list:
        """
        Greedily finds a set of non-overlapping nodes in the provided scene.

        :return: A list of non-overlapping nodes.
        """
        non_overlapping_nodes = list()
        nodes = sorted(nodes_list, key=lambda n: n.last_timestep)
        current_time = 0
        for node in nodes:
            if node.first_timestep >= current_time and node.timesteps >= min_timesteps:
                # Include the node
                non_overlapping_nodes.append(node)
                current_time = node.last_timestep

        return non_overlapping_nodes

    def get_node_at_timesteps(self, scene_ts) -> Node:
        possible_node_ranges = list(
            self.interval_tree.find_overlap(scene_ts[0], scene_ts[1] + 1))
        if not possible_node_ranges:
            return Node(node_type=self.type,
                        node_id='EMPTY',
                        data=self.nodes_list[0].data * np.nan,
                        is_robot=self.is_robot)

        node_idx = random.choice(possible_node_ranges)[2]
        return self.nodes_list[node_idx]

    def scene_ts_to_node_ts(self, scene_ts) -> (Node, np.ndarray, int, int):
        """
        Transforms timestamp from scene into timeframe of node data.

        :param scene_ts: Scene timesteps
        :return: ts: Transformed timesteps, paddingl: Number of timesteps in scene range which are not available in
                node data before data is available. paddingu: Number of timesteps in scene range which are not
                available in node data after data is available.
        """
        possible_node_ranges = list(
            self.interval_tree.find_overlap(scene_ts[0], scene_ts[1] + 1))
        if not possible_node_ranges:
            return None, None, None, None

        node_idx = random.choice(possible_node_ranges)[2]
        node = self.nodes_list[node_idx]

        paddingl = (node.first_timestep - scene_ts[0]).clip(0)
        paddingu = (scene_ts[1] - node.last_timestep).clip(0)
        ts = np.array(scene_ts).clip(
            min=node.first_timestep,
            max=node.last_timestep) - node.first_timestep
        return node, ts, paddingl, paddingu

    def get(self, tr_scene, state, padding=np.nan) -> np.ndarray:
        if tr_scene.size == 1:
            tr_scene = np.array([tr_scene, tr_scene])
        length = tr_scene[1] - tr_scene[0] + 1  # tr is inclusive

        node, tr, paddingl, paddingu = self.scene_ts_to_node_ts(tr_scene)
        if node is None:
            state_length = sum(
                [len(entity_dims) for entity_dims in state.values()])
            return np.full((length, state_length), fill_value=padding)

        data_array = node.data[tr[0]:tr[1] + 1, state]
        padded_data_array = np.full((length, data_array.shape[1]),
                                    fill_value=padding)
        padded_data_array[paddingl:length - paddingu] = data_array
        return padded_data_array

    def get_all(self, tr_scene, state, padding=np.nan) -> np.ndarray:
        # Assumption here is that the user is asking for all of the data in this MultiNode and to return it within a
        # full scene-sized output array.
        assert tr_scene.size == 2 and tr_scene[
            0] == 0 and self.last_timestep <= tr_scene[1]
        length = tr_scene[1] - tr_scene[0] + 1  # tr is inclusive
        state_length = sum(
            [len(entity_dims) for entity_dims in state.values()])
        padded_data_array = np.full((length, state_length), fill_value=padding)
        for node in self.nodes_list:
            padded_data_array[node.first_timestep:node.last_timestep +
                              1] = node.data[:, state]

        return padded_data_array

    def history_points_at(self, ts) -> int:
        """
        Number of history points in trajectory. Timestep is exclusive.

        :param ts: Scene timestep where the number of history points are queried.
        :return: Number of history timesteps.
        """
        node_idx = next(self.interval_tree.find_overlap(ts, ts + 1))[2]
        node = self.nodes_list[node_idx]
        return ts - node.first_timestep

    @property
    def timesteps(self) -> int:
        """
        Number of available timesteps for node.

        :return: Number of available timesteps.
        """
        return self._last_timestep - self.first_timestep + 1
예제 #6
0
# Test AIList
i = AIList()

i.from_array(starts1, ends1, ids1, values1)
i.construct()

ai_res = i.intersect_from_array(starts2, ends2, ids2)

i.intersect(starts2[50], ends2[50])

# Test NCLS
n = NCLS(starts1, ends1, ids1)

n_res = n.all_overlaps_both(starts2, ends2, ids2)

list(n.find_overlap(starts2[50], ends2[50]))

# Test pandas
p = pd.IntervalIndex.from_tuples(list(zip(starts1, ends1)))

p.overlaps(pd.Interval(starts2[50], ends2[50]))

# Test quicksect
b = quicksect.IntervalTree()
for i in range(len(starts1)):
    b.add(starts1[i], ends1[i])

b.search(starts2[50], ends2[50])

KIFYH5 = milo
예제 #7
0
파일: test_ncls.py 프로젝트: flywind2/krait
from ncls import NCLS
import numpy as np

starts = []
ends = []
ids = []
with open('data.txt') as fh:
    for line in fh:
        cols = line.strip().split()
        starts.append(int(cols[0]))
        ends.append(int(cols[1]))
        ids.append(int(cols[2]))

        if int(cols[2]) > 70000:
            break

starts = np.array(starts, dtype=np.long)
ends = np.array(ends, dtype=np.long)
ids = np.array(ids, dtype=np.long)

ncls = NCLS(starts, ends, ids)

for i in ncls.find_overlap(76623690, 76624000):
    print(i)
예제 #8
0

from ncls import NCLS

import pickle
import pandas as pd
import numpy as np

starts = np.random.randint(0, int(1e8), int(1e7))
ends = starts + 100
ids = starts

ncls = NCLS(starts, ends, ids)

for i in ncls.find_overlap(0, 2):
    print(i)

pickle.dump(ncls, open("test.pckl", "wb"))

import pickle

ncls2 = pickle.load(open("test.pckl", "rb"))

for i in ncls2.find_overlap(0, 2):
    print(i)
예제 #9
0
    def __call__(self, path):
        global vocab
        global entities
        num_annotations, num_sentences, num_documents = 0, 0, 0
        total_length = 0
        num_filtered_xao = 0
        num_filtered_by_candidate_set, num_filtered_by_human_annotations, num_filtered_by_self_overlaps = 0, 0, 0
        num_filtered_by_crossing_sentence_boundaries, num_filtered_solo_annotion_in_sentence = 0, 0
        num_filtered_by_entity_vocab = 0

        empty_line_tensor = vocab.encode_line(line='',
                                              append_eos=self.append_eos)
        assert len(empty_line_tensor) == int(self.append_eos)

        if self.entity_vocab is None:
            annotation_entities = Counter()
        else:
            output_prefix = self.generate_tmp_filename()
            dataset_builder = indexed_dataset.make_builder(
                output_prefix + '.text.bin',
                impl=self.dataset_impl,
                vocab_size=len(vocab),
            )
            annotations_list = list()

        with codecs.open(path, 'r', 'utf8') as f:
            for line in f:
                article = json.loads(line[:-1])
                annotations = article['el']
                article[
                    'annotations'], _num_filtered_xao = self.fix_annotations(
                        article['annotations'])
                num_filtered_xao += _num_filtered_xao
                annotations, _num_filtered_xao = self.fix_annotations(
                    annotations)
                num_filtered_xao += _num_filtered_xao
                annotations, _num_filtered_by_candidate_set = self.filter_by_candidate_set(
                    article, annotations)
                annotations, _num_filtered_by_human_annotations = self.filter_by_human_annotations(
                    article, annotations)
                annotations, _num_filtered_by_self_overlaps = self.filter_by_self_overlaps(
                    annotations)
                annotations = article['annotations'] + annotations
                if self.entity_vocab is not None:
                    annotations, _num_filtered_by_entity_vocab = self.filter_by_entity_vocab(
                        annotations)
                    num_filtered_by_entity_vocab += _num_filtered_by_entity_vocab
                num_filtered_by_candidate_set += _num_filtered_by_candidate_set
                num_filtered_by_human_annotations += _num_filtered_by_human_annotations
                num_filtered_by_self_overlaps += _num_filtered_by_self_overlaps

                nlcs = NCLS(*get_intervals(annotations))
                text = article['text'].replace(u'\xa0', u' ')
                offset = 0

                for sentence, offset in self.split_into_sentences(text):
                    sentence_begin = offset
                    sentence_end = offset + len(sentence)
                    assert sentence == text[sentence_begin:sentence_end]

                    annotations_per_sentence = []
                    for annotation_id in nlcs.find_overlap(
                            sentence_begin, sentence_end):
                        annotation = annotations[annotation_id[2]]
                        start, end = get_start_end(annotation)
                        if sentence_begin <= start and end <= sentence_end:
                            annotations_per_sentence.append(annotation)
                        else:
                            num_filtered_by_crossing_sentence_boundaries += 1
                    num_unique_entities = len(
                        set([
                            annotation['uri']
                            for annotation in annotations_per_sentence
                        ]))
                    if num_unique_entities < self.min_entities_per_sentence:
                        num_filtered_solo_annotion_in_sentence += 1
                        continue
                    num_annotations += len(annotations_per_sentence)

                    if self.entity_vocab is None:
                        annotation_entities.update([
                            annotation['uri']
                            for annotation in annotations_per_sentence
                        ])
                    else:
                        annotations_per_sentence = self.set_local_offsets(
                            offset, annotations_per_sentence)
                        fixed_sentence, annotations_per_sentence = self.strip_whitespaces(
                            sentence, annotations_per_sentence)
                        fixed_sentence, annotations_per_sentence = self.strip_double_whitespaces(
                            fixed_sentence, annotations_per_sentence)
                        fixed_sentence, annotations_per_sentence = self.add_margin_to_annotations(
                            fixed_sentence, annotations_per_sentence)
                        annotations_per_sentence = self.get_word_based_offsets(
                            fixed_sentence, annotations_per_sentence)
                        ids, annotations_per_sentence = self.apply_gt2_bpe(
                            fixed_sentence, annotations_per_sentence)

                        ids_tensor = vocab.encode_line(
                            line=' '.join(ids), append_eos=self.append_eos)
                        assert len(ids_tensor) == len(ids) + int(
                            self.append_eos)
                        dataset_builder.add_item(ids_tensor)
                        annotations_list.extend([[
                            x['start_word'] + total_length,
                            x['end_word'] + total_length, num_sentences,
                            num_documents,
                            int(entities[x['uri']])
                        ] for x in annotations_per_sentence])
                        total_length += len(ids_tensor)
                    num_sentences += 1

                if self.entity_vocab is not None:
                    dataset_builder.add_item(empty_line_tensor)
                    total_length += len(empty_line_tensor)
                    num_sentences += 1
                    num_documents += 1

        if self.entity_vocab is not None:
            dataset_builder.finalize(output_prefix + '.text.idx')
            annotations_list = np.array(annotations_list, dtype=np.int64)

        return (
            annotation_entities
            if self.entity_vocab is None else output_prefix,
            annotations_list if self.entity_vocab is not None else None,
            total_length if self.entity_vocab is not None else 0,
            num_documents,
            num_sentences,
            num_annotations,
            num_filtered_by_candidate_set,
            num_filtered_by_human_annotations,
            num_filtered_by_self_overlaps,
            num_filtered_by_crossing_sentence_boundaries,
            num_filtered_solo_annotion_in_sentence,
            num_filtered_xao,
            num_filtered_by_entity_vocab,
        )
예제 #10
0
from ncls import NCLS

import pandas as pd
import numpy as np

starts = pd.Series(range(0, int(1e7)))
ends = starts + 100
ids = starts

ncls = NCLS(starts.values, ends.values, ids.values)

ncls.write_binaries(b"hello")

ncls2 = NCLS(np.array([0]), np.array([2]), np.array([3]))

ncls2.buildFromUnsortedFile(b"hello.idb", n=int(1e7))

for i in ncls2.find_overlap(0, 500):
    print(i)