예제 #1
0
    def test_overflow(self):
        database: TestDatabase = TestDatabase()
        log: TestTable = database.create_table("log")
        table1: TestTable = database.create_table("table1")
        entry1: TestEntry = table1.add_entry(
            "test.new_line",
            "A B C D E F G H\na b c d e f g h\n1 2 3 4 5 6 7 8 9\n")

        # Requires multiple passes
        it = TestIterator(entry1, None, 10, 10)
        [items, offset_bounds, more] = it.next()
        self.assertEqual(list(items), [])
        self.assertEqual(offset_bounds, None)
        self.assertTrue(more)

        [items, offset_bounds, more] = it.next()
        self.assertEqual(list(items), [b"A B C D E F G H"])
        self.assertEqual(offset_bounds, OffsetBounds(0, 15))
        self.assertTrue(more)

        [items, offset_bounds, more] = it.next()
        self.assertEqual(list(items), [b"a b c d e f g h"])
        self.assertEqual(offset_bounds, OffsetBounds(16, 31))
        self.assertTrue(more)

        [items, offset_bounds, more] = it.next()
        self.assertEqual(list(items), [])
        self.assertEqual(offset_bounds, None)
        self.assertTrue(more)

        [items, offset_bounds, more] = it.next()
        self.assertEqual(list(items), [b"1 2 3 4 5 6 7 8 9"])
        self.assertEqual(offset_bounds, OffsetBounds(32, 49))
        self.assertFalse(more)
예제 #2
0
    def test_adjust(self):
        database: TestDatabase = TestDatabase()
        table1: TestTable = database.create_table("table1")
        entry1: TestEntry = table1.add_entry("0/123.4-13/1/1-1-1-test.mzML",
                                             INPUT)

        # Multiple spectra start in range
        it = mzML.Iterator(entry1, OffsetBounds(120, 540))
        [spectra, offset_bounds, more] = it.next()
        self.assertFalse(more)
        self.assertEqual(offset_bounds.start_index, 123)
        self.assertEqual(offset_bounds.end_index, 733)
        self.assertEqual(len(list(spectra)), 3)

        # One spectra starts in range
        it = mzML.Iterator(entry1, OffsetBounds(120, 250))
        [spectra, offset_bounds, more] = it.next()
        self.assertFalse(more)
        self.assertEqual(offset_bounds.start_index, 123)
        self.assertEqual(offset_bounds.end_index, 320)

        # No spectra start in range
        it = mzML.Iterator(entry1, OffsetBounds(126, 240))
        [spectra, offset_bounds, more] = it.next()
        self.assertFalse(more)
        self.assertEqual(offset_bounds.start_index, 123)
        self.assertEqual(offset_bounds.end_index, 320)
예제 #3
0
    def test_next(self):
        database: TestDatabase = TestDatabase()
        table1: TestTable = database.create_table("table1")

        entry1: TestEntry = table1.add_entry("test.fastq",
                                             b"\n".join(expected_items))
        # Read everything in one pass
        it = TestIterator(entry1, None, 300, 300)
        [items, offset_bounds, more] = it.next()
        items = list(items)

        self.assertEqual(len(items), 3)
        self.assertEqual(items, expected_items)
        self.assertEqual(offset_bounds, OffsetBounds(0, 265))
        self.assertFalse(more)

        # Requires multiple passes
        it = TestIterator(entry1, None, 100, 100)
        [items, offset_bounds, more] = it.next()
        self.assertEqual(list(items), expected_items[:1])
        self.assertEqual(offset_bounds, OffsetBounds(0, 88))
        self.assertTrue(more)

        [items, offset_bounds, more] = it.next()
        self.assertEqual(list(items), expected_items[1:2])
        self.assertEqual(offset_bounds, OffsetBounds(89, 177))
        self.assertTrue(more)

        [items, offset_bounds, more] = it.next()
        self.assertEqual(list(items), expected_items[2:])
        self.assertEqual(offset_bounds, OffsetBounds(178, 265))
        self.assertFalse(more)
예제 #4
0
    def test_next(self):
        database: TestDatabase = TestDatabase()
        table1: TestTable = database.create_table("table1")
        entry1: TestEntry = table1.add_entry("test.fasta",
                                             ">A\tB\tC\n>a\tb\tc\n>1\t2\t3\n")

        # Read everything in one pass
        it = TestIterator(entry1, None, 30, 30)
        [items, offset_bounds, more] = it.next()
        self.assertEqual(list(items),
                         [b">A\tB\tC\n", b">a\tb\tc\n", b">1\t2\t3\n"])
        self.assertEqual(offset_bounds, OffsetBounds(0, 20))
        self.assertFalse(more)

        # Requires multiple passes
        it = TestIterator(entry1, None, 8, 8)
        [items, offset_bounds, more] = it.next()
        self.assertEqual(list(items), [b">A\tB\tC\n"])
        self.assertEqual(offset_bounds, OffsetBounds(0, 6))
        self.assertTrue(more)

        [items, offset_bounds, more] = it.next()
        self.assertEqual(list(items), [b">a\tb\tc\n"])
        self.assertEqual(offset_bounds, OffsetBounds(7, 13))
        self.assertTrue(more)

        [items, offset_bounds, more] = it.next()
        self.assertEqual(list(items), [b">1\t2\t3\n"])
        self.assertEqual(offset_bounds, OffsetBounds(14, 20))
        self.assertFalse(more)
예제 #5
0
 def transform(
     self, stream: bytes, offset_bounds: Optional[OffsetBounds]
 ) -> Tuple[bytes, Optional[OffsetBounds]]:
     start_index: int
     end_index: int
     if not offset_bounds:
         start_index = self.spectra_start_index
         end_index = self.spectra_end_index
     else:
         s: str = stream.decode("utf-8")
         offsets: List[int] = list(
             map(lambda r: int(r.group(1)), self.offset_regex.finditer(s)))
         assert (len(offsets) > 0)
         s = self.entry.get_range(
             offset_bounds.end_index,
             offset_bounds.end_index + self.read_chunk_size).decode("utf-8")
         next_offsets: List[int] = list(
             map(lambda r: int(r.group(1)), self.offset_regex.finditer(s)))
         offset_bounds.start_index = offsets[0]
         if len(next_offsets) > 0:
             offset_bounds.end_index = next_offsets[0] - 1
         else:
             offset_bounds.end_index = self.spectra_end_index
         start_index = offset_bounds.start_index
         end_index = offset_bounds.end_index
     assert (start_index <= end_index)
     stream = self.entry.get_range(start_index, end_index)
     return (stream, OffsetBounds(start_index, end_index))
예제 #6
0
    def test_offsets(self):
        database: TestDatabase = TestDatabase()
        table1: TestTable = database.create_table("table1")
        entry1: TestEntry = table1.add_entry("test.fasta",
                                             ">A\tB\tC\n>a\tb\tc\n>1\t2\t3\n")

        it = TestIterator(entry1, OffsetBounds(10, 15), 30, 30)
        [items, offset_bounds, more] = it.next()
        self.assertEqual(list(items), [b">a\tb\tc\n"])
        self.assertEqual(offset_bounds, OffsetBounds(7, 13))

        # Edge case. Offset bound with end of file.
        it = TestIterator(entry1, OffsetBounds(14, 20), 30, 30)
        [items, offset_bounds, more] = it.next()
        self.assertEqual(list(items), [b">1\t2\t3\n"])
        self.assertEqual(offset_bounds, OffsetBounds(14, 20))
        self.assertFalse(more)
예제 #7
0
    def test_offsets(self):
        database: TestDatabase = TestDatabase()
        table1: TestTable = database.create_table("table1")
        entry1: TestEntry = table1.add_entry("test.fastq",
                                             b"\n".join(expected_items))

        it = TestIterator(entry1, OffsetBounds(100, 200), 100, 100)
        [items, offset_bounds, more] = it.next()
        items = list(items)
        self.assertEqual(items, expected_items[1:2])
        self.assertEqual(offset_bounds, OffsetBounds(89, 177))
        self.assertFalse(more)

        # Edge case. Offset bound with end of file.
        it = TestIterator(entry1, OffsetBounds(200, 300), 200, 200)
        [items, offset_bounds, more] = it.next()
        items = list(items)
        self.assertEqual(items, expected_items[2:])
        self.assertEqual(offset_bounds, OffsetBounds(178, 265))
        self.assertFalse(more)
예제 #8
0
  def test_next(self):
    entry = TestEntry("test.tsv", "A\tB\tC\na\tb\tc\n1\t2\t3\n")

    # Requires multiple passes
    it = TestIterator(entry, None, 11, 11)
    [items, offset_bounds, more] = it.next()
    self.assertEqual(list(items), [b"A\tB\tC", b"a\tb\tc"])
    self.assertEqual(offset_bounds, OffsetBounds(0, 11))
    self.assertTrue(more)

    [items, offset_bounds, more] = it.next()
    self.assertEqual(list(items), [b"1\t2\t3"])
    self.assertEqual(offset_bounds, OffsetBounds(12, 17))
    self.assertFalse(more)

    # Read everything in one pass
    it = TestIterator(entry, None, 20, 20)
    [items, offset_bounds, more] = it.next()
    self.assertEqual(list(items), [b"A\tB\tC", b"a\tb\tc", b"1\t2\t3"])
    self.assertEqual(offset_bounds, OffsetBounds(0, 17))
    self.assertFalse(more)
예제 #9
0
    def test_adjust(self):
        database: TestDatabase = TestDatabase()
        log: TestTable = database.create_table("log")
        table1: TestTable = database.create_table("table1")
        entry1: TestEntry = table1.add_entry(
            "test.new_line", "A B C\na b c\n1 2 3\nD E F\nd e f\n")
        it = TestIterator(entry1, OffsetBounds(8, 13), 10, 10)
        [items, offset_bounds, more] = it.next()
        self.assertEqual(list(items), [b"a b c"])
        self.assertEqual(offset_bounds, OffsetBounds(6, 11))
        self.assertFalse(more)

        # No adjustment needed
        it = TestIterator(entry1, OffsetBounds(6, 11), 10, 10)
        [items, offset_bounds, more] = it.next()
        self.assertEqual(list(items), [b"a b c"])
        self.assertEqual(offset_bounds, OffsetBounds(6, 11))
        self.assertFalse(more)

        # Beginning of content
        it = TestIterator(entry1, OffsetBounds(0, 7), 10, 10)
        [items, offset_bounds, more] = it.next()
        self.assertFalse(more)
        self.assertEqual(list(items), [b"A B C"])

        # Beginning of content
        it = TestIterator(entry1, OffsetBounds(26,
                                               entry1.content_length() - 1),
                          10, 10)
        [items, offset_bounds, more] = it.next()
        self.assertFalse(more)
        self.assertEqual(list(items), [b"d e f"])
예제 #10
0
def run(database: Database, test_key: str, params, input_format, output_format,
        offsets: List[int]):
    train_obj = database.get_entry("spacenet", params["train_key"])
    train_it = classification.Iterator(
        train_obj,
        OffsetBounds(params["train_offsets"][0], params["train_offsets"][1]))
    train_x = []
    train_y = []
    more = True
    while more:
        [items, _, more] = train_it.next()
        for [features, c] in items:
            train_x.append(features)
            train_y.append(c)

    neigh = NearestNeighbors(n_neighbors=params["k"], algorithm="brute")
    neigh.fit(train_x)

    pixels = []
    rgb = []
    with open(test_key, "rb") as f:
        lines = filter(lambda l: len(l.strip()) > 0, f.read().split(b"\n\n"))
        for line in lines:
            parts = line.split(b' ')
            x = int(parts[0])
            y = int(parts[1])
            pixels.append([x, y])
            rgb.append(np.frombuffer(b' '.join(parts[2:]), dtype=int))

    [distances, indices] = neigh.kneighbors(rgb)

    items = []
    for i in range(len(distances)):
        [x, y] = pixels[i]
        neighbors = []
        for j in range(len(distances[i])):
            distance = distances[i][j]
            clz = train_y[indices[i][j]]
            neighbors.append((distance, clz))
        items.append((str.encode("{x} {y}".format(x=x, y=y)), neighbors))

    output_format["ext"] = "knn"
    output_file = "/tmp/{0:s}".format(util.file_name(output_format))
    with open(output_file, "wb+") as f:
        knn.Iterator.from_array(items, f, {})

    return [output_file]
예제 #11
0
    def test_next(self):
        database: TestDatabase = TestDatabase()
        log: TestTable = database.create_table("log")
        table1: TestTable = database.create_table("table1")
        entry1: TestEntry = table1.add_entry("test.new_line",
                                             "A B C\na b c\n1 2 3\n")

        # Requires multiple passes
        it = TestIterator(entry1, None, 11, 11)
        [items, offset_bounds, more] = it.next()
        self.assertTrue(more)
        self.assertEqual(OffsetBounds(0, 11), offset_bounds)
        self.assertEqual(list(items), [b"A B C", b"a b c"])

        [items, offset_bounds, more] = it.next()
        self.assertFalse(more)
        self.assertEqual(list(items), [b"1 2 3"])
예제 #12
0
def run_application(d: Database, bucket_name: str, key: str,
                    input_format: Dict[str, Any], output_format: Dict[str,
                                                                      Any],
                    offsets: List[int], params: Dict[str, Any]):
    temp_file = "/tmp/{0:s}".format(key)
    util.make_folder(util.parse_file_name(key))

    if len(offsets) == 0:
        d.download(bucket_name, key, temp_file)
    else:
        obj = d.get_entry(bucket_name, key)
        format_lib = importlib.import_module(params["input_format"])
        iterator_class = getattr(format_lib, "Iterator")
        iterator = iterator_class(obj, OffsetBounds(offsets[0], offsets[1]))
        items = iterator.get(iterator.get_start_index(),
                             iterator.get_end_index())
        with open(temp_file, "wb+") as f:
            items = list(items)
            iterator_class.from_array(list(items), f, iterator.get_extra())

    application_lib = importlib.import_module(params["application"])
    application_method = getattr(application_lib, "run")
    output_files = application_method(d, temp_file, params, input_format,
                                      output_format, offsets)

    found = False
    for output_file in output_files:
        p = util.parse_file_name(output_file.replace("/tmp/", ""))
        if p is None:
            index = output_file.rfind(".")
            ext = output_file[index + 1:]
            output_format["ext"] = ext
            new_key = util.file_name(output_format)
        else:
            new_key = util.file_name(p)

        with open(output_file, "rb") as f:
            d.put(params["bucket"], new_key, f, {})
    return True
예제 #13
0
def handle_pivots(database: Database, bucket_name, key, input_format,
                  output_format, offsets, params):
    entry: Entry = database.get_entry(bucket_name, key)

    format_lib = importlib.import_module(params["input_format"])
    iterator_class = getattr(format_lib, "Iterator")
    if len(offsets) > 0:
        it = iterator_class(entry, OffsetBounds(offsets[0], offsets[1]))
    else:
        it = iterator_class(entry, None)

    items = it.get(it.get_start_index(), it.get_end_index())
    pivots: List[float] = create_pivots(database, format_lib, iterator_class,
                                        list(items), params)

    output_format["ext"] = "pivot"
    pivot_key = util.file_name(output_format)

    spivots = "\t".join(list(map(lambda p: str(p), pivots)))
    content = str.encode("{0:s}\n{1:s}\n{2:s}".format(bucket_name, key,
                                                      spivots))
    database.write(params["bucket"], pivot_key, content, {})
    return True
예제 #14
0
파일: sort.py 프로젝트: delimitrou/Ripple
def handle_sort(database: Database, table_name: str, key: str,
                input_format: Dict[str, Any], output_format: Dict[str, Any],
                offsets: List[int], params: Dict[str, Any]):
    entry = database.get_entry(table_name, key)
    assert ("ext" in output_format)
    format_lib = importlib.import_module(params["input_format"])
    iterator_class = getattr(format_lib, "Iterator")
    if len(offsets) > 0:
        it = iterator_class(entry, OffsetBounds(offsets[0], offsets[1]))
    else:
        it = iterator_class(entry, None)
    extra = it.get_extra()
    items = it.get(it.get_start_index(), it.get_end_index())
    items = list(
        map(
            lambda item: (it.get_identifier_value(
                item, format_lib.Identifiers[params["identifier"]]), item),
            items))
    sorted_items = sorted(items, key=lambda k: k[0])
    bin_ranges = params["pivots"]
    binned_input = bin_input(sorted_items, bin_ranges)
    write_binned_input(database, binned_input, bin_ranges, extra,
                       dict(output_format), iterator_class, params)
    return True