def test_overflow(self): database: TestDatabase = TestDatabase() log: TestTable = database.create_table("log") table1: TestTable = database.create_table("table1") entry1: TestEntry = table1.add_entry( "test.new_line", "A B C D E F G H\na b c d e f g h\n1 2 3 4 5 6 7 8 9\n") # Requires multiple passes it = TestIterator(entry1, None, 10, 10) [items, offset_bounds, more] = it.next() self.assertEqual(list(items), []) self.assertEqual(offset_bounds, None) self.assertTrue(more) [items, offset_bounds, more] = it.next() self.assertEqual(list(items), [b"A B C D E F G H"]) self.assertEqual(offset_bounds, OffsetBounds(0, 15)) self.assertTrue(more) [items, offset_bounds, more] = it.next() self.assertEqual(list(items), [b"a b c d e f g h"]) self.assertEqual(offset_bounds, OffsetBounds(16, 31)) self.assertTrue(more) [items, offset_bounds, more] = it.next() self.assertEqual(list(items), []) self.assertEqual(offset_bounds, None) self.assertTrue(more) [items, offset_bounds, more] = it.next() self.assertEqual(list(items), [b"1 2 3 4 5 6 7 8 9"]) self.assertEqual(offset_bounds, OffsetBounds(32, 49)) self.assertFalse(more)
def test_adjust(self): database: TestDatabase = TestDatabase() table1: TestTable = database.create_table("table1") entry1: TestEntry = table1.add_entry("0/123.4-13/1/1-1-1-test.mzML", INPUT) # Multiple spectra start in range it = mzML.Iterator(entry1, OffsetBounds(120, 540)) [spectra, offset_bounds, more] = it.next() self.assertFalse(more) self.assertEqual(offset_bounds.start_index, 123) self.assertEqual(offset_bounds.end_index, 733) self.assertEqual(len(list(spectra)), 3) # One spectra starts in range it = mzML.Iterator(entry1, OffsetBounds(120, 250)) [spectra, offset_bounds, more] = it.next() self.assertFalse(more) self.assertEqual(offset_bounds.start_index, 123) self.assertEqual(offset_bounds.end_index, 320) # No spectra start in range it = mzML.Iterator(entry1, OffsetBounds(126, 240)) [spectra, offset_bounds, more] = it.next() self.assertFalse(more) self.assertEqual(offset_bounds.start_index, 123) self.assertEqual(offset_bounds.end_index, 320)
def test_next(self): database: TestDatabase = TestDatabase() table1: TestTable = database.create_table("table1") entry1: TestEntry = table1.add_entry("test.fastq", b"\n".join(expected_items)) # Read everything in one pass it = TestIterator(entry1, None, 300, 300) [items, offset_bounds, more] = it.next() items = list(items) self.assertEqual(len(items), 3) self.assertEqual(items, expected_items) self.assertEqual(offset_bounds, OffsetBounds(0, 265)) self.assertFalse(more) # Requires multiple passes it = TestIterator(entry1, None, 100, 100) [items, offset_bounds, more] = it.next() self.assertEqual(list(items), expected_items[:1]) self.assertEqual(offset_bounds, OffsetBounds(0, 88)) self.assertTrue(more) [items, offset_bounds, more] = it.next() self.assertEqual(list(items), expected_items[1:2]) self.assertEqual(offset_bounds, OffsetBounds(89, 177)) self.assertTrue(more) [items, offset_bounds, more] = it.next() self.assertEqual(list(items), expected_items[2:]) self.assertEqual(offset_bounds, OffsetBounds(178, 265)) self.assertFalse(more)
def test_next(self): database: TestDatabase = TestDatabase() table1: TestTable = database.create_table("table1") entry1: TestEntry = table1.add_entry("test.fasta", ">A\tB\tC\n>a\tb\tc\n>1\t2\t3\n") # Read everything in one pass it = TestIterator(entry1, None, 30, 30) [items, offset_bounds, more] = it.next() self.assertEqual(list(items), [b">A\tB\tC\n", b">a\tb\tc\n", b">1\t2\t3\n"]) self.assertEqual(offset_bounds, OffsetBounds(0, 20)) self.assertFalse(more) # Requires multiple passes it = TestIterator(entry1, None, 8, 8) [items, offset_bounds, more] = it.next() self.assertEqual(list(items), [b">A\tB\tC\n"]) self.assertEqual(offset_bounds, OffsetBounds(0, 6)) self.assertTrue(more) [items, offset_bounds, more] = it.next() self.assertEqual(list(items), [b">a\tb\tc\n"]) self.assertEqual(offset_bounds, OffsetBounds(7, 13)) self.assertTrue(more) [items, offset_bounds, more] = it.next() self.assertEqual(list(items), [b">1\t2\t3\n"]) self.assertEqual(offset_bounds, OffsetBounds(14, 20)) self.assertFalse(more)
def transform( self, stream: bytes, offset_bounds: Optional[OffsetBounds] ) -> Tuple[bytes, Optional[OffsetBounds]]: start_index: int end_index: int if not offset_bounds: start_index = self.spectra_start_index end_index = self.spectra_end_index else: s: str = stream.decode("utf-8") offsets: List[int] = list( map(lambda r: int(r.group(1)), self.offset_regex.finditer(s))) assert (len(offsets) > 0) s = self.entry.get_range( offset_bounds.end_index, offset_bounds.end_index + self.read_chunk_size).decode("utf-8") next_offsets: List[int] = list( map(lambda r: int(r.group(1)), self.offset_regex.finditer(s))) offset_bounds.start_index = offsets[0] if len(next_offsets) > 0: offset_bounds.end_index = next_offsets[0] - 1 else: offset_bounds.end_index = self.spectra_end_index start_index = offset_bounds.start_index end_index = offset_bounds.end_index assert (start_index <= end_index) stream = self.entry.get_range(start_index, end_index) return (stream, OffsetBounds(start_index, end_index))
def test_offsets(self): database: TestDatabase = TestDatabase() table1: TestTable = database.create_table("table1") entry1: TestEntry = table1.add_entry("test.fasta", ">A\tB\tC\n>a\tb\tc\n>1\t2\t3\n") it = TestIterator(entry1, OffsetBounds(10, 15), 30, 30) [items, offset_bounds, more] = it.next() self.assertEqual(list(items), [b">a\tb\tc\n"]) self.assertEqual(offset_bounds, OffsetBounds(7, 13)) # Edge case. Offset bound with end of file. it = TestIterator(entry1, OffsetBounds(14, 20), 30, 30) [items, offset_bounds, more] = it.next() self.assertEqual(list(items), [b">1\t2\t3\n"]) self.assertEqual(offset_bounds, OffsetBounds(14, 20)) self.assertFalse(more)
def test_offsets(self): database: TestDatabase = TestDatabase() table1: TestTable = database.create_table("table1") entry1: TestEntry = table1.add_entry("test.fastq", b"\n".join(expected_items)) it = TestIterator(entry1, OffsetBounds(100, 200), 100, 100) [items, offset_bounds, more] = it.next() items = list(items) self.assertEqual(items, expected_items[1:2]) self.assertEqual(offset_bounds, OffsetBounds(89, 177)) self.assertFalse(more) # Edge case. Offset bound with end of file. it = TestIterator(entry1, OffsetBounds(200, 300), 200, 200) [items, offset_bounds, more] = it.next() items = list(items) self.assertEqual(items, expected_items[2:]) self.assertEqual(offset_bounds, OffsetBounds(178, 265)) self.assertFalse(more)
def test_next(self): entry = TestEntry("test.tsv", "A\tB\tC\na\tb\tc\n1\t2\t3\n") # Requires multiple passes it = TestIterator(entry, None, 11, 11) [items, offset_bounds, more] = it.next() self.assertEqual(list(items), [b"A\tB\tC", b"a\tb\tc"]) self.assertEqual(offset_bounds, OffsetBounds(0, 11)) self.assertTrue(more) [items, offset_bounds, more] = it.next() self.assertEqual(list(items), [b"1\t2\t3"]) self.assertEqual(offset_bounds, OffsetBounds(12, 17)) self.assertFalse(more) # Read everything in one pass it = TestIterator(entry, None, 20, 20) [items, offset_bounds, more] = it.next() self.assertEqual(list(items), [b"A\tB\tC", b"a\tb\tc", b"1\t2\t3"]) self.assertEqual(offset_bounds, OffsetBounds(0, 17)) self.assertFalse(more)
def test_adjust(self): database: TestDatabase = TestDatabase() log: TestTable = database.create_table("log") table1: TestTable = database.create_table("table1") entry1: TestEntry = table1.add_entry( "test.new_line", "A B C\na b c\n1 2 3\nD E F\nd e f\n") it = TestIterator(entry1, OffsetBounds(8, 13), 10, 10) [items, offset_bounds, more] = it.next() self.assertEqual(list(items), [b"a b c"]) self.assertEqual(offset_bounds, OffsetBounds(6, 11)) self.assertFalse(more) # No adjustment needed it = TestIterator(entry1, OffsetBounds(6, 11), 10, 10) [items, offset_bounds, more] = it.next() self.assertEqual(list(items), [b"a b c"]) self.assertEqual(offset_bounds, OffsetBounds(6, 11)) self.assertFalse(more) # Beginning of content it = TestIterator(entry1, OffsetBounds(0, 7), 10, 10) [items, offset_bounds, more] = it.next() self.assertFalse(more) self.assertEqual(list(items), [b"A B C"]) # Beginning of content it = TestIterator(entry1, OffsetBounds(26, entry1.content_length() - 1), 10, 10) [items, offset_bounds, more] = it.next() self.assertFalse(more) self.assertEqual(list(items), [b"d e f"])
def run(database: Database, test_key: str, params, input_format, output_format, offsets: List[int]): train_obj = database.get_entry("spacenet", params["train_key"]) train_it = classification.Iterator( train_obj, OffsetBounds(params["train_offsets"][0], params["train_offsets"][1])) train_x = [] train_y = [] more = True while more: [items, _, more] = train_it.next() for [features, c] in items: train_x.append(features) train_y.append(c) neigh = NearestNeighbors(n_neighbors=params["k"], algorithm="brute") neigh.fit(train_x) pixels = [] rgb = [] with open(test_key, "rb") as f: lines = filter(lambda l: len(l.strip()) > 0, f.read().split(b"\n\n")) for line in lines: parts = line.split(b' ') x = int(parts[0]) y = int(parts[1]) pixels.append([x, y]) rgb.append(np.frombuffer(b' '.join(parts[2:]), dtype=int)) [distances, indices] = neigh.kneighbors(rgb) items = [] for i in range(len(distances)): [x, y] = pixels[i] neighbors = [] for j in range(len(distances[i])): distance = distances[i][j] clz = train_y[indices[i][j]] neighbors.append((distance, clz)) items.append((str.encode("{x} {y}".format(x=x, y=y)), neighbors)) output_format["ext"] = "knn" output_file = "/tmp/{0:s}".format(util.file_name(output_format)) with open(output_file, "wb+") as f: knn.Iterator.from_array(items, f, {}) return [output_file]
def test_next(self): database: TestDatabase = TestDatabase() log: TestTable = database.create_table("log") table1: TestTable = database.create_table("table1") entry1: TestEntry = table1.add_entry("test.new_line", "A B C\na b c\n1 2 3\n") # Requires multiple passes it = TestIterator(entry1, None, 11, 11) [items, offset_bounds, more] = it.next() self.assertTrue(more) self.assertEqual(OffsetBounds(0, 11), offset_bounds) self.assertEqual(list(items), [b"A B C", b"a b c"]) [items, offset_bounds, more] = it.next() self.assertFalse(more) self.assertEqual(list(items), [b"1 2 3"])
def run_application(d: Database, bucket_name: str, key: str, input_format: Dict[str, Any], output_format: Dict[str, Any], offsets: List[int], params: Dict[str, Any]): temp_file = "/tmp/{0:s}".format(key) util.make_folder(util.parse_file_name(key)) if len(offsets) == 0: d.download(bucket_name, key, temp_file) else: obj = d.get_entry(bucket_name, key) format_lib = importlib.import_module(params["input_format"]) iterator_class = getattr(format_lib, "Iterator") iterator = iterator_class(obj, OffsetBounds(offsets[0], offsets[1])) items = iterator.get(iterator.get_start_index(), iterator.get_end_index()) with open(temp_file, "wb+") as f: items = list(items) iterator_class.from_array(list(items), f, iterator.get_extra()) application_lib = importlib.import_module(params["application"]) application_method = getattr(application_lib, "run") output_files = application_method(d, temp_file, params, input_format, output_format, offsets) found = False for output_file in output_files: p = util.parse_file_name(output_file.replace("/tmp/", "")) if p is None: index = output_file.rfind(".") ext = output_file[index + 1:] output_format["ext"] = ext new_key = util.file_name(output_format) else: new_key = util.file_name(p) with open(output_file, "rb") as f: d.put(params["bucket"], new_key, f, {}) return True
def handle_pivots(database: Database, bucket_name, key, input_format, output_format, offsets, params): entry: Entry = database.get_entry(bucket_name, key) format_lib = importlib.import_module(params["input_format"]) iterator_class = getattr(format_lib, "Iterator") if len(offsets) > 0: it = iterator_class(entry, OffsetBounds(offsets[0], offsets[1])) else: it = iterator_class(entry, None) items = it.get(it.get_start_index(), it.get_end_index()) pivots: List[float] = create_pivots(database, format_lib, iterator_class, list(items), params) output_format["ext"] = "pivot" pivot_key = util.file_name(output_format) spivots = "\t".join(list(map(lambda p: str(p), pivots))) content = str.encode("{0:s}\n{1:s}\n{2:s}".format(bucket_name, key, spivots)) database.write(params["bucket"], pivot_key, content, {}) return True
def handle_sort(database: Database, table_name: str, key: str, input_format: Dict[str, Any], output_format: Dict[str, Any], offsets: List[int], params: Dict[str, Any]): entry = database.get_entry(table_name, key) assert ("ext" in output_format) format_lib = importlib.import_module(params["input_format"]) iterator_class = getattr(format_lib, "Iterator") if len(offsets) > 0: it = iterator_class(entry, OffsetBounds(offsets[0], offsets[1])) else: it = iterator_class(entry, None) extra = it.get_extra() items = it.get(it.get_start_index(), it.get_end_index()) items = list( map( lambda item: (it.get_identifier_value( item, format_lib.Identifiers[params["identifier"]]), item), items)) sorted_items = sorted(items, key=lambda k: k[0]) bin_ranges = params["pivots"] binned_input = bin_input(sorted_items, bin_ranges) write_binned_input(database, binned_input, bin_ranges, extra, dict(output_format), iterator_class, params) return True