def test_single_gz_file_no_add_references(self): filename = u"tests/testdata/simple_1.concrete.gz" reader = CommunicationReader(filename, add_references=False) (comm1, comm1_filename) = reader.next() self.assertFalse(hasattr(comm1, "sentenceForUUID")) self.assertEqual(u"one", comm1.id) self.assertEqual(filename, comm1_filename)
def test_explicit_single_gz_file(self): filename = u"tests/testdata/simple_1.concrete.gz" reader = CommunicationReader(filename, filetype=FileType.STREAM_GZ) (comm1, comm1_filename) = reader.next() self.assertTrue(hasattr(comm1, "sentenceForUUID")) self.assertEqual(u"one", comm1.id) self.assertEqual(filename, comm1_filename)
def test_single_bz2_file(self): filename = u"tests/testdata/simple_1.concrete.bz2" reader = CommunicationReader(filename) (comm1, comm1_filename) = reader.next() self.assertTrue(hasattr(comm1, "sentenceForUUID")) self.assertEqual(u"one", comm1.id) self.assertEqual(filename, comm1_filename)
def test_CommunicationReader_tar_gz_file_unicode(): reader = CommunicationReader( "tests/testdata/les-deux-chandeliers.concrete.tar.gz") [comms, filenames] = zip(*[(c, f) for (c, f) in reader]) assert len(comms) == 2 assert 'les-deux-chandeliers/l0.txt' == comms[0].id assert 'les-deux-chandeliers/l1.txt' == comms[1].id
def test_create_comm_tarball(output_file, text_l0, text_l1): p = Popen([ 'scripts/create-comm-tarball.py', 'tests/testdata/les-deux-chandeliers.tar.gz', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers/l0.txt' assert validate_communication(comm) assert comm.text == text_l0 assert comm.sectionList is None (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers/l1.txt' assert validate_communication(comm) assert comm.text == text_l1 assert comm.sectionList is None try: it.next() except StopIteration: pass else: assert False
def test_create_comm_annotated(output_file, text): p = Popen([ 'scripts/create-comm.py', '--annotation-level', 'section', 'tests/testdata/les-deux-chandeliers.txt', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = it.next() assert comm.id == 'tests/testdata/les-deux-chandeliers.txt' assert validate_communication(comm) assert comm.text == text assert len(comm.sectionList) == 2 try: it.next() except StopIteration: pass else: assert False
def test_tweets2concrete_log_every(output_file): p = Popen([ 'scripts/tweets2concrete.py', '--log-level', 'INFO', '--log-interval', '1', 'tests/testdata/tweets.json', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 assert len([line for line in stderr.strip().split('\n') if 'INFO' in line]) >= 2 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = it.next() assert_first_comm(comm) (comm, _) = it.next() assert_second_comm(comm) try: it.next() except StopIteration: pass else: assert False
def test_tweets2concrete_log_config(log_conf, output_file): (log_conf_path, log_path) = log_conf p = Popen([ 'scripts/tweets2concrete.py', '--log-conf-path', log_conf_path, '--log-interval', '1', 'tests/testdata/tweets.json', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 assert len(stdout) == 0 assert len(stderr) == 0 with open(log_path) as f: data = f.read() assert len([line for line in data.strip().split('\n') if 'INFO' in line]) >= 2 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = it.next() assert_first_comm(comm) (comm, _) = it.next() assert_second_comm(comm) try: it.next() except StopIteration: pass else: assert False
def test_tweets2concrete_invalid(output_file): p = Popen([ 'scripts/tweets2concrete.py', '--skip-invalid-comms', 'tests/testdata/tweets.invalid.json', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = it.next() assert_first_comm(comm) (comm, _) = it.next() assert_second_comm(comm) try: it.next() except StopIteration: pass else: assert False
def test_create_comm_tarball_log_every(output_file, text_l0, text_l1): p = Popen([ 'scripts/create-comm-tarball.py', '--log-level', 'INFO', '--log-interval', '1', 'tests/testdata/les-deux-chandeliers.tar.gz', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 assert len([line for line in stderr.strip().split('\n') if 'INFO' in line]) >= 2 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers/l0.txt' assert validate_communication(comm) assert comm.text == text_l0 assert comm.sectionList is None (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers/l1.txt' assert validate_communication(comm) assert comm.text == text_l1 assert comm.sectionList is None try: it.next() except StopIteration: pass else: assert False
def test_tweets2concrete_incomplete_gz_multiproc(output_file): p = Popen([ 'scripts/tweets2concrete.py', '--num-proc', '2', '--catch-ioerror', 'tests/testdata/tweets.json.incomplete.gz', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = it.next() assert_first_comm(comm) (comm, _) = it.next() assert_second_comm(comm) try: it.next() except StopIteration: pass else: assert False
def test_tweets2concrete_stdout(output_file): p = Popen([ 'scripts/tweets2concrete.py', 'tests/testdata/tweets.json', '-' ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 with open(output_file, 'wb') as f: f.write(stdout) reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = it.next() assert_first_comm(comm) (comm, _) = it.next() assert_second_comm(comm) try: it.next() except StopIteration: pass else: assert False
def test_compress_uuids_api(reader_kwargs, compress_kwargs): input_file = 'tests/testdata/simple.tar.gz' reader = CommunicationReader(input_file, **reader_kwargs) it = iter(reader) (comm, _) = it.next() (new_comm, uc) = compress_uuids(comm, **compress_kwargs) assert new_comm.id == 'one' assert comm.id == new_comm.id assert validate_communication(new_comm) (comm, _) = it.next() (new_comm, uc) = compress_uuids(comm, **compress_kwargs) assert new_comm.id == 'two' assert comm.id == new_comm.id assert validate_communication(new_comm) (comm, _) = it.next() (new_comm, uc) = compress_uuids(comm, **compress_kwargs) assert new_comm.id == 'three' assert comm.id == new_comm.id assert validate_communication(new_comm) try: it.next() except StopIteration: pass else: assert False
def test_create_comm_stdout(output_file, text): p = Popen([ 'scripts/create-comm.py', 'tests/testdata/les-deux-chandeliers.txt', '-' ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 with open(output_file, 'wb') as f: f.write(stdout) reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = it.next() assert comm.id == 'tests/testdata/les-deux-chandeliers.txt' assert validate_communication(comm) assert comm.text == text assert comm.sectionList is None try: it.next() except StopIteration: pass else: assert False
def compress_uuids(input_path, output_path, verify=False, uuid_map_path=None, single_analytic=False): reader = CommunicationReader(input_path, add_references=False) writer = CommunicationWriterTGZ(output_path) if uuid_map_path is None: uuid_map_file = None else: uuid_map_file = open(uuid_map_path, 'w') for (i, (comm, comm_filename)) in enumerate(reader): (new_comm, uc) = _compress_uuids(comm, verify=verify, single_analytic=single_analytic) logging.info('compressed %s (%d analytics, %d uuids) (%d/?)' % (comm.id, len(uc.augs), len(uc.uuid_map), i + 1)) if uuid_map_file is not None: for (old_uuid, new_uuid) in sorted(uc.uuid_map.items(), key=lambda p: str(p[1])): uuid_map_file.write('%s %s\n' % (old_uuid, new_uuid)) writer.write(new_comm, comm_filename=comm_filename)
def test_tar(fifo): input_path = 'tests/testdata/simple.tar' p = Process(target=write_fifo, args=(input_path, fifo)) p.start() reader = CommunicationReader(fifo, filetype=FileType.TAR) it = iter(reader) (comm, path) = it.next() assert comm.id == 'one' (comm, path) = it.next() assert comm.id == 'two' (comm, path) = it.next() assert comm.id == 'three' try: it.next() except StopIteration: pass else: assert False p.join()
def test_concatenated_gz_file_no_add_references(self): filename = u'tests/testdata/simple_concatenated.gz' reader = CommunicationReader(filename, add_references=False) [comms, filenames] = zip(*[(c, f) for (c, f) in reader]) for (i, comm_id) in enumerate([u'one', u'two', u'three']): self.assertFalse(hasattr(comms[i], 'sentenceForUUID')) self.assertEqual(comm_id, comms[i].id) self.assertEqual(filename, filenames[i])
def test_concatenated_bz2_file(self): filename = u'tests/testdata/simple_concatenated.bz2' reader = CommunicationReader(filename) [comms, filenames] = zip(*[(c, f) for (c, f) in reader]) for (i, comm_id) in enumerate([u'one', u'two', u'three']): self.assertTrue(hasattr(comms[i], 'sentenceForUUID')) self.assertEqual(comm_id, comms[i].id) self.assertEqual(filename, filenames[i])
def get_count(data_file): conc = data_file.endswith("tgz") ifd = CommunicationReader(data_file) if conc else reader( gzip.open(data_file)) n = 0 for c, _ in enumerate(ifd): n += 1 logging.info("File %s contains %d Communications", data_file, n) return n
def test_create_comm_tarball_per_line(output_file, text_l0, text_l1_s0, text_l1_s1, text_l1_s2, text_l1_s3, text_l1_s4): p = Popen([ 'scripts/create-comm-tarball.py', '--per-line', 'tests/testdata/les-deux-chandeliers-perline.tar.gz', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers-perline/l0.txt/0' assert validate_communication(comm) assert comm.text == text_l0 assert comm.sectionList is None (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers-perline/l1.txt/0' assert validate_communication(comm) assert comm.text == text_l1_s0 assert comm.sectionList is None (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers-perline/l1.txt/1' assert validate_communication(comm) assert comm.text == text_l1_s1 assert comm.sectionList is None (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers-perline/l1.txt/2' assert validate_communication(comm) assert comm.text == text_l1_s2 assert comm.sectionList is None (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers-perline/l1.txt/3' assert validate_communication(comm) assert comm.text == text_l1_s3 assert comm.sectionList is None (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers-perline/l1.txt/4' assert validate_communication(comm) assert comm.text == text_l1_s4 assert comm.sectionList is None try: it.next() except StopIteration: pass else: assert False
def test_tar_gz_file(self): reader = CommunicationReader("tests/testdata/simple.tar.gz") [comms, filenames] = zip(*[(c, f) for (c, f) in reader]) self.assertTrue(hasattr(comms[0], 'sentenceForUUID')) self.assertTrue(hasattr(comms[1], 'sentenceForUUID')) self.assertTrue(hasattr(comms[2], 'sentenceForUUID')) self.assertEqual(u'one', comms[0].id) self.assertEqual(u'two', comms[1].id) self.assertEqual(u'three', comms[2].id) self.assertEqual(u'simple_1.concrete', filenames[0]) self.assertEqual(u'simple_2.concrete', filenames[1]) self.assertEqual(u'simple_3.concrete', filenames[2])
def test_explicit_nested_tar_file(self): reader = CommunicationReader("tests/testdata/simple_nested.tar", filetype=FileType.TAR) [comms, filenames] = zip(*[(c, f) for (c, f) in reader]) self.assertTrue(hasattr(comms[0], 'sentenceForUUID')) self.assertTrue(hasattr(comms[1], 'sentenceForUUID')) self.assertTrue(hasattr(comms[2], 'sentenceForUUID')) self.assertEqual(u'one', comms[0].id) self.assertEqual(u'two', comms[1].id) self.assertEqual(u'three', comms[2].id) self.assertEqual(u'a/b/simple_1.concrete', filenames[0]) self.assertEqual(u'a/c/simple_2.concrete', filenames[1]) self.assertEqual(u'a/c/simple_3.concrete', filenames[2])
def test_zip_file_no_add_references(self): reader = CommunicationReader("tests/testdata/simple.zip", add_references=False) [comms, filenames] = zip(*[(c, f) for (c, f) in reader]) self.assertFalse(hasattr(comms[0], 'sentenceForUUID')) self.assertFalse(hasattr(comms[1], 'sentenceForUUID')) self.assertFalse(hasattr(comms[2], 'sentenceForUUID')) self.assertEqual(u'one', comms[0].id) self.assertEqual(u'two', comms[1].id) self.assertEqual(u'three', comms[2].id) self.assertEqual(u'simple_1.concrete', filenames[0]) self.assertEqual(u'simple_2.concrete', filenames[1]) self.assertEqual(u'simple_3.concrete', filenames[2])
def test_compress_uuids(output_file, args): input_file = 'tests/testdata/simple.tar.gz' p = Popen(['scripts/compress-uuids.py', input_file, output_file] + list(args), stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, comm_filename) = it.next() assert comm_filename == 'simple_1.concrete' assert comm.id == 'one' assert validate_communication(comm) (comm, comm_filename) = it.next() assert comm_filename == 'simple_2.concrete' assert comm.id == 'two' assert validate_communication(comm) (comm, comm_filename) = it.next() assert comm_filename == 'simple_3.concrete' assert comm.id == 'three' assert validate_communication(comm) assert os.stat(output_file).st_size < os.stat(input_file).st_size try: it.next() except StopIteration: pass else: assert False
def read_data(data_file, num_file, tag_type, of_interest=None): conc = data_file.endswith("tgz") nums = set() with gzip.open(num_file) as ifd: for n in ifd: nums.add(int(n.rstrip(b"\n"))) items = [] ifd = CommunicationReader(data_file) if conc else reader( gzip.open(data_file)) for n, item in enumerate(ifd): if n in nums: if conc: text = item[0].text.lower() cid = item[0].id labels = [ t for t in item[0].communicationTaggingList if t.taggingType == tag_type ][0].tagList label = "_".join(sorted(labels)) else: cid, label, text = item.rstrip("\n").split("\t") items.append((cid, label, text)) logging.info("Read %d Communications", len(items)) return items
def test_CommunicationReader_single_file_unicode(): reader = CommunicationReader( "tests/testdata/les-deux-chandeliers.concrete") [comms, filenames] = zip(*[(c, f) for (c, f) in reader]) assert len(comms) == 1 assert 'tests/testdata/les-deux-chandeliers.txt' == comms[0].id
#!/usr/bin/env python import gzip import codecs import random from concrete.util.file_io import CommunicationReader from .io import reader, writer if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", dest="input") parser.add_argument("-o", "--output", dest="output") options = parser.parse_args() with writer(gzip.open(options.output, "w")) as ofd: for i, c in enumerate(CommunicationReader(options.input)): ofd.write("%d\n" % (i))