예제 #1
0
 def test_single_gz_file_no_add_references(self):
     filename = u"tests/testdata/simple_1.concrete.gz"
     reader = CommunicationReader(filename, add_references=False)
     (comm1, comm1_filename) = reader.next()
     self.assertFalse(hasattr(comm1, "sentenceForUUID"))
     self.assertEqual(u"one", comm1.id)
     self.assertEqual(filename, comm1_filename)
예제 #2
0
 def test_explicit_single_gz_file(self):
     filename = u"tests/testdata/simple_1.concrete.gz"
     reader = CommunicationReader(filename, filetype=FileType.STREAM_GZ)
     (comm1, comm1_filename) = reader.next()
     self.assertTrue(hasattr(comm1, "sentenceForUUID"))
     self.assertEqual(u"one", comm1.id)
     self.assertEqual(filename, comm1_filename)
예제 #3
0
 def test_single_bz2_file(self):
     filename = u"tests/testdata/simple_1.concrete.bz2"
     reader = CommunicationReader(filename)
     (comm1, comm1_filename) = reader.next()
     self.assertTrue(hasattr(comm1, "sentenceForUUID"))
     self.assertEqual(u"one", comm1.id)
     self.assertEqual(filename, comm1_filename)
예제 #4
0
def test_CommunicationReader_tar_gz_file_unicode():
    reader = CommunicationReader(
        "tests/testdata/les-deux-chandeliers.concrete.tar.gz")
    [comms, filenames] = zip(*[(c, f) for (c, f) in reader])
    assert len(comms) == 2
    assert 'les-deux-chandeliers/l0.txt' == comms[0].id
    assert 'les-deux-chandeliers/l1.txt' == comms[1].id
def test_create_comm_tarball(output_file, text_l0, text_l1):
    p = Popen([
        'scripts/create-comm-tarball.py',
        'tests/testdata/les-deux-chandeliers.tar.gz', output_file
    ],
              stdout=PIPE,
              stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers/l0.txt'
    assert validate_communication(comm)
    assert comm.text == text_l0
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers/l1.txt'
    assert validate_communication(comm)
    assert comm.text == text_l1
    assert comm.sectionList is None

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
def test_create_comm_annotated(output_file, text):
    p = Popen([
        'scripts/create-comm.py', '--annotation-level', 'section',
        'tests/testdata/les-deux-chandeliers.txt', output_file
    ],
              stdout=PIPE,
              stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert comm.id == 'tests/testdata/les-deux-chandeliers.txt'
    assert validate_communication(comm)
    assert comm.text == text
    assert len(comm.sectionList) == 2

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
def test_tweets2concrete_log_every(output_file):
    p = Popen([
        'scripts/tweets2concrete.py',
        '--log-level', 'INFO',
        '--log-interval', '1',
        'tests/testdata/tweets.json',
        output_file
    ], stdout=PIPE, stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0
    assert len([line
                for line in stderr.strip().split('\n')
                if 'INFO' in line]) >= 2

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert_first_comm(comm)

    (comm, _) = it.next()
    assert_second_comm(comm)

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
def test_tweets2concrete_log_config(log_conf, output_file):
    (log_conf_path, log_path) = log_conf
    p = Popen([
        'scripts/tweets2concrete.py',
        '--log-conf-path', log_conf_path,
        '--log-interval', '1',
        'tests/testdata/tweets.json',
        output_file
    ], stdout=PIPE, stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0
    assert len(stdout) == 0
    assert len(stderr) == 0

    with open(log_path) as f:
        data = f.read()
        assert len([line
                    for line in data.strip().split('\n')
                    if 'INFO' in line]) >= 2

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert_first_comm(comm)

    (comm, _) = it.next()
    assert_second_comm(comm)

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
def test_tweets2concrete_invalid(output_file):
    p = Popen([
        'scripts/tweets2concrete.py',
        '--skip-invalid-comms',
        'tests/testdata/tweets.invalid.json',
        output_file
    ], stdout=PIPE, stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert_first_comm(comm)

    (comm, _) = it.next()
    assert_second_comm(comm)

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
def test_create_comm_tarball_log_every(output_file, text_l0, text_l1):
    p = Popen([
        'scripts/create-comm-tarball.py', '--log-level', 'INFO',
        '--log-interval', '1', 'tests/testdata/les-deux-chandeliers.tar.gz',
        output_file
    ],
              stdout=PIPE,
              stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0
    assert len([line
                for line in stderr.strip().split('\n') if 'INFO' in line]) >= 2

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers/l0.txt'
    assert validate_communication(comm)
    assert comm.text == text_l0
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers/l1.txt'
    assert validate_communication(comm)
    assert comm.text == text_l1
    assert comm.sectionList is None

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
def test_tweets2concrete_incomplete_gz_multiproc(output_file):
    p = Popen([
        'scripts/tweets2concrete.py',
        '--num-proc', '2',
        '--catch-ioerror',
        'tests/testdata/tweets.json.incomplete.gz',
        output_file
    ], stdout=PIPE, stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert_first_comm(comm)

    (comm, _) = it.next()
    assert_second_comm(comm)

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
def test_tweets2concrete_stdout(output_file):
    p = Popen([
        'scripts/tweets2concrete.py',
        'tests/testdata/tweets.json',
        '-'
    ], stdout=PIPE, stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    with open(output_file, 'wb') as f:
        f.write(stdout)

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert_first_comm(comm)

    (comm, _) = it.next()
    assert_second_comm(comm)

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
예제 #13
0
def test_compress_uuids_api(reader_kwargs, compress_kwargs):
    input_file = 'tests/testdata/simple.tar.gz'
    reader = CommunicationReader(input_file, **reader_kwargs)
    it = iter(reader)

    (comm, _) = it.next()
    (new_comm, uc) = compress_uuids(comm, **compress_kwargs)
    assert new_comm.id == 'one'
    assert comm.id == new_comm.id
    assert validate_communication(new_comm)

    (comm, _) = it.next()
    (new_comm, uc) = compress_uuids(comm, **compress_kwargs)
    assert new_comm.id == 'two'
    assert comm.id == new_comm.id
    assert validate_communication(new_comm)

    (comm, _) = it.next()
    (new_comm, uc) = compress_uuids(comm, **compress_kwargs)
    assert new_comm.id == 'three'
    assert comm.id == new_comm.id
    assert validate_communication(new_comm)

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
def test_create_comm_stdout(output_file, text):
    p = Popen([
        'scripts/create-comm.py', 'tests/testdata/les-deux-chandeliers.txt',
        '-'
    ],
              stdout=PIPE,
              stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    with open(output_file, 'wb') as f:
        f.write(stdout)

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert comm.id == 'tests/testdata/les-deux-chandeliers.txt'
    assert validate_communication(comm)
    assert comm.text == text
    assert comm.sectionList is None

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
예제 #15
0
def compress_uuids(input_path,
                   output_path,
                   verify=False,
                   uuid_map_path=None,
                   single_analytic=False):
    reader = CommunicationReader(input_path, add_references=False)
    writer = CommunicationWriterTGZ(output_path)

    if uuid_map_path is None:
        uuid_map_file = None
    else:
        uuid_map_file = open(uuid_map_path, 'w')

    for (i, (comm, comm_filename)) in enumerate(reader):
        (new_comm, uc) = _compress_uuids(comm,
                                         verify=verify,
                                         single_analytic=single_analytic)

        logging.info('compressed %s (%d analytics, %d uuids) (%d/?)' %
                     (comm.id, len(uc.augs), len(uc.uuid_map), i + 1))

        if uuid_map_file is not None:
            for (old_uuid, new_uuid) in sorted(uc.uuid_map.items(),
                                               key=lambda p: str(p[1])):
                uuid_map_file.write('%s %s\n' % (old_uuid, new_uuid))

        writer.write(new_comm, comm_filename=comm_filename)
예제 #16
0
def test_tar(fifo):
    input_path = 'tests/testdata/simple.tar'
    p = Process(target=write_fifo, args=(input_path, fifo))
    p.start()

    reader = CommunicationReader(fifo, filetype=FileType.TAR)
    it = iter(reader)

    (comm, path) = it.next()
    assert comm.id == 'one'

    (comm, path) = it.next()
    assert comm.id == 'two'

    (comm, path) = it.next()
    assert comm.id == 'three'

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False

    p.join()
예제 #17
0
 def test_concatenated_gz_file_no_add_references(self):
     filename = u'tests/testdata/simple_concatenated.gz'
     reader = CommunicationReader(filename, add_references=False)
     [comms, filenames] = zip(*[(c, f) for (c, f) in reader])
     for (i, comm_id) in enumerate([u'one', u'two', u'three']):
         self.assertFalse(hasattr(comms[i], 'sentenceForUUID'))
         self.assertEqual(comm_id, comms[i].id)
         self.assertEqual(filename, filenames[i])
예제 #18
0
 def test_concatenated_bz2_file(self):
     filename = u'tests/testdata/simple_concatenated.bz2'
     reader = CommunicationReader(filename)
     [comms, filenames] = zip(*[(c, f) for (c, f) in reader])
     for (i, comm_id) in enumerate([u'one', u'two', u'three']):
         self.assertTrue(hasattr(comms[i], 'sentenceForUUID'))
         self.assertEqual(comm_id, comms[i].id)
         self.assertEqual(filename, filenames[i])
예제 #19
0
def get_count(data_file):
    conc = data_file.endswith("tgz")
    ifd = CommunicationReader(data_file) if conc else reader(
        gzip.open(data_file))
    n = 0
    for c, _ in enumerate(ifd):
        n += 1
    logging.info("File %s contains %d Communications", data_file, n)
    return n
def test_create_comm_tarball_per_line(output_file, text_l0, text_l1_s0,
                                      text_l1_s1, text_l1_s2, text_l1_s3,
                                      text_l1_s4):
    p = Popen([
        'scripts/create-comm-tarball.py', '--per-line',
        'tests/testdata/les-deux-chandeliers-perline.tar.gz', output_file
    ],
              stdout=PIPE,
              stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers-perline/l0.txt/0'
    assert validate_communication(comm)
    assert comm.text == text_l0
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers-perline/l1.txt/0'
    assert validate_communication(comm)
    assert comm.text == text_l1_s0
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers-perline/l1.txt/1'
    assert validate_communication(comm)
    assert comm.text == text_l1_s1
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers-perline/l1.txt/2'
    assert validate_communication(comm)
    assert comm.text == text_l1_s2
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers-perline/l1.txt/3'
    assert validate_communication(comm)
    assert comm.text == text_l1_s3
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers-perline/l1.txt/4'
    assert validate_communication(comm)
    assert comm.text == text_l1_s4
    assert comm.sectionList is None

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
예제 #21
0
 def test_tar_gz_file(self):
     reader = CommunicationReader("tests/testdata/simple.tar.gz")
     [comms, filenames] = zip(*[(c, f) for (c, f) in reader])
     self.assertTrue(hasattr(comms[0], 'sentenceForUUID'))
     self.assertTrue(hasattr(comms[1], 'sentenceForUUID'))
     self.assertTrue(hasattr(comms[2], 'sentenceForUUID'))
     self.assertEqual(u'one', comms[0].id)
     self.assertEqual(u'two', comms[1].id)
     self.assertEqual(u'three', comms[2].id)
     self.assertEqual(u'simple_1.concrete', filenames[0])
     self.assertEqual(u'simple_2.concrete', filenames[1])
     self.assertEqual(u'simple_3.concrete', filenames[2])
예제 #22
0
 def test_explicit_nested_tar_file(self):
     reader = CommunicationReader("tests/testdata/simple_nested.tar",
                                  filetype=FileType.TAR)
     [comms, filenames] = zip(*[(c, f) for (c, f) in reader])
     self.assertTrue(hasattr(comms[0], 'sentenceForUUID'))
     self.assertTrue(hasattr(comms[1], 'sentenceForUUID'))
     self.assertTrue(hasattr(comms[2], 'sentenceForUUID'))
     self.assertEqual(u'one', comms[0].id)
     self.assertEqual(u'two', comms[1].id)
     self.assertEqual(u'three', comms[2].id)
     self.assertEqual(u'a/b/simple_1.concrete', filenames[0])
     self.assertEqual(u'a/c/simple_2.concrete', filenames[1])
     self.assertEqual(u'a/c/simple_3.concrete', filenames[2])
예제 #23
0
 def test_zip_file_no_add_references(self):
     reader = CommunicationReader("tests/testdata/simple.zip",
                                  add_references=False)
     [comms, filenames] = zip(*[(c, f) for (c, f) in reader])
     self.assertFalse(hasattr(comms[0], 'sentenceForUUID'))
     self.assertFalse(hasattr(comms[1], 'sentenceForUUID'))
     self.assertFalse(hasattr(comms[2], 'sentenceForUUID'))
     self.assertEqual(u'one', comms[0].id)
     self.assertEqual(u'two', comms[1].id)
     self.assertEqual(u'three', comms[2].id)
     self.assertEqual(u'simple_1.concrete', filenames[0])
     self.assertEqual(u'simple_2.concrete', filenames[1])
     self.assertEqual(u'simple_3.concrete', filenames[2])
예제 #24
0
def test_compress_uuids(output_file, args):
    input_file = 'tests/testdata/simple.tar.gz'

    p = Popen(['scripts/compress-uuids.py', input_file, output_file] +
              list(args),
              stdout=PIPE,
              stderr=PIPE)
    (stdout, stderr) = p.communicate()

    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, comm_filename) = it.next()
    assert comm_filename == 'simple_1.concrete'
    assert comm.id == 'one'
    assert validate_communication(comm)

    (comm, comm_filename) = it.next()
    assert comm_filename == 'simple_2.concrete'
    assert comm.id == 'two'
    assert validate_communication(comm)

    (comm, comm_filename) = it.next()
    assert comm_filename == 'simple_3.concrete'
    assert comm.id == 'three'
    assert validate_communication(comm)

    assert os.stat(output_file).st_size < os.stat(input_file).st_size

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
예제 #25
0
def read_data(data_file, num_file, tag_type, of_interest=None):
    conc = data_file.endswith("tgz")
    nums = set()
    with gzip.open(num_file) as ifd:
        for n in ifd:
            nums.add(int(n.rstrip(b"\n")))
    items = []
    ifd = CommunicationReader(data_file) if conc else reader(
        gzip.open(data_file))
    for n, item in enumerate(ifd):
        if n in nums:
            if conc:
                text = item[0].text.lower()
                cid = item[0].id
                labels = [
                    t for t in item[0].communicationTaggingList
                    if t.taggingType == tag_type
                ][0].tagList
                label = "_".join(sorted(labels))
            else:
                cid, label, text = item.rstrip("\n").split("\t")
            items.append((cid, label, text))
    logging.info("Read %d Communications", len(items))
    return items
예제 #26
0
def test_CommunicationReader_single_file_unicode():
    reader = CommunicationReader(
        "tests/testdata/les-deux-chandeliers.concrete")
    [comms, filenames] = zip(*[(c, f) for (c, f) in reader])
    assert len(comms) == 1
    assert 'tests/testdata/les-deux-chandeliers.txt' == comms[0].id
예제 #27
0
#!/usr/bin/env python

import gzip
import codecs
import random
from concrete.util.file_io import CommunicationReader
from .io import reader, writer

if __name__ == "__main__":

    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", dest="input")
    parser.add_argument("-o", "--output", dest="output")
    options = parser.parse_args()

    with writer(gzip.open(options.output, "w")) as ofd:
        for i, c in enumerate(CommunicationReader(options.input)):
            ofd.write("%d\n" % (i))