def test_create_comm_tarball_log_every(output_file, text_l0, text_l1):
    p = Popen([
        'scripts/create-comm-tarball.py', '--log-level', 'INFO',
        '--log-interval', '1', 'tests/testdata/les-deux-chandeliers.tar.gz',
        output_file
    ],
              stdout=PIPE,
              stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0
    assert len([line
                for line in stderr.strip().split('\n') if 'INFO' in line]) >= 2

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers/l0.txt'
    assert validate_communication(comm)
    assert comm.text == text_l0
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers/l1.txt'
    assert validate_communication(comm)
    assert comm.text == text_l1
    assert comm.sectionList is None

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
def test_compress_uuids(output_file, args):
    input_file = "tests/testdata/simple.tar.gz"

    p = Popen(["scripts/compress-uuids.py", input_file, output_file] + list(args), stdout=PIPE, stderr=PIPE)
    (stdout, stderr) = p.communicate()

    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, comm_filename) = it.next()
    assert comm_filename == "simple_1.concrete"
    assert comm.id == "one"
    assert validate_communication(comm)

    (comm, comm_filename) = it.next()
    assert comm_filename == "simple_2.concrete"
    assert comm.id == "two"
    assert validate_communication(comm)

    (comm, comm_filename) = it.next()
    assert comm_filename == "simple_3.concrete"
    assert comm.id == "three"
    assert validate_communication(comm)

    assert os.stat(output_file).st_size < os.stat(input_file).st_size

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
示例#3
0
def test_create_comm_tarball_log_every(output_file, text_l0, text_l1):
    p = Popen([
        'scripts/create-comm-tarball.py',
        '--log-level', 'INFO',
        '--log-interval', '1',
        'tests/testdata/les-deux-chandeliers.tar.gz',
        output_file
    ], stdout=PIPE, stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0
    assert len([line
                for line in stderr.strip().split('\n')
                if 'INFO' in line]) >= 2

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers/l0.txt'
    assert validate_communication(comm)
    assert comm.text == text_l0
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers/l1.txt'
    assert validate_communication(comm)
    assert comm.text == text_l1
    assert comm.sectionList is None

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
示例#4
0
def test_compress_uuids_api(reader_kwargs, compress_kwargs):
    input_file = 'tests/testdata/simple.tar.gz'
    reader = CommunicationReader(input_file, **reader_kwargs)
    it = iter(reader)

    (comm, _) = next(it)
    (new_comm, uc) = compress_uuids(comm, **compress_kwargs)
    assert new_comm.id == 'one'
    assert comm.id == new_comm.id
    assert validate_communication(new_comm)

    (comm, _) = next(it)
    (new_comm, uc) = compress_uuids(comm, **compress_kwargs)
    assert new_comm.id == 'two'
    assert comm.id == new_comm.id
    assert validate_communication(new_comm)

    (comm, _) = next(it)
    (new_comm, uc) = compress_uuids(comm, **compress_kwargs)
    assert new_comm.id == 'three'
    assert comm.id == new_comm.id
    assert validate_communication(new_comm)

    try:
        next(it)
    except StopIteration:
        pass
    else:
        assert False
def test_compress_uuids_api(reader_kwargs, compress_kwargs):
    input_file = "tests/testdata/simple.tar.gz"
    reader = CommunicationReader(input_file, **reader_kwargs)
    it = iter(reader)

    (comm, _) = it.next()
    (new_comm, uc) = compress_uuids(comm, **compress_kwargs)
    assert new_comm.id == "one"
    assert comm.id == new_comm.id
    assert validate_communication(new_comm)

    (comm, _) = it.next()
    (new_comm, uc) = compress_uuids(comm, **compress_kwargs)
    assert new_comm.id == "two"
    assert comm.id == new_comm.id
    assert validate_communication(new_comm)

    (comm, _) = it.next()
    (new_comm, uc) = compress_uuids(comm, **compress_kwargs)
    assert new_comm.id == "three"
    assert comm.id == new_comm.id
    assert validate_communication(new_comm)

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
示例#6
0
def test_create_comm_tarball(output_file, text_l0, text_l1):
    p = Popen([
        'scripts/create-comm-tarball.py',
        'tests/testdata/les-deux-chandeliers.tar.gz',
        output_file
    ], stdout=PIPE, stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers/l0.txt'
    assert validate_communication(comm)
    assert comm.text == text_l0
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers/l1.txt'
    assert validate_communication(comm)
    assert comm.text == text_l1
    assert comm.sectionList is None

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
示例#7
0
def test_create_comm_tarball(output_file, text_l0, text_l1):
    p = Popen([
        sys.executable, 'scripts/create-comm-tarball.py',
        'tests/testdata/les-deux-chandeliers.tar.gz', output_file
    ],
              stdout=PIPE,
              stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = next(it)
    assert comm.id == 'les-deux-chandeliers/l0.txt'
    assert validate_communication(comm)
    assert comm.text == text_l0
    assert comm.sectionList is None

    (comm, _) = next(it)
    assert comm.id == 'les-deux-chandeliers/l1.txt'
    assert validate_communication(comm)
    assert comm.text == text_l1
    assert comm.sectionList is None

    try:
        next(it)
    except StopIteration:
        pass
    else:
        assert False
def test_create_comm_tarball_stdin(output_file, text_l0, text_l1):
    p = Popen(['scripts/create-comm-tarball.py', '-', output_file],
              stdin=PIPE,
              stdout=PIPE,
              stderr=PIPE)
    with open('tests/testdata/les-deux-chandeliers.tar.gz', 'rb') as f:
        (stdout, stderr) = p.communicate(f.read())
    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers/l0.txt'
    assert validate_communication(comm)
    assert comm.text == text_l0
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers/l1.txt'
    assert validate_communication(comm)
    assert comm.text == text_l1
    assert comm.sectionList is None

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
def test_create_comm_tarball_annotated(output_file, text_l0, text_l1):
    p = Popen([
        'scripts/create-comm-tarball.py', '--annotation-level', 'section',
        'tests/testdata/les-deux-chandeliers.tar.gz', output_file
    ],
              stdout=PIPE,
              stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers/l0.txt'
    assert validate_communication(comm)
    assert comm.text == text_l0
    assert len(comm.sectionList) == 1

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers/l1.txt'
    assert validate_communication(comm)
    assert comm.text == text_l1
    assert len(comm.sectionList) == 1

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
示例#10
0
def test_create_comm_complex_al_none():
    comm = create_comm('one', '\n\nsimple comm\t\t.\nor ...\n\nisit?\n',
                       annotation_level=AL_NONE)
    assert 'one' == comm.id
    assert '\n\nsimple comm\t\t.\nor ...\n\nisit?\n' == comm.text
    assert comm.sectionList is None
    assert validate_communication(comm)
示例#11
0
def test_create_comm_stdout(output_file, text):
    p = Popen([
        'scripts/create-comm.py',
        'tests/testdata/les-deux-chandeliers.txt',
        '-'
    ], stdout=PIPE, stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    with open(output_file, 'wb') as f:
        f.write(stdout)

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert comm.id == 'tests/testdata/les-deux-chandeliers.txt'
    assert validate_communication(comm)
    assert comm.text == text
    assert comm.sectionList is None

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
示例#12
0
def test_create_comm_annotated(output_file, text):
    p = Popen([
        'scripts/create-comm.py',
        '--annotation-level', 'section',
        'tests/testdata/les-deux-chandeliers.txt',
        output_file
    ], stdout=PIPE, stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert comm.id == 'tests/testdata/les-deux-chandeliers.txt'
    assert validate_communication(comm)
    assert comm.text == text
    assert len(comm.sectionList) == 2

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
示例#13
0
def test_create_comm_complex_al_sentence():
    comm = create_comm('one', '\n\nsimple comm\t\t.\nor ...\n\nisit?\n',
                       annotation_level=AL_SENTENCE)
    assert 'one' == comm.id
    assert '\n\nsimple comm\t\t.\nor ...\n\nisit?\n' == comm.text
    assert 2 == len(comm.sectionList)

    sect = comm.sectionList[0]
    assert 2 == sect.textSpan.start
    assert 23 == sect.textSpan.ending
    assert 2 == len(sect.sentenceList)
    sent = sect.sentenceList[0]
    assert 2 == sent.textSpan.start
    assert 16 == sent.textSpan.ending
    assert sent.tokenization is None
    sent = sect.sentenceList[1]
    assert 17 == sent.textSpan.start
    assert 23 == sent.textSpan.ending
    assert sent.tokenization is None

    sect = comm.sectionList[1]
    assert 25 == sect.textSpan.start
    assert 30 == sect.textSpan.ending
    assert 1 == len(sect.sentenceList)
    sent = sect.sentenceList[0]
    assert 25 == sent.textSpan.start
    assert 30 == sent.textSpan.ending
    assert sent.tokenization is None

    assert validate_communication(comm)
def test_create_comm_stdout(output_file, text):
    p = Popen([
        sys.executable,
        'scripts/create-comm.py',
        'tests/testdata/les-deux-chandeliers.txt',
        '-'
    ], stdout=PIPE, stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    with open(output_file, 'wb') as f:
        f.write(stdout)

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = next(it)
    assert comm.id == 'tests/testdata/les-deux-chandeliers.txt'
    assert validate_communication(comm)
    assert comm.text == text
    assert comm.sectionList is None

    try:
        next(it)
    except StopIteration:
        pass
    else:
        assert False
示例#15
0
def json_str_to_validated_concrete_bytes(tweet_str):
    tweet_str = tweet_str.decode('utf-8')
    b = json_tweet_string_to_Communication(tweet_str, True, True)
    if b is None or not validate_communication(b):
        return None
    else:
        return write_communication_to_buffer(b)
示例#16
0
def test_create_comm_complex_al_sentence():
    comm = create_comm('one',
                       '\n\nsimple comm\t\t.\nor ...\n\nisit?\n',
                       annotation_level=AL_SENTENCE)
    assert 'one' == comm.id
    assert '\n\nsimple comm\t\t.\nor ...\n\nisit?\n' == comm.text
    assert 2 == len(comm.sectionList)

    sect = comm.sectionList[0]
    assert 2 == sect.textSpan.start
    assert 23 == sect.textSpan.ending
    assert 2 == len(sect.sentenceList)
    sent = sect.sentenceList[0]
    assert 2 == sent.textSpan.start
    assert 16 == sent.textSpan.ending
    assert sent.tokenization is None
    sent = sect.sentenceList[1]
    assert 17 == sent.textSpan.start
    assert 23 == sent.textSpan.ending
    assert sent.tokenization is None

    sect = comm.sectionList[1]
    assert 25 == sect.textSpan.start
    assert 30 == sect.textSpan.ending
    assert 1 == len(sect.sentenceList)
    sent = sect.sentenceList[0]
    assert 25 == sent.textSpan.start
    assert 30 == sent.textSpan.ending
    assert sent.tokenization is None

    assert validate_communication(comm)
def test_tweets2concrete_unicode(output_file):
    p = Popen([
        'scripts/tweets2concrete.py',
        'tests/testdata/tweets.unicode.json',
        output_file
    ], stdout=PIPE, stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert_first_comm(comm)
    assert validate_communication(comm)

    (comm, _) = it.next()
    assert_second_comm(comm)

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
示例#18
0
def test_validate_minimal_communication_with_uuid():
    comm = Communication()
    comm.id = "myID"
    comm.metadata = AnnotationMetadata(tool="TEST", timestamp=int(time.time()))
    comm.type = "Test Communication"
    comm.uuid = generate_UUID()
    assert validate_communication(comm)
示例#19
0
def json_str_to_validated_concrete_bytes(tweet_str):
    tweet_str = tweet_str.decode('utf-8')
    b = json_tweet_string_to_Communication(tweet_str, True, True)
    if b is None or not validate_communication(b):
        return None
    else:
        return write_communication_to_buffer(b)
def test_create_comm_annotated(output_file, text):
    p = Popen([
        sys.executable,
        'scripts/create-comm.py',
        '--annotation-level', 'section',
        'tests/testdata/les-deux-chandeliers.txt',
        output_file
    ], stdout=PIPE, stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = next(it)
    assert comm.id == 'tests/testdata/les-deux-chandeliers.txt'
    assert validate_communication(comm)
    assert comm.text == text
    assert len(comm.sectionList) == 2

    try:
        next(it)
    except StopIteration:
        pass
    else:
        assert False
def test_create_comm(output_file, text):
    p = Popen([
        'scripts/create-comm.py', 'tests/testdata/les-deux-chandeliers.txt',
        output_file
    ],
              stdout=PIPE,
              stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert comm.id == 'tests/testdata/les-deux-chandeliers.txt'
    assert validate_communication(comm)
    assert comm.text == text
    assert comm.sectionList is None

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
def test_zip_file_backed_comm_container_retrieve():
    zipfile_path = u'tests/testdata/simple.zip'
    cc = ZipFileBackedCommunicationContainer(zipfile_path)
    assert 3 == len(cc)
    assert u'simple_1' in cc
    for comm_id in cc:
        comm = cc[comm_id]
        assert validate_communication(comm)
def test_memory_backed_comm_container_retrieve():
    comm_path = u'tests/testdata/simple.tar.gz'
    cc = MemoryBackedCommunicationContainer(comm_path)
    assert 3 == len(cc)
    assert u'one' in cc
    for comm_id in cc:
        comm = cc[comm_id]
        assert validate_communication(comm)
def test_directory_backed_comm_container_retrieve():
    directory_path = u'tests/testdata/a'
    cc = DirectoryBackedCommunicationContainer(directory_path)
    assert 3 == len(cc)
    assert u'simple_1' in cc
    for comm_id in cc:
        comm = cc[comm_id]
        assert validate_communication(comm)
示例#25
0
def test_directory_backed_comm_container_retrieve():
    directory_path = u'tests/testdata/a'
    cc = DirectoryBackedCommunicationContainer(directory_path)
    assert 3 == len(cc)
    assert u'simple_1' in cc
    for comm_id in cc:
        comm = cc[comm_id]
        assert validate_communication(comm)
示例#26
0
def test_memory_backed_comm_container_retrieve():
    comm_path = u'tests/testdata/simple.tar.gz'
    cc = MemoryBackedCommunicationContainer(comm_path)
    assert 3 == len(cc)
    assert u'one' in cc
    for comm_id in cc:
        comm = cc[comm_id]
        assert validate_communication(comm)
示例#27
0
def test_zip_file_backed_comm_container_retrieve():
    zipfile_path = u'tests/testdata/simple.zip'
    cc = ZipFileBackedCommunicationContainer(zipfile_path)
    assert 3 == len(cc)
    assert u'simple_1' in cc
    for comm_id in cc:
        comm = cc[comm_id]
        assert validate_communication(comm)
示例#28
0
def test_validate_minimal_communication_with_uuid():
    comm = Communication()
    comm.id = "myID"
    comm.metadata = AnnotationMetadata(
        tool="TEST", timestamp=int(time.time()))
    comm.type = "Test Communication"
    comm.uuid = generate_UUID()
    assert validate_communication(comm)
示例#29
0
def test_create_comm_complex_al_none():
    comm = create_comm('one',
                       '\n\nsimple comm\t\t.\nor ...\n\nisit?\n',
                       annotation_level=AL_NONE)
    assert 'one' == comm.id
    assert '\n\nsimple comm\t\t.\nor ...\n\nisit?\n' == comm.text
    assert comm.sectionList is None
    assert validate_communication(comm)
def create_comm_from_tweet(json_tweet_string):
    """Create a Concrete Communication from a JSON Tweet string

    Args:
        json_tweet_string: A JSON string for a Tweet, using the JSON
            format specified by the Twitter API:
              https://dev.twitter.com/docs/platform-objects/tweets

    Returns:
        A Concrete Communication object
    """
    tweet_data = json.loads(json_tweet_string)

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()

    comm = concrete.Communication()
    comm.id = "Annotation_Test_1"
    comm.metadata = concrete.AnnotationMetadata(
        tool="Annotation Example script",
        timestamp=int(time.time())
    )
    comm.text = tweet_data['text']
    comm.type = "Tweet"
    comm.uuid = next(aug)

    comm.sectionList = [concrete.Section()]
    comm.sectionList[0].kind = "mySectionKind"
    comm.sectionList[0].uuid = next(aug)
    comm.sectionList[0].sentenceList = [concrete.Sentence()]
    comm.sectionList[0].sentenceList[0].uuid = next(aug)
    comm.sectionList[0].sentenceList[0].tokenization = concrete.Tokenization()

    tokenization = comm.sectionList[0].sentenceList[0].tokenization
    tokenization.kind = concrete.TokenizationKind.TOKEN_LIST
    tokenization.metadata = concrete.AnnotationMetadata(
        tool="TEST", timestamp=int(time.time()))
    tokenization.tokenList = concrete.TokenList()
    tokenization.tokenList.tokenList = []
    tokenization.uuid = next(aug)

    # Whitespace tokenization
    tokens = comm.text.split()

    for i, token_text in enumerate(tokens):
        t = concrete.Token()
        t.tokenIndex = i
        t.text = token_text
        tokenization.tokenList.tokenList.append(t)

    if validate_communication(comm):
        print("Created valid Communication")
    else:
        print("ERROR: Invalid Communication")

    return comm
示例#31
0
def create_comm_from_tweet(json_tweet_string):
    """Create a Concrete Communication from a JSON Tweet string

    Args:
        json_tweet_string: A JSON string for a Tweet, using the JSON
            format specified by the Twitter API:
              https://dev.twitter.com/docs/platform-objects/tweets

    Returns:
        A Concrete Communication object
    """
    tweet_data = json.loads(json_tweet_string)

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()

    comm = concrete.Communication()
    comm.id = "Annotation_Test_1"
    comm.metadata = concrete.AnnotationMetadata(
        tool="Annotation Example script",
        timestamp=int(time.time())
    )
    comm.text = tweet_data['text']
    comm.type = "Tweet"
    comm.uuid = aug.next()

    comm.sectionList = [concrete.Section()]
    comm.sectionList[0].kind = "mySectionKind"
    comm.sectionList[0].uuid = aug.next()
    comm.sectionList[0].sentenceList = [concrete.Sentence()]
    comm.sectionList[0].sentenceList[0].uuid = aug.next()
    comm.sectionList[0].sentenceList[0].tokenization = concrete.Tokenization()

    tokenization = comm.sectionList[0].sentenceList[0].tokenization
    tokenization.kind = concrete.TokenizationKind.TOKEN_LIST
    tokenization.metadata = concrete.AnnotationMetadata(
        tool="TEST", timestamp=int(time.time()))
    tokenization.tokenList = concrete.TokenList()
    tokenization.tokenList.tokenList = []
    tokenization.uuid = aug.next()

    # Whitespace tokenization
    tokens = comm.text.split()

    for i, token_text in enumerate(tokens):
        t = concrete.Token()
        t.tokenIndex = i
        t.text = token_text
        tokenization.tokenList.tokenList.append(t)

    if validate_communication(comm):
        print "Created valid Communication"
    else:
        print "ERROR: Invalid Communication"

    return comm
示例#32
0
def test_create_comm_one_sentence_al_section():
    comm = create_comm('one', 'simple comm\t\t.', annotation_level=AL_SECTION)
    assert 'one' == comm.id
    assert 'simple comm\t\t.' == comm.text
    assert 1 == len(comm.sectionList)
    sect = comm.sectionList[0]
    assert 0 == sect.textSpan.start
    assert 14 == sect.textSpan.ending
    assert sect.sentenceList is None
    assert validate_communication(comm)
示例#33
0
def test_create_comm_one_sentence_al_section():
    comm = create_comm('one', 'simple comm\t\t.', annotation_level=AL_SECTION)
    assert 'one' == comm.id
    assert 'simple comm\t\t.' == comm.text
    assert 1 == len(comm.sectionList)
    sect = comm.sectionList[0]
    assert 0 == sect.textSpan.start
    assert 14 == sect.textSpan.ending
    assert sect.sentenceList is None
    assert validate_communication(comm)
def test_create_comm_complex():
    comm = create_comm('one', '\n\nsimple comm\t\t.\nor ...\n\nisit?\n')
    assert 'one' == comm.id
    assert '\n\nsimple comm\t\t.\nor ...\n\nisit?\n' == comm.text
    assert 3 == len(comm.sectionList)

    sect = comm.sectionList[0]
    assert 0 == sect.textSpan.start
    assert 0 == sect.textSpan.ending
    assert 0 == len(sect.sentenceList)

    sect = comm.sectionList[1]
    assert 2 == sect.textSpan.start
    assert 23 == sect.textSpan.ending
    assert 2 == len(sect.sentenceList)
    sent = sect.sentenceList[0]
    assert 2 == sent.textSpan.start
    assert 16 == sent.textSpan.ending
    tl = sent.tokenization.tokenList.tokenList
    assert 3 == len(tl)
    assert 0 == tl[0].tokenIndex
    assert 'simple' == tl[0].text
    assert 1 == tl[1].tokenIndex
    assert 'comm' == tl[1].text
    assert 2 == tl[2].tokenIndex
    assert '.' == tl[2].text
    sent = sect.sentenceList[1]
    assert 17 == sent.textSpan.start
    assert 23 == sent.textSpan.ending
    tl = sent.tokenization.tokenList.tokenList
    assert 2 == len(tl)
    assert 0 == tl[0].tokenIndex
    assert 'or' == tl[0].text
    assert 1 == tl[1].tokenIndex
    assert '...' == tl[1].text

    sect = comm.sectionList[2]
    assert 25 == sect.textSpan.start
    assert 31 == sect.textSpan.ending
    assert 2 == len(sect.sentenceList)
    sent = sect.sentenceList[0]
    assert 25 == sent.textSpan.start
    assert 30 == sent.textSpan.ending
    tl = sent.tokenization.tokenList.tokenList
    assert 1 == len(tl)
    assert 0 == tl[0].tokenIndex
    assert 'isit?' == tl[0].text
    sent = sect.sentenceList[1]
    assert 31 == sent.textSpan.start
    assert 31 == sent.textSpan.ending
    tl = sent.tokenization.tokenList.tokenList
    assert 0 == len(tl)

    assert validate_communication(comm)
示例#35
0
def test_create_comm_complex():
    comm = create_comm('one', '\n\nsimple comm\t\t.\nor ...\n\nisit?\n')
    assert 'one' == comm.id
    assert '\n\nsimple comm\t\t.\nor ...\n\nisit?\n' == comm.text
    assert 3 == len(comm.sectionList)

    sect = comm.sectionList[0]
    assert 0 == sect.textSpan.start
    assert 0 == sect.textSpan.ending
    assert 0 == len(sect.sentenceList)

    sect = comm.sectionList[1]
    assert 2 == sect.textSpan.start
    assert 23 == sect.textSpan.ending
    assert 2 == len(sect.sentenceList)
    sent = sect.sentenceList[0]
    assert 2 == sent.textSpan.start
    assert 16 == sent.textSpan.ending
    tl = sent.tokenization.tokenList.tokenList
    assert 3 == len(tl)
    assert 0 == tl[0].tokenIndex
    assert 'simple' == tl[0].text
    assert 1 == tl[1].tokenIndex
    assert 'comm' == tl[1].text
    assert 2 == tl[2].tokenIndex
    assert '.' == tl[2].text
    sent = sect.sentenceList[1]
    assert 17 == sent.textSpan.start
    assert 23 == sent.textSpan.ending
    tl = sent.tokenization.tokenList.tokenList
    assert 2 == len(tl)
    assert 0 == tl[0].tokenIndex
    assert 'or' == tl[0].text
    assert 1 == tl[1].tokenIndex
    assert '...' == tl[1].text

    sect = comm.sectionList[2]
    assert 25 == sect.textSpan.start
    assert 31 == sect.textSpan.ending
    assert 2 == len(sect.sentenceList)
    sent = sect.sentenceList[0]
    assert 25 == sent.textSpan.start
    assert 30 == sent.textSpan.ending
    tl = sent.tokenization.tokenList.tokenList
    assert 1 == len(tl)
    assert 0 == tl[0].tokenIndex
    assert 'isit?' == tl[0].text
    sent = sect.sentenceList[1]
    assert 31 == sent.textSpan.start
    assert 31 == sent.textSpan.ending
    tl = sent.tokenization.tokenList.tokenList
    assert 0 == len(tl)

    assert validate_communication(comm)
示例#36
0
    def test_entity_mention_ids(self):
        comm = read_test_comm()
        self.assertTrue(validate_communication(comm))
        self.assertTrue(validate_entity_mention_ids(comm))

        comm.entitySetList[0].entityList[0].mentionIdList[
            0] = concrete.UUID(uuidString='BAD_ENTITY_MENTION_UUID')

        with LogCapture() as log_capture:
            self.assertFalse(validate_entity_mention_ids(comm))
        log_capture.check(('root', 'ERROR', StringComparison(
            r'.*invalid entityMentionId.*BAD_ENTITY_MENTION_UUID')))
示例#37
0
    def test_entity_mention_tokenization(self):
        comm = read_test_comm()
        self.assertTrue(validate_communication(comm))
        self.assertTrue(validate_entity_mention_ids(comm))

        comm.entityMentionSetList[0].mentionList[0].tokens.tokenizationId = (
            concrete.UUID(uuidString='BAD_TOKENIZATION_UUID')
        )

        with LogCapture() as log_capture:
            self.assertFalse(validate_entity_mention_tokenization_ids(comm))
        log_capture.check(('root', 'ERROR', StringComparison(
            r'.*invalid tokenizationId.*BAD_TOKENIZATION_UUID')))
def test_entity_mention_tokenization():
    comm = read_test_comm()
    assert validate_communication(comm)
    assert validate_entity_mention_ids(comm)

    comm.entityMentionSetList[0].mentionList[0].tokens.tokenizationId = (
        concrete.UUID(uuidString='BAD_TOKENIZATION_UUID'))

    with LogCapture() as log_capture:
        assert not validate_entity_mention_tokenization_ids(comm)
    log_capture.check(
        ('root', 'ERROR',
         StringComparison(r'.*invalid tokenizationId.*BAD_TOKENIZATION_UUID')))
def test_compress_uuids(output_file, args):
    input_file = 'tests/testdata/simple.tar.gz'

    p = Popen([
        sys.executable,
        'scripts/compress-uuids.py',
        input_file,
        output_file
    ] + list(args), stdout=PIPE, stderr=PIPE)
    (stdout, stderr) = p.communicate()

    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, comm_filename) = next(it)
    assert comm_filename == 'simple_1.concrete'
    assert comm.id == 'one'
    assert validate_communication(comm)

    (comm, comm_filename) = next(it)
    assert comm_filename == 'simple_2.concrete'
    assert comm.id == 'two'
    assert validate_communication(comm)

    (comm, comm_filename) = next(it)
    assert comm_filename == 'simple_3.concrete'
    assert comm.id == 'three'
    assert validate_communication(comm)

    assert os.stat(output_file).st_size < os.stat(input_file).st_size

    try:
        next(it)
    except StopIteration:
        pass
    else:
        assert False
示例#40
0
def test_compress_uuids(output_file, args):
    input_file = 'tests/testdata/simple.tar.gz'

    p = Popen(
        [sys.executable, 'scripts/compress-uuids.py', input_file, output_file
         ] + list(args),
        stdout=PIPE,
        stderr=PIPE)
    (stdout, stderr) = p.communicate()

    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, comm_filename) = next(it)
    assert comm_filename == 'simple_1.concrete'
    assert comm.id == 'one'
    assert validate_communication(comm)

    (comm, comm_filename) = next(it)
    assert comm_filename == 'simple_2.concrete'
    assert comm.id == 'two'
    assert validate_communication(comm)

    (comm, comm_filename) = next(it)
    assert comm_filename == 'simple_3.concrete'
    assert comm.id == 'three'
    assert validate_communication(comm)

    assert os.stat(output_file).st_size < os.stat(input_file).st_size

    try:
        next(it)
    except StopIteration:
        pass
    else:
        assert False
示例#41
0
def test_create_comm_unicode_al_sentence():
    comm = create_comm('one', u'狐狸\t\t.', annotation_level=AL_SENTENCE)
    assert 'one' == comm.id
    assert u'狐狸\t\t.' == comm.text
    assert 1 == len(comm.sectionList)
    sect = comm.sectionList[0]
    assert 0 == sect.textSpan.start
    assert 5 == sect.textSpan.ending
    assert 1 == len(sect.sentenceList)
    sent = sect.sentenceList[0]
    assert 0 == sent.textSpan.start
    assert 5 == sent.textSpan.ending
    assert sent.tokenization is None
    assert validate_communication(comm)
示例#42
0
def test_create_comm_unicode_al_sentence():
    comm = create_comm('one', u'狐狸\t\t.', annotation_level=AL_SENTENCE)
    assert 'one' == comm.id
    assert u'狐狸\t\t.' == comm.text
    assert 1 == len(comm.sectionList)
    sect = comm.sectionList[0]
    assert 0 == sect.textSpan.start
    assert 5 == sect.textSpan.ending
    assert 1 == len(sect.sentenceList)
    sent = sect.sentenceList[0]
    assert 0 == sent.textSpan.start
    assert 5 == sent.textSpan.ending
    assert sent.tokenization is None
    assert validate_communication(comm)
def test_entity_mention_ids():
    comm = read_test_comm()
    assert validate_communication(comm)
    assert validate_entity_mention_ids(comm)

    comm.entitySetList[0].entityList[0].mentionIdList[0] = concrete.UUID(
        uuidString='BAD_ENTITY_MENTION_UUID')

    with LogCapture() as log_capture:
        assert not validate_entity_mention_ids(comm)
    log_capture.check(
        ('root', 'ERROR',
         StringComparison(
             r'.*invalid entityMentionId.*BAD_ENTITY_MENTION_UUID')))
def test_create_comm_tarball_per_line(output_file, text_l0, text_l1_s0,
                                      text_l1_s1, text_l1_s2, text_l1_s3,
                                      text_l1_s4):
    p = Popen([
        'scripts/create-comm-tarball.py', '--per-line',
        'tests/testdata/les-deux-chandeliers-perline.tar.gz', output_file
    ],
              stdout=PIPE,
              stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers-perline/l0.txt/0'
    assert validate_communication(comm)
    assert comm.text == text_l0
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers-perline/l1.txt/0'
    assert validate_communication(comm)
    assert comm.text == text_l1_s0
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers-perline/l1.txt/1'
    assert validate_communication(comm)
    assert comm.text == text_l1_s1
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers-perline/l1.txt/2'
    assert validate_communication(comm)
    assert comm.text == text_l1_s2
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers-perline/l1.txt/3'
    assert validate_communication(comm)
    assert comm.text == text_l1_s3
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers-perline/l1.txt/4'
    assert validate_communication(comm)
    assert comm.text == text_l1_s4
    assert comm.sectionList is None

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
示例#45
0
def test_create_comm_tarball_per_line(output_file, text_l0,
                                      text_l1_s0, text_l1_s1, text_l1_s2,
                                      text_l1_s3, text_l1_s4):
    p = Popen([
        'scripts/create-comm-tarball.py',
        '--per-line',
        'tests/testdata/les-deux-chandeliers-perline.tar.gz',
        output_file
    ], stdout=PIPE, stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers-perline/l0.txt/0'
    assert validate_communication(comm)
    assert comm.text == text_l0
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers-perline/l1.txt/0'
    assert validate_communication(comm)
    assert comm.text == text_l1_s0
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers-perline/l1.txt/1'
    assert validate_communication(comm)
    assert comm.text == text_l1_s1
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers-perline/l1.txt/2'
    assert validate_communication(comm)
    assert comm.text == text_l1_s2
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers-perline/l1.txt/3'
    assert validate_communication(comm)
    assert comm.text == text_l1_s3
    assert comm.sectionList is None

    (comm, _) = it.next()
    assert comm.id == 'les-deux-chandeliers-perline/l1.txt/4'
    assert validate_communication(comm)
    assert comm.text == text_l1_s4
    assert comm.sectionList is None

    try:
        it.next()
    except StopIteration:
        pass
    else:
        assert False
示例#46
0
def add_dictionary_tagging(comm):
    """Adds In/Out of dictionary 'POS' tags to a Communication

    Takes a Concrete Communication, adds a Part-Of-Speech tag to each
    token, where the tags record whether the token is 'In' or 'Out' of
    the system dictionary.

    Args:
        comm: A Concrete Communication with tokens

    Returns:
        A copy of the original Communication, with POS tags added
    """
    dictionary = set()
    for w in open('/usr/share/dict/words'):
        dictionary.add(w.strip().lower())

    augf = AnalyticUUIDGeneratorFactory(comm)
    aug = augf.create()

    if comm.sectionList:
        for section in comm.sectionList:
            if section.sentenceList:
                for sentence in section.sentenceList:
                    posTagList = concrete.TokenTagging()
                    posTagList.metadata = concrete.AnnotationMetadata(
                        tool="POS Tagger", timestamp=int(time.time()))
                    posTagList.taggingType = "POS"
                    posTagList.taggedTokenList = []
                    posTagList.uuid = aug.next()
                    tkzn = sentence.tokenization
                    if tkzn.tokenList:
                        for i, token in enumerate(tkzn.tokenList.tokenList):
                            tt = concrete.TaggedToken()
                            tt.tokenIndex = i
                            if token.text.lower() in dictionary:
                                tt.tag = "In"
                            else:
                                tt.tag = "Out"
                            posTagList.taggedTokenList.append(tt)
                            print "%d [%s] %s" % (i, token.text, tt.tag)
                    tkzn.tokenTaggingList = [posTagList]
            print

    if validate_communication(comm):
        print "Created valid POS tagging for Communication"
    else:
        print "ERROR: Invalid POS tagging Communication"
    return comm
def add_dictionary_tagging(comm):
    """Adds In/Out of dictionary 'POS' tags to a Communication

    Takes a Concrete Communication, adds a Part-Of-Speech tag to each
    token, where the tags record whether the token is 'In' or 'Out' of
    the system dictionary.

    Args:
        comm: A Concrete Communication with tokens

    Returns:
        A copy of the original Communication, with POS tags added
    """
    dictionary = set()
    for w in open('/usr/share/dict/words'):
        dictionary.add(w.strip().lower())

    augf = AnalyticUUIDGeneratorFactory(comm)
    aug = augf.create()

    if comm.sectionList:
        for section in comm.sectionList:
            if section.sentenceList:
                for sentence in section.sentenceList:
                    posTagList = concrete.TokenTagging()
                    posTagList.metadata = concrete.AnnotationMetadata(
                        tool="POS Tagger", timestamp=int(time.time()))
                    posTagList.taggingType = "POS"
                    posTagList.taggedTokenList = []
                    posTagList.uuid = next(aug)
                    tkzn = sentence.tokenization
                    if tkzn.tokenList:
                        for i, token in enumerate(tkzn.tokenList.tokenList):
                            tt = concrete.TaggedToken()
                            tt.tokenIndex = i
                            if token.text.lower() in dictionary:
                                tt.tag = "In"
                            else:
                                tt.tag = "Out"
                            posTagList.taggedTokenList.append(tt)
                            print("%d [%s] %s" % (i, token.text, tt.tag))
                    tkzn.tokenTaggingList = [posTagList]
            print()

    if validate_communication(comm):
        print("Created valid POS tagging for Communication")
    else:
        print("ERROR: Invalid POS tagging Communication")
    return comm
示例#48
0
def test_fetch_backed_container():
    comm_container = {'one': create_comm('one'), 'two': create_comm('two')}

    impl = CommunicationContainerFetchHandler(comm_container)
    host = 'localhost'
    port = find_port()

    with SubprocessFetchCommunicationServiceWrapper(impl, host, port):
        cc = FetchBackedCommunicationContainer(host, port)
        assert len(cc) == 2
        assert 'one' in cc
        assert 'two' in cc
        for comm_id in cc:
            comm = cc[comm_id]
            assert validate_communication(comm)
示例#49
0
def test_fetch_backed_container():
    comm_container = {
        'one': create_comm('one'),
        'two': create_comm('two')
    }

    impl = CommunicationContainerFetchHandler(comm_container)
    host = 'localhost'
    port = find_port()

    with SubprocessFetchCommunicationServiceWrapper(impl, host, port):
        cc = FetchBackedCommunicationContainer(host, port)
        assert len(cc) == 2
        assert 'one' in cc
        assert 'two' in cc
        for comm_id in cc:
            comm = cc[comm_id]
            assert validate_communication(comm)
def test_create_comm_unicode():
    comm = create_comm('one', u'狐狸\t\t.')
    assert 'one' == comm.id
    assert u'狐狸\t\t.' == comm.text
    assert 1 == len(comm.sectionList)
    sect = comm.sectionList[0]
    assert 0 == sect.textSpan.start
    assert 5 == sect.textSpan.ending
    assert 1 == len(sect.sentenceList)
    sent = sect.sentenceList[0]
    assert 0 == sent.textSpan.start
    assert 5 == sent.textSpan.ending
    tl = sent.tokenization.tokenList.tokenList
    assert 2 == len(tl)
    assert 0 == tl[0].tokenIndex
    assert u'狐狸' == tl[0].text
    assert 1 == tl[1].tokenIndex
    assert '.' == tl[1].text
    assert validate_communication(comm)
示例#51
0
def test_create_comm_unicode():
    comm = create_comm('one', u'狐狸\t\t.')
    assert 'one' == comm.id
    assert u'狐狸\t\t.' == comm.text
    assert 1 == len(comm.sectionList)
    sect = comm.sectionList[0]
    assert 0 == sect.textSpan.start
    assert 5 == sect.textSpan.ending
    assert 1 == len(sect.sentenceList)
    sent = sect.sentenceList[0]
    assert 0 == sent.textSpan.start
    assert 5 == sent.textSpan.ending
    tl = sent.tokenization.tokenList.tokenList
    assert 2 == len(tl)
    assert 0 == tl[0].tokenIndex
    assert u'狐狸' == tl[0].text
    assert 1 == tl[1].tokenIndex
    assert '.' == tl[1].text
    assert validate_communication(comm)
示例#52
0
def test_create_comm_one_sentence():
    comm = create_comm('one', 'simple comm\t\t.')
    assert 'one' == comm.id
    assert 'simple comm\t\t.' == comm.text
    assert 1 == len(comm.sectionList)
    sect = comm.sectionList[0]
    assert 0 == sect.textSpan.start
    assert 14 == sect.textSpan.ending
    assert 1 == len(sect.sentenceList)
    sent = sect.sentenceList[0]
    assert 0 == sent.textSpan.start
    assert 14 == sent.textSpan.ending
    tl = sent.tokenization.tokenList.tokenList
    assert 3 == len(tl)
    assert 0 == tl[0].tokenIndex
    assert 'simple' == tl[0].text
    assert 1 == tl[1].tokenIndex
    assert 'comm' == tl[1].text
    assert 2 == tl[2].tokenIndex
    assert '.' == tl[2].text
    assert validate_communication(comm)
def test_create_comm_one_sentence():
    comm = create_comm('one', 'simple comm\t\t.')
    assert 'one' == comm.id
    assert 'simple comm\t\t.' == comm.text
    assert 1 == len(comm.sectionList)
    sect = comm.sectionList[0]
    assert 0 == sect.textSpan.start
    assert 14 == sect.textSpan.ending
    assert 1 == len(sect.sentenceList)
    sent = sect.sentenceList[0]
    assert 0 == sent.textSpan.start
    assert 14 == sent.textSpan.ending
    tl = sent.tokenization.tokenList.tokenList
    assert 3 == len(tl)
    assert 0 == tl[0].tokenIndex
    assert 'simple' == tl[0].text
    assert 1 == tl[1].tokenIndex
    assert 'comm' == tl[1].text
    assert 2 == tl[2].tokenIndex
    assert '.' == tl[2].text
    assert validate_communication(comm)
示例#54
0
def assert_first_comm(comm):
    assert comm.id == '238426131689242624'
    assert comm.startTime == 1345680194
    assert comm.endTime == 1345680194
    assert validate_communication(comm)
示例#55
0
def test_create_simple_comm():
    comm = create_simple_comm('one')
    assert 'one' == comm.id
    assert 'Super simple sentence .' == comm.text
    assert validate_communication(comm)
示例#56
0
def test_create_comm_ws_al_sentence():
    comm = create_comm('one', '\t \t\r\n\n', annotation_level=AL_SENTENCE)
    assert 'one' == comm.id
    assert '\t \t\r\n\n' == comm.text
    assert [] == comm.sectionList
    assert validate_communication(comm)
示例#57
0
def test_create_comm_ws_al_sentence():
    comm = create_comm('one', '\t \t\r\n\n', annotation_level=AL_SENTENCE)
    assert 'one' == comm.id
    assert '\t \t\r\n\n' == comm.text
    assert [] == comm.sectionList
    assert validate_communication(comm)
示例#58
0
def assert_second_comm(comm):
    assert comm.id == '238426131689242625'
    assert comm.startTime == 1345680195
    assert comm.endTime == 1345680195
    assert validate_communication(comm)
示例#59
0
def test_create_comm_ws():
    comm = create_comm('one', '\t \t\r\n\n')
    assert 'one' == comm.id
    assert '\t \t\r\n\n' == comm.text
    assert [] == comm.sectionList
    assert validate_communication(comm)