def test_CommunicationWriterTGZ_single_file_default_name(output_file, login_info): comm = read_communication_from_file("tests/testdata/simple_1.concrete") writer = CommunicationWriterTGZ() try: writer.open(output_file) writer.write(comm) finally: writer.close() assert tarfile.is_tarfile(output_file) f = tarfile.open(output_file) tarinfo = f.next() assert tarinfo is not None assert comm.uuid.uuidString + ".concrete" == tarinfo.name assert tarinfo.isreg() assert tarinfo.mtime > time.time() - TIME_MARGIN assert os.stat("tests/testdata/simple_1.concrete").st_size == tarinfo.size assert 0644 == tarinfo.mode assert login_info["uid"] == tarinfo.uid assert login_info["username"] == tarinfo.uname assert login_info["gid"] == tarinfo.gid assert login_info["groupname"] == tarinfo.gname tarinfo = f.next() assert tarinfo is None f.close()
def validate_communication_file(communication_filename): logging.info( _ilm( 0, "Opening Concrete Communication with filename '%s'" % communication_filename)) comm = read_communication_from_file(communication_filename) validate_communication(comm)
def test_CommunicationWriterTGZ_single_file_ctx_mgr(output_file, login_info): comm = read_communication_from_file("tests/testdata/simple_1.concrete") with CommunicationWriterTGZ(output_file) as writer: writer.write(comm, "simple_1.concrete") assert tarfile.is_tarfile(output_file) f = tarfile.open(output_file) tarinfo = f.next() assert tarinfo is not None assert "simple_1.concrete" == tarinfo.name assert tarinfo.isreg() assert tarinfo.mtime > time.time() - TIME_MARGIN assert os.stat("tests/testdata/simple_1.concrete").st_size == tarinfo.size assert 0644 == tarinfo.mode assert login_info["uid"] == tarinfo.uid assert login_info["username"] == tarinfo.uname assert login_info["gid"] == tarinfo.gid assert login_info["groupname"] == tarinfo.gname tarinfo = f.next() assert tarinfo is None f.close()
def test_CommunicationWriterTGZ_single_file_default_name(output_file): comm = read_communication_from_file("tests/testdata/simple_1.concrete") writer = CommunicationWriterTGZ() writer.open(output_file) writer.write(comm) writer.close() assert tarfile.is_tarfile(output_file) f = tarfile.open(output_file) tarinfo = f.next() assert tarinfo is not None assert comm.uuid.uuidString + '.concrete' == tarinfo.name assert tarinfo.isreg() assert tarinfo.mtime > time.time() - TIME_MARGIN assert os.stat('tests/testdata/simple_1.concrete').st_size == tarinfo.size assert 0644 == tarinfo.mode assert os.getuid() == tarinfo.uid assert pwd.getpwuid(os.getuid()).pw_name == tarinfo.uname assert os.getgid() == tarinfo.gid assert grp.getgrgid(os.getgid()).gr_name == tarinfo.gname tarinfo = f.next() assert tarinfo is None f.close()
def test_CommunicationWriterTar_single_file_ctx_mgr(output_file): comm = read_communication_from_file("tests/testdata/simple_1.concrete") with CommunicationWriterTar(output_file) as writer: writer.write(comm, "simple_1.concrete") assert tarfile.is_tarfile(output_file) f = tarfile.open(output_file) tarinfo = f.next() assert tarinfo is not None assert "simple_1.concrete" == tarinfo.name assert tarinfo.isreg() assert tarinfo.mtime > time.time() - TIME_MARGIN assert os.stat('tests/testdata/simple_1.concrete').st_size == tarinfo.size assert 0644 == tarinfo.mode assert os.getuid() == tarinfo.uid assert pwd.getpwuid(os.getuid()).pw_name == tarinfo.uname assert os.getgid() == tarinfo.gid assert grp.getgrgid(os.getgid()).gr_name == tarinfo.gname tarinfo = f.next() assert tarinfo is None f.close()
def load_comm(filename, tool='ud converted ptb trees using pyStanfordDependencies'): "Load a concrete communication file with required pyStanfordDependencies output." # import here to avoid requiring concrete from concrete.util.file_io import read_communication_from_file comm = read_communication_from_file(filename) if comm.sectionList: for sec in comm.sectionList: if sec.sentenceList: for sent in sec.sentenceList: yield sec.label, get_udparse(sent, tool)
def test_CommunicationWriter_fixed_point_unicode(output_file): input_file = "tests/testdata/les-deux-chandeliers.concrete" comm = read_communication_from_file(input_file) with CommunicationWriter(output_file) as writer: writer.write(comm) with open(input_file, "rb") as expected_f: expected_data = expected_f.read() with open(output_file, "rb") as actual_f: actual_data = actual_f.read() assert expected_data == actual_data
def test_CommunicationWriter_fixed_point_ctx_mgr(output_file): input_file = "tests/testdata/simple_1.concrete" comm = read_communication_from_file(input_file) with CommunicationWriter(output_file) as writer: writer.write(comm) with open(input_file, "rb") as expected_f: expected_data = expected_f.read() with open(output_file, "rb") as actual_f: actual_data = actual_f.read() assert expected_data == actual_data
def test_CommunicationWriter_fixed_point_unicode(output_file): input_file = 'tests/testdata/les-deux-chandeliers.concrete' comm = read_communication_from_file(input_file) with CommunicationWriter(output_file) as writer: writer.write(comm) with open(input_file, 'rb') as expected_f: expected_data = expected_f.read() with open(output_file, 'rb') as actual_f: actual_data = actual_f.read() assert expected_data == actual_data
def test_CommunicationWriter_fixed_point_ctx_mgr(output_file): input_file = 'tests/testdata/simple_1.concrete' comm = read_communication_from_file(input_file) with CommunicationWriter(output_file) as writer: writer.write(comm) with open(input_file, 'rb') as expected_f: expected_data = expected_f.read() with open(output_file, 'rb') as actual_f: actual_data = actual_f.read() assert expected_data == actual_data
def test_CommunicationWriter_fixed_point(output_file): input_file = "tests/testdata/simple_1.concrete" comm = read_communication_from_file(input_file) writer = CommunicationWriter() try: writer.open(output_file) writer.write(comm) finally: writer.close() with open(input_file, "rb") as expected_f: expected_data = expected_f.read() with open(output_file, "rb") as actual_f: actual_data = actual_f.read() assert expected_data == actual_data
def test_CommunicationWriterTar_single_file_fixed_point_unicode(output_file, login_info): comm = read_communication_from_file("tests/testdata/les-deux-chandeliers.concrete") with CommunicationWriterTar(output_file) as writer: writer.write(comm, "les-deux-chandeliers.concrete") assert tarfile.is_tarfile(output_file) f = tarfile.open(output_file) tarinfo = f.next() assert tarinfo is not None assert "les-deux-chandeliers.concrete" == tarinfo.name actual_data = f.extractfile(tarinfo).read() with open("tests/testdata/les-deux-chandeliers.concrete", "rb") as expected_f: expected_data = expected_f.read() assert expected_data == actual_data tarinfo = f.next() assert tarinfo is None f.close()
def test_CommunicationWriterTar_single_file_fixed_point(output_file): comm = read_communication_from_file("tests/testdata/simple_1.concrete") with CommunicationWriterTar(output_file) as writer: writer.write(comm, "simple_1.concrete") assert tarfile.is_tarfile(output_file) f = tarfile.open(output_file) tarinfo = f.next() assert tarinfo is not None assert "simple_1.concrete" == tarinfo.name actual_data = f.extractfile(tarinfo).read() with open('tests/testdata/simple_1.concrete', 'rb') as expected_f: expected_data = expected_f.read() assert expected_data == actual_data tarinfo = f.next() assert tarinfo is None f.close()
def validate_communication_file(communication_filename): logging.info(_ilm( 0, "Opening Concrete Communication with filename '%s'" % communication_filename)) comm = read_communication_from_file(communication_filename) validate_communication(comm)
import argparse import csv import itertools parser = argparse.ArgumentParser() parser.add_argument("--tsv", type=str, default="", help="") parser.add_argument("--concrete_dir", type=str, default="", help="") parser.add_argument("--lang", type=str, default="eng") ARGS = parser.parse_args() tsv = csv.reader(open(ARGS.tsv), delimiter='\t') for comm_id, rows in itertools.groupby(tsv, key=lambda r: r[0].split(':')[0]): try: comm = cio.read_communication_from_file( f"{ARGS.concrete_dir}/{comm_id}.comm") # remove non-English documents lang_dist = comm.lidList[0].languageToProbabilityMap lang = max(lang_dist.items(), key=lambda t: t[1])[0] if ARGS.lang != "all" and lang != ARGS.lang: # if ARGS.lang == "all", retain all language samples continue sentences = [ sentence for section in comm.sectionList for sentence in section.sentenceList ] sentence_indices: np.ndarray = np.array( [sentence.textSpan.start for sentence in sentences]) token_indices: List[np.ndarray] = [