def test_CommunicationWriterTar_single_file_fixed_point_unicode( output_file, login_info): comm = read_communication_from_file( "tests/testdata/les-deux-chandeliers.concrete") with CommunicationWriterTar(output_file) as writer: writer.write(comm, "les-deux-chandeliers.concrete") assert tarfile.is_tarfile(output_file) f = tarfile.open(output_file) tarinfo = f.next() assert tarinfo is not None assert "les-deux-chandeliers.concrete" == tarinfo.name actual_data = f.extractfile(tarinfo).read() with open('tests/testdata/les-deux-chandeliers.concrete', 'rb') as expected_f: expected_data = expected_f.read() assert expected_data == actual_data tarinfo = f.next() assert tarinfo is None f.close()
def test_CommunicationWriterTar_single_file_ctx_mgr(output_file, login_info): comm = read_communication_from_file("tests/testdata/simple_1.concrete") with CommunicationWriterTar(output_file) as writer: writer.write(comm, "simple_1.concrete") assert tarfile.is_tarfile(output_file) f = tarfile.open(output_file) tarinfo = f.next() assert tarinfo is not None assert "simple_1.concrete" == tarinfo.name assert tarinfo.isreg() assert tarinfo.mtime > time() - TIME_MARGIN assert os.stat('tests/testdata/simple_1.concrete').st_size == tarinfo.size assert 0o644 == tarinfo.mode assert login_info['uid'] == tarinfo.uid assert login_info['username'] == tarinfo.uname assert login_info['gid'] == tarinfo.gid assert login_info['groupname'] == tarinfo.gname tarinfo = f.next() assert tarinfo is None f.close()
def test_CommunicationWriterTGZ_single_file_default_name(output_file, login_info): comm = read_communication_from_file("tests/testdata/simple_1.concrete") writer = CommunicationWriterTGZ() try: writer.open(output_file) writer.write(comm) finally: writer.close() assert tarfile.is_tarfile(output_file) f = tarfile.open(output_file) tarinfo = f.next() assert tarinfo is not None assert comm.id + '.concrete' == tarinfo.name assert tarinfo.isreg() assert tarinfo.mtime > time() - TIME_MARGIN assert os.stat('tests/testdata/simple_1.concrete').st_size == tarinfo.size assert 0o644 == tarinfo.mode assert login_info['uid'] == tarinfo.uid assert login_info['username'] == tarinfo.uname assert login_info['gid'] == tarinfo.gid assert login_info['groupname'] == tarinfo.gname tarinfo = f.next() assert tarinfo is None f.close()
def test_CommunicationWriterTGZ_single_file_default_name( output_file, login_info): comm = read_communication_from_file("tests/testdata/simple_1.concrete") writer = CommunicationWriterTGZ() try: writer.open(output_file) writer.write(comm) finally: writer.close() assert tarfile.is_tarfile(output_file) f = tarfile.open(output_file) tarinfo = f.next() assert tarinfo is not None assert comm.id + '.concrete' == tarinfo.name assert tarinfo.isreg() assert tarinfo.mtime > time() - TIME_MARGIN assert os.stat('tests/testdata/simple_1.concrete').st_size == tarinfo.size assert 0o644 == tarinfo.mode assert login_info['uid'] == tarinfo.uid assert login_info['username'] == tarinfo.uname assert login_info['gid'] == tarinfo.gid assert login_info['groupname'] == tarinfo.gname tarinfo = f.next() assert tarinfo is None f.close()
def test_CommunicationWriterTar_single_file_fixed_point_unicode(output_file, login_info): comm = read_communication_from_file( "tests/testdata/les-deux-chandeliers.concrete" ) with CommunicationWriterTar(output_file) as writer: writer.write(comm, "les-deux-chandeliers.concrete") assert tarfile.is_tarfile(output_file) f = tarfile.open(output_file) tarinfo = f.next() assert tarinfo is not None assert "les-deux-chandeliers.concrete" == tarinfo.name actual_data = f.extractfile(tarinfo).read() with open('tests/testdata/les-deux-chandeliers.concrete', 'rb') as expected_f: expected_data = expected_f.read() assert expected_data == actual_data tarinfo = f.next() assert tarinfo is None f.close()
def main(): usage = "%prog [options] <input path> <output path>" parser = optparse.OptionParser(usage=usage) (options, args) = parser.parse_args(sys.argv) if len(args) != 3: parser.print_help() sys.exit(1) in_path = args[1] out_path = args[2] # in_path = "/mnt/d/MyProjects/ACE2005/preprocess/ace-05-comms-ptb-anno-chunks/AFP_ENG_20030616.0715.concrete" # in_path = "/mnt/d/MyProjects/AFP_ENG_20030616.0715.1.concrete" # out_path = "/mnt/d/MyProjects/AFP_ENG_20030616.0715.json" if not os.path.exists(in_path): raise Exception("Input path doesn't exist: " + in_path) comm = read_communication_from_file(in_path) js = comm2json(comm) with open(out_path, "wb") as out_file: json.dump(js, out_file, encoding="utf-8") print("From %s to %s done." % (in_path, out_path))
def communication_file_to_json(communication_filename): """ Takes a Communication filename, deserializes Communication from file, returns a JSON string with the information in that Communication. """ comm = read_communication_from_file(communication_filename) return communication_to_json(comm)
def test_CommunicationWriter_gz_fixed_point_unicode(output_file): input_file = 'tests/testdata/les-deux-chandeliers.concrete' comm = read_communication_from_file(input_file) with CommunicationWriter(output_file, gzip=True) as writer: writer.write(comm) with open(input_file, 'rb') as expected_f: expected_data = expected_f.read() with gzip.open(output_file, 'rb') as actual_f: actual_data = actual_f.read() assert expected_data == actual_data
def test_CommunicationWriter_fixed_point_ctx_mgr(output_file): input_file = 'tests/testdata/simple_1.concrete' comm = read_communication_from_file(input_file) with CommunicationWriter(output_file) as writer: writer.write(comm) with open(input_file, 'rb') as expected_f: expected_data = expected_f.read() with open(output_file, 'rb') as actual_f: actual_data = actual_f.read() assert expected_data == actual_data
def add_chunks_to_file(in_file, out_file, chunklink, fail_on_error): '''Reads a Communication file, adds chunking information, and writes a new Communication file containing the annotated version.''' # Deserialize comm = read_communication_from_file(in_file) # Add chunks num_chunked, num_sents = add_chunks_to_comm(comm, chunklink, fail_on_error) logging.info("Chunked %d / %d = %f" % (num_chunked, num_sents, float(num_chunked) / float(num_sents))) # Serialize write_communication_to_file(comm, out_file)
def main(): set_stdout_encoding() parser = argparse.ArgumentParser( description="Pretty Print a Concrete file") parser.add_argument('--concrete_type', default='communication', choices=['communication', 'tokenlattice'], help='Default: communication') parser.add_argument('--protocol', default='simple', choices=['simple', 'TJSONProtocol'], help='Default: simple') parser.add_argument('--remove-timestamps', action='store_true', help="Removes timestamps from JSON output") parser.add_argument('--remove-uuids', action='store_true', help="Removes UUIDs from JSON output") parser.add_argument('-l', '--loglevel', '--log-level', help='Logging verbosity level threshold (to stderr)', default='info') parser.add_argument('concrete_file', help='path to input concrete communication file') parser.add_argument('json_file', nargs='?', default='-', help='path to output json file') concrete.version.add_argparse_argument(parser) args = parser.parse_args() logging.basicConfig(format='%(asctime)-15s %(levelname)s: %(message)s', level=args.loglevel.upper()) if args.protocol == 'simple': if args.concrete_type == 'communication': json_communication = communication_file_to_json( args.concrete_file, remove_timestamps=args.remove_timestamps, remove_uuids=args.remove_uuids ) else: json_communication = tokenlattice_file_to_json( args.concrete_file ) else: if args.concrete_type == 'communication': comm = read_communication_from_file(args.concrete_file) json_communication = TSerialization.serialize( comm, TJSONProtocol.TJSONProtocolFactory()).decode('utf-8') else: raise NotImplementedError if args.json_file == '-': print(json_communication) else: with codecs.open(args.json_file, 'w', encoding='utf-8') as f: f.write(json_communication)
def add_chunks_to_file(in_file, out_file, chunklink, fail_on_error): '''Reads a Communication file, adds chunking information, and writes a new Communication file containing the annotated version.''' # Deserialize comm = read_communication_from_file(in_file) # Add chunks num_chunked, num_sents = add_chunks_to_comm(comm, chunklink, fail_on_error) logging.info( "Chunked %d / %d = %f" % (num_chunked, num_sents, float(num_chunked) / float(num_sents))) # Serialize write_communication_to_file(comm, out_file)
def test_CommunicationWriterZip_single_file_ctx_mgr(output_file, login_info): comm = read_communication_from_file("tests/testdata/simple_1.concrete") with CommunicationWriterZip(output_file) as writer: writer.write(comm, "simple_1.concrete") assert zipfile.is_zipfile(output_file) f = zipfile.ZipFile(output_file) [zipinfo] = f.infolist() assert "simple_1.concrete" == zipinfo.filename assert timegm(zipinfo.date_time) > timegm(localtime()) - TIME_MARGIN assert os.stat('tests/testdata/simple_1.concrete').st_size == zipinfo.file_size f.close()
def test_CommunicationWriter_gz_fixed_point(output_file): input_file = 'tests/testdata/simple_1.concrete' comm = read_communication_from_file(input_file) writer = CommunicationWriter(gzip=True) try: writer.open(output_file) writer.write(comm) finally: writer.close() with open(input_file, 'rb') as expected_f: expected_data = expected_f.read() with gzip.open(output_file, 'rb') as actual_f: actual_data = actual_f.read() assert expected_data == actual_data
def test_CommunicationWriterZip_single_file_ctx_mgr(output_file, login_info): comm = read_communication_from_file("tests/testdata/simple_1.concrete") with CommunicationWriterZip(output_file) as writer: writer.write(comm, "simple_1.concrete") assert zipfile.is_zipfile(output_file) f = zipfile.ZipFile(output_file) [zipinfo] = f.infolist() assert "simple_1.concrete" == zipinfo.filename assert timegm(zipinfo.date_time) > timegm(localtime()) - TIME_MARGIN assert os.stat( 'tests/testdata/simple_1.concrete').st_size == zipinfo.file_size f.close()
def communication_file_to_json(communication_filename, remove_timestamps=False, remove_uuids=False): """Get a "pretty-printed" JSON string representation for a Communication Args: - `communication_filename`: String specifying Communication filename - `remove_uuids`: Boolean flag indicating if Concrete UUIDs should be removed Returns: - A string containing a "pretty-printed" JSON representation of the Communication """ comm = read_communication_from_file(communication_filename) return thrift_to_json(comm, remove_timestamps=remove_timestamps, remove_uuids=remove_uuids)
def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) parser = argparse.ArgumentParser( description="Pretty Print a Concrete file") parser.add_argument('--concrete_type', default='communication', choices=['communication', 'tokenlattice'], help='Default: communication') parser.add_argument('--protocol', default='simple', choices=['simple', 'TJSONProtocol'], help='Default: simple') parser.add_argument('--remove-timestamps', action='store_true', help="Removes timestamps from JSON output") parser.add_argument('--remove-uuids', action='store_true', help="Removes UUIDs from JSON output") parser.add_argument('concrete_file') parser.add_argument('json_file', nargs='?', default='STDOUT') concrete.version.add_argparse_argument(parser) args = parser.parse_args() if args.protocol == 'simple': if args.concrete_type == 'communication': json_communication = communication_file_to_json( args.concrete_file, remove_timestamps=args.remove_timestamps, remove_uuids=args.remove_uuids ) else: json_communication = tokenlattice_file_to_json(args.concrete_file) else: if args.concrete_type == 'communication': comm = read_communication_from_file(args.concrete_file) json_communication = TSerialization.serialize( comm, TJSONProtocol.TJSONProtocolFactory()) else: raise NotImplementedError if args.json_file == 'STDOUT': print json_communication else: f = codecs.open(args.json_file, "w", encoding="utf-8") f.write(json_communication) f.close()
def test_CommunicationWriterZip_single_file_fixed_point( output_file, login_info): comm = read_communication_from_file("tests/testdata/simple_1.concrete") with CommunicationWriterZip(output_file) as writer: writer.write(comm, "simple_1.concrete") assert zipfile.is_zipfile(output_file) f = zipfile.ZipFile(output_file) [zipinfo] = f.infolist() assert "simple_1.concrete" == zipinfo.filename actual_data = f.open(zipinfo).read() with open('tests/testdata/simple_1.concrete', 'rb') as expected_f: expected_data = expected_f.read() assert expected_data == actual_data f.close()
def test_CommunicationWriterZip_single_file_fixed_point(output_file, login_info): comm = read_communication_from_file("tests/testdata/simple_1.concrete") with CommunicationWriterZip(output_file) as writer: writer.write(comm, "simple_1.concrete") assert zipfile.is_zipfile(output_file) f = zipfile.ZipFile(output_file) [zipinfo] = f.infolist() assert "simple_1.concrete" == zipinfo.filename actual_data = f.open(zipinfo).read() with open('tests/testdata/simple_1.concrete', 'rb') as expected_f: expected_data = expected_f.read() assert expected_data == actual_data f.close()
def test_CommunicationWriterZip_single_file_default_name(output_file, login_info): comm = read_communication_from_file("tests/testdata/simple_1.concrete") writer = CommunicationWriterZip() try: writer.open(output_file) writer.write(comm) finally: writer.close() assert zipfile.is_zipfile(output_file) f = zipfile.ZipFile(output_file) [zipinfo] = f.infolist() assert comm.id + '.concrete' == zipinfo.filename assert timegm(zipinfo.date_time) > timegm(localtime()) - TIME_MARGIN assert os.stat('tests/testdata/simple_1.concrete').st_size == zipinfo.file_size f.close()
def test_CommunicationWriterZip_single_file_default_name( output_file, login_info): comm = read_communication_from_file("tests/testdata/simple_1.concrete") writer = CommunicationWriterZip() try: writer.open(output_file) writer.write(comm) finally: writer.close() assert zipfile.is_zipfile(output_file) f = zipfile.ZipFile(output_file) [zipinfo] = f.infolist() assert comm.id + '.concrete' == zipinfo.filename assert timegm(zipinfo.date_time) > timegm(localtime()) - TIME_MARGIN assert os.stat( 'tests/testdata/simple_1.concrete').st_size == zipinfo.file_size f.close()
def serve(): train_test_url = opj(args.data_dir, 'train_test.feat') entity_map_url = opj(args.data_dir, 'entity.map') feat_map_url = opj(args.data_dir, 'vocab.new') entity_sent_url = opj(args.data_dir, 'entities.sentences') guid2name = {} guid2id = {} id2guid = {} guid2sent = {} # The train_test.feat file contains some entities such as number 1997 # that has no features. Its feature line is blank. # These entities were removed while training the neural network architecture. # Therefore to map the embeddings in NVGE back to the KB we need to use this # alignment information. This information is not necessary for BS because BS # can easily handle the fact that some entities have no features (ie. the ) # document is empty. data_set, data_count, alignment = utils.data_set(train_test_url) for idx, row in enumerate( codecs.open(entity_map_url, 'r', 'utf-8').read().split('\n')): if row == '': continue dbid, canonical = row.split('\t') guid2name[dbid] = canonical if idx in alignment: guid2id[dbid] = alignment[idx] id2guid[alignment[idx]] = dbid GUID2SENT_PKL_FILE = opj(args.data_dir, os.path.pardir, 'guid2sent.pkl') try: print 'Loading', GUID2SENT_PKL_FILE guid2sent = pkl.load(open(GUID2SENT_PKL_FILE)) except: print 'Could not find', GUID2SENT_PKL_FILE concrete_entity_files = os.listdir(args.concrete_entity_dir) for commidx, filename in enumerate(concrete_entity_files): print '%-5d\r' % ((commidx * 100) / len(concrete_entity_files)), comm = read_communication_from_file( opj(args.concrete_entity_dir, filename)) guid = comm.id for sent in comm.sectionList[0].sentenceList: uuid = sent.uuid.uuidString tokens = [ e.text for e in sent.tokenization.tokenList.tokenList ] try: guid2sent[guid].append((uuid, tokens)) except KeyError: guid2sent[guid] = [(uuid, tokens)] with open(GUID2SENT_PKL_FILE, 'wb') as gpf: print 'Dumping', GUID2SENT_PKL_FILE pkl.dump(guid2sent, gpf) # for row in codecs.open(entity_sent_url, 'r', 'utf-8').read().split('\n'): # row = row.split(' ||| ') # guid = row[0] # for sent in row[1:]: # tokens = sent.split() # try: # guid2sent[guid].append(tokens) # except KeyError: # guid2sent[guid] = [tokens] id2feat_data = codecs.open(feat_map_url, 'r', 'utf-8').read().split('\n') id2feat = dict((((sum(1 for e in id2feat_data if e != '') - 1) if idx == 0 else (idx - 1)), row.split()[0]) for idx, row in enumerate(id2feat_data) if row != '') print('Checking feature size =', len(data_set[guid2id[":Entity_ENG_EDL_0092354"]]), 'for', guid2name[":Entity_ENG_EDL_0092354"], 'max(id2feat.values())', max(id2feat.keys())) def load(args): import cPickle as pkl with open(opj(args.data_dir, args.model_pkl), 'rb') as f: nnp = pkl.load(f) return nnp handler = EntitySearchProvider( args.language, NVBS(data_set=data_set, nnp=load(args), method=getattr(NVBSALGO, args.algorithm), opts=args, id2guid=id2guid, guid2id=guid2id, guid2name=guid2name, guid2sent=guid2sent, id2feat=id2feat), args.k_query, args.k_rationale) server = SearchServiceWrapper(handler) if args.serve: print('Starting NVBS Server') server.serve(args.host, args.port) else: return handler.index
def read_test_comm(): communication_filename = "tests/testdata/serif_dog-bites-man.concrete" return read_communication_from_file(communication_filename)
def from_concrete_file(comm_file: str, task: str = 'argidcls') -> Document: def _entity_mention_to_span_indices(em: EntityMention) -> Tuple[int, int]: sentid: int = tok_to_sentid[em.tokens.tokenizationId.uuidString] start: int = doc.local_to_global(sent_id=sentid, local_idx=em.tokens.tokenIndexList[0]) end: int = doc.local_to_global(sent_id=sentid, local_idx=em.tokens.tokenIndexList[-1]) return start, end def _normalize_token(t: str) -> str: # For ACE dataset if t == '\'\'': return '"' elif t == '``': return '"' elif t == '-LRB-': return '(' elif t == '-RRB-': return ')' elif t == '-LSB-': return '[' elif t == '-RSB-': return ']' elif t == '-LCB-': return '{' elif t == '-RCB-': return '}' else: return t def _normalize_role(r: str) -> str: if 'Time' in r: return 'Time' else: return r comm: Communication = read_communication_from_file(comm_file) tok_to_sentid: Dict[str, int] = {} sentences: List[List[str]] = [] # extract tokens to form sentences for sent_id, tok in enumerate(get_comm_tokenizations(comm)): tok_to_sentid[tok.uuid.uuidString] = sent_id sentences.append([ _normalize_token(t.text) for t in tok.tokenList.tokenList ]) doc: Document = Document(doc_key=str(comm.id), events=[], sentences=sentences) # convert SituationMention into Event objects for sm in comm.situationMentionSetList[0].mentionList: if sm.situationType != 'EVENT': continue event: Event = Event(document=doc, kind=sm.situationKind, arguments=[]) for arg in sm.argumentList: if arg.entityMentionId is not None: arg_entity_mention = comm.entityMentionForUUID[arg.entityMentionId.uuidString] elif arg.situationMentionId is not None: arg_entity_mention = comm.situationMentionForUUID[arg.situationMentionId.uuidString] else: raise ValueError start_idx, end_idx = _entity_mention_to_span_indices(em=arg_entity_mention) if arg.role == 'TRIGGER': event.trigger = Trigger(start=start_idx, end=end_idx, document=doc) else: event.arguments.append( Argument(start=start_idx, end=end_idx, role=_normalize_role(arg.role), # TODO(Yunmo): Ensure that there is only one Time document=doc) ) if event.trigger is None: if sm.tokens is None: start_idx, end_idx = (0, len(doc.to) - 1) else: start_idx, end_idx = _entity_mention_to_span_indices(sm) event.trigger = Trigger(start=start_idx, end=end_idx, document=doc) doc.events.append(event) doc.argument_mentions: List[Span] = [] if task == 'argidcls-noisy': # add all possible for `argidcls` for em in comm.entityMentionSetList[0].mentionList: start_idx, end_idx = _entity_mention_to_span_indices(em=em) doc.argument_mentions.append(Span(start=start_idx, end=end_idx, document=doc)) elif task == 'argcls' or task == 'argidcls': for event in doc.events: for arg in event.arguments: doc.argument_mentions.append(Span(start=arg.start, end=arg.end, document=doc)) # else: # raise NotImplemented return doc