def test_minimal_communication_with_uuid(self): comm = Communication() comm.id = "myID" comm.metadata = AnnotationMetadata(tool="TEST", timestamp=int(time.time())) comm.type = "Test Communication" comm.uuid = generate_UUID() self.assertTrue(validate_communication(comm))
def test_minimal_communication_with_uuid(self): comm = Communication() comm.id = "myID" comm.metadata = AnnotationMetadata( tool="TEST", timestamp=int(time.time())) comm.type = "Test Communication" comm.uuid = generate_UUID() self.assertTrue(validate_communication(comm))
def add_chunks_to_comm(comm, chunklink, fail_on_error): '''Converts the first constituency tree of each tokenization to chunks and adds them as a TokenTagging to the communication. comm - Communication to be annotated. chunklink - Path to the modified chunklink perl script. ''' num_sents = 0 num_chunked = 0 try: for tokenization in get_tokenizations(comm): num_sents += 1 try: if tokenization.parseList and len(tokenization.parseList) > 0: parse = tokenization.parseList[0] # Convert concrete Parse to a PTB style parse string to use as stdin for chunklink. ptb_str = '( ' + penn_treebank_for_parse(parse) + ' )\n' ptb_str = ptb_str.encode('ascii', 'replace') logging.debug("PTB string: " + ptb_str) # Run the chunklink script and capture the output. try: # We expect the chunklink script to be a modified version which can read a tree from stdin. p = subprocess.Popen(['perl', chunklink], stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) stdouterr = p.communicate(input=ptb_str) chunk_str = stdouterr[0] chunk_err = stdouterr[1] logging.debug("Chunklink stdout:\n" + chunk_str) logging.debug("Chunklink stderr:\n" + chunk_err) chunk_tags = get_chunks(chunk_str) logging.debug("Chunk tags: " + str(chunk_tags)) if len(chunk_tags) != len( tokenization.tokenList.tokenList): raise Exception( "ERROR: incorrect number of chunks. expected=%d actual=%d" % (len(tokenization.tokenList.tokenList), len(chunk_tags))) metadata = concrete.AnnotationMetadata() metadata.tool = "Chunklink Constituency Converter" metadata.timestamp = long(time.time()) # Extract the chunks column and create a TokenTagging from it. chunks = concrete.TokenTagging() chunks.uuid = concrete_uuid.generate_UUID() chunks.metadata = metadata chunks.taggingType = "CHUNK" chunks.taggedTokenList = [] for i, chunk in enumerate(chunk_tags): tt = concrete.TaggedToken() tt.tokenIndex = i tt.tag = chunk chunks.taggedTokenList.append(tt) # Add chunks to the list of TokenTaggings. if not tokenization.tokenTaggingList: tokenization.tokenTaggingList = [] tokenization.tokenTaggingList.append(chunks) num_chunked += 1 except subprocess.CalledProcessError as e: logging.error("Chunklink failed on tree: %s" % (ptb_str)) if fail_on_error: raise e except Exception as e: logging.exception("Chunking failed on tokenization") if fail_on_error: raise e except Exception as e: logging.exception("Chunking failed on Communication") if fail_on_error: raise e return num_chunked, num_sents
def add_chunks_to_comm(comm, chunklink, fail_on_error): '''Converts the first constituency tree of each tokenization to chunks and adds them as a TokenTagging to the communication. comm - Communication to be annotated. chunklink - Path to the modified chunklink perl script. ''' num_sents = 0 num_chunked = 0 try: for tokenization in get_tokenizations(comm): num_sents += 1 try: if tokenization.parseList and len(tokenization.parseList) > 0: parse = tokenization.parseList[0] # Convert concrete Parse to a PTB style parse string to use as stdin for chunklink. ptb_str = '( ' + penn_treebank_for_parse(parse) + ' )\n' ptb_str = ptb_str.encode('ascii', 'replace') logging.debug("PTB string: " + ptb_str) # Run the chunklink script and capture the output. try: # We expect the chunklink script to be a modified version which can read a tree from stdin. p = subprocess.Popen(['perl', chunklink], stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) stdouterr = p.communicate(input=ptb_str) chunk_str = stdouterr[0] chunk_err = stdouterr[1] logging.debug("Chunklink stdout:\n" + chunk_str) logging.debug("Chunklink stderr:\n" + chunk_err) chunk_tags = get_chunks(chunk_str) logging.debug("Chunk tags: " + str(chunk_tags)) if len(chunk_tags) != len(tokenization.tokenList.tokenList): raise Exception("ERROR: incorrect number of chunks. expected=%d actual=%d" % (len(tokenization.tokenList.tokenList), len(chunk_tags))) metadata = concrete.AnnotationMetadata() metadata.tool = "Chunklink Constituency Converter" metadata.timestamp = long(time.time()) # Extract the chunks column and create a TokenTagging from it. chunks = concrete.TokenTagging() chunks.uuid = concrete_uuid.generate_UUID() chunks.metadata = metadata chunks.taggingType = "CHUNK" chunks.taggedTokenList = [] for i, chunk in enumerate(chunk_tags): tt = concrete.TaggedToken() tt.tokenIndex = i tt.tag = chunk chunks.taggedTokenList.append(tt) # Add chunks to the list of TokenTaggings. if not tokenization.tokenTaggingList: tokenization.tokenTaggingList = [] tokenization.tokenTaggingList.append(chunks) num_chunked += 1 except subprocess.CalledProcessError as e: logging.error("Chunklink failed on tree: %s" % (ptb_str)) if fail_on_error: raise e except Exception as e: logging.exception("Chunking failed on tokenization") if fail_on_error: raise e except Exception as e: logging.exception("Chunking failed on Communication") if fail_on_error: raise e return num_chunked, num_sents
def test_next(self): comm = Communication() comm.uuid = generate_UUID()