def create_comm_from_tweet(json_tweet_string):
    """Create a Concrete Communication from a JSON Tweet string

    Args:
        json_tweet_string: A JSON string for a Tweet, using the JSON
            format specified by the Twitter API:
              https://dev.twitter.com/docs/platform-objects/tweets

    Returns:
        A Concrete Communication object
    """
    tweet_data = json.loads(json_tweet_string)

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()

    comm = concrete.Communication()
    comm.id = "Annotation_Test_1"
    comm.metadata = concrete.AnnotationMetadata(
        tool="Annotation Example script",
        timestamp=int(time.time())
    )
    comm.text = tweet_data['text']
    comm.type = "Tweet"
    comm.uuid = next(aug)

    comm.sectionList = [concrete.Section()]
    comm.sectionList[0].kind = "mySectionKind"
    comm.sectionList[0].uuid = next(aug)
    comm.sectionList[0].sentenceList = [concrete.Sentence()]
    comm.sectionList[0].sentenceList[0].uuid = next(aug)
    comm.sectionList[0].sentenceList[0].tokenization = concrete.Tokenization()

    tokenization = comm.sectionList[0].sentenceList[0].tokenization
    tokenization.kind = concrete.TokenizationKind.TOKEN_LIST
    tokenization.metadata = concrete.AnnotationMetadata(
        tool="TEST", timestamp=int(time.time()))
    tokenization.tokenList = concrete.TokenList()
    tokenization.tokenList.tokenList = []
    tokenization.uuid = next(aug)

    # Whitespace tokenization
    tokens = comm.text.split()

    for i, token_text in enumerate(tokens):
        t = concrete.Token()
        t.tokenIndex = i
        t.text = token_text
        tokenization.tokenList.tokenList.append(t)

    if validate_communication(comm):
        print("Created valid Communication")
    else:
        print("ERROR: Invalid Communication")

    return comm
def test_check_required_fields():
    # When a field is marked as required in a .thrift file, the
    # Python code generated by the Thrift compiler only seems to
    # capture this requirement in the validate() function for the
    # generated class.  While the ThriftGeneratedClass.thrift_spec
    # structure captures the names and types of the fields,
    # thrift_spec does not seem to store any flags indicating
    # whether or not a field is required.
    #
    # Here is the validate() function for the Communication class:
    #
    #    def validate():
    #        if self.id is None:
    #            raise TProtocol.TProtocolException(
    #                message='Required field id is unset!')
    #        if self.uuid is None:
    #            raise TProtocol.TProtocolException(
    #                message='Required field uuid is unset!')
    #        if self.type is None:
    #            raise TProtocol.TProtocolException(
    #                message='Required field type is unset!')
    #        return
    #
    # The validate() function raises an exception when it can't
    # find a required field.  There doesn't seem to be any way to
    # determine whether multiple required fields are missing,
    # aside from assigning a value to the required field and
    # running validate() again.

    comm = concrete.Communication()

    with LogCapture() as log_capture:
        assert not validate_thrift_deep(comm)
    log_capture.check(
        ('root', 'ERROR', "Communication: Required Field 'id' is unset!"))

    comm.id = "ID"
    with LogCapture() as log_capture:
        assert not validate_thrift_deep(comm)
    log_capture.check(
        ('root', 'ERROR', "Communication: Required Field 'uuid' is unset!"))

    comm.uuid = concrete.UUID(uuidString="TEST_UUID")
    with LogCapture() as log_capture:
        assert not validate_thrift_deep(comm)
    log_capture.check(
        ('root', 'ERROR',
         StringComparison(r".*TEST_UUID.*Required Field 'type' is unset!")))

    comm.metadata = concrete.AnnotationMetadata(tool="TEST",
                                                timestamp=int(time.time()))

    comm.type = "OTHER"
    assert validate_thrift_deep(comm)
def add_dictionary_tagging(comm):
    """Adds In/Out of dictionary 'POS' tags to a Communication

    Takes a Concrete Communication, adds a Part-Of-Speech tag to each
    token, where the tags record whether the token is 'In' or 'Out' of
    the system dictionary.

    Args:
        comm: A Concrete Communication with tokens

    Returns:
        A copy of the original Communication, with POS tags added
    """
    dictionary = set()
    for w in open('/usr/share/dict/words'):
        dictionary.add(w.strip().lower())

    augf = AnalyticUUIDGeneratorFactory(comm)
    aug = augf.create()

    if comm.sectionList:
        for section in comm.sectionList:
            if section.sentenceList:
                for sentence in section.sentenceList:
                    posTagList = concrete.TokenTagging()
                    posTagList.metadata = concrete.AnnotationMetadata(
                        tool="POS Tagger", timestamp=int(time.time()))
                    posTagList.taggingType = "POS"
                    posTagList.taggedTokenList = []
                    posTagList.uuid = next(aug)
                    tkzn = sentence.tokenization
                    if tkzn.tokenList:
                        for i, token in enumerate(tkzn.tokenList.tokenList):
                            tt = concrete.TaggedToken()
                            tt.tokenIndex = i
                            if token.text.lower() in dictionary:
                                tt.tag = "In"
                            else:
                                tt.tag = "Out"
                            posTagList.taggedTokenList.append(tt)
                            print("%d [%s] %s" % (i, token.text, tt.tag))
                    tkzn.tokenTaggingList = [posTagList]
            print()

    if validate_communication(comm):
        print("Created valid POS tagging for Communication")
    else:
        print("ERROR: Invalid POS tagging Communication")
    return comm
示例#4
0
def add_chunks_to_comm(comm, chunklink, fail_on_error):
    '''Converts the first constituency tree of each tokenization
    to chunks and adds them as a TokenTagging to the communication.
    
    comm - Communication to be annotated.
    chunklink - Path to the modified chunklink perl script.
    '''
    num_sents = 0
    num_chunked = 0
    try:
        for tokenization in get_tokenizations(comm):
            num_sents += 1
            try:
                if tokenization.parseList and len(tokenization.parseList) > 0:
                    parse = tokenization.parseList[0]
                    # Convert concrete Parse to a PTB style parse string to use as stdin for chunklink.
                    ptb_str = '( ' + penn_treebank_for_parse(parse) + ' )\n'
                    ptb_str = ptb_str.encode('ascii', 'replace')
                    logging.debug("PTB string: " + ptb_str)

                    # Run the chunklink script and capture the output.
                    try:
                        # We expect the chunklink script to be a modified version which can read a tree from stdin.
                        p = subprocess.Popen(['perl', chunklink],
                                             stdout=subprocess.PIPE,
                                             stderr=subprocess.PIPE,
                                             stdin=subprocess.PIPE)

                        stdouterr = p.communicate(input=ptb_str)
                        chunk_str = stdouterr[0]
                        chunk_err = stdouterr[1]
                        logging.debug("Chunklink stdout:\n" + chunk_str)
                        logging.debug("Chunklink stderr:\n" + chunk_err)
                        chunk_tags = get_chunks(chunk_str)
                        logging.debug("Chunk tags: " + str(chunk_tags))
                        if len(chunk_tags) != len(
                                tokenization.tokenList.tokenList):
                            raise Exception(
                                "ERROR: incorrect number of chunks. expected=%d actual=%d"
                                % (len(tokenization.tokenList.tokenList),
                                   len(chunk_tags)))

                        metadata = concrete.AnnotationMetadata()
                        metadata.tool = "Chunklink Constituency Converter"
                        metadata.timestamp = long(time.time())
                        # Extract the chunks column and create a TokenTagging from it.
                        chunks = concrete.TokenTagging()
                        chunks.uuid = concrete_uuid.generate_UUID()
                        chunks.metadata = metadata
                        chunks.taggingType = "CHUNK"
                        chunks.taggedTokenList = []
                        for i, chunk in enumerate(chunk_tags):
                            tt = concrete.TaggedToken()
                            tt.tokenIndex = i
                            tt.tag = chunk
                            chunks.taggedTokenList.append(tt)

                        # Add chunks to the list of TokenTaggings.
                        if not tokenization.tokenTaggingList:
                            tokenization.tokenTaggingList = []
                        tokenization.tokenTaggingList.append(chunks)
                        num_chunked += 1
                    except subprocess.CalledProcessError as e:
                        logging.error("Chunklink failed on tree: %s" %
                                      (ptb_str))
                        if fail_on_error: raise e
            except Exception as e:
                logging.exception("Chunking failed on tokenization")
                if fail_on_error: raise e
    except Exception as e:
        logging.exception("Chunking failed on Communication")
        if fail_on_error: raise e
    return num_chunked, num_sents
示例#5
0
def create_dummy_annotation():
    '''Creates empty annotation to satisfy format'''
    ann = concrete.AnnotationMetadata()
    ann.tool = 'Quora Scrape Ingest'
    ann.timestamp = int(time.time())
    return ann
示例#6
0
def create_dummy_annotation():
    ann = concrete.AnnotationMetadata()
    ann.tool = 'Quora Scrape Ingest'
    ann.timestamp = int(time.time())
    return ann