Пример #1
0
 def test_x_prefix_no_comm(self):
     n = 1000
     augf = AnalyticUUIDGeneratorFactory()
     u = augf.comm_uuid
     for i in xrange(n):
         aug = augf.create()
         self.assertTrue(aug.next().uuidString.startswith(u[:8 + 1 + 4]))
Пример #2
0
 def test_x_prefix_no_comm(self):
     n = 1000
     augf = AnalyticUUIDGeneratorFactory()
     u = augf.comm_uuid
     for i in xrange(n):
         aug = augf.create()
         self.assertTrue(aug.next().uuidString.startswith(u[:8 + 1 + 4]))
Пример #3
0
 def test_x_prefix_bad_comm_uuid(self):
     u = '7575a428a-aaf7-4c2e-929e-1e2a0ab59e16'
     comm = Duck()
     comm.uuid = Duck()
     comm.uuid.uuidString = u
     augf = AnalyticUUIDGeneratorFactory(comm)
     with self.assertRaises(ValueError):
         augf.create()
Пример #4
0
 def test_x_prefix_bad_comm_uuid(self):
     u = '7575a428a-aaf7-4c2e-929e-1e2a0ab59e16'
     comm = Duck()
     comm.uuid = Duck()
     comm.uuid.uuidString = u
     augf = AnalyticUUIDGeneratorFactory(comm)
     with self.assertRaises(ValueError):
         augf.create()
Пример #5
0
def create_comm_from_tweet(json_tweet_string):
    """Create a Concrete Communication from a JSON Tweet string

    Args:
        json_tweet_string: A JSON string for a Tweet, using the JSON
            format specified by the Twitter API:
              https://dev.twitter.com/docs/platform-objects/tweets

    Returns:
        A Concrete Communication object
    """
    tweet_data = json.loads(json_tweet_string)

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()

    comm = concrete.Communication()
    comm.id = "Annotation_Test_1"
    comm.metadata = concrete.AnnotationMetadata(
        tool="Annotation Example script",
        timestamp=int(time.time())
    )
    comm.text = tweet_data['text']
    comm.type = "Tweet"
    comm.uuid = aug.next()

    comm.sectionList = [concrete.Section()]
    comm.sectionList[0].kind = "mySectionKind"
    comm.sectionList[0].uuid = aug.next()
    comm.sectionList[0].sentenceList = [concrete.Sentence()]
    comm.sectionList[0].sentenceList[0].uuid = aug.next()
    comm.sectionList[0].sentenceList[0].tokenization = concrete.Tokenization()

    tokenization = comm.sectionList[0].sentenceList[0].tokenization
    tokenization.kind = concrete.TokenizationKind.TOKEN_LIST
    tokenization.metadata = concrete.AnnotationMetadata(
        tool="TEST", timestamp=int(time.time()))
    tokenization.tokenList = concrete.TokenList()
    tokenization.tokenList.tokenList = []
    tokenization.uuid = aug.next()

    # Whitespace tokenization
    tokens = comm.text.split()

    for i, token_text in enumerate(tokens):
        t = concrete.Token()
        t.tokenIndex = i
        t.text = token_text
        tokenization.tokenList.tokenList.append(t)

    if validate_communication(comm):
        print "Created valid Communication"
    else:
        print "ERROR: Invalid Communication"

    return comm
def create_comm_from_tweet(json_tweet_string):
    """Create a Concrete Communication from a JSON Tweet string

    Args:
        json_tweet_string: A JSON string for a Tweet, using the JSON
            format specified by the Twitter API:
              https://dev.twitter.com/docs/platform-objects/tweets

    Returns:
        A Concrete Communication object
    """
    tweet_data = json.loads(json_tweet_string)

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()

    comm = concrete.Communication()
    comm.id = "Annotation_Test_1"
    comm.metadata = concrete.AnnotationMetadata(
        tool="Annotation Example script",
        timestamp=int(time.time())
    )
    comm.text = tweet_data['text']
    comm.type = "Tweet"
    comm.uuid = next(aug)

    comm.sectionList = [concrete.Section()]
    comm.sectionList[0].kind = "mySectionKind"
    comm.sectionList[0].uuid = next(aug)
    comm.sectionList[0].sentenceList = [concrete.Sentence()]
    comm.sectionList[0].sentenceList[0].uuid = next(aug)
    comm.sectionList[0].sentenceList[0].tokenization = concrete.Tokenization()

    tokenization = comm.sectionList[0].sentenceList[0].tokenization
    tokenization.kind = concrete.TokenizationKind.TOKEN_LIST
    tokenization.metadata = concrete.AnnotationMetadata(
        tool="TEST", timestamp=int(time.time()))
    tokenization.tokenList = concrete.TokenList()
    tokenization.tokenList.tokenList = []
    tokenization.uuid = next(aug)

    # Whitespace tokenization
    tokens = comm.text.split()

    for i, token_text in enumerate(tokens):
        t = concrete.Token()
        t.tokenIndex = i
        t.text = token_text
        tokenization.tokenList.tokenList.append(t)

    if validate_communication(comm):
        print("Created valid Communication")
    else:
        print("ERROR: Invalid Communication")

    return comm
Пример #7
0
 def test_y_prefix(self):
     m = 100
     n = 100
     augf = AnalyticUUIDGeneratorFactory()
     for i in xrange(m):
         aug = augf.create()
         uu = aug.next().uuidString
         for j in xrange(n - 1):
             self.assertTrue(aug.next().uuidString.startswith(
                 uu[:8 + 1 + 4 + 1 + 4 + 1 + 4]))
Пример #8
0
 def test_y_prefix_spread(self):
     m = 10
     augf = AnalyticUUIDGeneratorFactory()
     # union bound: (1/16)^8 * m^2 = 2e-8
     s = set()
     for i in xrange(m):
         aug = augf.create()
         u = aug.next().uuidString
         s.add(u[:8 + 1 + 4 + 1 + 4 + 1 + 4])
     self.assertEquals(len(s), m)
Пример #9
0
 def test_y_prefix_spread(self):
     m = 10
     augf = AnalyticUUIDGeneratorFactory()
     # union bound: (1/16)^8 * m^2 = 2e-8
     s = set()
     for i in xrange(m):
         aug = augf.create()
         u = aug.next().uuidString
         s.add(u[:8 + 1 + 4 + 1 + 4 + 1 + 4])
     self.assertEquals(len(s), m)
Пример #10
0
 def test_y_prefix(self):
     m = 100
     n = 100
     augf = AnalyticUUIDGeneratorFactory()
     for i in xrange(m):
         aug = augf.create()
         uu = aug.next().uuidString
         for j in xrange(n - 1):
             self.assertTrue(aug.next().uuidString.startswith(
                 uu[:8 + 1 + 4 + 1 + 4 + 1 + 4]))
Пример #11
0
 def test_x_prefix_with_comm(self):
     n = 1000
     u = '7575a428-aaf7-4c2e-929e-1e2a0ab59e16'
     comm = Duck()
     comm.uuid = Duck()
     comm.uuid.uuidString = u
     augf = AnalyticUUIDGeneratorFactory(comm)
     self.assertEquals(augf.comm_uuid, u)
     for i in xrange(n):
         aug = augf.create()
         self.assertTrue(aug.next().uuidString.startswith(u[:8 + 1 + 4]))
Пример #12
0
def create_simple_comm(comm_id, sentence_string="Super simple sentence ."):
    """Create a simple (valid) Communication suitable for testing purposes

    The Communication will have a single Section containing a single
    Sentence.

    Args:

    - `comm_id`: A string specifying a Communication ID
    - `sentence_string`: A string to be used for the sentence text.
       The string will be whitespace-tokenized.

    Returns:

    - A Concrete Communication object
    """
    logging.warning('create_simple_comm will be removed in a future'
                    ' release, please use create_comm instead')

    toolname = "TEST"
    timestamp = int(time.time())

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()

    comm = Communication(id=comm_id,
                         metadata=AnnotationMetadata(tool=toolname,
                                                     timestamp=timestamp),
                         type=toolname,
                         uuid=aug.next())

    tokenization = Tokenization(kind=TokenizationKind.TOKEN_LIST,
                                metadata=AnnotationMetadata(
                                    tool=toolname, timestamp=timestamp),
                                tokenList=TokenList(tokenList=[]),
                                uuid=aug.next())
    token_string_list = sentence_string.split()
    for i, token_string in enumerate(token_string_list):
        tokenization.tokenList.tokenList.append(
            Token(text=token_string, tokenIndex=i))

    sentence = Sentence(textSpan=TextSpan(0, len(sentence_string)),
                        tokenization=tokenization,
                        uuid=aug.next())

    section = Section(kind="SectionKind",
                      sentenceList=[sentence],
                      textSpan=TextSpan(0, len(sentence_string)),
                      uuid=aug.next())

    comm.sectionList = [section]
    comm.text = sentence_string

    return comm
Пример #13
0
 def test_x_prefix_with_comm(self):
     n = 1000
     u = '7575a428-aaf7-4c2e-929e-1e2a0ab59e16'
     comm = Duck()
     comm.uuid = Duck()
     comm.uuid.uuidString = u
     augf = AnalyticUUIDGeneratorFactory(comm)
     self.assertEquals(augf.comm_uuid, u)
     for i in xrange(n):
         aug = augf.create()
         self.assertTrue(aug.next().uuidString.startswith(u[:8 + 1 + 4]))
Пример #14
0
 def test_z_increment(self):
     m = 100
     n = 100
     augf = AnalyticUUIDGeneratorFactory()
     for i in xrange(m):
         aug = augf.create()
         u = aug.next().uuidString
         z = int(u[8 + 1 + 4 + 1 + 4 + 1 + 4 + 1:], 16)
         for j in xrange(n - 1):
             u = aug.next().uuidString
             self.assertEquals(int(u[8 + 1 + 4 + 1 + 4 + 1 + 4 + 1:], 16),
                               (z + 1) % 2**48)
             z = (z + 1) % 2**48
Пример #15
0
 def test_spread(self):
     m = 100
     n = 100
     augf = AnalyticUUIDGeneratorFactory()
     # union bound: (2m-1)*(1/16)^12 * n^2 = 7e-9
     s = set()
     for i in xrange(m):
         aug = augf.create()
         u = aug.next().uuidString
         s.add(u)
         for j in xrange(n - 1):
             u = aug.next().uuidString
             s.add(u)
     self.assertEquals(len(s), m * n)
Пример #16
0
 def test_spread(self):
     m = 100
     n = 100
     augf = AnalyticUUIDGeneratorFactory()
     # union bound: (2m-1)*(1/16)^12 * n^2 = 7e-9
     s = set()
     for i in xrange(m):
         aug = augf.create()
         u = aug.next().uuidString
         s.add(u)
         for j in xrange(n - 1):
             u = aug.next().uuidString
             s.add(u)
     self.assertEquals(len(s), m * n)
Пример #17
0
def add_dictionary_tagging(comm):
    """Adds In/Out of dictionary 'POS' tags to a Communication

    Takes a Concrete Communication, adds a Part-Of-Speech tag to each
    token, where the tags record whether the token is 'In' or 'Out' of
    the system dictionary.

    Args:
        comm: A Concrete Communication with tokens

    Returns:
        A copy of the original Communication, with POS tags added
    """
    dictionary = set()
    for w in open('/usr/share/dict/words'):
        dictionary.add(w.strip().lower())

    augf = AnalyticUUIDGeneratorFactory(comm)
    aug = augf.create()

    if comm.sectionList:
        for section in comm.sectionList:
            if section.sentenceList:
                for sentence in section.sentenceList:
                    posTagList = concrete.TokenTagging()
                    posTagList.metadata = concrete.AnnotationMetadata(
                        tool="POS Tagger", timestamp=int(time.time()))
                    posTagList.taggingType = "POS"
                    posTagList.taggedTokenList = []
                    posTagList.uuid = aug.next()
                    tkzn = sentence.tokenization
                    if tkzn.tokenList:
                        for i, token in enumerate(tkzn.tokenList.tokenList):
                            tt = concrete.TaggedToken()
                            tt.tokenIndex = i
                            if token.text.lower() in dictionary:
                                tt.tag = "In"
                            else:
                                tt.tag = "Out"
                            posTagList.taggedTokenList.append(tt)
                            print "%d [%s] %s" % (i, token.text, tt.tag)
                    tkzn.tokenTaggingList = [posTagList]
            print

    if validate_communication(comm):
        print "Created valid POS tagging for Communication"
    else:
        print "ERROR: Invalid POS tagging Communication"
    return comm
def add_dictionary_tagging(comm):
    """Adds In/Out of dictionary 'POS' tags to a Communication

    Takes a Concrete Communication, adds a Part-Of-Speech tag to each
    token, where the tags record whether the token is 'In' or 'Out' of
    the system dictionary.

    Args:
        comm: A Concrete Communication with tokens

    Returns:
        A copy of the original Communication, with POS tags added
    """
    dictionary = set()
    for w in open('/usr/share/dict/words'):
        dictionary.add(w.strip().lower())

    augf = AnalyticUUIDGeneratorFactory(comm)
    aug = augf.create()

    if comm.sectionList:
        for section in comm.sectionList:
            if section.sentenceList:
                for sentence in section.sentenceList:
                    posTagList = concrete.TokenTagging()
                    posTagList.metadata = concrete.AnnotationMetadata(
                        tool="POS Tagger", timestamp=int(time.time()))
                    posTagList.taggingType = "POS"
                    posTagList.taggedTokenList = []
                    posTagList.uuid = next(aug)
                    tkzn = sentence.tokenization
                    if tkzn.tokenList:
                        for i, token in enumerate(tkzn.tokenList.tokenList):
                            tt = concrete.TaggedToken()
                            tt.tokenIndex = i
                            if token.text.lower() in dictionary:
                                tt.tag = "In"
                            else:
                                tt.tag = "Out"
                            posTagList.taggedTokenList.append(tt)
                            print("%d [%s] %s" % (i, token.text, tt.tag))
                    tkzn.tokenTaggingList = [posTagList]
            print()

    if validate_communication(comm):
        print("Created valid POS tagging for Communication")
    else:
        print("ERROR: Invalid POS tagging Communication")
    return comm
Пример #19
0
 def annotate(self, communication):
     text = ""
     for section in communication.sectionList:
         if section.kind == "content":
             text += communication.text[section.textSpan.start:section.textSpan.ending]
     scores = {languages.get(iso639_1_code=k).iso639_3_code : math.exp(v) for k, v in self.classifier.classify(text).iteritems()}
     logging.info(str(scores))
     augf = AnalyticUUIDGeneratorFactory(communication)
     aug = augf.create()
     lid = LanguageIdentification(uuid=aug.next(),
                                  languageToProbabilityMap=scores,
                                  metadata=AnnotationMetadata(tool="valid", timestamp=int(time.time()), kBest=len(scores)),
     )
     communication.lidList.append(lid)
     return communication
Пример #20
0
 def test_z_increment(self):
     m = 100
     n = 100
     augf = AnalyticUUIDGeneratorFactory()
     for i in xrange(m):
         aug = augf.create()
         u = aug.next().uuidString
         z = int(u[8 + 1 + 4 + 1 + 4 + 1 + 4 + 1:], 16)
         for j in xrange(n - 1):
             u = aug.next().uuidString
             self.assertEquals(
                 int(u[8 + 1 + 4 + 1 + 4 + 1 + 4 + 1:], 16),
                 (z + 1) % 2**48
             )
             z = (z + 1) % 2**48
Пример #21
0
    def annotate(self, communication):
        augf = AnalyticUUIDGeneratorFactory(communication)
        aug = augf.create()        

        for section in communication.sectionList:
            text = communication.text[section.textSpan.start:section.textSpan.ending]
            current_offset = section.textSpan.start            
            for sent in nltk.sent_tokenize(text):
                logging.info("Found sentence %s", sent)
                initial = text.find(sent)
                s = Sentence(uuid=aug.next(),
                             textSpan=TextSpan(start=current_offset + initial, ending=current_offset + initial + len(sent)))
                section.sentenceList.append(s)                    
                current_offset = current_offset + initial + len(sent)
                text = communication.text[current_offset:]
        return communication
Пример #22
0
    def annotate(self, communication):
        text = communication.text
        augf = AnalyticUUIDGeneratorFactory(communication)
        aug = augf.create()
        entities = {}
        for section in communication.sectionList:
            for sentence in section.sentenceList:
                tokens = [
                    x.text for x in sentence.tokenization.tokenList.tokenList
                ]
                tags = [
                    x.tag for x in
                    sentence.tokenization.tokenTaggingList[-1].taggedTokenList
                ]
                for subtree in nltk.ne_chunk(zip(tokens, tags)).subtrees():
                    if subtree.label() != "S":
                        name = " ".join([x[0] for x in subtree.leaves()])
                        logging.info("Found named entity \"%s\"", name)
                        entities[(name, subtree.label(
                        ))] = entities.get(name, []) + [
                            EntityMention(
                                uuid=aug.next(),
                                entityType=subtree.label(),
                                tokens=TokenRefSequence(
                                    tokenIndexList=[],
                                    tokenizationId=sentence.tokenization.uuid))
                        ]

        communication.entitySetList.append(
            EntitySet(uuid=aug.next(),
                      metadata=AnnotationMetadata(timestamp=int(time.time()),
                                                  tool="nltk"),
                      entityList=[
                          Entity(uuid=aug.next(),
                                 mentionIdList=[x.uuid for x in v],
                                 canonicalName=k[0],
                                 type=k[1]) for k, v in entities.iteritems()
                      ]))

        communication.entityMentionSetList.append(
            EntityMentionSet(uuid=aug.next(),
                             metadata=AnnotationMetadata(timestamp=int(
                                 time.time()),
                                                         tool="nltk"),
                             mentionList=sum(entities.values(), [])))

        return communication
Пример #23
0
 def annotate(self, communication):
     print communication.id
     augf = AnalyticUUIDGeneratorFactory(communication)
     aug = augf.create()
     for section in communication.sectionList:
         for sentence in section.sentenceList:
             text = communication.text[sentence.textSpan.start:sentence.textSpan.ending]
             sentence.tokenization = Tokenization(uuid = aug.next(),
                                                  kind = TokenizationKind.TOKEN_LIST,
                                                  tokenList = TokenList(tokenList=[]),
                                                  tokenTaggingList = [],
                                                  metadata = AnnotationMetadata(timestamp=int(time.time()), tool="nltk"))
                                                  
             for i, token in enumerate(nltk.word_tokenize(text)):
                 logging.info("Found token %s", token)
                 sentence.tokenization.tokenList.tokenList.append(Token(tokenIndex=i, text=token))
     return communication
Пример #24
0
def create_comm(comm_id, text='',
                comm_type='article', section_kind='passage',
                metadata_tool='concrete-python',
                metadata_timestamp=None,
                annotation_level=AL_TOKEN):
    '''
    Create a simple, valid Communication from text.
    By default the text will be split by double-newlines into sections
    and then by single newlines into sentences within those sections.

    annotation_level controls the amount of annotation that is added:
      AL_NONE      add no optional annotations (not even sections)
      AL_SECTION   add sections but not sentences
      AL_SENTENCE  add sentences but not tokens
      AL_TOKEN     add all annotations, up to tokens (the default)

    If metadata_timestamp is None, the current time will be used.
    '''

    if metadata_timestamp is None:
        metadata_timestamp = int(time.time())

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()

    sections = (annotation_level is not None) and (annotation_level != AL_NONE)

    return Communication(
        id=comm_id,
        uuid=aug.next(),
        type=comm_type,
        text=text,
        metadata=AnnotationMetadata(
            tool=metadata_tool,
            timestamp=metadata_timestamp,
        ),
        sectionList=(
            [
                create_section(sec_text, sec_start, sec_end, section_kind,
                               aug, metadata_tool, metadata_timestamp,
                               annotation_level)
                for (sec_text, sec_start, sec_end) in _split(text, '\n\n')
            ] if text.strip() else []
        ) if sections else None,
    )
Пример #25
0
def create_comm(comm_id,
                text='',
                comm_type='article',
                section_kind='passage',
                metadata_tool='concrete-python',
                metadata_timestamp=None,
                annotation_level=AL_TOKEN):
    '''
    Create a simple, valid Communication from text.
    By default the text will be split by double-newlines into sections
    and then by single newlines into sentences within those sections.

    annotation_level controls the amount of annotation that is added:
      AL_NONE      add no optional annotations (not even sections)
      AL_SECTION   add sections but not sentences
      AL_SENTENCE  add sentences but not tokens
      AL_TOKEN     add all annotations, up to tokens (the default)

    If metadata_timestamp is None, the current time will be used.
    '''

    if metadata_timestamp is None:
        metadata_timestamp = int(time.time())

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()

    sections = (annotation_level is not None) and (annotation_level != AL_NONE)

    return Communication(
        id=comm_id,
        uuid=aug.next(),
        type=comm_type,
        text=text,
        metadata=AnnotationMetadata(
            tool=metadata_tool,
            timestamp=metadata_timestamp,
        ),
        sectionList=([
            create_section(sec_text, sec_start, sec_end, section_kind, aug,
                           metadata_tool, metadata_timestamp, annotation_level)
            for (sec_text, sec_start, sec_end) in _split(text, '\n\n')
        ] if text.strip() else []) if sections else None,
    )
Пример #26
0
    def annotate(self, communication):
        text = communication.text
        augf = AnalyticUUIDGeneratorFactory(communication)
        aug = augf.create()
        for section in communication.sectionList:
            for sentence in section.sentenceList:
                tokens = [
                    x.text for x in sentence.tokenization.tokenList.tokenList
                ]
                sentence.tokenization.tokenTaggingList.append(
                    TokenTagging(uuid=aug.next(),
                                 metadata=AnnotationMetadata(timestamp=int(
                                     time.time()),
                                                             tool="nltk"),
                                 taggedTokenList=[],
                                 taggingType="Penn Treebank"))
                for i, (tok, tag) in enumerate(nltk.pos_tag(tokens)):
                    logging.info("Tagged %s as %s", tok, tag)
                    sentence.tokenization.tokenTaggingList[
                        -1].taggedTokenList.append(
                            TaggedToken(tokenIndex=i, tag=tag))

        return communication
Пример #27
0
    def index():
        text = request.forms.get('text')
        transport = TTransport.TFramedTransport(
            TSocket.TSocket(options.annotator_host, options.annotator_port))
        protocol = TCompactProtocol.TCompactProtocol(transport)
        client = Annotator.Client(protocol)
        transport.open()
        augf = AnalyticUUIDGeneratorFactory()
        aug = augf.create()
        c = Communication(
            id="",
            text=text,
            uuid=aug.next(),
            type="user-supplied input",
            metadata=AnnotationMetadata(timestamp=int(time.time()),
                                        tool="stdin"),
            sectionList=[
                Section(uuid=aug.next(),
                        sentenceList=[],
                        kind="paragraph",
                        textSpan=TextSpan(start=0, ending=len(text)))
            ],
            entitySetList=[],
            entityMentionSetList=[],
        )

        new_c = client.annotate(c)
        form = '''<form action="/" method="post">
        Enter or paste some text: <input name="text" type="text" />
        <input value="Submit" type="submit" />
        </form>
        '''
        return form + "\n".join(["<h3>%s</h3>" % text] + [
            "\n".join([
                "<br>%s %s" % (e.type, e.canonicalName) for e in es.entityList
            ]) for es in new_c.entitySetList
        ])
Пример #28
0
def json_tweet_object_to_Communication(tweet):
    """
    """
    tweet_info = json_tweet_object_to_TweetInfo(tweet)

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()
    if 'id_str' in tweet:
        tweet_id = tweet['id_str']
    else:
        logging.warning('Tweet has no id_str, leaving communication id blank')
        tweet_id = None
    tweet_time = datetime_to_timestamp(datetime.strptime(tweet_info.createdAt,
                                                         CREATED_AT_FORMAT))
    comm = Communication(
        communicationMetadata=CommunicationMetadata(
            tweetInfo=tweet_info),
        metadata=AnnotationMetadata(
            tool=TOOL_NAME,
            timestamp=int(time.time())),
        originalText=tweet_info.text,
        text=tweet_info.text,
        type=TWEET_TYPE,
        uuid=aug.next(),
        startTime=tweet_time,
        endTime=tweet_time,
        id=tweet_id
    )

    # either this, or pass in gen as parameter to fx
    # latter is more annoying to test but slightly cleaner
    if tweet_info.lid is not None:
        tweet_info.lid.uuid = aug.next()
        lidList = [tweet_info.lid]
        comm.lidList = lidList
    return comm
Пример #29
0
def json_tweet_object_to_Communication(tweet):
    """
    """
    tweet_info = json_tweet_object_to_TweetInfo(tweet)

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()
    if 'id_str' in tweet:
        tweet_id = tweet['id_str']
    else:
        logging.warning('Tweet has no id_str, leaving communication id blank')
        tweet_id = None
    tweet_time = unix_time(datetime.strptime(tweet_info.createdAt,
                                             CREATED_AT_FORMAT))
    comm = Communication(
        communicationMetadata=CommunicationMetadata(
            tweetInfo=tweet_info),
        metadata=AnnotationMetadata(
            tool=TOOL_NAME,
            timestamp=int(time.time())),
        originalText=tweet_info.text,
        text=tweet_info.text,
        type=TWEET_TYPE,
        uuid=aug.next(),
        startTime=tweet_time,
        endTime=tweet_time,
        id=tweet_id
    )

    # either this, or pass in gen as parameter to fx
    # latter is more annoying to test but slightly cleaner
    if tweet_info.lid is not None:
        tweet_info.lid.uuid = aug.next()
        lidList = [tweet_info.lid]
        comm.lidList = lidList
    return comm
Пример #30
0
    parser.add_argument("-p", "--port", dest="port", type=int, default=9090)
    parser.add_argument("-H", "--host", dest="host", default="localhost")
    options = parser.parse_args()

    # Make socket
    transport = TSocket.TSocket(options.host, options.port)

    # Buffering is critical. Raw sockets are very slow
    transport = TTransport.TBufferedTransport(transport)

    # Wrap in a protocol
    protocol = TCompactProtocol.TCompactProtocol(transport)

    # Create a client to use the protocol encoder
    client = Annotator.Client(protocol)
    
    # Connect!
    transport.open()

    while True:
        s = raw_input("Write some text > ")
        if re.match(r"^\s*$", s):
            break
        else:
            augf = AnalyticUUIDGeneratorFactory()
            aug = augf.create()
            c = Communication(id="", text=s, uuid=aug.next(), type="tweet", metadata=AnnotationMetadata(timestamp=0, tool="stdin"), lidList=[])

            new_c = client.annotate(c)
            print new_c
Пример #31
0
def create_simple_comm(comm_id, sentence_string="Super simple sentence ."):
    """Create a simple (valid) Communication suitable for testing purposes

    The Communication will have a single Section containing a single
    Sentence.

    Args:

    - `comm_id`: A string specifying a Communication ID
    - `sentence_string`: A string to be used for the sentence text.
       The string will be whitespace-tokenized.

    Returns:

    - A Concrete Communication object
    """
    logging.warning('create_simple_comm will be removed in a future'
                    ' release, please use create_comm instead')

    toolname = "TEST"
    timestamp = int(time.time())

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()

    comm = Communication(
        id=comm_id,
        metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp),
        type=toolname,
        uuid=aug.next()
    )

    tokenization = Tokenization(
        kind=TokenizationKind.TOKEN_LIST,
        metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp),
        tokenList=TokenList(
            tokenList=[]),
        uuid=aug.next()
    )
    token_string_list = sentence_string.split()
    for i, token_string in enumerate(token_string_list):
        tokenization.tokenList.tokenList.append(Token(text=token_string,
                                                      tokenIndex=i))

    sentence = Sentence(
        textSpan=TextSpan(0, len(sentence_string)),
        tokenization=tokenization,
        uuid=aug.next()
    )

    section = Section(
        kind="SectionKind",
        sentenceList=[sentence],
        textSpan=TextSpan(0, len(sentence_string)),
        uuid=aug.next()
    )

    comm.sectionList = [section]
    comm.text = sentence_string

    return comm