Пример #1
0
def multi_token_match(stream_item, aligner_data):
    '''
    iterate through tokens looking for near-exact matches to strings
    in si.ratings...mentions
    '''
    sentences = stream_item.body.sentences.get(aligner_data['tagger_id'])
    if not sentences:
        return
    ## construct a list of tuples, where the first part of each tuple
    ## is a tuple of cleansed strings, and the second part is the
    ## Token object from which it came.
    tokens = map(
        lambda tok: (cleanse(tok.token.decode('utf8')).split(' '), tok),
        itertools.chain(*[sent.tokens for sent in sentences]))
    for annotator_id, ratings in stream_item.ratings.items():
        if annotator_id == aligner_data['annotator_id']:
            for rating in ratings:
                label = Label(annotator=rating.annotator, target=rating.target)

                num_tokens_matched = 0
                for tok in look_ahead_match(rating, tokens):
                    if aligner_data.get('update_labels'):
                        tok.labels.pop(annotator_id, None)
                    add_annotation(tok, label)
                    num_tokens_matched += 1

                if num_tokens_matched == 0:
                    logger.critical(
                        'failed multi_token_match %r:\n  mentions: %r\n  tokens: %r\n clean_html=%r',
                        stream_item.abs_url, rating.mentions, tokens,
                        stream_item.body.clean_html)
                else:
                    logger.debug('matched %d tokens for %r',
                                 num_tokens_matched, rating.target.target_id)
    def _make_stream_item(cls, path, metadata, abs_url, entities):
        '''
        
        '''
        ## Every StreamItem has a stream_time property.  It usually comes
        ## from the document creation time.
        creation_time = os.path.getctime(path)

        ## make stream item
        stream_item = streamcorpus.make_stream_item(
            creation_time,
            abs_url)

        stream_item.source = metadata.get('source')

        ## build a ContentItem for the body
        body = streamcorpus.ContentItem()
        body.media_type = magic.from_file(path, mime=True)
        
        logger.info('opening %r', path)
        with open(path) as f:
            body.raw = f.read()

        ## attach the content_item to the stream_item
        stream_item.body = body

        ## annotations
        anno = streamcorpus.Annotator()
        anno.annotator_id = metadata['annotator_id']
        anno.annotation_time = stream_item.stream_time

        num_ratings = 0
        for entity, is_profile in entities:
            num_ratings += 1

            ## pull out target id and mention tokens
            target_id = str(entity['target_id'])

            ## build a Label for the doc-level label:
            rating = streamcorpus.Rating()
            rating.annotator = anno
            rating.target = streamcorpus.Target(target_id = target_id)
            rating.contains_mention = True

            if is_profile:
                rating.flags = [streamcorpus.FlagType.PROFILE]

            ## parse slots in yaml file
            slots = cls._parse_slots(entity['slots'])

            ## heuristically split the slots string on white space and
            ## use each token as a separate mention.
            rating.mentions = [cleanse(unicode(slot[1], 'utf-8')) for slot in slots]

            ## put this one label in the array of labels
            streamcorpus.add_annotation(stream_item, rating)

        ## provide this stream_item to the pipeline
        logger.info('created StreamItem(num ratings=%d, abs_url=%r', num_ratings, stream_item.abs_url)
        return stream_item
Пример #3
0
def multi_token_match(stream_item, aligner_data):
    '''
    iterate through tokens looking for near-exact matches to strings
    in si.ratings...mentions
    '''    
    sentences = stream_item.body.sentences.get(aligner_data['tagger_id'])
    if not sentences:
        return
    ## construct a list of tuples, where the first part of each tuple
    ## is a tuple of cleansed strings, and the second part is the
    ## Token object from which it came.
    tokens = map(lambda tok: (cleanse(tok.token.decode('utf8')).split(' '), tok), 
                 itertools.chain(*[sent.tokens for sent in sentences]))    
    for annotator_id, ratings in stream_item.ratings.items():
        if annotator_id == aligner_data['annotator_id']:
            for rating in ratings:
                label = Label(annotator=rating.annotator,
                              target=rating.target)
                
                num_tokens_matched = 0
                for tok in look_ahead_match(rating, tokens):
                    if aligner_data.get('update_labels'):
                        tok.labels.pop(annotator_id, None)
                    add_annotation(tok, label)
                    num_tokens_matched += 1

                if num_tokens_matched == 0:
                    logger.critical('failed multi_token_match %r:\n  mentions: %r\n  tokens: %r\n clean_html=%r',
                                    stream_item.abs_url, rating.mentions, tokens, stream_item.body.clean_html)
                else:
                    logger.debug('matched %d tokens for %r',
                                 num_tokens_matched, rating.target.target_id)
Пример #4
0
def look_ahead_match(rating, tokens):
    '''
    iterate through all tokens looking for matches of cleansed tokens,
    skipping tokens left empty by cleansing and coping with Token
    objects that produce multiple space-separated strings when
    cleansed.
    '''
    ## this ensures that all cleansed tokens are non-zero length
    clean_mentions = []
    for m in rating.mentions:
        mtoks = cleanse(m.decode('utf8')).split(' ')
        if mtoks and mtoks != ['']:
            clean_mentions.append(mtoks)
        else:
            logger.warn('got empty cleansed mention: %r\nrating=%r' %
                        (m, rating))

    for i in range(len(tokens)):
        for mtoks in clean_mentions:
            if tokens[i][0][0] == mtoks[0]:
                ## found the start of a possible match, so iterate
                ## through the tuples of cleansed strings for each
                ## Token while stepping through the cleansed strings
                ## for this mention.
                m_j = 1
                i_j = 0
                last_token_matched = 0
                matched = True
                while m_j < len(mtoks):
                    i_j += 1
                    if i_j == len(tokens[i + last_token_matched][0]):
                        i_j = 0
                        last_token_matched += 1
                        if i + last_token_matched == len(tokens):
                            matched = False
                            break
                    if mtoks[m_j] == tokens[i + last_token_matched][0][i_j]:
                        m_j += 1
                    elif tokens[i + last_token_matched][0][i_j] == '':
                        continue
                    else:
                        matched = False
                        break
                if matched:
                    ## yield each matched token only once
                    toks = set()
                    for j in xrange(last_token_matched + 1):
                        toks.add(tokens[i + j][1])
                    for tok in toks:
                        yield tok
Пример #5
0
def look_ahead_match(rating, tokens):
    '''
    iterate through all tokens looking for matches of cleansed tokens,
    skipping tokens left empty by cleansing and coping with Token
    objects that produce multiple space-separated strings when
    cleansed.
    '''
    ## this ensures that all cleansed tokens are non-zero length
    clean_mentions = []
    for m in rating.mentions:
        mtoks = cleanse(m.decode('utf8')).split(' ')
        if mtoks and  mtoks != ['']:
            clean_mentions.append(mtoks)
        else:
            logger.warn('got empty cleansed mention: %r\nrating=%r' % (m, rating))

    for i in range(len(tokens)):
        for mtoks in clean_mentions:
            if tokens[i][0][0] == mtoks[0]:
                ## found the start of a possible match, so iterate
                ## through the tuples of cleansed strings for each
                ## Token while stepping through the cleansed strings
                ## for this mention.
                m_j = 1
                i_j = 0
                last_token_matched = 0
                matched = True
                while m_j < len(mtoks):
                    i_j += 1
                    if i_j == len(tokens[i + last_token_matched][0]):
                        i_j = 0
                        last_token_matched += 1
                        if i + last_token_matched == len(tokens):
                            matched = False
                            break
                    if mtoks[m_j] == tokens[i + last_token_matched][0][i_j]:
                        m_j += 1
                    elif tokens[i + last_token_matched][0][i_j] == '':
                        continue
                    else:
                        matched = False
                        break
                if matched:
                    ## yield each matched token only once
                    toks = set()
                    for j in xrange(last_token_matched + 1):
                        toks.add(tokens[i + j][1])
                    for tok in toks:
                        yield tok
Пример #6
0
def make_chains_with_names(sentences):
    '''
    assemble in-doc coref chains by mapping equiv_id to tokens and
    their cleansed name strings

    :param sentences: iterator over token generators
    :returns dict:
        keys are equiv_ids,
        values are tuple(concatentated name string, list of tokens)
    '''
    ## if an equiv_id is -1, then the token is classified into some
    ## entity_type but has not other tokens in its chain.  We don't
    ## want these all lumped together, so we give them distinct "fake"
    ## equiv_id other than -1 -- counting negatively to avoid
    ## collisions with "real" equiv_ids
    fake_equiv_ids = -2

    ## use a default dictionary
    equiv_ids = collections.defaultdict(lambda: (set(), set()))

    for tagger_id, sents in sentences.items():
        for sent in sents:
            for tok in sent.tokens:
                if tok.entity_type is not None:

                    ## get an appropriate equiv_id
                    if tok.equiv_id == -1:
                        eqid = fake_equiv_ids
                        fake_equiv_ids -= 1
                    else:
                        eqid = tok.equiv_id

                    ## store the name parts initially as a set
                    equiv_ids[eqid][0].add(cleanse(tok.token.decode('utf8')))
                    ## carry a *reference* to the entire Token object
                    equiv_ids[eqid][1].add(tok)

    return equiv_ids
Пример #7
0
def make_chains_with_names(sentences):
    '''
    assemble in-doc coref chains by mapping equiv_id to tokens and
    their cleansed name strings

    :param sentences: iterator over token generators
    :returns dict:
        keys are equiv_ids,
        values are tuple(concatentated name string, list of tokens)
    '''
    ## if an equiv_id is -1, then the token is classified into some
    ## entity_type but has not other tokens in its chain.  We don't
    ## want these all lumped together, so we give them distinct "fake"
    ## equiv_id other than -1 -- counting negatively to avoid
    ## collisions with "real" equiv_ids
    fake_equiv_ids = -2

    ## use a default dictionary
    equiv_ids = collections.defaultdict(lambda: (set(), set()))

    for tagger_id, sents in sentences.items():
        for sent in sents:
            for tok in sent.tokens:
                if tok.entity_type is not None:

                    ## get an appropriate equiv_id
                    if tok.equiv_id == -1:
                        eqid = fake_equiv_ids
                        fake_equiv_ids -= 1
                    else:
                        eqid = tok.equiv_id

                    ## store the name parts initially as a set
                    equiv_ids[eqid][0].add(cleanse(tok.token.decode('utf8')))
                    ## carry a *reference* to the entire Token object
                    equiv_ids[eqid][1].add(tok)

    return equiv_ids
Пример #8
0
def unpack_noun_phrases(row):
    body = cbor.loads(zlib.decompress(row['f:response.body']))
    body = make_clean_visible(body.encode('utf-8')).decode('utf-8')
    body = cleanse(body)
    return features.noun_phrases(body)
    def _make_stream_item(cls, path, metadata, abs_url, entities):
        '''
        
        '''
        ## Every StreamItem has a stream_time property.  It usually comes
        ## from the document creation time.
        creation_time = os.path.getctime(path)

        ## make stream item
        stream_item = streamcorpus.make_stream_item(creation_time, abs_url)

        stream_item.source = metadata.get('source')

        ## build a ContentItem for the body
        body = streamcorpus.ContentItem()
        body.media_type = magic.from_file(path, mime=True)

        logger.info('opening %r', path)
        with open(path) as f:
            body.raw = f.read()

        ## attach the content_item to the stream_item
        stream_item.body = body

        ## annotations
        anno = streamcorpus.Annotator()
        anno.annotator_id = metadata['annotator_id']
        anno.annotation_time = stream_item.stream_time

        num_ratings = 0
        for entity, is_profile in entities:
            num_ratings += 1

            ## pull out target id and mention tokens
            target_id = str(entity['target_id'])

            ## build a Label for the doc-level label:
            rating = streamcorpus.Rating()
            rating.annotator = anno
            rating.target = streamcorpus.Target(target_id=target_id)
            rating.contains_mention = True

            if is_profile:
                rating.flags = [streamcorpus.FlagType.PROFILE]

            ## parse slots in yaml file
            slots = cls._parse_slots(entity['slots'])

            ## heuristically split the slots string on white space and
            ## use each token as a separate mention.
            rating.mentions = [
                cleanse(unicode(slot[1], 'utf-8')) for slot in slots
            ]

            ## put this one label in the array of labels
            streamcorpus.add_annotation(stream_item, rating)

        ## provide this stream_item to the pipeline
        logger.info('created StreamItem(num ratings=%d, abs_url=%r',
                    num_ratings, stream_item.abs_url)
        return stream_item
Пример #10
0
def unpack_noun_phrases(row):
    body = cbor.loads(zlib.decompress(row['f:response.body']))
    body = make_clean_visible(body.encode('utf-8')).decode('utf-8')
    body = cleanse(body)
    return features.noun_phrases(body)
Пример #11
0
def html_to_fc(html=None,
               clean_html=None,
               clean_visible=None,
               encoding=None,
               url=None,
               timestamp=None,
               other_features=None):
    '''`html` is expected to be a raw string received over the wire from a
    remote webserver, and `encoding`, if provided, is used to decode
    it.  Typically, encoding comes from the Content-Type header field.
    The :func:`~streamcorpus_pipeline._clean_html.make_clean_html`
    function handles character encodings.

    '''
    def add_feature(name, xs):
        if name not in fc:
            fc[name] = StringCounter()
        fc[name] += StringCounter(xs)

    timestamp = timestamp or int(time.time() * 1000)
    other_features = other_features or {}

    if clean_html is None:
        if html is not None:
            try:
                clean_html_utf8 = make_clean_html(html, encoding=encoding)
            except:
                logger.warn('dropping doc because:', exc_info=True)
                return
            clean_html = clean_html_utf8.decode('utf-8')
        else:
            clean_html_utf8 = u''
            clean_html = u''
    else:
        clean_html_utf8 = u''

    if clean_visible is None or len(clean_visible) == 0:
        clean_visible = make_clean_visible(clean_html_utf8).decode('utf-8')
    elif isinstance(clean_visible, str):
        clean_visible = clean_visible.decode('utf-8')

    fc = FeatureCollection()
    fc[u'meta_raw'] = html and uni(html, encoding) or u''
    fc[u'meta_clean_html'] = clean_html
    fc[u'meta_clean_visible'] = clean_visible
    fc[u'meta_timestamp'] = unicode(timestamp)

    url = url or u''

    fc[u'meta_url'] = uni(url)

    add_feature(u'icq', features.ICQs(clean_visible))
    add_feature(u'skype', features.skypes(clean_visible))
    add_feature(u'phone', features.phones(clean_visible))
    add_feature(u'email', features.emails(clean_visible))
    bowNP, normalizations = features.noun_phrases(cleanse(clean_visible),
                                                  included_unnormalized=True)
    add_feature(u'bowNP', bowNP)
    bowNP_unnorm = chain(*normalizations.values())
    add_feature(u'bowNP_unnorm', bowNP_unnorm)

    add_feature(u'image_url', features.image_urls(clean_html))
    add_feature(u'a_url', features.a_urls(clean_html))

    ## get parsed versions, extract usernames
    fc[u'img_url_path_dirs'] = features.path_dirs(fc[u'image_url'])
    fc[u'img_url_hostnames'] = features.host_names(fc[u'image_url'])
    fc[u'usernames'] = features.usernames(fc[u'image_url'])

    fc[u'a_url_path_dirs'] = features.path_dirs(fc[u'a_url'])
    fc[u'a_url_hostnames'] = features.host_names(fc[u'a_url'])

    fc[u'usernames'] += features.usernames(fc[u'a_url'])

    #fc[u'usernames'] += features.usernames2(
    #    fc[u'meta_clean_visible'])

    # beginning of treating this as a pipeline...
    xform = features.entity_names()
    fc = xform.process(fc)

    for feat_name, feat_val in other_features.iteritems():
        fc[feat_name] += StringCounter(feat_val)

    return fc
Пример #12
0
def html_to_fc(html=None, clean_html=None, clean_visible=None, encoding=None, url=None,
               timestamp=None, other_features=None):
    '''`html` is expected to be a raw string received over the wire from a
    remote webserver, and `encoding`, if provided, is used to decode
    it.  Typically, encoding comes from the Content-Type header field.
    The :func:`~streamcorpus_pipeline._clean_html.make_clean_html`
    function handles character encodings.

    '''
    def add_feature(name, xs):
        if name not in fc:
            fc[name] = StringCounter()
        fc[name] += StringCounter(xs)

    timestamp = timestamp or int(time.time() * 1000)
    other_features = other_features or {}

    if clean_html is None:
        if html is not None:
            try:
                clean_html_utf8 = make_clean_html(html, encoding=encoding)
            except: 
                logger.warn('dropping doc because:', exc_info=True)
                return
            clean_html = clean_html_utf8.decode('utf-8')
        else:
            clean_html_utf8 = u''
            clean_html = u''
    else:
        clean_html_utf8 = u''

    if clean_visible is None or len(clean_visible) == 0:
        clean_visible = make_clean_visible(clean_html_utf8).decode('utf-8')
    elif isinstance(clean_visible, str):
        clean_visible = clean_visible.decode('utf-8')

    fc = FeatureCollection()
    fc[u'meta_raw'] = html and uni(html, encoding) or u''
    fc[u'meta_clean_html'] = clean_html
    fc[u'meta_clean_visible'] = clean_visible
    fc[u'meta_timestamp'] = unicode(timestamp)

    url = url or u''

    fc[u'meta_url'] = uni(url)

    add_feature(u'icq', features.ICQs(clean_visible))
    add_feature(u'skype', features.skypes(clean_visible))
    add_feature(u'phone', features.phones(clean_visible))
    add_feature(u'email', features.emails(clean_visible))
    bowNP, normalizations = features.noun_phrases(
        cleanse(clean_visible), included_unnormalized=True)
    add_feature(u'bowNP', bowNP)
    bowNP_unnorm = chain(*normalizations.values())
    add_feature(u'bowNP_unnorm', bowNP_unnorm)

    add_feature(u'image_url', features.image_urls(clean_html))
    add_feature(u'a_url', features.a_urls(clean_html))

    ## get parsed versions, extract usernames
    fc[u'img_url_path_dirs'] = features.path_dirs(fc[u'image_url'])
    fc[u'img_url_hostnames'] = features.host_names(fc[u'image_url'])
    fc[u'usernames'] = features.usernames(fc[u'image_url'])

    fc[u'a_url_path_dirs'] = features.path_dirs(fc[u'a_url'])
    fc[u'a_url_hostnames'] = features.host_names(fc[u'a_url'])

    fc[u'usernames'] += features.usernames(fc[u'a_url'])

    #fc[u'usernames'] += features.usernames2(
    #    fc[u'meta_clean_visible'])

    # beginning of treating this as a pipeline...
    xform = features.entity_names()
    fc = xform.process(fc)

    for feat_name, feat_val in other_features.iteritems():
        fc[feat_name] += StringCounter(feat_val)

    return fc
def test_cleanse():
    assert cleanse(u'This -LRB-big-RRB- dog has no \u1F601 Teeth'
                   ) == u'this big dog has no \u1F601 teeth'
def test_cleanse():
    assert cleanse(u'This -LRB-big-RRB- dog has no \u1F601 Teeth') == u'this big dog has no \u1F601 teeth'