def __call__(self, tweet): text = tweet['text'] tokens = token_re.findall(text) # tokens_features = map(list, featurize(tokens, crf_feature_functions)) tokens_features = featurize(tokens, self.feature_functions) null_label = 'None' labels = self.crf.predict([tokens_features])[0] # tweet['labels'] = labels if 'sequences' not in tweet: tweet['sequences'] = [] for sequence_label, entries in itertools.groupby(zip_boundaries(labels), lambda tup: tup[0]): if sequence_label != null_label: labels, starts, ends = zip(*entries) tweet['sequences'].append({ 'text': sequence_label, 'start': starts[0], 'end': ends[-1], }) return tweet
def spotlight(document, confidence=0.1, support=10): document_string = u' '.join(document) r = requests.post(spotlight_annotate_url, headers=dict(Accept='application/json'), data=dict(text=document_string, confidence=confidence, support=support)) Resources = r.json().get('Resources', []) for token, token_start, token_end in zip_boundaries(document): labels = [] for Resource in Resources: entity_start = int(Resource['@offset']) entity_end = entity_start + len(Resource['@surfaceForm']) if entity_start <= token_start <= entity_end or entity_start <= token_end <= entity_end: entity_uri = Resource['@URI'] entity_types = Resource['@types'].split(',') labels += [entity_uri] + entity_types yield labels