def suggest_locations(): """ Suggest locations in a text string. These might be useful keywords for annotators to geolocate. input: full message's text [string] output: dict of with outer key "locations". inner keys are entity types, with list of entities. "locations": { "GSP": [ "Congo" ] } # [TODO] # output: list. each item is a python dictionary: # - text : the text for the specific entity [string] # - indices : tuple of (start [int], end [int]) offset where entity is # located in given full message # - confidence : probability from 0-to-1 [float] """ if not request.json and not 'text' in request.json: abort(400) # Get all entities and only fetch GPE entities = Machine.guess_locations(request.json['text']) for k, v in entities.iteritems(): entities[k] = list(v) return jsonify({'locations': entities})
def detect_language(): """Given some text, returns a ranked list of likey natural languages the given content is in Input parameters: text: string """ if not request.json or not 'text' in request.json: abort(400) language = Machine.guess_language(request.json['text'])[0] return jsonify({'language': language[0], "confidence": language[1]})
def extract_entities(): """Given some text input, identify - besides location - people, organisations and other types of entities within the text""" pass if not request.json and not 'text' in request.json: abort(400) result = Machine.guess_entities(request.json['text']) entities = {} for key, value in result.iteritems(): entities[key.lower()] = list(value) return jsonify({'entities': entities})
def suggest_sensitive_info(): """ Suggest personally identifying information (PII) -- such as credit card numbers, phone numbers, email, etc -- from a text string. These are useful for annotators to investigate and strip before publicly posting information. input: text, input: options - custom regex for local phone numbers - flags or booleans to specify the type of pii (e.g. phone_only) output: list of dictionaries: - word - type (e-mail, phone, ID, person name, etc.) - indices (start/end offset in text) - confidence [todo: is possible?] """ if not request.json and not 'text' in request.json: abort(400) private_info = Machine.guess_private_info(request.json['text']) return jsonify({'private_info': private_info})