def getNER(): if not request.get_json(force=True) or not 'text' in request.get_json(force=True): print 'aborting' abort(400) # istr = request.json['text'] istr = request.get_json(force=True)['text'] n = int(request.get_json(force=True)['max_n'] ) # remove all nonAsciiCharacters istr = removeNonAscii(istr) inclusion_list = DB.getInclusionList() exclusion_list = DB.getExclusionList() istr = removeNonAscii(istr) NER_results = NER.findNamedEntities(istr, inclusion_list) keywords = NER_results[0:n] # omit repeats and return lowercase keywords = sorted(list(set(i.lower() for i in keywords))) keywords = NER.excludeKeywords(exclusion_list, keywords) return jsonify({'keywords': keywords}), 201
def getTFIDF(): if not request.get_json(force=True) or not 'text' in request.get_json(force=True): print 'aborting' abort(400) # istr = request.json['text'] istr = request.get_json(force=True)['text'] n = int(request.get_json(force=True)['max_n'] ) # remove all nonAsciiCharacters istr = removeNonAscii(istr) inclusion_list = DB.getInclusionList() exclusion_list = DB.getExclusionList() istr = removeNonAscii(istr) TFIDF_results = TFIDF.findTFIDFkeywords(istr) keywords = TFIDF_results[0:n] keywords = NER.excludeKeywords(exclusion_list, keywords) return jsonify({'keywords': keywords}), 201
def getKeywords(): if not request.get_json(force=True) or not 'text' in request.get_json(force=True): print 'aborting' abort(400) # istr = request.json['text'] istr = request.get_json(force=True)['text'] n = int(request.get_json(force=True)['max_n'] ) # remove all nonAsciiCharacters istr = removeNonAscii(istr) inclusion_list = DB.getInclusionList() exclusion_list = DB.getExclusionList() istr = removeNonAscii(istr) NER_results = NER.findNamedEntities(istr, inclusion_list) NER_results = list(set(NER_results)) TFIDF_results = TFIDF.findTFIDFkeywords(istr) keywords = NER_results[0:n/2] + TFIDF_results[0:n/2] # omit repeats and return lowercase keywords = sorted(list(set(i.lower() for i in keywords))) current_j = n/2 current_k = n/2 while len(keywords) < n: if current_k < len(NER_results): current_k += 1 if NER_results[current_k] not in keywords: keywords.append(NER_results[current_k]) else: current_j += 1 if TFIDF_results[current_j] not in keywords: keywords.append(TFIDF_results[current_j]) keywords = NER.excludeKeywords(exclusion_list, keywords) return jsonify({'keywords': keywords}), 201
import NER.extractNER as NER import TFIDF.extractTFIDF as TFIDF import DB import sample def removeNonAscii(s): return "".join(filter(lambda x: ord(x)<128, s)) inclusion_list = DB.getInclusionList() exclusion_list = DB.getExclusionList() n = 10 istr = sample.sample_text istr = removeNonAscii(istr) NER_results = NER.findNamedEntities(istr, inclusion_list) TFIDF_results = TFIDF.findTFIDFkeywords(istr) keywords = NER_results[0:n/2] + TFIDF_results[0:n/2] # omit repeats and return lowercase keywords = sorted(list(set(i.lower() for i in keywords))) keywords = NER.excludeKeywords(exclusion_list, keywords) print keywords