def extract_entities_textrazor(snippet): """ this function extract two entities (university name, researcher name and numebers) and confidence scores using the textrazor package :param snippet: :return: """ textrazor.api_key = "7b4d6194cabab0a5c05bd34ad0ba423520a4a3d33a0304b9783971c9" client = textrazor.TextRazor(extractors=["entities", "topics"]) response1 = client.analyze(snippet) #"Pegah Alizadeh was born september 21 1983 in Ahvaz, Iran. she finished her PHD at " #"university of PARIS13 2016.") output = {'RN':{'entity':[], 'confidenceScore':[]}, 'U':{'entity':[], 'confidenceScore':[]}, 'Y':{'entity':[], 'confidenceScore':[]} } for entity in response1.entities(): #print(entity.json) """if list is not empty""" if len(entity.freebase_types) > 0: if entity.freebase_types[0] == ['/people/person']: #.__contains__('person'): output['RN']['entity'].append( (entity.json['entityId']).split(' ') ) output['RN']['confidenceScore'].append( entity.confidence_score ) elif entity.freebase_types[0].__contains__('organization'): #== ['/organization/organization']: output['U']['entity'].append( (entity.json['entityId']).split(' ') ) output['U']['confidenceScore'].append(entity.confidence_score) else: if 'type' in entity.json: if entity.json['type'] == ['Number']: output['Y']['entity'].append((entity.json['entityId']).split(' ')) output['Y']['confidenceScore'].append(entity.confidence_score) return output
def get_keywords(text): ''' Finding the keywords of a text with textrazor Maximum query instances of textrazor is 500 Input: text: string Output: dict ''' textrazor.api_key = "1b69ecec7f8c72d386c2c5280780e6eb6ec00510e2a221d98e246c82" client = textrazor.TextRazor(extractors=["entities", "topics"]) response = client.analyze(text) entities = list(response.entities()) entities.sort(key=lambda x: x.relevance_score, reverse=True) seen = set() result = {} for entity in entities: if entity.id not in seen: seen.add(entity.id) list_of_cats = [] set_of_cats = set() for i in list(entity.freebase_types): i.split("/") if i[0] != "/": i = i[0] else: i = i.split("/")[1] if i in ['travel','projects','location','arts', 'food', 'sports', 'media_common', 'exhibition', 'architecture', 'geography', 'visual_art', 'travel', 'protected_sites','sports']: list_of_cats.append(i) if list_of_cats != []: result[entity.id] = list_of_cats return result
def categarize(filename): finalcat = [] finalscore = [] textrazor.api_key = "f8656917eff9fdb7989aafbb22a8c8e1b74ebd076f1040c75de4dfcc" client = textrazor.TextRazor(extractors=["entities", "topics"]) # client.set_cleanup_mode("cleanHTML") path = app.config['UPLOAD_FOLDER'] + '//' + filename client.set_classifiers(["textrazor_newscodes"]) #input_file = file(path).read().decode("ISO-8859-1") input_file = file(path).read().decode("utf-8") r = Rake() r.extract_keywords_from_text(input_file) #print(r.get_ranked_phrases()) startLines = input_file[0:100] #print(startLines) # os.system("LSA.py") response = client.analyze(input_file) entities = list(response.entities()) entities.sort(key=lambda x: x.relevance_score, reverse=True) seen = set() keywords = list() info = list() for entity in entities: if entity.id not in seen: #print (entity.id, entity.relevance_score, entity.confidence_score, entity.freebase_types) seen.add(entity.id) keywords.append(entity.id) mydb.keywords.insert({"keywords": keywords, "name": filename}) print("--------------------------------------------") topiclist = list() for topic in response.topics(): if topic.score > 0.3: #print (topic.label) topiclist.append(topic.label) mydb.topic.insert({"topic": topic.label}) print("------------------------------------------------------") categorylist = list() try: for category in response.categories(): alterLabel = (category.label).split(">") finalcat.append(alterLabel[-1]) finalscore.append(category.score) k = finalcat[0] s = finalscore[0] print(category.label) # print(alterLabel[-1]) # print category.score categorylist.append(alterLabel[-1]) mydb.category.insert({"category": alterLabel[-1]}) mydb.doccat.insert({"classified": k, "Document": filename, "Score": s, "startLines": startLines}) mydb.record.insert( {"name": filename, "description": [{"keywords": keywords, "topic": topiclist, "category": categorylist}]}) output = "Category : " + str(k) return jsonify(result=output) except: return jsonify(result="unable to categarize")
def get_tags(text): textrazor.api_key = "631d67844c4e5bf22a4dfe37afcd0f08a3c330b54a8ca798a0970846" client = textrazor.TextRazor(extractors=["topics"]) # classifiers=['textrazor_mediatopics', 'textrazor_newscodes', 'textrazor_iab', 'textrazor_iab_content_taxonomy'] client.set_classifiers(['textrazor_iab']) response = client.analyze(text) if not response.ok: print(response.error) print(response.message) return [] tags = [] for c in response.categories(): if c.score > 0.5: category = re.sub(r"[>]+", "/", c.label) tag = category.split('/')[-1] tag = re.sub(r"[\s&]+", "_", tag) if len(tag) > 0: tag = '#' + tag.lower() tags.append(tag) for c in response.topics(): if c.score == 1.0: tag = c.label tag = re.sub(r"[\s&]+", "_", tag) tag = '#' + tag.lower() tags.append(tag) return tags
def get_wikilinks(facts): textrazor.api_key = "f87456a08da3eff12e62ebdb2bcf2a8be4baaeb4b79be19fce12f770" client = textrazor.TextRazor(extractors=["entities", "topics"]) for fact in facts: response = client.analyze(fact.text) json_response = response.json print(json_response) exit()
def textrazorAPI(df): text_content = df['article'] client = textrazor.TextRazor(extractors=["entities", "topics", "dependency-trees", "relations", "entailments", "senses"]) response = client.analyze(text_content) response_json = response.json # list of dictionaries. Dictionary keys are id, label, score, wikiLink, wikidataId try: df['textrazorAPItopics'] = response_json['response']['topics'] except KeyError: df['textrazorAPItopics'] = np.nan # list of dictionaries. Dictionary keys are id, label, score, wikiLink, wikidataId try: df['textrazorAPIcoarseTopics'] = response_json['response']['coarseTopics'] except KeyError: df['textrazorAPIcoarseTopics'] = np.nan # list of dictionaries. Dictionary keys are try: df['textrazorAPIentities'] = response_json['response']['entities'] except KeyError: df['textrazorAPIentities'] = np.nan # list of two dictionaries. Dictionaries are two entities with relation: # keys are id, param - list of dict, keys are: # relation, wordPositions (key to list of ints) try: df['textrazorAPIrelations'] = response_json['response']['relations'] except KeyError: df['textrazorAPIrelations'] = np.nan # list of dictionaries. Dictionary keys are: # position - int with the position of the sentence # words - dict with keys endingPos, lemma, parentPosition, partOfSpeech, position, # relationToParent, startingPos, stem, token try: df['textrazorAPIsentences'] = response_json['response']['sentences'] except KeyError: df['textrazorAPIsentences'] = np.nan # list of dictionaries. Dictionary keys are contextScore, entailedTree(key to a list), # entailedWords(key to a list), id, prior score, score, wordPositions(key to a list) try: df['textrazorAPIentailments'] = response_json['response']['entailments'] except KeyError: df['textrazorAPIentailments'] = np.nan # list of dictionaries. Dictionary keys are id, propertyPositions(key to a list), # wordPositions(key to a list) try: df['textrazorAPIproperties'] = response_json['response']['properties'] except KeyError: df['textrazorAPIproperties'] = np.nan return df
def analyzeText(text): textrazor.api_key = "b695217cdaeb234d8a4edd867e1ab59b23aa1d050fa063c5f4a3a89a" client = textrazor.TextRazor(extractors=["topics"]) response = client.analyze(text) topic = map(lambda x: str(x.label), response.topics()[:8]) ans = "" for topica in topic: ans += topica + "</h6> <h6>" return ans
def __init__(self, url, username): client = textrazor.TextRazor(extractors=["entities", "topics"]) client.set_cleanup_mode("cleanHTML") client.set_cleanup_return_cleaned(return_cleaned=True) client.set_classifiers(["textrazor_newscodes"]) self.url = url self.username = username self.response = client.analyze_url(url) self.response.entities().sort(key=lambda x: x.relevance_score, reverse=True)
def concept_extract(text): client = textrazor.TextRazor(YOUR_API_KEY, extractors=["entities"]) response = client.analyze(text) concept_set = ([]) for entity in response.entities(): if 'entityId' in entity.json: concept_set.append(entity.json['entityId']) print(entity.json) return concept_set
def get_news_statistics(self, text): """Retrieve the list of topics(keywords) from the API for the given text.""" textrazor.api_key = self.textrazor_apikey client = textrazor.TextRazor(extractors=["topics","entities"]) client.set_classifiers(["textrazor_newscodes"]) response = client.analyze(text) if len(response.topics())==0 and len(response.entities())==0 and len(response.categories())==0: return None return response
def get_enrichment(book_uri, synopsis, rdf_graph): client = textrazor.TextRazor(extractors=["entities", "topics"]) response = client.analyze(synopsis) if response.ok == True: response2graph(book_uri, response, synopsis, rdf_graph) else: print('Error: ', response.error)
def get_entities_from_url(self, url): try: textrazor.api_key = TEXT_RAZOR_API_KEY_1 client = textrazor.TextRazor( extractors=[ENTITES, RELATIONS, TOPICS]) response = client.analyze_url(url) return response except Exception, e: print("ERROR For URL: {0} - {1}".format(url, str(e))) raise Exception()
def get_entities_from_text(self, text): try: textrazor.api_key = TEXT_RAZOR_API_KEY_1 client = textrazor.TextRazor( extractors=[ENTITES, RELATIONS, TOPICS]) response = client.analyze(text) return response except Exception, e: print(str(e)) raise Exception(str(e))
def print_topic_score(url): """Print 10 most probable topics and confidence score for a text from a text on a website url passed as input.""" textrazor.api_key = "fab1f5ef253a7daa2ec64726f01738f24bf84c59dde7c66f1ec1cd04" client = textrazor.TextRazor(extractors=["topics"]) client.set_language_override('fre') response = client.analyze_url(url) for topic in response.topics()[:10]: print(topic.label, topic.score)
def detect_target_garment(content, target): client = textrazor.TextRazor(api_key="13b22cd6d8562948feeddee54a992ef4edfb1b9d8c3df54a70f40810", extractors=["entities"]) client.set_entity_freebase_type_filters(["/fashion/garment", "/business/product_category"]) to_analyze = content response = client.analyze(to_analyze) garment = [] for ent in response.entities(): garment.append(ent.id) if target in garment: return True
def getTopic(string): return dict() topic_distri = dict() if string == "": return topic_distri textrazor.api_key = "528d21faef2b391e46cc77bfa8b1a9d28dd00a7f77ee562f2811520a" client = textrazor.TextRazor(extractors=["topics"]) response = client.analyze(string) for topic in response.coarse_topics(): topic_distri[topic.label] = topic.score return topic_distri
def __init__(self, aiml_instance): textrazor.api_key = "d64cc7e640600e8e2305304d8e79e6b945b575825a72ce4b853da187" self.client = textrazor.TextRazor(extractors=["dependency-trees"]) self.max_position = 0 self.mark_array = [] self.token_array = [] self.phrase = [] self.marker = ["xcomp", "advcl", "conj"] self.aiml_instance = aiml_instance
def run(query): dir_path = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(dir_path, 'configurations/config.txt')) as data_file: json_obj = json.load(data_file) # Initialize the API keys wclient = wolframalpha.Client(json_obj['wolframalpha']) textrazor.api_key = json_obj['textrazor'] # Extract the true meaning of the sentence tclient = textrazor.TextRazor(extractors=["entities"]) response = tclient.analyze(query) query = response.entities()[0].id # Perform query response = wclient.query(query) for pod in response.pods: if pod.title=='Wikipedia summary' and pod.text != None: print 'Wikipedia summary : ' + pod.text if pod.title=='Response': print pod.text if pod.title=='Basic information': print pod.text.split('\n')[0] + '\n' + pod.text.split('\n')[1] if pod.title=='Result' or pod.title=='Current result' or pod.title=='Approximate result' or pod.title=='Results' or pod.title=='Average result': print pod.text if pod.title=='Notable facts': print '* ' + pod.text.split('\n')[0] + '\n* ' + pod.text.split('\n')[1] + '\n* ' + pod.text.split('\n')[2] if pod.title=='Bordering countries/regions': print 'Bordering countries/regions -> ' + pod.text if pod.title=='Location': print pod.text if pod.title=='Capital city': print 'Capital city -> ' + pod.text if pod.title=='Currency': print 'Currency -> ' + pod.text.split('\n')[1] if pod.title=='Value': print pod.text.split('\n')[0] if pod.title=='Morse code translation': print pod.text
def getWikipediaLinks(textToAnnotate): wikipediaLinks = [] client = textrazor.TextRazor(extractors=["entities"]) response = client.analyze(textToAnnotate) for entity in response.entities(): wikipediaLinks.append(entity.wikipedia_link) wikipediaLinks = list(dict.fromkeys(wikipediaLinks)) #Removing duplicates wikipediaLinks = list(filter(None, wikipediaLinks)) #Removing empty strings return wikipediaLinks
def classification(text): """news classification using text razor api""" textrazor.api_key = "2afab77eb63718df82c96d0669e0017cb0c6bcabb2c0ae4044fa58a7" client = textrazor.TextRazor(extractors=["entities", "topics"]) client.set_classifiers(["textrazor_newscodes"]) response = client.analyze(text) categories = response.categories() category = categories[0].label return category
def textrazor(item, tool_name): text = item["text"] #.encode('utf-8') dpaId = item["dpaId"] textrazor_function.api_key=api_key #text=text.encode('UTF-8') client = textrazor_function.TextRazor(extractors=["entities","words"]) #response = client.analyze_url("http://www.bbc.co.uk/news/uk-politics-18640916") try: response = client.analyze(text) if response.ok != True: output=[False,response.message] else: if response.language != "ger": output=[False,response] else: entities_list=[] if len(response.entities())==0: output=[True,entities_list] else: t=time.time() annotation=[] for entity in response.entities(): label = entity.id if entity.wikidata_id == None: uri="QO" category = "OTH" else: uri=entity.wikidata_id category = query_category(uri) surface = entity.matched_text position = entity.matched_positions start=entity.starting_position end=entity.ending_position timestamp = '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.utcfromtimestamp(t)) insert_dict={ "start" : start, "end" : end, "label" : label, "surface" : surface, "uri" : uri, "category_tool" : "", "category" : category, "dpaid" : dpaId, "timestamp" : '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.utcfromtimestamp(t)), "tool" : tool_name } annotation.append(insert_dict) output=[True,annotation] except TextRazorAnalysisException: output=[False,"http error"] return(output)
def concept_extract_save(filepath, text): client = textrazor.TextRazor(YOUR_API_KEY, extractors=["entities"]) response = client.analyze(text) output_file = open(filepath, 'w+', -1, 'UTF-8') # concept_set = ([]) concept_set = [] for entity in response.entities(): if 'entityId' in entity.json: concept_set.append(entity.json['entityId']) print(entity.json) output_file.write(str(entity.json) + '\n') output_file.close()
def get_similarity_with_topic(transcript, speechTopic, categories): client = textrazor.TextRazor(extractors=["topics"]) textrazor_resonse = client.analyze(transcript) topic_list = textrazor_resonse.topics() print('topic list is',topic_list) keyword_list = [] for topic in topic_list: keyword_list.append(topic.label) category_list = textrazor_resonse.categories() print(category_list) return keyword_list '''max_similarity = -1000
def get_entities(text): client = textrazor.TextRazor(extractors=["entities", "topics"]) response = client.analyze(text) ret = [] for entity in response.entities(): if (int(entity.relevance_score) + int(entity.confidence_score) > 0.5): ret.append({ 'id': entity.id, 'relevance_score': entity.relevance_score, 'confidence_score': entity.confidence_score, 'freebase_types': entity.freebase_types }) return ret
def text_razor(repo_description): """ Use TextRazor to process textual input """ textrazor.api_key = '5f6331ac5ecb61dfe6e57d9706eeb4f9e7bceaa82a4a37b128cb0201' textrazor.language_override = 'en' client = textrazor.TextRazor(extractors=['entities']) response = client.analyze(repo_description) phrases = response.entities() keywords = [item.matched_text for item in phrases] kw_str = ', '.join(keywords) return kw_str
def getKeywordsArray(site_url, min_relevance_score, min_topic_score): textrazor.api_key = "1f0ebd1fc796a631ec72919329071930fede6007817a81744071c643" client = textrazor.TextRazor(extractors=["entities", "topics"]) response = client.analyze_url(site_url) for entity in response.entities(): if entity.relevance_score > min_relevance_score and entity.id not in keywords: keywords.append(entity.id) for topic in response.topics(): if topic.score > min_topic_score and topic.label not in keywords: keywords.append(topic.label) return keywords
def extract(self, text, extractors="entities,topics", lang="fr", min_confidence=0.0): self.lang = lang self.text = text lang = lang.replace("fr", "fre").replace("en", "eng") textrazor.api_key = self.api_key client = textrazor.TextRazor(extractors=["entities"]) client.set_language_override(lang) response = client.analyze(text) entities = [entity.json for entity in response.entities()] self.annotations = entities
def process_or_store(alltweets): textrazor.api_key = "813c3fc408c749a28006cca97f5865dca86c569f77ed08728b151bc0" client = textrazor.TextRazor(extractors=["entities", "topics"]) #client.set_cleanup_mode("stripTags") alltweets = re.sub(r"(?:\@|https?\://)\S+", "", alltweets) client.set_classifiers(["textrazor_iab"]) response = client.analyze(alltweets) print "##############################################################################################" print "\n\n\nEntities" for entity in response.entities(): if entity.confidence_score > 0.7: if entity.dbpedia_types: for e in entity.dbpedia_types: if entity.id not in entities_dictionary[e]: entities_dictionary[e].append(entity.id) if entity.id in score: score[entity.id] += 1 else: score[entity.id] = 1 #print entities_dictionary topics = "" for topic in response.topics(): topics += topic.label + "\n" print "\n\nAfter Topics" #for category in response.categories(): # print category.category_id, category.label, category.score response = client.analyze(topics) for entity in response.entities(): if entity.confidence_score > 0.7: if entity.dbpedia_types: for e in entity.dbpedia_types: if entity.id not in entities_dictionary[e]: entities_dictionary[e].append(entity.id) score[entity.id] = 1 print entities_dictionary print score
def get( url='https://www.politico.com/news/2020/02/21/bernie-sanders-condemns-russian-116640' ): # characteristics = ['Location', 'Event', 'Person', 'Organization'] try: article = Article(url) article.download() article.parse() except: return {} text = article.title date = article.publish_date days_to_subtract = 2 try: d = (date - timedelta(days=days_to_subtract)).strftime('%Y-%m-%d') d2 = (date + timedelta(days=days_to_subtract)).strftime('%Y-%m-%d') except TypeError: date = datetime.now() d = (date - timedelta(days=days_to_subtract)).strftime('%Y-%m-%d') d2 = (date + timedelta(days=days_to_subtract)).strftime('%Y-%m-%d') alt_api_key = 'feca0c9db3d492ac63a83761a41d003f306c5acfff3b828b8c1319da' textrazor.api_key = '3db6ae4b1e8b2e04ee07657ca98d0de9eda7b885b3043dc11ab9b230' client = textrazor.TextRazor(extractors=["words", "phrases"]) response = client.analyze(text) query = '' for np in response.noun_phrases(): query += '{} '.format( text[np.words[0].input_start_offset:np.words[-1].input_end_offset]) print(query) news_parameters = { 'q': query, 'from': d, 'to': d2, 'sortBy': 'popularity', 'apiKey': 'a02790e5a3af4b5f8683318c276e702d' } response = requests.get('http://newsapi.org/v2/everything', params=news_parameters) json_data = json.loads(response.text) return json_data
def update_api_key(api_key): TextRazorManager.api_key = api_key textrazor.api_key = api_key TextRazorManager.client = textrazor.TextRazor(extractors=[ 'customAnnotations', 'coarseTopics', 'entailments', 'properties', 'nounPhrases', 'sentences', 'categories', "entities", "topics", 'relations', ])