def saveAnnotation(id,text,db): print id if db.simDoc.find({'_id':id}).count() == 0: #print entity.id, entity.relevance_score, entity.confidence_score, entity.freebase_types, entity.wikidata_id wikidataEntities,dbpediaEntities=get_annotation_text_razor(text) datatxt = DataTXT(app_id='0b2b87bc', app_key='7f0ae25400535758e9ceae358b3db763') result =datatxt.nex(text.decode('latin-1'),include_lod=True,language='en')['annotations'] #pprint(result) entityDbpediaSet=set() entityDbpedia=[] print result for entity in result: print entity if 'lod' in entity and 'dbpedia' in entity['lod'] and entity['lod']['dbpedia'] not in entityDbpediaSet: entityDbpedia.append({'dbpedia_id':entity['lod']['dbpedia'], 'confidence':entity['confidence']}) entityDbpediaSet.add(entity['lod']['dbpedia']) #entitySetWikidata=set(map(lambda x: x['lod']['wikidata'],result)) #pprint(entitySetDbpedia) print "dbpedia %s wikidata %s"%(len(entityDbpedia),len(wikidataEntities)) db.simDoc.insert({'_id':id,'text':text.decode('utf-8','ignore'), 'entities_dbpedia':entityDbpedia, 'entities_wikidata':wikidataEntities, 'entities_dbpedia_razor':dbpediaEntities})
def tok1(msg): lis = [] li = [] datatxt = DataTXT(app_id='5d504312af124377bac2f69c908dc20b', app_key='5d504312af124377bac2f69c908dc20b') repnews = [ 'news.google.co.in', 'nytimes.com', 'timesofindia.indiatimes.com', 'wsj.com', 'washingtonpost.com', 'bbc.com', 'moneycontrol.com', 'economist.com', 'newyorker.com', 'economictimes.indiatimes.com', 'ndtv.com', 'indiatoday.in', 'indianexpress.com', 'thehindu.com', 'news18.com', 'firstpost.com', 'dnaindia.com', 'apnews.com', 'brief.news', 'npr.org', 'scroll.in', 'reuters.com' ] tokenizer = RegexpTokenizer(r'\w+') a = tokenizer.tokenize(msg) stop = stopwords.words('english') + list(string.punctuation) a = [i for i in a if i not in stop] er = EventRegistry(apiKey="e010e4f7-343c-49d5-893d-63d4c2cfd487") q = QueryArticlesIter(keywords=QueryItems.OR(a), lang=["eng"], keywordsLoc="title") b = q.execQuery(er, sortBy="rel", maxItems=1) for article in b: if (article['source']['uri'] in repnews): if article['title'] not in li: lis.append(article['title']) for i in range(len(lis)): a = datatxt.sim(msg, lis[i]) if a['similarity'] >= 0.60: print(a['similarity']) li.append(lis[i]) return (li)
def get_seed_type(self, seed_name): app_id = configuration.APP_ID app_key = configuration.API_KEY_DANDELION datatxt = DataTXT(app_id=app_id, app_key=app_key) response = datatxt.nex(seed_name, **{ "min_confidence": 0.6, "include": ["types"] }) return response.annotations
def nerelEn(text): #text="voglio andare in bici. Che percorso mi consigliate?" translator = Translator() tr=translator.translate(text) text=tr.text datatxt = DataTXT(app_id='5cb879ebda544e2e95ce5cefd4963aca', app_key='5cb879ebda544e2e95ce5cefd4963aca') response = datatxt.nex(text, min_confidence=0.20, include_types=True, include_abstract=True, include_lod=True, include_categories=True) time = response['annotations'] #print(time) #entity = [] #print(time) index=0 categories=[] entity = [] types=[] lods=[] for index, row in enumerate(time): ca=[] ty=[] lo=[] name = time[index]['spot'] entity.append(name) try: categoria = time[index]['categories'] ca.append(categoria) for r in ca: for o in r: categories.append(o) except: print('categories not present') #categories.append("") try: typ = time[index]['types'] ty.append(typ) for r in ty: for o in r: types.append(o) except: print('types not present') #types.append("") try: lod = time[index]['lod']['dbpedia'] lo.append(lod) for r in lo: lods.append(r) except: print('lod not present') #print(lo) return (text,entity,categories,types,lods)
def dandelion(item,tool_name): text = item["text"].encode('utf-8') dpaId = item["dpaId"] datatxt = DataTXT(app_id=token, app_key=token) response = datatxt.nex( text, include_categories=True, include_types=True, include_image=True, include_lod=True, include_alternate_labels=True, include_abstract=True) try: if response["lang"] != "de": output=[False,response] elif response["lang"] == "de": try: annotation=[] t=time.time() for entity in response.annotations: wiki= str(entity["id"]) uri = wiki_query(wiki) category = query_category(uri) surface = entity["spot"] start = entity["start"] end = entity["end"] label = entity["title"] insert_dict={ "start" : start, "end" : end, "label" : label, "surface" : surface, "uri" : uri, "category_tool" : "", "category" : category, "dpaid" : dpaId, "timestamp" : '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.utcfromtimestamp(t)), "tool" : tool_name } annotation.append(insert_dict) output=[True,annotation] # import IPython # IPython.embed() except KeyError: output= [KeyError,response] except KeyError: output= [KeyError,response] return output
def nerel(text): datatxt = DataTXT(app_id='5cb879ebda544e2e95ce5cefd4963aca', app_key='5cb879ebda544e2e95ce5cefd4963aca') response = datatxt.nex(text, min_confidence=0.20, include_abstract=True, include_confidence=True, include_categories=True, include_image=True) time = response['annotations'] mostConfidence=0 #print(response) index=0 entity=[] abstracts=[] confidences=[] mostConf=0 mostimage="" categories=[] for index, row in enumerate(time): ca=[] name = time[index]['spot'] entity.append(name) try: abstract = time[index]['abstract'] abstracts.append(abstract) #print(abstract) except: print('abstract not present') abstracts.append("abstact not present") try: confidence = time[index]['confidence'] if confidence > mostConfidence: #print('ok') mostConfidence=confidence mostConf=name mostimage=time[index]['image']['thumbnail'] #print(confidence) confidences.append(confidence) except: print('confidence not present') confidences.append("") try: categoria = time[index]['categories'] ca.append(categoria) for r in ca: for o in r: #print(o) categories.append(o) except: print('categories not present') #categories.append("") return (entity, abstracts, confidences, categories, mostConf, mostimage)
class DandelionAnnotator: def __init__(self, app_id, app_key): self.app_id = app_id self.app_key = app_key self.datatxt = DataTXT(app_id=self.app_id, app_key=self.app_key) def dandelion_annotation(self, string): """ Gets a string, annotates it, and returns the annotated version with the entities inside :param string: :return: """ response = self.datatxt.nex(string, include_lod=True) annotated_string = string shift = 0 for annotation in response.annotations: start = annotation["start"] end = annotation["end"] print(shift) annotated_string = annotated_string[:start + shift] + replace_dbpedia( annotation["lod"].dbpedia ) + annotated_string[shift + end:] print(annotated_string) shift = shift + len(replace_dbpedia(annotation["lod"].dbpedia)) - ( annotation["end"] - annotation["start"]) return annotated_string
class TestDatatxt(TestCase): def setUp(self): default_config['app_id'] = os.environ['APP_ID'] default_config['app_key'] = os.environ['APP_KEY'] self.datatxt = DataTXT() def test_nex(self): res = self.datatxt.nex('They say Apple is better than Windows') self.assertEqual( {annotation.uri for annotation in res.annotations}, {'http://en.wikipedia.org/wiki/Apple_Inc.', 'http://en.wikipedia.org/wiki/Microsoft_Windows'} ) def test_sim(self): res = self.datatxt.sim( 'Reports that the NSA eavesdropped on world leaders have "severely' ' shaken" relations between Europe and the U.S., German Chancellor' ' Angela Merkel said.', # -- 'Germany and France are to seek talks with the US to settle a row ' 'over spying, as espionage claims continue to overshadow an EU ' 'summit in Brussels.' ) self.assertGreater(res.similarity, 0.5) def test_li(self): res = self.datatxt.li("Le nostre tre M sono: mafia, mamma, mandolino") self.assertEqual( [entry.lang for entry in res.detectedLangs], ['it'] ) self.assertGreater(res.detectedLangs[0].confidence, 0.9999) def test_raises_on_error(self): with self.assertRaises(DandelionException): self.datatxt.nex(text=None) def test_can_set_host(self): self.datatxt = DataTXT(host="api.dandelion.eu") self.test_nex() self.datatxt = DataTXT(host="http://api.dandelion.eu") self.test_nex()
def test_can_authenticate(self): with self.assertRaises(DandelionException) as context: Datagem('administrative-regions') self.assertEqual( context.exception.message, 'Param "app_id" is required' ) with self.assertRaises(DandelionException) as context: DataTXT() self.assertEqual( context.exception.message, 'Param "app_id" is required' ) default_config['app_id'] = os.environ['APP_ID'] default_config['app_key'] = os.environ['APP_KEY'] Datagem('administrative-regions') DataTXT()
def run(self, tweets_chunks, app_id, app_key): datatxt = DataTXT(app_id=app_id, app_key=app_key) for tweets in tweets_chunks: join_tweets = tweets_chunk.TweetsChunk(tweets) pprint.pprint(len(tweets)) try: response = datatxt.nex( join_tweets.get_unique_string(), **{ "lang": tweets[0]["lang"], "include": [ "types", "categories", "abstract", "alternate_labels" ], "social.hashtag": True, "social.mention": True, "min_confidence": 0 }) # print(response) except DandelionException as e: logging.error(e.code, e.message) continue join_tweets.split_annotation_each_tweet(response.annotations) # pprint.pprint(join_tweets.index_tweet) for tweet in join_tweets.index_tweet: #seed_id = list(self.db_manager.find("seeds", {"handle": tweet["tweet"]["user"]["screen_name"], "id_experiment":self.id_experiment})) #if(len(seed_id)>0): # seed_id=seed_id[0]["_id"] #else: # pprint.pprint(tweet["tweet"]["user"]["screen_name"]) # continue seed_id = tweet["tweet"]["seed"] for annotation in tweet["annotations"]: annotation["tweet"] = tweet["tweet"]["_id"] annotation["seed"] = seed_id annotation["concrete_types"] = self.find_concrete_type( annotation["types"], self.ontology) annotation["id_experiment"] = self.id_experiment #print(annotation) self.db_manager.write_mongo("entity", annotation)
def get_entities(self, text, lang='en', min_confidence=0.7, include='types, lod'): """ Dato un testo recupera le entità. :param text: rappresenta il testo da cui vogliamo estrarre le entità :param lang: indica la lingua in cui è scritto il testo :param min_confidence: indica il valore minimo affinchè l'entità estratta venga restituita :param include: consente di specificare dei parametri per ottenere più informazioni dalle API di Dandelion. In particolare: - type: consente di aggiungere informazioni sul tipo (tassonomia) dell'entità estratta attravero una lista di link a DBpedia. Se lang='en' vengono restituiti link relativi a DBpedia English. - lod: aggiunge link relativi alle equivalenti entità presenti in DBpedia. :return: la lista di entità estratte dal documento """ entities = [] self.validate_token() datatxt = DataTXT(token=self._tokenList[self._indexToken]) annotations = datatxt.nex( text, lang=lang, min_confidence=min_confidence, include=include ).annotations for annotation in annotations: entities.append({ 'title': annotation.title, 'wikipediaURI': annotation.lod.wikipedia, 'dbpediaURI': annotation.lod.dbpedia, 'types': annotation.types }) self._requests = self._requests + 1 return entities
class ChunksTest(unittest.TestCase): def setUp(self): # Retrieve all tweets tweets = list(mongo_manager.MongoManager(configuration.db_name).find("tweets", {}))[10:16] self.datatxt = DataTXT(app_id=configuration.APP1_ID, app_key=configuration.API_KEY_DANDELION1) self.t = tweets_chunk.TweetsChunk(tweets) def test_chunks(self): unique = self.t.get_unique_string() print(unique) response = self.datatxt.nex(self.t.get_unique_string(), **{"include": ["types", "categories", "abstract", "alternate_labels"], "social.hashtag": True, "social.mention": True}) print(response.annotations) self.t.split_annotation_each_tweet(response.annotations) print(self.t.index_tweet)
class DandelionEntityExtractor(EntityExtractor): # http://mappings.dbpedia.org/server/ontology/classes/ __dbpedia_type_to_entity_type = { 'http://dbpedia.org/ontology/Person': EntityType.PERSON, 'http://dbpedia.org/ontology/Place': EntityType.PLACE, 'http://dbpedia.org/ontology/Organisation': EntityType.GROUP, 'http://dbpedia.org/ontology/Group': EntityType.GROUP, 'http://dbpedia.org/ontology/Event': EntityType.EVENT, 'http://dbpedia.org/ontology/TimePeriod': EntityType.DATE, 'http://dbpedia.org/ontology/Activity': EntityType.ACTIVITY, 'http://dbpedia.org/ontology/Work': EntityType.MANMADEOBJECT } def __init__(self): token = os.environ.get('DANDELION_TOKEN') if token is None: raise Exception( 'Environment variable "DANDELION_TOKEN" must be set') self.__datatxt = DataTXT(token=token) def extract_entities(self, text): response = self.__datatxt.nex(text, include_types=True) return self.__convert_entities(response.annotations) def __convert_entities(self, annotations): converted_entities = [] for annotation in annotations: entity_type = self.__convert_types(annotation.types) converted_entity = Entity(annotation.label, entity_type, annotation.start, annotation.end) converted_entities.append(converted_entity) return converted_entities def __convert_types(self, types): entity_type = EntityType.THING if len(types) > 0: for t in types: if t in DandelionEntityExtractor.__dbpedia_type_to_entity_type: entity_type = DandelionEntityExtractor.__dbpedia_type_to_entity_type[ t] break return entity_type
def AnalyseText(text): datatxt = DataTXT(app_id='cd32413268454e19a31776d33b5f0ba0', app_key='cd32413268454e19a31776d33b5f0ba0') response = datatxt.nex(text, include="categories") return response.annotations
import dandelion from dandelion import DataTXT token = '3d86a1a88bc4456c91f82a0d6043a31f' from dandelion import default_config default_config['token'] = token datatxt = DataTXT() def analysis(t1, t2): #"never" uses always the semantic algorithm semantic = datatxt.sim(t1, t2, binow='never') return round(semantic['similarity'] * 100, 2)
import API_KEYS from dandelion import DataTXT datatxt = DataTXT(app_id='YOUR_APP_ID', app_key='YOUR_APP_KEY')
from __future__ import print_function from dandelion import DataTXT import sys import os client = DataTXT(app_id='9d7ee60076304802b131eccf185700c4', app_key='9d7ee60076304802b131eccf185700c4') def process(line): if len(line) > 0: response = client.nex(line, lang='en', social_hashtag='true') return ",".join([ os.path.basename(annotation.uri) for annotation in response.annotations ]).encode('utf-8') else: return "" def main(): try: for line in sys.stdin: print(process(line.strip())) except: print("FAIL! " + line, file=sys.stderr) raise if __name__ == "__main__": main()
to_file.close() to_file_lemma.flush() to_file_lemma.close() exit(0) if __name__ == '__main__': path_from = input(f'input file: ') path_to = input(f'output file: ') path_to_lemma = input(f'output file lemma: ') row_from = input(f'row from: ') # 5320 sport = input(f'sport: ') confidence = input(f'confidence: ') #count_dandelion = input(f'dandelion requests: ') #token = input(f'token: ') datatxt = DataTXT(token='') count_dandelion = 0 s = sparql.Service('http://dbpedia.org/sparql', qs_encoding='utf-8') nlp = spacy.load("en_core_web_sm") lemmatizer = WordNetLemmatizer() # Per evitare di splittare su ( prefixes = list(nlp.Defaults.prefixes) prefixes.remove('\\(') prefix_regex = spacy.util.compile_prefix_regex(prefixes) nlp.tokenizer.prefix_search = prefix_regex.search # Per evitare di splittare su ) suffixes = list(nlp.Defaults.suffixes) suffixes.remove('\\)') suffix_regex = spacy.util.compile_suffix_regex(suffixes) nlp.tokenizer.suffix_search = suffix_regex.search infixes = (
def get_entities_from_dandelion(text): # TODO: mettere le keys in un file di setting datatxt = DataTXT(app_id='7c418708', app_key='0043c60be84a1f471184a192fe06e540') result = datatxt.nex(text, include_lod=True, language='en') return result
def __init__(self, app_id, app_key): self.app_id = app_id self.app_key = app_key self.datatxt = DataTXT(app_id=self.app_id, app_key=self.app_key)
def setUp(self): default_config['app_id'] = os.environ['APP_ID'] default_config['app_key'] = os.environ['APP_KEY'] self.datatxt = DataTXT()
def get_annotation_dandelion(text): datatxt = DataTXT(app_id='0b2b87bc', app_key='7f0ae25400535758e9ceae358b3db763') result =datatxt.nex(text.decode('latin-1'),include_lod=True,language='en') pprint(result)
stopwords_file = arguments['--stopwords'] stopwords = read_stopword(stopwords_file) new_annotations = arguments['--new-annotations'] different_annotations = arguments['--different-annotations'] processed_items = arguments['--processed-items'] stemmer = Stemmer('italian') app_id = config.get('keys', 'app_id') app_key = config.get('keys', 'app_key') cache_dir = config.get('cache', 'cache_dir') datatxt = DataTXT(app_id=app_id, app_key=app_key, cache=FileCache(cache_dir) ) g = SKOSGraph() g.parse(infile, format='xml') query = u'SELECT DISTINCT ?a ?b WHERE { ?a skos:prefLabel ?b .}' qres = g.query(query, initNs=dict(skos=SKOSNS)) i = 0 tot = len(qres) print tot for subject_url, name in qres: i = i + 1 name = unicode(name)
#!/usr/bin/env python from dotenv import load_dotenv from dandelion import DataTXT import speech_recognition as sr import random import yaml import os load_dotenv() r = sr.Recognizer() mic = sr.Microphone(device_index=0) datatxt = DataTXT(token=os.getenv('TOKEN')) def compare(cmd: str, cmds: map): best_cmd = list(cmds)[0] similiarity = 0 for cmd_ in cmds: res = datatxt.sim(cmd_, cmd, lang='en') print('>', cmd_, res['similarity']) if res['similarity'] > similiarity: best_cmd = cmd_ similiarity = res['similarity'] return best_cmd def listen():
import os import nltk import tokenize from nltk import ne_chunk from nltk.parse import stanford from nltk.parse.stanford import StanfordDependencyParser from graphviz import Source from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from nltk.tokenize.moses import MosesDetokenizer from dandelion import DataTXT datatxt = DataTXT(app_id='78697915c52f48f5b3bd6c7bb603b2a2', app_key='78697915c52f48f5b3bd6c7bb603b2a2') h, k = 100, 100 #num = 0 l = 9 i = 0 j = 0 main = [] trial = [] tagged = [[0 for x in range(h)] for y in range(k)] named = [[0 for x in range(h)] for y in range(k)] list1 = [[0 for x in range(h)] for y in range(k)] detoken = [[0 for x in range(h)] for y in range(k)] ini_path = 'C:\Stanford' os.environ['STANFORD_PARSER'] = 'C:\Stanford\stanford-parser.jar' os.environ['STANFORD_MODELS'] = 'C:\Stanford\stanford-parser-3.5.2-models.jar' os.environ['JAVAHOME'] = 'C:/Program Files/Java/jdk1.8.0_161/' parser = stanford.StanfordParser( 'C:\Stanford\stanford-parser.jar',
import requests import json import csv from dandelion import DataTXT import dandelion import pandas as pd datatxt = DataTXT(app_id='', app_key='ENTER YOUR TWITTER API KEY') if __name__ == '__main__': query1 = raw_input("Enter the input claim:") inputFile = raw_input("Enter the file name to be checked for:") inputFile = inputFile + ".csv" colnames = [ "username", "date", "retweets", "favorites", "text", "geo", "mentions", "hashtags", "id", "permalink" ] with open(inputFile) as csvfile: inputReader = pd.read_csv(csvfile, sep="|", error_bad_lines=False) textReader = inputReader['text'] outputFile = inputFile.split('.')[0] + "_results.csv" with open(outputFile, "w") as f: try: for i in range(0, inputReader.shape[0]): query2 = textReader[i] print(query2) try: response = datatxt.sim(query1, query2) if response.similarity > 0.4: f.write(str(i)) f.write('|') f.write(query2)
def test_can_set_host(self): self.datatxt = DataTXT(host="api.dandelion.eu") self.test_nex() self.datatxt = DataTXT(host="http://api.dandelion.eu") self.test_nex()
import datetime import codecs import time import json from dandelion import DataTXT with open("config.json") as fin: config_data = json.load(fin) #datatxt = DataTXT(app_id = config_data['application_id'], app_key = config_data['application_key']) datatxt = DataTXT(token = config_data['token']) def simple_clean(text): text = " ".join(text.replace("’","'").split()) return text.lower() #import spacy #nlp = spacy.load('it_core_news_sm', # disable=["tagger", "parser", "ner"]) #def spacy_clean(text): # text = " ".join(text.replace("’","'").split()) # doc = nlp(text) # tokens = [token.lemma_.strip() for token in doc if # not token.is_stop # and not nlp.vocab[token.lemma_].is_stop # and not token.is_punct # and not token.is_digit
def setUp(self): # Retrieve all tweets tweets = list(mongo_manager.MongoManager(configuration.db_name).find("tweets", {}))[10:16] self.datatxt = DataTXT(app_id=configuration.APP1_ID, app_key=configuration.API_KEY_DANDELION1) self.t = tweets_chunk.TweetsChunk(tweets)
def loginDandelion(): file = open('credentialsDandelion.json') keys = json.load(file) datatxt = DataTXT(app_id=keys['app_id'], app_key=keys['app_key']) return datatxt
def final_score(event, keywords): textrazor.api_key = "9dcd16199684c470157ce02dc8ced9357b28f61dd685df6acc8dfd62" infocsv = pd.read_csv(event.csv_file.path, header=None) print("INFOOOO") print(infocsv.shape) print(infocsv.iloc[2,2]) dandelionclient = DataTXT(app_id = '9355e03c7d5e4b879e6af9d8575159d2', app_key = '9355e03c7d5e4b879e6af9d8575159d2') # keywords = "reactjs, react.js, redux, React.js" a=[] output = [] for count in range(infocsv.shape[0]): applicant = Applicant() applicant.name = str(infocsv.iloc[count, 0]) applicant.college = str(infocsv.iloc[count, 1]) applicant.email = str(infocsv.iloc[count, 2]) applicant.github_url = str(infocsv.iloc[count,3]) if(applicant.github_url == "nan"): applicant.delete() break applicant.quora_url = infocsv.iloc[count,4] applicant.resume_link = str(infocsv.iloc[count,5]) applicant.number = infocsv.iloc[count, 6] applicant.event = event applicant.save() print("resume_link") print(applicant.resume_link) print("RESUME INFO") # if __name__ == "__main__": words = applicant.resume_link.split('/') file_id = words[len(words)-2] print("File ID", file_id) destination = './' +file_id + '.pdf' print("Destination:", destination) download_file_from_google_drive(file_id, destination) convertapi.api_secret = 'Zgeg7qFLxqDtCAJr' result = convertapi.convert('txt', { 'File': './' + file_id + '.pdf' }) result.file.save('./') f1 = open('./' + file_id + '.txt', "r", encoding="utf8") resumeinfo = f1.read() print(resumeinfo) print("="*100) try: client = textrazor.TextRazor(extractors=["entities", "topics"]) response = client.analyze(resumeinfo) related_keyword_resume=[] for topic in response.topics(): if topic.score>0.7: related_keyword_resume.append(topic.label) rel_key_resume=', '.join(related_keyword_resume) print(rel_key_resume) r = dandelionclient.sim(rel_key_resume, keywords, lang="en", bow="one_empty") resumesimilarity = r.similarity*25 except: resumesimilarity = 0 print("--"*100) print("QUORA INFO") quorainfo = get_user_info_quora(applicant.quora_url) print(quorainfo) print("="*100) if(quorainfo is not ""): try: client = textrazor.TextRazor(extractors=["topics"]) response = client.analyze(quorainfo) related_keyword_qra=[] for topic in response.topics(): if topic.score>0.7: related_keyword_qra.append(topic.label) rel_key_quora=', '.join(related_keyword_qra) print(rel_key_quora) r = dandelionclient.sim(rel_key_quora, keywords, lang="en", bow="one_empty") quorasimilarity = r.similarity*15 except Exception as e: print(e) quorasimilarity = 0 else: quorasimilarity = 0 print("--"*100) print("GITHUB INFO") gitinfo = get_user_info_git(applicant.github_url)[0] print(gitinfo) print("=="*100) try: client = textrazor.TextRazor(extractors=["topics"]) response = client.analyze(gitinfo) related_keyword_git=[] for topic in response.topics(): if topic.score>0.7: related_keyword_git.append(topic.label) rel_key_git=', '.join(related_keyword_git) print(rel_key_git) print("--"*100) r = dandelionclient.sim(rel_key_git, keywords, lang="en", bow="one_empty") gitsimilarity = r.similarity*60 except: gitsimilarity = 0 print("+"*100) print(quorasimilarity, resumesimilarity, gitsimilarity) a.append(quorasimilarity+resumesimilarity+gitsimilarity) applicant.score = a[-1] applicant.save() output.append(applicant) output.sort(key=lambda x: x.score, reverse=True) print(a) return output
import tqdm import _pickle as cPickle from DataReader import DataReader from dandelion import DataTXT from dandelion import default_config from TweetNormalizer import normalizeTweet # Initializing Dandelion API (can be obtained from https://dandelion.eu/) default_config['token'] = 'INSERT TOKEN' datatxt = DataTXT() # Loading data dr_tr = DataReader('./Data/olid-training-v1.tsv', 'A') data_tr, labels_tr = dr_tr.get_labelled_data() dr_tst = DataReader('./Data/testset-levela.tsv', 'A') data_tst, label_tst = dr_tst.get_test_data() data_tr = data_tr[:] data_tst = data_tst[:] entities_tr = [] entities_tst = [] # Entity extraction using dandelion for line in tqdm.tqdm(data_tr): temp = [] for annotation in datatxt.nex(normalizeTweet(line), lang='en').annotations: temp.append(annotation.title) entities_tr.append(temp) for line in tqdm.tqdm(data_tst):
response = urllib2.urlopen(m.group(1)) html = response.read() cleaner = Cleaner() cleaner.javascript = True cleaner.style = True #print lxml.html.tostring(cleaner.clean_html(lxml.html.parse(url))) clean = cleaner.clean_html(lxml.html.parse(url)) clean = lxml.html.tostring(clean) soup = BeautifulSoup(clean, 'lxml') text = soup.get_text() datatxt = DataTXT(app_id='d40305b7', app_key='7d432531dfb0d3173212d4203f25d4b6') #response = datatxt.sim(text, "The ultimate skel-ebration of monster mania, this year's Monster High dance will be the monster bash to end all bashes (if it happens)! And as the Monster High ghouls make new beast friends, the horror show really begins. This freaky fabulous new character is larger than unlife at 17 inches tall! And of course, she wears an over-the-tent fashion with lots of ") paragraphs = list() match = list() for line in text.splitlines(): if len(line) > 20: paragraphs.append(line) paragraphs = paragraphs[0:5] for p in paragraphs: response = datatxt.sim( p, "The ultimate skel-ebration of monster mania, this year's Monster High dance will be the monster bash to end all bashes (if it happens)! And as the Monster High ghouls make new beast friends, the horror show really begins. This freaky fabulous new character is larger than unlife at 17 inches tall! And of course, she wears an over-the-tent fashion with lots of "