def top20_verbs(txt_name): nlp = dao.spacy_load_en() with open(txt_name, 'r') as myfile: article = myfile.read().replace('\n', '') results = [] for token in nlp(article): if token.pos_ == 'VERB': results.append(token.lemma_) results = [ e for e in results if e not in ['will', 'would', 'could', 'may', 'can'] ] counts = Counter(results) labels, values = zip(*counts.items()) indSort = np.argsort(values)[::-1] if len(indSort) > 20: indSort = indSort[:19] labels = np.array(labels)[indSort] values = np.array(values)[indSort] indexes = np.arange(len(labels)) bar_width = 0.35 plt.figure() plt.bar(indexes, values) # add labels plt.xticks(indexes + bar_width, labels, rotation=45) plt.show() plt.savefig("out/nlp/nlp_top20_verbs.png", dpi=100) return "out/nlp/nlp_top20_verbs.png"
def __init__(self, language='english'): """ Initialize """ self.nlp = dao.spacy_load_en() self.sent_detector = data.load('tokenizers/punkt/english.pickle') self.analyzer = SentimentIntensityAnalyzer() # for sentiment analysis
def top20_organizations(txt_name): nlp = dao.spacy_load_en() with open(txt_name, 'r') as myfile: article = myfile.read().replace('\n', '') parsed_phrase = nlp(article) results = [] names = list(parsed_phrase.ents) for e in names: if e.label_ == 'ORG': results.append(e.text) counts = Counter(results) labels, values = zip(*counts.items()) indSort = np.argsort(values)[::-1] if len(indSort) > 20: indSort = indSort[:19] labels = np.array(labels)[indSort] values = np.array(values)[indSort] indexes = np.arange(len(labels)) bar_width = 0.35 plt.figure() plt.bar(indexes, values) # add labels plt.xticks(indexes + bar_width, labels, rotation=90) plt.show() plt.savefig("out/nlp/top20_organizations.png", dpi=100) return "out/nlp/top20_organizations.png"
def lemmatize(txt_name): nlp = dao.spacy_load_en() with open(txt_name, 'r') as myfile: article = myfile.read().replace('\n', '') results = [] for token in nlp(article): results.append(token.lemma_) return ' '.join(results)
def __init__(self, language='english'): """ Initialize """ self.nlp = dao.spacy_load_en() self.sent_detector = data.load('tokenizers/punkt/english.pickle') self.analyzer = SentimentIntensityAnalyzer() # for sentiment analysis self.keyverbs = list( pd.read_csv('gat/service/nlp_resources/KeyVerbs.csv')['key_verbs'])
def __init__(self, language='english', search_question='', article_count=0): super().__init__() self.messages = [] self.messages_lock = threading.Lock() self.result = None self.result_lock = threading.Lock() self.result_ontology = None self.result_ontology_lock = threading.Lock() self._nlp = dao.spacy_load_en() self.__sent_detector = data.load('tokenizers/punkt/english.pickle') self.__analyzer = SentimentIntensityAnalyzer( ) # for sentiment analysis current_file_path = os.path.dirname(os.path.abspath(__file__)) self.__keyverbs = list( pd.read_csv(os.path.join(current_file_path, 'KeyVerbs.csv'))['key_verbs']) self.__allcities = list( pd.read_csv(os.path.join(current_file_path, 'Allcities.csv'))['City']) self.__search_question = search_question self.__article_count = int(article_count)
def top5accuracy(y_true, y_pred): correct=0 for i in range(len(y_true)): if y_true[i] in y_pred[i]: correct=correct+1 return correct*1.0/len(y_true) model=joblib.load('gat/CameoPrediction/model.pkl') vector_rule=pd.read_csv('gat/CameoPrediction/vectorize_rules.txt', sep=' ',header=None) cameo_book=pd.read_csv('gat/CameoPrediction/CAMEO_code_new.csv') top_words=list(pd.read_csv('gat/CameoPrediction/top_all_words_from_analysis.txt',sep=' ',header=None).head(3000)[0]) nlp=dao.spacy_load_en() def top5CAMEO(sentence): phrase=[e.lemma_ for e in nlp(sentence)] sentence_binary=np.zeros(3000,dtype=int) for i in range(len(top_words)): if top_words[i] in phrase: sentence_binary[i]=1 sentence_binary=sentence_binary.reshape(1,-1) pred_ba=model.predict_proba(sentence_binary) pred_top5=top5pred(pred_ba)[0] cameo_top5=list(vector_rule[vector_rule[1].isin(pred_top5)][0]) cameo_5=list(cameo_book[cameo_book['Code'].isin(cameo_top5)]['Move']) return cameo_5
def loadModel(language): #Only loads english. #Loads SpaCy language model. Separate because computationally expensive. return dao.spacy_load_en()