def relevancy_dict(chunk): service = NLU( version='2018-03-16', url= 'https://gateway.watsonplatform.net/natural-language-understanding/api', iam_apikey='#####') response = service.analyze(text=chunk, features=Features( entities=EntitiesOptions(), keywords=KeywordsOptions())).get_result() analysis = json.dumps(response, indent=2) return json.loads(analysis)
def analyze_text(corpus_id, text, type, n_archs): features = Features( concepts=ConceptsOptions(), entities=EntitiesOptions(), keywords=KeywordsOptions(), ) authenticator = IAMAuthenticator( current_app.config['NATURAL_LANGUAGE_UNDERSTANDING_IAM_APIKEY'] ) service = NaLaUn( version=current_app.config['NATURAL_LANGUAGE_UNDERSTANDING_VERSION'], authenticator=authenticator) service.set_service_url( current_app.config['NATURAL_LANGUAGE_UNDERSTANDING_URL'] ) response = service.analyze( text=text, features=features ) results = {} typ_list = ['entities', 'concepts', 'keywords'] for typ in typ_list: results[typ] = pd.DataFrame(response.result[typ]) test_vec = \ results['concepts'].set_index('text')[['relevance']].apply(norm_dot) archetypes = get_corpus_archetypes(corpus_id, type=type, n_archs=n_archs) # Select the subset of features in corpus that cover the test vector. in_common = list(set(test_vec.index).intersection( set(archetypes.fn.columns) )) similarities = ( (archetypes.fn[in_common] @ test_vec.loc[in_common]) * 100 ).applymap(int) similarities.columns = ['similarity %'] test_vec_expanded = pd.DataFrame( test_vec, index=archetypes.f.columns ).apply(scale).fillna(-0.1) compare = archetypes.f.T.apply(scale) compare['DOC'] = test_vec_expanded.apply(scale) archetype_maps = [] for ix in archetypes.f.index: cmp = compare.sort_values(by=ix, ascending=True)[[ix, 'DOC']] cmp = cmp[cmp[ix] > 0.1] archetype_maps.append(cmp.applymap(np.sqrt)) return similarities, archetype_maps
def analyze_corpus(app, name, directory): features = Features( concepts=ConceptsOptions(), entities=EntitiesOptions(), keywords=KeywordsOptions(), ) with app.app_context(): authenticator = IAMAuthenticator( app.config['NATURAL_LANGUAGE_UNDERSTANDING_IAM_APIKEY']) service = NaLaUn( version=app.config['NATURAL_LANGUAGE_UNDERSTANDING_VERSION'], authenticator=authenticator) service.set_service_url( app.config['NATURAL_LANGUAGE_UNDERSTANDING_URL']) filenames = os.listdir(directory) new_corpus = Corpus(name=name, status='processing') db.session.add(new_corpus) db.session.commit() db.session.flush() print('Analyzing corpus in thread. Corpus ID: ' + str(new_corpus.id)) count = 0 for file in filenames: path = os.path.join(directory, file) if not os.path.isfile(path) or not file.endswith('.txt'): continue with open(path) as f: for i in range(3): try: results = service.analyze(text=f.read(), features=features) pickled_results = pickle.dumps(results) new_results = CorpusResult(corpus_id=new_corpus.id, name=file.replace( '.txt', ''), data=pickled_results) db.session.add(new_results) db.session.commit() count += 1 print('Processed file #{}: {} '.format(count, file)) except Exception as e: print(e) time.sleep(0.5) print('Retrying...') else: break else: print('Failed to analyze a file ({}) after ' + 'multiple attempts.'.format(file)) new_corpus.status = 'ready' db.session.commit() print('Finished analyzing corpus.')
def call_nlu_with_retry(doc_html: str, natural_language_understanding: ibm_watson. NaturalLanguageUnderstandingV1, extract_entities: bool, extract_semantic_roles: bool) -> Any: """ Pass a document through Natural Language Understanding, performing the analyses we need for the current use case. Also handles retrying with exponential backoff. :param doc_html: HTML contents of the web page :param nlu: Preinitialized instance of the NLU Python API :returns: Python object encapsulating the parsed JSON response from the web service. """ if extract_entities and extract_semantic_roles: nlu_features = nlu.Features( entities=nlu.EntitiesOptions(mentions=True), semantic_roles=nlu.SemanticRolesOptions()) elif extract_entities and not extract_semantic_roles: nlu_features = nlu.Features(entities=nlu.EntitiesOptions( mentions=True)) elif not extract_entities and extract_semantic_roles: nlu_features = nlu.Features(semantic_roles=nlu.SemanticRolesOptions()) else: raise ValueError("Must run at least one NLU model.") num_tries = 0 MAX_RETRIES = 8 RATE_LIMIT_ERROR_CODE = 429 last_exception = None while num_tries < MAX_RETRIES: num_tries += 1 try: return natural_language_understanding.analyze( html=doc_html, return_analyzed_text=True, features=nlu_features).get_result() except ibm_cloud_sdk_core.api_exception.ApiException as e: # Retry logic in case we hit the rate limit if e.code != RATE_LIMIT_ERROR_CODE: raise e sleep_time = 2**(num_tries - 1) print( f"Request failed {num_tries} times; retrying in {sleep_time} sec" ) time.sleep(sleep_time) raise Exception(f"Exceeded limit of {MAX_RETRIES} retries.")
class DocumentArchetypes: ''' DocumentArchetypes performs Archetypal Analysis on a corpus consisting of a set of documents, for example a set of articles, books, news stories or medical dictations. Input parameters: PATH - Dictionary with paths to I/O PATH['data'] - Directory for input text files. Example: './data/input_texts/' PATH['results'] - Directory for output. Example: './data/output_nlu/' NLU - Dictionary with information for running Watson NLU NLU['apikey'] - apikey for running Watson NLU NLU['apiurl'] - URL for Watson NLU API NLU['version'] - Watson NLU version, e.g. '2019-07-12' NLU['features'] - Features requested from Watson NLU for each document in the set, e.g. Features( categories= CategoriesOptions(), concepts = ConceptsOptions(), entities = EntitiesOptions(), keywords = KeywordsOptions(), relations = RelationsOptions(), syntax = SyntaxOptions() ) Attributes: self.PATH ''' from ibm_watson import NaturalLanguageUnderstandingV1 as NaLaUn from ibm_watson.natural_language_understanding_v1 import Features, CategoriesOptions,ConceptsOptions,EntitiesOptions,KeywordsOptions,RelationsOptions,SyntaxOptions def __init__(self, PATH, NLU): self.PATH = PATH self.NLU = NLU self.nlu_model = NaLaUn(version=NLU['version'] , iam_apikey = NLU['apikey'], url = NLU['apiurl']) #Local Natural Language Understanding object self.archetypes_dic = {} ################ ## PREPARE DATA ################ self.filenames = os.listdir(self.PATH['data']) self.dictation_dic = {} #dictionary for dictation files for name in self.filenames: self.dictation_dic[name.replace('.txt','')] = open(self.PATH['data']+name).read() ############################### ## PERFORM WATSON NLU ANALYSIS ############################### self.watson = {} #Dictionary with Watson-NLU results for each dictation self.watson_pkl = PATH['results']+'all_dictations_nlu.pkl' pkl_exists = os.path.exists(self.watson_pkl) if pkl_exists: self.watson = pickle.load( open( self.watson_pkl, "rb" ) ) else: #perform nlu-analysis on dictations for item in list(self.dictation_dic.items()): lbl = item[0] text = item[1] self.watson[lbl] = self.nlu_model.analyze(text = text, features=NLU['features']) f = open(PATH['results']+str(lbl)+'_nlu.pkl','wb') pickle.dump(self.watson[lbl],f) f.close() f = open(self.watson_pkl,'wb') pickle.dump(self.watson,f) f.close() # Copy Watson NLU results to Pandas Dataframes self.watson_nlu = {} for dctn in self.watson.items(): self.watson_nlu[dctn[0]] = {} for item in list(dctn[1].result.items()): self.watson_nlu[dctn[0]][item[0]]=pd.DataFrame(list(item[1])) ############## # ARCHETYPAL ANALYSIS ############## def archetypes(self,typ='entities',n_archs=6,bootstrap = False, bootstrap_frac = 0.5): hyperparam = (n_archs,bootstrap,bootstrap_frac) if typ not in self.archetypes_dic.keys(): self.archetypes_dic[typ] = {} if hyperparam not in self.archetypes_dic[typ].keys(): self.archetypes_dic[typ][hyperparam] = {} df = pd.DataFrame() for key in self.watson_nlu: dfx = self.watson_nlu[key][typ].copy() dfx['dictation'] = key df = df.append(dfx,sort=True) if typ is 'entities': df = df[df['type']=='HealthCondition'] df.rename({'relevance': 'rel0'}, axis=1,inplace=True) df['relevance'] = df['rel0'] * df['confidence'] mat = df.pivot_table(index='dictation',columns='text',values='relevance').fillna(0) self.archetypes_dic[typ][hyperparam] = Archetypes(mat,n_archs,bootstrap = bootstrap, bootstrap_frac = bootstrap_frac) return self.archetypes_dic[typ][hyperparam] def display_archetype(self,typ = 'entities' , n_archs = 6, arch_nr = 0, var = 'variables', threshold = 0.1): if var is 'variables': arc = self.archetypes(typ,n_archs).f.T.sort_values(by=arch_nr,ascending = False) result = arc[ arc[arch_nr] >= (threshold * arc[arch_nr][0]) ] return result elif var is 'dictations': arc = sns.clustermap(archetypes(typ,n_archs).o).data2d return arc
dictation_analysis = {} dian = dictation_analysis # If dictation_analysis dictionary already exists - read the pickled file # If it does NOT already exist, perform calculations. dian_pkl_file = PATH['results']+'all_dictations_nlu.pkl' dian_pkl_exists = os.path.exists(dian_pkl_file) if dian_pkl_exists: dian = pickle.load( open( dian_pkl_file, "rb" ) ) else: #perform nlu-analysis on dictations for item in list(dictation_dic.items()): lbl = item[0] text = item[1] dian[lbl] = nlu.analyze(text = text, features=NLU['features']) f = open(PATH['results']+str(lbl)+'_nlu.pkl','wb') pickle.dump(dian[lbl],f) f.close() f = open(dian_pkl_file,'wb') pickle.dump(dian,f) f.close() # Transform dian to Pandas Dataframes df_dic = {} for dctn in dian.items(): df_dic[dctn[0]] = {} for item in list(dctn[1].result.items()): df_dic[dctn[0]][item[0]]=pd.DataFrame(list(item[1]))
class WatsonDocumentArchetypes: ''' WatsonDocumentArchetypes performs Archetypal Analysis on a corpus consisting of a set of documents, for example a set of articles, books, news stories or medical dictations. Input parameters: PATH - Dictionary with paths to I/O PATH['data'] - Directory for input text files. Example: './data/input_texts/' PATH['results'] - Directory for output. Example: './data/output_nlu/' NLU - Dictionary with information for running Watson NLU NLU['apikey'] - apikey for running Watson NLU NLU['apiurl'] - URL for Watson NLU API NLU['version'] - Watson NLU version, e.g. '2019-07-12' NLU['features'] - Features requested from Watson NLU for each document in the set, e.g. Features( categories= CategoriesOptions(), concepts = ConceptsOptions(), entities = EntitiesOptions(), keywords = KeywordsOptions(), relations = RelationsOptions(), syntax = SyntaxOptions() ) Attributes: self.PATH ''' from ibm_watson import NaturalLanguageUnderstandingV1 as NaLaUn from ibm_watson.natural_language_understanding_v1 import Features, CategoriesOptions,ConceptsOptions,EntitiesOptions,KeywordsOptions,RelationsOptions,SyntaxOptions def __init__(self, PATH, NLU, train_test = False): self.PATH = PATH self.NLU = NLU # To random partition documents into train/test-sets, # choose relative size of test-set, train_test (1 = 100%) self.train_test = train_test self.nlu_model = NaLaUn(version=NLU['version'] , iam_apikey = NLU['apikey'], url = NLU['apiurl']) #Local Natural Language Understanding object # Initiate X_matrix dictionaries self.X_matrix_dic = {} self.X_matrix_train_dic = {} self.X_matrix_test_dic = {} self.archetypes_dic = {} ################ ## PREPARE DATA ################ self.filenames = ls(self.PATH['data']+'*.txt', name_only=True) # all filenames ending with '.txt' self.names = [name.replace('.txt','') for name in self.filenames] self.all_names = self.names *1 # if train_test - self.names will be set to self.names_train self.dictation_dic = {} # dictionary for dictation files for name in self.filenames: self.dictation_dic[name.replace('.txt','')] = open(self.PATH['data']+name, encoding="utf-8").read() self.dictation_df = pd.Series(self.dictation_dic) #################### ## TRAIN-TEST SPLIT #################### if self.train_test: # 0<train_test<1 - the proportion of names to save as 'test (rounded downwards) self.names_test , self.names_train = random_split(self.all_names , self.train_test) self.names = self.names_train ############################### ## PERFORM WATSON NLU ANALYSIS ############################### self.watson = {} #Dictionary with Watson-NLU results for each dictation self.watson_pkl = PATH['results']+'all_dictations_nlu.pkl' pkl_exists = os.path.exists(self.watson_pkl) if pkl_exists: self.watson = pickle.load( open( self.watson_pkl, "rb" ) ) else: #perform nlu-analysis on dictations for item in list(self.dictation_dic.items()): lbl = item[0] text = item[1] self.watson[lbl] = self.nlu_model.analyze(text = text, features=NLU['features']) f = open(PATH['results']+str(lbl)+'_nlu.pkl','wb') pickle.dump(self.watson[lbl],f) f.close() f = open(self.watson_pkl,'wb') pickle.dump(self.watson,f) f.close() # Copy Watson NLU results to Pandas Dataframes self.watson_nlu = {} for dctn in self.watson.items(): self.watson_nlu[dctn[0]] = {} for item in list(dctn[1].result.items()): self.watson_nlu[dctn[0]][item[0]]=pd.DataFrame(list(item[1])) ############## # ARCHETYPAL ANALYSIS ############## # CONSTRUCT X- MATRIX def X_matrix(self,typ = 'entities'): ''' Construct the archetypal analysis X-matrix by pivoting the dataframe in the dictionary my_wda.watson_nlu that contains the Watson NLU analysis in question X_matrix(typ) rows : Dictations columns: Variables; keywords/entities/concepts, from Watson NLU analysis values : Weights, from Watson NLU analysis the constructed X_matrix(typ) is saved as X_matrix_dic[typ] if my_wda.train_test has a value (not False) X_matrix_train_dic[typ] and X_matrix_test[typ] are added computed and added to their respective dicionaries ''' if typ not in self.X_matrix_dic.keys(): df = pd.DataFrame() for key in self.names: dfx = self.watson_nlu[key][typ].copy() dfx['dictation'] = key df = df.append(dfx,sort=True) if typ is 'entities': df = df[df['type']=='HealthCondition'] df.rename({'relevance': 'rel0'}, axis=1,inplace=True) df['relevance'] = df['rel0'] * df['confidence'] self.X_matrix_dic[typ] = df.pivot_table(index='dictation',columns='text',values='relevance').fillna(0) if self.train_test: self.X_matrix_train_dic[typ] = self.X_matrix_dic[typ] df = pd.DataFrame() for key in self.names_test: dfx = self.watson_nlu[key][typ].copy() dfx['dictation'] = key df = df.append(dfx,sort=True) if typ is 'entities': df = df[df['type']=='HealthCondition'] df.rename({'relevance': 'rel0'}, axis=1,inplace=True) df['relevance'] = df['rel0'] * df['confidence'] self.X_matrix_test_dic[typ] = df.pivot_table(index='dictation',columns='text',values='relevance').fillna(0) return self.X_matrix_dic[typ] # CALCULATE ARCHETYPES def archetypes(self,typ='entities',n_archs=6,bootstrap = False, bootstrap_frac = 0.5): if typ not in self.archetypes_dic.keys(): self.archetypes_dic[typ] = {} hyperparam = (n_archs,bootstrap,bootstrap_frac) self.X_matrix(typ) self.archetypes_dic[typ][hyperparam] = Archetypes(self.X_matrix(typ),n_archs,bootstrap = bootstrap, bootstrap_frac = bootstrap_frac) return self.archetypes_dic[typ][hyperparam] def display_archetype(self,arch_nr = -1, typ = 'entities' , n_archs = 6, var = 'variables', threshold = 0.1, norm = scale): fun = {'variables' : 'self.archetypes(typ = typ,n_archs = n_archs).f.T ', 'dictations': 'self.archetypes(typ = typ,n_archs = n_archs).o' } f = eval(fun[var]) fn = f.apply(norm) if arch_nr == -1: return sns.clustermap(f).data2d else: arc = f.sort_values(by=arch_nr,ascending = False) result = arc[ arc[arch_nr] >= (threshold * arc[arch_nr][0]) ] return result