def create_index(): conn = connect() df = pd.read_sql(""" SELECT id, title, summary FROM articles""", conn) articles = (df['title'] + '. ' + df['summary']).tolist() tfidf_corpus, corpus_dict = get_tfidf(articles) index = Similarity('index', tfidf_corpus, num_features=len(corpus_dict)) pickle_save(index, 'similarity_index.pckl') pickle_save(df['id'].to_dict(), 'idx_to_arxiv.pckl') conn.close()
def __get_tfidf_similarity_index(texts): """Takes a list of strings as input. Returns a gensim.Similarity object for calculating cosine similarities.""" texts_tokenized = [__tokenize_text(text) for text in texts] logging.debug('Creating corpora dictionary...') corpora_dict = corpora.Dictionary(texts_tokenized) logging.debug('Done creating corpora dictionary.') # gensim has us convert tokens to numeric IDs using corpora.Dictionary corpus = [ corpora_dict.doc2bow(text_tokenized) for text_tokenized in texts_tokenized ] corpus_tfidf = models.TfidfModel( corpus, normalize=True )[corpus] # Feed corpus back into its own model to get the TF-IDF values for the texts logging.debug('Creating Similarity index...') index = Similarity(None, corpus_tfidf, num_features=len(corpora_dict)) logging.debug('Done creating Similarity index.') return index
#coding:utf-8 import os from gensim import corpora, models import logging from gensim.similarities.docsim import Similarity logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) DICT_PATH = "dict/pat.dict" CORPUS_PATH = "dict/pat.mm" if (os.path.exists(DICT_PATH)): dictionary = corpora.Dictionary.load(DICT_PATH) corpus = corpora.MmCorpus(CORPUS_PATH) print("Used files generated from first tutorial") else: print("Please run first tutorial to generate data set") if (os.path.exists("model/pat_tfidf.model")): tfidf_model = models.TfidfModel.load("model/pat_tfidf.model") corpus_tfidf = tfidf_model[corpus] index = Similarity( "index/sim.index",corpus_tfidf,num_features=len(dictionary), shardsize=327680) # build the index else: print("Please run to generate tfidf data set")
#coding:utf-8 import os from gensim import corpora, models import logging from gensim.similarities.docsim import Similarity logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) DICT_PATH = "dict/pat.dict" CORPUS_PATH = "dict/pat.mm" if (os.path.exists(DICT_PATH)): dictionary = corpora.Dictionary.load(DICT_PATH) corpus = corpora.MmCorpus(CORPUS_PATH) print("Used files generated from first tutorial") else: print("Please run first tutorial to generate data set") if (os.path.exists("model/pat_tfidf.model")): tfidf_model = models.TfidfModel.load("model/pat_tfidf.model") lda_model = models.LdaModel.load("model/pat_lda.model") print "TF/IDF MODEL & LDA MODEL TO INDEX MODEL BEGIN TRAIN" corpus_tfidf = tfidf_model[corpus] corpus_lda = lda_model[corpus_tfidf] index = Similarity("index_lda/sim.index", corpus_lda, num_features=10, shardsize=327680) # build the index print "TRAIN DONE" else: print("Please run to generate LDA data set")
set([len(x) for x in doc_embeddings_text]) # ### Calculate Similarities # In[25]: from gensim.similarities.docsim import Similarity # In[26]: if CALCULATE_SIMILARITIES and CALCULATE_SIMILARITIES > 0: num_best = CALCULATE_SIMILARITIES + 1 if SPACY_FLAG: print('Calculating scispacy similarities index') spacy_index = Similarity(gs_index_tempfile, doc_embeddings_text, num_features=200, num_best=num_best) print('Reading specter embeddings') specter_df = pd.read_csv(specter_file, header=None, index_col=0) print('Calculating specter similarities index') specter_index = Similarity(gs_index_tempfile, specter_df.to_numpy(), num_features=specter_df.shape[1], num_best=num_best) # In[27]: if CALCULATE_SIMILARITIES and CALCULATE_SIMILARITIES > 0: if SPACY_FLAG: print('Calculating scispacy similarities') df['sims_scispacy_idx'] = [[
def gerar_modelo(self, modelo): ''' Treina o modelo selecionado, salvando-o. Após, cria a matrix de similaridade para o corpus transformado. Parâmetros: modelo (str) --> nome do modelo: "tfidf", "tfidf_pivot", "lsi", "lda" ou "doc2vec" Retorno: None ''' # Verifica se o modelo foi implementado if modelo not in self._modelos: print(f'O modelo "{modelo}" não foi implementado.') return # Define os nomes dos arquivos arq_model = os.path.join( self.corpus._pastas['modelos'], f'{self.corpus._link_nome}.{self._exts[modelo]}') arq_index = os.path.join(self.corpus._pastas['indices'], f'{self.corpus._link_nome}_{modelo}.idx') # Gera o modelo solicitado if modelo == 'tfidf': # Inicializa o modelo corpus_train = self.corpus.corpus(tipo='bow') num_features = self.corpus.num_tokens model = TfidfModel(corpus=corpus_train, id2word=self.corpus.dicionario()) elif modelo == 'tfidf_pivot': # Inicializa o modelo corpus_train = self.corpus.corpus(tipo='bow') num_features = self.corpus.num_tokens model = TfidfModel(corpus=corpus_train, id2word=self.corpus.dicionario(), smartirs='nfu', pivot=self.corpus.num_tokens / self.corpus.num_docs) elif modelo == 'lda': # Inicializa o modelo corpus_train = self.corpus.corpus(tipo='bow') num_features = self._modelos[modelo]['num_topics'] model = LdaModel(corpus=corpus_train, id2word=self.corpus.dicionario(), num_topics=num_features) elif modelo == 'lsi': # Inicia o modelo corpus_train = self.corpus.corpus(tipo='tfidf') num_features = self._modelos[modelo]['num_topics'] model = LsiModel(corpus=corpus_train, id2word=self.corpus.dicionario(), num_topics=num_features) elif modelo == 'doc2vec': # Instancia o modelo Doc2Vec corpus_train = self.corpus.corpus(tipo='tagged') num_features = self._modelos[modelo]['vector_size'] model = Doc2Vec(vector_size=num_features, workers=mp.cpu_count() / 2, alpha=self._modelos[modelo]['alpha'], min_alpha=self._modelos[modelo]['min_alpha']) # Obtém o vocabulário do corpus para treinar o modelo Doc2Vec model.build_vocab(corpus_train) # Treina o modelo Doc2Vec model.train(corpus_train, total_examples=model.corpus_count, epochs=model.epochs) else: print(f'O modelo "{modelo}" não foi implementado.') return # Salva o modelo treinado model.save(self._arqs['modelos'][modelo]) # Define o corpus para a matriz de similaridade if modelo == 'doc2vec': corpus = Doc2VecCorpus(model) else: corpus = model[corpus_train] # Gera o index a partir do modelo serializado index = Similarity(output_prefix=self._arqs['indices'][modelo], corpus=corpus, num_features=num_features) # Salva o índice index.save(self._arqs['indices'][modelo])
def testar_num_topics( self, modelo, num_topicos=[20, 50, 100, 200, 300, 400, 500, 1000, 1500], perc_fichas=0.2, vetor_testes=None, tipo_teste='similaridade'): ''' Testa a coerência dos modelos gerados por tópicos para uma lista de quantidade de tópicos para encontrar o melhor número de tópicos para o modelo com relação ao corpus. Parâmetros: modelo (str) --> Modelo a ser testado: "lda", "lsi" ou "doc2vec". num_topicos (list de int) --> Lista de números de tópicos a serem testados (default: [20, 50, 100, 200, 300, 400, 500, 1000, 1500]) per_fichas (float) --> Percentual de fichas do corpus a serem considerados para o teste (default: 0.2) vetor_teste (list de tuple) --> Lista de pares de fichas para testes de similaridade. É ignorado se o teste é o "u_mass" (default: None) tipo_testes (str) --> Tipo de teste: "u_mass" ou "similaridade" (default: "similaridade") Retorno: um dicionário de dicionários. A chave do dicionário principal é o número de tópicos e, para cada número de tópicos, há outro dicionário com as seguintes chaves: "medida" --> Valor de coerência calculado para o modelo com aquele número de tópicos. "modelo" --> O modelo gerado para aquele número de tópicos ''' # Verifica se o teste para o modelo foi implantado if modelo not in ['lda', 'lsi', 'doc2vec']: print( f'O modelo {modelo} ou não é de tópico ou não foi implantado.') return if tipo_teste not in ['u_mass', 'similaridade']: print(f'O tipo de teste {tipo_teste} não foi implementado.') return if modelo == 'doc2vec' and tipo_teste == 'u_mass': print( 'O teste de coerência com u_mass não pode ser usado para o modelo doc2vec.' ) return # Iniciando as variáveis para os testes resultado = {} arq_index = os.path.join(self.corpus._pastas['indices'], f'{self.corpus._link_nome}_testes.idx') if vetor_testes: flat = list(zip(*vetor_testes)) fichas_incluir = set(flat[0]) fichas_incluir.update(flat[1]) else: fichas_incluir = None # Define os corpus de treinamento e o corpus parcial if modelo == 'lsi': bow = self.corpus.corpus(tipo='bow') corpus_parcial = bow.fatiar(perc_fichas=perc_fichas, incluir=fichas_incluir) model_tfidf = self['tfidf'] or TfidfModel( corpus=corpus_parcial, id2word=self.corpus.dicionario()) corpus_train = model_tfidf[corpus_parcial] elif modelo == 'lda': bow = self.corpus.corpus(tipo='bow') corpus_parcial = corpus_train = bow.fatiar(perc_fichas=perc_fichas, incluir=fichas_incluir) elif modelo == 'doc2vec': corpus_tagged = self.corpus.corpus(tipo='tagged') corpus_parcial = corpus_train = corpus_tagged.fatiar( perc_fichas=perc_fichas, incluir=fichas_incluir) # Obtém a relação dos ids_fichas do corpus parcial if fichas_incluir: ids_fichas = corpus_parcial.fichas() else: ids_fichas = list(range(len(corpus_parcial))) # Faz o teste para cada quantidade de tópicos for num in tqdm(num_topicos): print(f'Criando modelo "{modelo}" para num_topics={num}') # Treina os modelo solicitado if modelo == 'lda': model = LdaModel(corpus=corpus_train, id2word=self.corpus.dicionario(), num_topics=num) elif modelo == 'lsi': model = LsiModel(corpus=corpus_train, id2word=self.corpus.dicionario(), num_topics=num) elif modelo == 'doc2vec': model = Doc2Vec(vector_size=num, workers=mp.cpu_count() / 2, alpha=self._modelos[modelo]['alpha'], min_alpha=self._modelos[modelo]['min_alpha']) # Obtém o vocabulário do corpus para treinar o modelo Doc2Vec model.build_vocab(corpus_train) # Treina o modelo Doc2Vec model.train(corpus_train, total_examples=model.corpus_count, epochs=model.epochs) # Salva o modelo construído para o número de tópicos da iteração resultado[num] = {'modelo': model} # Realiza o teste de coerência if tipo_teste == 'u_mass': # Calcula a coerência do modelo para o número de tópicos setado print( f'Calculando o score de coerência do modelo "{modelo}" para num_topics={num}' ) cm = CoherenceModel(model=model, corpus=corpus_train, coherence='u_mass') resultado[num]['medida'] = cm.get_coherence() print(f'Score u_mass = {resultado[num]["medida"]}') # Realiza o teste de similaridade elif tipo_teste == 'similaridade': # Define o corpus para a matriz de similaridade if modelo == 'doc2vec': corpus = Doc2VecCorpus(model) else: corpus = model[corpus_train] # Calcula a similaridade do modelo para o número de tópicos setado print( f'Calculando o score de similaridade do modelo "{modelo}" para num_topics={num}' ) index = Similarity(output_prefix=arq_index, corpus=corpus, num_features=num) medidas = [] for ficha_query, ficha_target in vetor_testes: id_query = self.corpus.ficha2id(ficha_query) query = ids_fichas.index(id_query) id_target = self.corpus.ficha2id(ficha_target) target = ids_fichas.index(id_target) posicao, _ = self._obter_posicao_target( index, query, target) medidas.append(1 / posicao) valores = pd.Series(medidas) resultado[num]['medida'] = valores.median() print(f'Score similaridade = {resultado[num]["medida"]}') return resultado
def rekomendasi(input): data = [input] id2word = Dictionary.load('pdupt_website/id2word_new.dict') corpus = MmCorpus('pdupt_website/corpus_new.mm') df = pd.read_csv('pdupt_website/reduksifix.csv') with open("pdupt_website/lemma_new.txt", "rb") as fp: #Pickling data_lemmatized = pickle.load(fp) stop_words = stopwords.words('indonesian') stop_words2 = stopwords.words('english') stop_words.extend(stop_words2) stop_words.extend([ 'of', 'in', 'and', 'the', 'for', 'on', 'using', 'based', 'from', 'with', 'to', 'by', 'as', 'an', 'pengaruh', 'effect', 'analisis', 'at', 'pre', 'pro', 'analysis', 'berbasis', 'tahun', 'between', 'kualitas', 'method', 'metode', 'through', 'menggunakan', 'hasil' ]) # Remove Numbers data = [re.sub(" \d+", ' ', sent) for sent in data] data = [re.sub('[^a-zA-Z]', ' ', sent) for sent in data] # Remove new line characters data = [re.sub('\s+', ' ', sent) for sent in data] # Remove distracting single quotes data = [re.sub("\'", "", sent) for sent in data] def sent_to_words(sentences): for sentence in sentences: yield (gensim.utils.simple_preprocess(str(sentence), deacc=True) ) # deacc=True removes punctuations data = sent_to_words(data) data_words = list(data) # Build the bigram and trigram models bigram = gensim.models.Phrases( data_words, min_count=5, threshold=100) # higher threshold fewer phrases. trigram = gensim.models.Phrases(bigram[data_words], threshold=100) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) # Define functions for stopwords, bigrams, trigrams and lemmatization # from Sastrawi.Stemmer.StemmerFactory import StemmerFactory def remove_stopwords(texts): return [[ word for word in simple_preprocess(str(doc)) if word not in (stop_words or stop_words2) ] for doc in texts] def make_bigrams(texts): return [bigram_mod[doc] for doc in texts] def make_trigrams(texts): return [trigram_mod[bigram_mod[doc]] for doc in texts] def lemmatization(texts): """https://spacy.io/api/annotation""" texts_out = [] for sent in texts: doc = nlp(" ".join(sent)) texts_out.append([token.lemma_ for token in doc]) return texts_out # Remove Stop Words data_words_nostops = remove_stopwords(data_words) # # Form Bigrams data_words_bigrams = make_bigrams(data_words_nostops) nlp = spacy.load('en_core_web_sm') data_lemmatized_search = lemmatization(data_words_bigrams) #stem masing-masing kata yang ada factory = StemmerFactory() stemmer = factory.create_stemmer() for x in range(len(data_lemmatized_search) - 1): for y in range(len(data_lemmatized_search[x]) - 1): data_lemmatized_search[x][y] = stemmer.stem( data_lemmatized_search[x][y]) # import gensim model = gensim.models.ldamodel.LdaModel.load( 'pdupt_website/mallet_18_lda.mdl', mmap='r') new_doc_bow = id2word.doc2bow(data_lemmatized_search[0]) hasil = model.get_document_topics(new_doc_bow) topic = 0 nilai = -99 for i, row in (hasil): if (row > nilai): topic = i nilai = row df_topik = df.loc[df['Topic1'] == topic] df_topik = df_topik.astype({"id_judul": int}) df_topik = df_topik.reset_index(drop=True) ##membuat data lemma, corpus dan dictionary berdasarkan data dalam 1 topik res_list = [data_lemmatized[int(i) - 1] for i in df_topik.id_judul] # Create Dictionary id2word_topik = corpora.Dictionary(res_list) # Create Corpus texts = res_list # Term Document Frequency corpus_topik = [id2word_topik.doc2bow(text) for text in res_list] #membuat indexing untuk perhitungan cossim index_tmpfile = get_tmpfile("index") index = Similarity(index_tmpfile, corpus_topik, num_features=len(id2word_topik)) #query diambil dari term document berdasarkan corpus per topik dari data lemma hasil search query = id2word_topik.doc2bow(data_lemmatized_search[0]) similarities = index[query] sort_index = np.argsort(similarities) sort_index reversed_arr = sort_index[::-1] reversed_arr list_idx = reversed_arr[:10] list_id_artikel = list(df_topik[df_topik.index.isin(list_idx)].id_judul) return (list_id_artikel, topic + 1)
data = pickle.load(handle) query = "small character good music bright background" query_arr = preprocess(query) dictionary.add_documents([query_arr]) bow_q = dictionary.doc2bow(query_arr) results: set = data[query_arr[0]] for i in query_arr[1:]: results = results.intersection(data[i]) conn = sqlite3.connect("game_data copy 3.db") c = conn.cursor() q = "SELECT r_id,text from reviews WHERE r_id IN " + str(tuple(results)) + "" c.execute(q) rows = c.fetchall() bows = [] indices = [] for idx, review in rows: pre = preprocess(review) bow = dictionary.doc2bow(pre) bows.append(bow) indices.append(idx) model = gensim.models.TfidfModel(bows) siml = Similarity(None, bows, num_features=len(dictionary)) result_siml = siml[bow_q] ordered = sorted(range(len(result_siml)), key=lambda k: result_siml[k]) for i in ordered: print(indices[i])
def search(request): if request.method == 'POST': global catch catch = request.POST['title'] data = [catch] stop_words = stopwords.words('indonesian') stop_words2 = stopwords.words('english') stop_words.extend(stop_words2) stop_words.extend([ 'of', 'in', 'and', 'the', 'for', 'on', 'using', 'based', 'from', 'with', 'to', 'by', 'as', 'an', 'pengaruh', 'effect', 'analisis', 'at', 'pre', 'pro', 'analysis', 'berbasis', 'tahun', 'between', 'kualitas', 'method', 'metode', 'through', 'menggunakan', 'hasil' ]) # Remove Numbers data = [re.sub(" \d+", ' ', sent) for sent in data] data = [re.sub('[^a-zA-Z]', ' ', sent) for sent in data] # Remove new line characters data = [re.sub('\s+', ' ', sent) for sent in data] # Remove distracting single quotes data = [re.sub("\'", "", sent) for sent in data] def sent_to_words(sentences): for sentence in sentences: yield (gensim.utils.simple_preprocess(str(sentence), deacc=True) ) # deacc=True removes punctuations coba = sent_to_words(data) data_words = list(coba) # Build the bigram and trigram models bigram = gensim.models.Phrases( data_words, min_count=5, threshold=100) # higher threshold fewer phrases. trigram = gensim.models.Phrases(bigram[data_words], threshold=100) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) # Define functions for stopwords, bigrams, trigrams and lemmatization # from Sastrawi.Stemmer.StemmerFactory import StemmerFactory def remove_stopwords(texts): return [[ word for word in simple_preprocess(str(doc)) if word not in (stop_words or stop_words2) ] for doc in texts] def make_bigrams(texts): return [bigram_mod[doc] for doc in texts] def make_trigrams(texts): return [trigram_mod[bigram_mod[doc]] for doc in texts] def lemmatization(texts): """https://spacy.io/api/annotation""" texts_out = [] for sent in texts: doc = nlp(" ".join(sent)) texts_out.append([token.lemma_ for token in doc]) return texts_out # Remove Stop Words data_words_nostops = remove_stopwords(data_words) # # Form Bigrams data_words_bigrams = make_bigrams(data_words_nostops) nlp = spacy.load('en_core_web_sm') data_lemmatized = lemmatization(data_words_bigrams) #stem masing-masing kata yang ada factory = StemmerFactory() stemmer = factory.create_stemmer() for x in range(len(data_lemmatized) - 1): for y in range(len(data_lemmatized[x]) - 1): data_lemmatized[x][y] = stemmer.stem(data_lemmatized[x][y]) id2wordd = corpora.Dictionary(data_lemmatized) # Create Corpus texts = data_lemmatized # Term Document Frequency corpuss = [id2wordd.doc2bow(text) for text in texts] id2word = Dictionary.load('papers/id2word_new.dict') corpus = MmCorpus('papers/corpus_new.mm') # import gensim model = gensim.models.ldamodel.LdaModel.load( 'papers/mallet_18_lda.mdl', mmap='r') new_doc_bow = id2word.doc2bow(data_lemmatized[0]) hasil = model.get_document_topics(new_doc_bow) topic = 0 nilai = -99 for i, row in (hasil): if (row > nilai): topic = i nilai = row keywords = [] for i, nilai in model.show_topic(topic): keywords.append(i) # topics = Topics.objects.filter(id_topic=topic).values_list('id_publication', flat=True) #load data df = pd.read_csv('papers/label18baru.csv') with open("papers/lemma_new.txt", "rb") as fp: #Pickling data_lemmatizedd = pickle.load(fp) #init tempat menyimpan hasil hasil_cosine_keseluruhan = [] hasil_cosine = [] #mengambil data yang sesuai dengan topik # topic=df topik = df.loc[df['Topic1'] == topic] ##membuat data lemma, corpus dan dictionary berdasarkan data dalam 1 topik res_list = [data_lemmatizedd[i] for i in topik.index] # Create Dictionary id2worddd = corpora.Dictionary(res_list) # Create Corpus texts = res_list # Term Document Frequency corpusss = [id2worddd.doc2bow(text) for text in res_list] #menghitung cosine sim judul dibandingkan dengan keseluruhan judul yang ada index_tmpfile = get_tmpfile("index") index = Similarity(index_tmpfile, corpusss, num_features=len(id2worddd)) index = MatrixSimilarity(corpusss, num_features=len(id2worddd)) sims = index[corpuss] sort_index = np.argsort(sims[0]) reversed_arr = sort_index[::-1] hasil = pd.DataFrame(reversed_arr) hasilbaru = hasil.iloc[:40, :] hasilmantep = hasilbaru.to_numpy() idfix = [] for i in range(0, 40): idfix.append(hasilmantep[i][0]) ngetest = topik.to_numpy() id_artikel = [] for i in idfix: id_artikel.append(ngetest[i][9]) global user_list user_list = Papers.objects.filter( id_pub__in=id_artikel).order_by('id_pub') topic_dict = { '0': 'Kimia', '1': 'Industri', '2': 'Biologi-Tumbuhan', '3': 'Biologi-Pangan', '4': 'Mikrobiologi', '5': 'Studi-Penemuan', '6': 'Sosial-Masyarakat-Sejarah', '7': 'Habitat Makhluk Hidup', '8': 'Elektro-Mesin', '9': 'Pendidikan', '10': 'Sosial-Pengaruh', '11': 'Pertanian', '12': 'Data-Citra-Statistik', '13': 'Jawa-Indonesia', '14': 'Masyarakat', '15': 'Biokimia', '16': 'Kesehatan', '17': 'Kesehatan 2', } global hasiltopik hasiltopik = topic_dict.get(str(topic)) page = request.GET.get('page', 1) paginator = Paginator(user_list, 10) try: users = paginator.page(page) except PageNotAnInteger: users = paginator.page(1) except EmptyPage: users = paginator.page(paginator.num_pages) context = { 'title': 'Halaman Utama', 'topic': hasiltopik, 'catch': catch, 'users': users, } return render(request, 'papers/index.html', context) else: page = request.GET.get('page', 1) paginator = Paginator(user_list, 10) try: users = paginator.page(page) except PageNotAnInteger: users = paginator.page(1) except EmptyPage: users = paginator.page(paginator.num_pages) context = { 'title': 'Halaman Utama', 'topic': hasiltopik, 'catch': catch, 'users': users, } return render(request, 'papers/index.html', context)
def setSimilar(self, simi_name=config.SimilarlySentencePath + "simi_index/Similarity-index", corpus=None): self.similar = Similarity(simi_name, corpus, self.num_feature)