def extraeUNI(archivo, sw = False): contestacion = [] if(type(archivo) is list): for publicacion in archivo: for elemento in publicacion: contestacion.append(elemento) elif( os.path.isfile(archivo) ): corpus = open(archivo, 'r') contestacion = corpus.readlines() corpus.close() else: print ("Error con la entrada de extraeUNI: ni archivo, ni lista") pass tokens = [] vocabulario = {} for i in range(0, len(contestacion)): if sw: tokens.append(norm.tokenize_text(norm.remove_palabrasfuncionales(norm.remove_special_characters(contestacion[i].lower())))) else: tokens.append(norm.tokenize_text(norm.remove_special_characters(contestacion[i].lower()))) for t in tokens: for o in t: if o in vocabulario: vocabulario[o] += 1 else: vocabulario[o] = 1 items = [(v,k) for k, v in vocabulario.items()] items.sort() items.reverse() return items
def extraeN(archivo, n, sw = False): contestacion = [] espacio = ' ' if(type(archivo) is list): for publicacion in archivo: contestacion.append(espacio.join(publicacion)) elif( os.path.isfile(archivo)): corpus = open(archivo, 'r') contestacion = corpus.readlines() corpus.close() else: print ("Error con la entrada de extraeUNI: ni archivo, ni lista") pass tokens = [] ngramas = [] vocabulario = {} for i in range(0, len(contestacion)): if(sw): tokens.append(norm.tokenize_text(norm.remove_palabrasfuncionales(norm.remove_special_characters(contestacion[i].lower())))) else: tokens.append(norm.tokenize_text(norm.remove_special_characters(contestacion[i].lower()))) for t in tokens: ngramas.append(ngrams(t, n)) for g in ngramas: for r in g: if r in vocabulario: vocabulario[r] += 1 else: vocabulario[r] = 1 items = [(v,k) for k, v in vocabulario.items()] items.sort() items.reverse() return items
def normalize_documents(documents): normalized_doc_list = [] for doc in documents: # convert to ascii doc1 = unicodedata.normalize('NFKD', doc).encode('ascii', 'ignore') # remove tags and escape characters doc2 = BeautifulSoup(doc1, 'html.parser').get_text() # expand contractions doc3 = expand_contractions(doc2, CONTRACTION_MAP) # lemmatize of text doc4 = lemmatize_text(doc3) # remove special characters doc5 = remove_special_characters(doc4) # keep text characters doc6 = keep_text_characters(doc5) # remove stopwords doc7 = remove_stopwords(doc6) normalized_doc_list.append(doc7) return normalized_doc_list
def normalize_documents(doc_list): normalize_doc_list = [] for doc in doc_list: doc1 = unicodedata.normalize('NFKD',doc).encode('ascii', 'ignore') doc2 = BeautifulSoup(doc1, 'html.parser').get_text() doc3 = expand_contractions(doc2, CONTRACTION_MAP) doc4 = lemmatize_text(doc3) doc5 = remove_special_characters(doc4) doc6 = keep_text_characters(doc5) doc7 = remove_stopwords(doc6) normalize_doc_list.append(doc7) return normalize_doc_list
def getCleanBagofWords(archivos): palabras = '' no_post = 0 if (isinstance(archivos, list)): for xml in archivos: tree = ET.parse(xml) root_element = tree.getroot() for texto in root_element.iter('TEXT'): #Obtengo los posts palabras = palabras + texto.text no_post = no_post + 1 elif (isinstance(archivos, str)): tree = ET.parse(archivos) root_element = tree.getroot() for texto in root_element.iter('TEXT'): #Obtengo los posts palabras = palabras + texto.text no_post = no_post + 1 palabras = tag_lines(palabras) palabras = normalization.remove_special_characters(palabras) palabras = normalization.remove_stopwords(palabras) tokens = normalization.tokenize_text(palabras) return (no_post, tokens)
dic.initialize_class_types('dp') IDLVdic = {} dic.appendPost('dpp', 'dp') dic.appendPost('dpn', 'dp') chunk.loadVocabulary() chunk.calcIDLV() print('Numero de instancias en depresion', len(dic.types['dp']['rows'])) dic.types['dp']['cols'] = dic.fillOnesZeros('dp') print(len(dic.types['dp']['cols'])) # Adquisicion del corpus >>>>>>>> FIN # Normalizado del corpus >>>>>>>>>> INICIO norm_train_corpus = [ norm.remove_special_characters(text) for text in dic.types['dp']['rows'] ] train_corpus = [norm.remove_stopwords(text) for text in norm_train_corpus] # Normalizado del corpus >>>>>>>>>> FIN tfs = [(v.IDLV(0), k) for k, v in chunk.vocablos.items()] tfs.sort() tfs.reverse() for k, v in chunk.vocablos.items(): IDLVdic[k] = v from feature_extractor import bow_extractor, tfidf_transformer, tfidf_extractor import nltk import gensim import numpy as np # BOW features
book_data = pd.read_csv('data/data.csv', encoding='utf8') # 读取文件 print(book_data.head()) book_titles = book_data['title'].tolist() book_content = book_data['content'] book_content = book_content.astype(str).tolist() print('书名:', book_titles[0]) print('内容:', book_content[0][:10]) from normalization import normalize_corpus, remove_stopwords, remove_special_characters norm_book_content = normalize_corpus(book_content) print(norm_book_content) norm_book_content1 = [] for norm in norm_book_content: text = remove_special_characters(remove_stopwords(norm)) norm_book_content1.append(text) print(norm_book_content1) # 提取 tf-idf 特征 vectorizer, feature_matrix = build_feature_matrix(norm_book_content1, feature_type='tfidf', ngram_range=(1, 2), min_df=0.2, max_df=0.90) # 查看特征数量 print(feature_matrix.shape) # 获取特征名称 feature_names = vectorizer.get_feature_names()
print('Numero de instancias en depresion: ', len(dic.types['ax']['rows'])) print('Vocabulario en el chunk: ', len(chunk.vocablos.keys())) dic.types['ax']['cols'] = dic.fillOnesZeros('ax') print(len(dic.types['ax']['cols'])) tfs = [(v.IDLV(0), k) for k, v in chunk.vocablos.items()] tfs.sort() tfs.reverse() for i in range(0, 1000): IDLVdic[tfs[i][1]] = tfs[i][0] norm.thousand_words = IDLVdic.keys() # Adquisicion del corpus >>>>>>>> FIN # Normalizado del corpus >>>>>>>>>> INICIO corpus = [ norm.remove_special_characters(text) for text in dic.types['ax']['rows'] ] train_corpus = [norm.remove_stopwords(text) for text in corpus] thousand_corpus = [norm.just_thousand(text) for text in train_corpus] # Normalizado del corpus >>>>>>>>>> FIN from feature_extractor import bow_extractor, tfidf_transformer, tfidf_extractor import nltk import gensim import pandas as pd import numpy as np # BOW features bow_vectorizer, bow_train_features = bow_extractor(thousand_corpus, (1, 1)) feature_names = bow_vectorizer.get_feature_names() print('Numero de caracteristicas tomadas en cuenta', len(feature_names))