예제 #1
0
def extraeUNI(archivo, sw = False):
    contestacion = []
    if(type(archivo) is list):
        for publicacion in archivo:
            for elemento in publicacion:
                contestacion.append(elemento)
    elif( os.path.isfile(archivo) ):
        corpus = open(archivo, 'r')
        contestacion = corpus.readlines()
        corpus.close()
    else:
        print ("Error con la entrada de extraeUNI: ni archivo, ni lista")
        pass
    tokens = []
    vocabulario = {}
    for i in range(0, len(contestacion)):
        if sw:
            tokens.append(norm.tokenize_text(norm.remove_palabrasfuncionales(norm.remove_special_characters(contestacion[i].lower()))))
        else:
            tokens.append(norm.tokenize_text(norm.remove_special_characters(contestacion[i].lower())))
    for t in tokens:
        for o in t:
            if o in vocabulario:
                vocabulario[o] += 1
            else:
                vocabulario[o] = 1
    items = [(v,k) for k, v in vocabulario.items()]
    items.sort()
    items.reverse()
    return items
예제 #2
0
def extraeN(archivo, n, sw = False):
    contestacion = []
    espacio = ' '
    if(type(archivo) is list):
        for publicacion in archivo:
            contestacion.append(espacio.join(publicacion))
    elif( os.path.isfile(archivo)):
        corpus = open(archivo, 'r')
        contestacion = corpus.readlines()
        corpus.close()
    else:
        print ("Error con la entrada de extraeUNI: ni archivo, ni lista")
        pass
    tokens = []
    ngramas = []
    vocabulario = {}
    for i in range(0, len(contestacion)):
        if(sw):
            tokens.append(norm.tokenize_text(norm.remove_palabrasfuncionales(norm.remove_special_characters(contestacion[i].lower()))))
        else:
            tokens.append(norm.tokenize_text(norm.remove_special_characters(contestacion[i].lower())))
    for t in tokens:
        ngramas.append(ngrams(t, n))
    for g in ngramas:
        for r in g:
            if r in vocabulario:
                vocabulario[r] += 1
            else:
                vocabulario[r] = 1
    items = [(v,k) for k, v in vocabulario.items()]
    items.sort()
    items.reverse()
    return items
예제 #3
0
def normalize_documents(documents):
    normalized_doc_list = []
    for doc in documents:
        # convert to ascii
        doc1 = unicodedata.normalize('NFKD', doc).encode('ascii', 'ignore')

        # remove tags and escape characters
        doc2 = BeautifulSoup(doc1, 'html.parser').get_text()

        # expand contractions
        doc3 = expand_contractions(doc2, CONTRACTION_MAP)

        # lemmatize of text
        doc4 = lemmatize_text(doc3)

        # remove special characters
        doc5 = remove_special_characters(doc4)

        # keep text characters
        doc6 = keep_text_characters(doc5)

        # remove stopwords
        doc7 = remove_stopwords(doc6)
        normalized_doc_list.append(doc7)
    return normalized_doc_list
def normalize_documents(doc_list):
    normalize_doc_list = []
    for doc in doc_list:
        doc1 = unicodedata.normalize('NFKD',doc).encode('ascii', 'ignore')
        doc2 = BeautifulSoup(doc1, 'html.parser').get_text()
        doc3 = expand_contractions(doc2, CONTRACTION_MAP)
        doc4 = lemmatize_text(doc3)
        doc5 = remove_special_characters(doc4)
        doc6 = keep_text_characters(doc5)
        doc7 = remove_stopwords(doc6)
        normalize_doc_list.append(doc7)
    return normalize_doc_list
예제 #5
0
def getCleanBagofWords(archivos):
    palabras = ''
    no_post = 0
    if (isinstance(archivos, list)):
        for xml in archivos:
            tree = ET.parse(xml)
            root_element = tree.getroot()
            for texto in root_element.iter('TEXT'):
                #Obtengo los posts
                palabras = palabras + texto.text
                no_post = no_post + 1
    elif (isinstance(archivos, str)):
        tree = ET.parse(archivos)
        root_element = tree.getroot()
        for texto in root_element.iter('TEXT'):
            #Obtengo los posts
            palabras = palabras + texto.text
            no_post = no_post + 1
    palabras = tag_lines(palabras)
    palabras = normalization.remove_special_characters(palabras)
    palabras = normalization.remove_stopwords(palabras)
    tokens = normalization.tokenize_text(palabras)
    return (no_post, tokens)
예제 #6
0
dic.initialize_class_types('dp')
IDLVdic = {}

dic.appendPost('dpp', 'dp')
dic.appendPost('dpn', 'dp')
chunk.loadVocabulary()
chunk.calcIDLV()
print('Numero de instancias en depresion', len(dic.types['dp']['rows']))

dic.types['dp']['cols'] = dic.fillOnesZeros('dp')
print(len(dic.types['dp']['cols']))

# Adquisicion del corpus >>>>>>>> FIN
# Normalizado del corpus >>>>>>>>>> INICIO
norm_train_corpus = [
    norm.remove_special_characters(text) for text in dic.types['dp']['rows']
]
train_corpus = [norm.remove_stopwords(text) for text in norm_train_corpus]
# Normalizado del corpus >>>>>>>>>> FIN

tfs = [(v.IDLV(0), k) for k, v in chunk.vocablos.items()]
tfs.sort()
tfs.reverse()
for k, v in chunk.vocablos.items():
    IDLVdic[k] = v

from feature_extractor import bow_extractor, tfidf_transformer, tfidf_extractor
import nltk
import gensim
import numpy as np
# BOW features
예제 #7
0
book_data = pd.read_csv('data/data.csv', encoding='utf8')  # 读取文件
print(book_data.head())
book_titles = book_data['title'].tolist()
book_content = book_data['content']
book_content = book_content.astype(str).tolist()
print('书名:', book_titles[0])
print('内容:', book_content[0][:10])

from normalization import normalize_corpus, remove_stopwords, remove_special_characters

norm_book_content = normalize_corpus(book_content)
print(norm_book_content)

norm_book_content1 = []
for norm in norm_book_content:
    text = remove_special_characters(remove_stopwords(norm))
    norm_book_content1.append(text)
print(norm_book_content1)

# 提取 tf-idf 特征
vectorizer, feature_matrix = build_feature_matrix(norm_book_content1,
                                                  feature_type='tfidf',
                                                  ngram_range=(1, 2),
                                                  min_df=0.2,
                                                  max_df=0.90)
# 查看特征数量
print(feature_matrix.shape)

# 获取特征名称
feature_names = vectorizer.get_feature_names()
예제 #8
0
print('Numero de instancias en depresion: ', len(dic.types['ax']['rows']))
print('Vocabulario en el chunk: ', len(chunk.vocablos.keys()))

dic.types['ax']['cols'] = dic.fillOnesZeros('ax')
print(len(dic.types['ax']['cols']))
tfs = [(v.IDLV(0), k) for k, v in chunk.vocablos.items()]
tfs.sort()
tfs.reverse()
for i in range(0, 1000):
    IDLVdic[tfs[i][1]] = tfs[i][0]

norm.thousand_words = IDLVdic.keys()
# Adquisicion del corpus >>>>>>>> FIN
# Normalizado del corpus >>>>>>>>>> INICIO
corpus = [
    norm.remove_special_characters(text) for text in dic.types['ax']['rows']
]
train_corpus = [norm.remove_stopwords(text) for text in corpus]
thousand_corpus = [norm.just_thousand(text) for text in train_corpus]

# Normalizado del corpus >>>>>>>>>> FIN

from feature_extractor import bow_extractor, tfidf_transformer, tfidf_extractor
import nltk
import gensim
import pandas as pd
import numpy as np
# BOW features
bow_vectorizer, bow_train_features = bow_extractor(thousand_corpus, (1, 1))
feature_names = bow_vectorizer.get_feature_names()
print('Numero de caracteristicas tomadas en cuenta', len(feature_names))