def ProcessCorpus(V, L):
    try:
        stem = SpanishStemmer()
        for l in stdin:
            l = l.split()
            if len(l) < 3: 
                stderr.write('Warning: Short line: \"%s\"\n' % ' '.join(l))
                continue
            tid = l[0]
            uid = l[1]
            lv = [0 for w in V]
            for w in l[2:]:
                w = stem.stem(w.decode('utf-8'))
                d = V.get(w, None)
                if d is None: 
                    #stderr.write('Warning: \"%s\" not in the lexicon\n' % w);
                    continue
                lv[d] = lv[d] + 1
            if sum(lv) == 0:
                stderr.write('Warning: %s with null vector. Label: %d\n' % (tid, L[l[0]]) )
            stdout.write('%d ' % L[l[0]])
            for i in range(len(lv)):
                stdout.write('%d:%d ' % (i+1, lv[i]))
            stdout.write('# %s %s\n' % (tid, uid))
        return 0
    except Exception as ex:
        stderr.write('Exception: %s\n' % repr(ex))
        return 1
예제 #2
0
class Tokenizer(object):
    """
    Esta clase es la encargada de obtener las palabras de los
    documentos recuperados por el `Crawler`
    """

    def __init__(self, min_long=5):
        """
        Para inicializar un `Tokenizer` es necesario saber el tamaño mínimo de caracteres
        `min_long`. que constituyen una palabra válidad
        :param min_long: un entero. Por defecto igual a cinco (5)
        """
        self.stemmer = SpanishStemmer()
        self.min_long = min_long

    def obtener_palabras(self, contenido):
        """
        Este método devuelve una lista de palabras recuperadas del `contenido`
        :param contenido: una cadena con el contenido de texto del documento
        :return: una lista de cadenas de caracteres representando las palabras
        """
        # Realizo el stemming en todo el contenido. Eliminando acentos mayusculas y dejando las raices.
        cont_stemed = self.stemmer.stem(contenido)
        # Divido el texto por palabras eliminando las repetidas
        conjunto_palabras = set(re.split(r'\W+', cont_stemed))
        # Elimino Stopwords, palabras menores a min_long y retorno lista
        return [palabra for palabra in conjunto_palabras if
                palabra not in stopwords.words('spanish') and not len(palabra) < self.min_long]
예제 #3
0
class ConceptComparerSpanishStem(ConceptComparerBase):
    """
    Implementation of a concept comparer based on a stemmer for spanish.

    Parameters
    ----------
    None.

    Notes
    -----
    This is a sub-class of
    :py:class:`~lingpy.meaning.concepts.ConceptComparerBase`. It uses a simple
    match of the stem of a given (spanish) string against a given context
    (that is supposed to be a stemmed spanish word stem).

    See also
    --------
    ConceptComparerBase
    ConceptGraph
    """

    def __init__(self):
        self.stemmer = SpanishStemmer(True)
        self.re_brackets = re.compile(" ?\([^)]\)")

    def compare_to_concept(self, element, concept):
        """Compares a given element to a concept.

        Parameters
        ----------
        element : str
            The string (for example a lexical item: head or translation) to
            compare to the concept.
        concept : str or object
            The conpect to compare to.

        Return
        ------
        match : bool
            True if element matches the given concept, False otherwise.

        Notes
        -----
        The `element` is supposed to be a spanish word, the concept a stemmed
        entry of the spanish Swadesh List.

        See also
        --------
        spanish_swadesh_list

        """
        element = self.re_brackets.sub("", element)
        element = element.strip()
        if not " " in element:
            stem = self.stemmer.stem(element)
            if stem == concept:
                return True
        return False
예제 #4
0
class OOVclassifier(object):
    def __init__(self, stem=False):
        dictionaries = dicts()
        path = '/home/alangb/TWPP'  # path to TreeTagger installation directory
        self.english_dict = enchant.Dict("en_EN")
        self.spanish_dict = enchant.Dict("es_ES")
        self.ND = dictionaries.norm
        self.SD = dictionaries.lemario
        self.PND = dictionaries.names
        self.stem = stem
        if stem:
            self.stemmer = SpanishStemmer()
        else:
            self.tagger = TreeTagger(TAGLANG='es', TAGDIR=path)

    def dictionary_lookup(self, word):
        result = (word in self.SD or word in self.PND
                  or word in self.ND.values())
        return result

    def affix_check(self, word):
        result = False
        if word.islower() or word.istitle():
            if self.stem:
                n = len(word)
                stem = self.stemmer.stem(word)
                # compare with first substring of length n of each word in SD
                for w in [x[:n] for x in self.SD if len(x) >= n]:
                    result = (word == w)
                    if result:
                        break
            else:
                lemma = make_tags(self.tagger.tag_text(word))[0].lemma
                result = self.dictionary_lookup(lemma)
        return result

    def check(self, word):
        result = self.spanish_dict.check(word)
        if not result:
            result = self.dictionary_lookup(word) or self.affix_check(word)
        return result

    def check_NoES(self, word):
        result = False
        if len(word) > 1:
            result = self.english_dict.check(word)
        return result

    def classify(self, word):
        if self.check(word):
            result = 1
        elif self.check_NoES(word):
            result = 2
        else:
            result = 0
        return result
예제 #5
0
class SpanishStemmer(Normalizer):

    def __init__(self, next_normalizer=None):
        super(SpanishStemmer, self).__init__(next_normalizer)
        self._stemmer = NLTKSpanishStemmer()

    def _apply_normalizer(self, data):
        stem_word = lambda x: self._stemmer.stem(x)
        stem_word_list = lambda xl: [stem_word(w) for w in xl]
        return stem_word(data) if not isinstance(data, (list, tuple)) else stem_word_list(data)
예제 #6
0
class StemmerProcessor(DocumentAtATimeCorpusProcessor):
    def __init__(self):
        super(StemmerProcessor, self).__init__()
        self.stemmer = SpanishStemmer()

    def process_document(self, document):
        processed_document = []
        for word in document:
            processed_document.append(self.stemmer.stem(word))
        return processed_document
예제 #7
0
class StemmerProcessor(DocumentAtATimeCorpusProcessor):
    def __init__(self):
        super(StemmerProcessor, self).__init__()
        self.stemmer = SpanishStemmer()

    def process_document(self, document):
        processed_document = []
        for word in document:
            processed_document.append(self.stemmer.stem(word))
        return processed_document
예제 #8
0
def find_top_N_words(lang_entries, top_N, lang):
    dictionary = Lang_Dictionary({}, lang)
    for player in lang_entries:
        for chat in player.c:
            language = {'eng': 0, 'spn': 0, 'other': 0, 'tot': 0}
            sentence = player.c[chat]
            newlist = player.c[chat].strip().split(' ')
            newlist = [x.strip("''") for x in newlist]
            for word in newlist:
                language['tot'] += 1
                if word.lower() not in Lang_dicts.lang_index:
                    language['other'] += 1
                else:
                    word = Lang_dicts.lang_index[word.lower()]
                    if word == "english":
                        language['eng'] += 1
                    elif word == "spanish":
                        language['spn'] += 1
                    else:
                        language['other'] += 1
            if language['other'] < 2 * (language['spn'] + language['eng']):
               print(sentence)
               if language['spn'] > language['eng']:
                   print("SPANISH")
                   stemmer = SpanishStemmer()
               else:
                   print("ENGLISH")
                   stemmer = EnglishStemmer()
            aslist = []
            aslist += sentence
            sentence =""
            j = ''.join(aslist)
            words = j.split(' ')
            for line in words:
                line = str(line).replace('\'', '')
                line = line.replace('""', '')
                line = line.replace('"', '')
                if len(line) > 0:
                    if language["other"] < 2 * (language['spn'] + language["eng"]):
                        sentence += stemmer.stem(line.encode(sys.stdout.encoding, errors = 'replace')) + " "
                        print(sentence)
                    ##INEFFICIENT - looking through dictionary each time?
                    if line.lower() not in dictionary.d:
                        dictionary.d[line.lower()] = 0
                    dictionary.d[line.lower()] += 1

    ###wthCounts is a list of the word and its count
    wthCounts = []
    for(w,c) in dictionary.d.iteritems():
        wthCounts += [(c,w)]
    ##wc is the wthCounts list only sorted
    wc = sorted(wthCounts, reverse=True)
    return wc[:top_N]
예제 #9
0
def build_paragraph_inv_index(paragraphs, stem):
    p_index = {}
    stemmer = SpanishStemmer()
    for i, paragraph in enumerate(paragraphs):
        words = [word for word in paragraph.split() if word not in STOP_WORDS]
        for word in words:
            if stem:
                word = stemmer.stem(word)
            if word not in p_index:
                p_index[word] = []
            p_index[word].append(i)
    return p_index
예제 #10
0
def build_index_from_words(words, stem):
    '''
	Takes: 
	- words, a list of strings
	
	Returns:
	- index, a dictionary with a count of times a word appears
	in the document
	'''
    index = {}
    stemmer = SpanishStemmer()
    for word in words:
        if word not in STOP_WORDS:
            if stem:
                word = stemmer.stem(word)
            if word not in index:
                index[word] = 0
            index[word] += 1
    return index
def generate_stopwords(stopname='stopSpanish.pkl'):
    """ Remove stop words, and apply stemming """
    stemmer=SpanishStemmer()
    stopwords_es = set(stopwords.words('spanish'))
    stopwords_es_sw = set(get_stop_words('spanish'))
    stopSpanishBeta = list(set(stopwords_es.union(stopwords_es_sw)))

    stopSpanish = set(stopwords_es.union(stopwords_es_sw))
    for stopWord in stopSpanishBeta:
        stopSpanish.add(stemmer.stem(stopWord))

    stopSpanish = list(stopSpanish)
    stopSpanish.extend(['tra', 'd', 'desc']) # Adding stopwords not present in the standard stopwords
    stopSpanish.remove('no')  # Keep to help identify negative categories

    with open(f'{resource_path}/{stopname}', 'wb') as f:
        pickle.dump(stopSpanish, f)

    return stopSpanish
예제 #12
0
def getfeats(fields, o):
    """ This takes the word in question and
    the offset with respect to the instance
    word """
    word = fields[0]
    stemmer = SpanishStemmer()

    with_hyphen = 0
    if "-" in word:
        with_hyphen = 1

    with_apostrophe = 0
    if "'" in word:
        with_apostrophe = 1

    o = str(o)
    features = [
        (o + "word", word),
        (o + 'pos', fields[1]),
        #(o + 'prefix1', word[:1]),
        (o + 'prefix2', word[:2]),
        (o + 'prefix3', word[:3]),
        (o + 'prefix4', word[:4]),
        #(o + 'suffix1', word[-1:]),
        (o + 'suffix2', word[-2:]),
        (o + 'suffix3', word[-3:]),
        (o + 'suffix4', word[-4:]),
        (o + 'is_upper', word.isupper()),
        (o + 'is_title', word.istitle()),
        (o + 'is_digit', word.isdigit()),
        (o + 'with_hypen', with_hyphen),
        (o + 'with_apostrophe', with_apostrophe),
        (o + 'spanich_stem', stemmer.stem(word)),
        # (o + 'word_shape', word_shape(word))
    ]

    return features
예제 #13
0
def spanish_swadesh_list(stemmed=True):
    """
    Helper function that returns a list of strings with the stems of the
    spanish Swadesh entries.

    """
    try:
        stemmer = SpanishStemmer(True)
    except:
        log.warn("Spanish stemmer could not be loaded!")
        return

    swadesh_entries = []
    for line in util.read_text_file(
            util.data_path('swadesh', 'swadesh_spa.txt'), lines=True):
        line = line.strip()
        for e in line.split(","):
            e = e.strip()
            if stemmed:
                stem = stemmer.stem(e)
                swadesh_entries.append(stem)
            else:
                swadesh_entries.append(e)
    return swadesh_entries
예제 #14
0
class WTAclassifier(object):
    def __init__(self, lemma=False, stem=False):
        self.extra_dicts = Dicts()
        self.english_dict = enchant.Dict("en_EN")
        self.spanish_dict = enchant.Dict("es_AR")
        self.lemma = lemma
        self.stem = stem
        self.VARIANT_CLASS = 0
        self.SPANISH_CLASS = 1
        self.FOREIGN_CLASS = 2
        if lemma:
            self.lemmatizer = Lemmatizer()
        if stem:
            self.stemmer = SpanishStemmer()

    def check(self, word):
        result = self.spanish_dict.check(word)
        if not result:
            result = self.extra_dicts.is_valid(word)
        if self.lemma and not result:
            lemma = self.lemmatizer.lemmatize(word)
            result = self.extra_dicts.is_valid(lemma)
        if self.stem and not result:
            result = self.affix_check(word)
        return result

    def affix_check(self, word):
        result = False
        stem = self.stemmer.stem(word.lower())
        n = len(stem)
        # compare with first substring of length n of each word in lemario
        if stem[0] in self.extra_dicts.lemario.keys():
            for x in self.extra_dicts.lemario[stem[0]]:
                if len(x) >= n and x[:n] == stem:
                    return True
        return result

    def check_NoES(self, word):
        result = False
        if len(word) > 1:
            # Tokens like 'x' or 'q' appear in english dict and return True
            result = self.english_dict.check(word)
        return result

    def classify(self, word):
        if self.check(word):
            result = self.SPANISH_CLASS  # Correct word in spanish
        elif self.check_NoES(word):
            result = self.FOREIGN_CLASS  # Correct word in another language
        else:
            result = self.VARIANT_CLASS  # Variant word (to correct)
        return result

    def is_variant(self, class_number):
        return class_number == self.VARIANT_CLASS

    def is_correct(self, class_number):
        return class_number == self.SPANISH_CLASS

    def is_not_spanish(self, class_number):
        return class_number == self.FOREIGN_CLASS
예제 #15
0
def steming(text: Text) -> List[Text]:
    stem = SpanishStemmer()
    ltext = [stem.stem(w) for w in cleanText(text)]
    return ltext
예제 #16
0
    def get_vector_matrix(self, freq_floor=50, context_words=3):

        nlp = es_core_web_md.load()
        STOPWORDS = spacy.es.STOP_WORDS

        def _clean_sent(sent):
            clean_sent = []
            # remove stopwords
            for word in sent:
                word = word.lower()
                if not word in STOPWORDS:
                    if not word.isdigit():
                        clean_sent.append(word)
            return clean_sent

        def _update_feature(word, feature_name, features):
            " dirty update of features "
            counts = 1
            if word in vectors:
                if feature_name in vectors[word]:
                    counts = vectors[word][feature_name] + 1
            features[feature_name] = counts
            return features

        def _update_counts(feature_name, f_counts):
            counts = 1
            if feature_name in f_counts:
                counts = f_counts[feature_name] + 1
            f_counts[feature_name] = counts
            return f_counts

        sents = self.corpus.get_sents()
        stemmer = SpanishStemmer()

        # will use the words as keys and dict of features as values
        vectors = {}
        #freq_counts = {}
        for sent in sents:
            # TODO: PARALELLIZE!!
            #for doc in nlp.pipe(texts, batch_size=10000, n_threads=3):
            # take off stopwords && to get context_words!
            cleaned_sent = _clean_sent(sent)
            doc = nlp(' '.join(sent))
            for word_idx in range(len(doc)):
                # get the word and the pos tag
                spacy_word = doc[word_idx]
                word = spacy_word.text.lower()

                pos_tag = spacy_word.pos_

                if len(word) <= 2:
                    continue
                if word in STOPWORDS:
                    continue
                if word.isdigit():
                    continue

                # if not seen word
                if not word in vectors:
                    features = {}
                else:
                    features = vectors[word]

                # counts of frequency to normalze later
                #freq_counts = _update_counts(pos_tag, freq_counts)

                # context related (POS and words stemmed)
                features = _update_feature(word, pos_tag, features)
                if word_idx > 0:
                    prev_tag = doc[word_idx - 1].pos_
                    feature_name = prev_tag + '_pos_prev'
                    features = _update_feature(word, feature_name, features)
                if word_idx < len(sent) - 1:
                    post_tag = doc[word_idx + 1].pos_
                    feature_name = post_tag + '_pos_post'
                    features = _update_feature(word, feature_name, features)

                # dependency features. the objective of the dep is stemmed!
                dep_type = spacy_word.dep_
                if dep_type != 'ROOT':
                    dep_obj = stemmer.stem(spacy_word.head.text.lower())
                    feature_name = 'DEP:' + dep_type + '-' + dep_obj
                    features = _update_feature(word, feature_name, features)

                # get n words from context as features (stemmed...!)
                for i in range(context_words):
                    ctxt_word = (random.choice(cleaned_sent))
                    feature_word = stemmer.stem(ctxt_word)
                    feature_name = ctxt_word + '_ctxt_word'
                    features = _update_feature(word, feature_name, features)
                # agregar feature de synset (wordnet) :0
                features['word'] = word

                # frequency counting
                features = _update_feature(word, 'freq', features)

                vectors[word] = features

        # sacar palabras con < 'freq'
        words_to_pop = set()
        for word, f_dict in vectors.items():
            if f_dict['freq'] <= freq_floor:
                words_to_pop.add(word)
        for word in words_to_pop:
            vectors.pop(word)

        for word, f_dict in vectors.items():
            #print(word, f_dict)
            f_dict['freq'] = 0
            vectors[word] = f_dict  # delete an irrelevant dimension!
        # normalizar los contextos de POS
        #for word, f_dict in vectors.items():
        #    f_dict[]

        # agregar palabra de contexto. .. LEMATIZADA !

        # NORMALIZAR TODOS LOS CONTEXTOS! -> diccionario de frequencias de ... TODOS los features que ocurrieron
        self.words = list(
            vectors.keys())  # thankfully in the same order as vectors.values

        vectorizer = DictVectorizer(dtype=numpy.int32)
        vec_matrix = vectorizer.fit_transform(list(vectors.values()))
        vectors_shape = vec_matrix.get_shape()
        print(vectors_shape)
        """
        freqs_vector = vectorizer.transform(freq_counts)

        vec_matrix = vstack([freqs_vector, vec_matrix])
        print(s.get_shape)
        print(s)
        print(vectorizer.inverse_transform(s))
        """

        # normalization
        vec_matrix = normalize(vec_matrix, copy=False)

        ####### reduccion de dim no sup
        # reducir dimensionalidad con variance treshold
        #selector = VarianceThreshold(threshold = 0.0)
        #vec_matrix = selector.fit_transform(vec_matrix)

        # SVD (PCA)
        Trunc_svd = TruncatedSVD(n_components=1500)
        vec_matrix = Trunc_svd.fit_transform(vec_matrix)

        # reducir dimensionalidad con percentile de varianza
        #selected = SelectPercentile(chi2, percentile = 10)
        #word_vecs_new=selected.fit_transform(new_word_vecs,target_vec)

        print(vectorizer.inverse_transform(vec_matrix))  # -> to see features!

        return self.words, vec_matrix
resenhas_categoria = {}
inputt = open('resenhas_por_categoria_dict.pk1', 'rb')
resenhas_categoria = load(inputt)
inputt.close()

categoria_polaridad = []

palabras_no_encontradas = 0
total_palabras_encontradas = 0
ss = SpanishStemmer()

for categoria in resenhas_categoria:
    valor_categoria = 0
    palabras_encontradas = 0
    for resenha in resenhas_categoria[categoria]:
        for word in resenha[0].split():
            if diccionario_polaridad.get(ss.stem(word).lower()):
                palabras_encontradas += 1
                valor_categoria += diccionario_polaridad[ss.stem(word).lower()]
            else:
                palabras_no_encontradas += 1

    total_palabras_encontradas += palabras_encontradas
    polaridad_promedio = valor_categoria / len(resenhas_categoria[categoria])
    categoria_polaridad.append((categoria, polaridad_promedio))
print()
for cp in categoria_polaridad:
    print(cp)
print()
print('>>>', total_palabras_encontradas, ' palabras encontradas.')
print('>>>', palabras_no_encontradas, ' palabras no encontradas.')
예제 #18
0
 def tokenizer_stemmer_global(document):
     stemmer = SpanishStemmer()
     my_tokenizer = RegexpTokenizer("[\w']+")
     return [
         stemmer.stem(token) for token in my_tokenizer.tokenize(document)
     ]
예제 #19
0
def stemmer_all(tweet):
    stm = SpanishStemmer()
    split_tweet = [word for word in tweet.lower().split(' ') if word.strip()]
    return ' '.join([stm.stem(word.strip())
                     for word in split_tweet])
예제 #20
0
# -*- coding: utf-8 -*-
"""
Created on Mon May  6 16:07:59 2019

@author: Turing
"""

from bs4 import BeautifulSoup as Soup
from _pickle import dump
from nltk.stem.snowball import SpanishStemmer

handler = open('senticon.es.xml', encoding="utf-8").read()
soup = Soup(handler, 'lxml')
diccionario_polaridad = {}
ss = SpanishStemmer()
for lemma in soup.find_all('lemma'):
    palabra = lemma.get_text()
    polaridad = float(lemma.attrs["pol"])
    diccionario_polaridad[ss.stem(palabra.replace(' ',
                                                  '')).lower()] = polaridad

output = open("diccionario_polaridades.pk1", "wb")
dump(diccionario_polaridad, output, -1)
output.close()
def remove_stopwords(text, stopSpanish):
    stemmer=SpanishStemmer()
    textList = text.split()
    textList = [word for word in textList if word not in stopSpanish]
    return ' '.join([stemmer.stem(word) for word in textList])
예제 #22
0
파일: __init__.py 프로젝트: FrankNagel/qlc
from operator import attrgetter
from pylons import config

from nltk.stem.snowball import SpanishStemmer
stemmer = SpanishStemmer(True)

# load swadesh list
swadesh_file = codecs.open(os.path.join(os.path.dirname(
    os.path.realpath(
        __file__)), "spa.txt"), "r", "utf-8")

swadesh_list = []
for line in swadesh_file:
    line = line.strip()
    for e in line.split(","):
        stem = stemmer.stem(e)
        swadesh_list.append(stem)

def init_model(engine):
    """Call me before using any of the tables or classes in the model"""
    Session.configure(bind=engine)

entry_table = schema.Table('entry', meta.metadata,
    schema.Column('id', types.Integer,
        schema.Sequence('entry_seq_id', optional=True), primary_key=True),
    schema.Column('head', types.Unicode(255)),
    schema.Column('fullentry', types.Text),
    schema.Column('is_subentry', types.Boolean),
    schema.Column('is_subentry_of_entry_id', types.Integer),
    schema.Column('dictdata_id', types.Integer, schema.ForeignKey('dictdata.id')),
    schema.Column('book_id', types.Integer, schema.ForeignKey('book.id')),
예제 #23
0
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)

    return input_txt


# Elimina autor del tweet
df = df.drop(columns=['user'])

# Elimina usernames del tweet
df['tiny_tweet'] = np.vectorize(remove_pattern)(df['tweet'], "@[\w]*")

# Eliminar puntuacion, numeros y caracteres especiales
df['tiny_tweet'] = df['tiny_tweet'].str.replace("[^a-zA-Z#]", " ")

# Elimina palabras cortas (menos de 3 caracteres)
df['tiny_tweet'] = df['tiny_tweet'].apply(
    lambda x: ' '.join([w for w in x.split() if len(w) > 3]))

# Tokenizar tweets
df['tokens'] = df['tiny_tweet'].apply(lambda x: x.lower().split())

# Stemming
stemmer = SpanishStemmer()
df['stems'] = df['tokens'].apply(lambda x: [stemmer.stem(i) for i in x])

# Guardo un dataset ya limpio
df.to_csv('./processed_tweets.csv', index=False)
class ScoreParagraphs(object):
    def __init__(self, question, words, stem):
        self.question = question
        self.stem = stem
        self.stemmer = SpanishStemmer()
        self.words = words
        self.stemmed_words = self.stem_words(self.words)
        self.path_pfx = os.getcwd()

        self.inverted_index = self.load_doc_inverted_index()
        self.doc_names = self.init_doc_names()
        self.paragraph_indices = {}
        self.paragraph_inverted_indices = {}
        self.results = pd.DataFrame(columns=['text', 'law', 'score'])
        self.load_paragraph_indices()

        self.L = 23055.676666666666  #Manually obtained using bash
        self.scores = {'tf': {}, 'idf':{}, 'tfidf':{},'n_containing':{},\
              'score':{}}

    def stem_words(self, words):
        #print('Stemming {}'.format(words))
        processed = []
        if self.stem:
            for word in words:
                word = self.stemmer.stem(word)
                processed.append(word)
        return set(processed)

    def load(self, filename):
        #print('Trying to load {}'.format(filename))
        f = open(filename, 'r', encoding='utf-8')
        index = json.load(f)
        #print('Success!')
        return index

    def load_doc_inverted_index(self):
        filename = self.path_pfx + '/indices/inverted{}.json'.format(
            self.stem * '_stem')
        return self.load(filename)

    def init_doc_names(self):
        temp = [self.inverted_index[word] for word in self.stemmed_words \
          if word in self.inverted_index]
        return [i for sublist in temp for i in sublist]
        # rv = set()
        # for i, word in enumerate(self.stemmed_words):
        # 	temp = set(self.inverted_index.get(word, []))
        # 	if i == 0:
        # 		rv = rv.union(temp)
        # 	rv = rv.intersection(temp)
        # return list(rv)

    def load_paragraph_indices(self):
        for doc in self.doc_names:
            filename = self.path_pfx + '/indices/{}.json'.format(
                doc + self.stem * '_stem')
            self.paragraph_indices[doc] = self.load(filename)
            filename = self.path_pfx + '/indices/{}_p.json'.format(
                doc + self.stem * '_stem')
            self.paragraph_inverted_indices[doc] = self.load(filename)

    def where_should_i_look(self):
        '''
		words is a list of strings, for now:
		['robo', 'algun', 'sentencia']
		'''
        #print('Working with the following documents:\n{}'.format(self.doc_names))

        for law in self.doc_names:
            filename = self.path_pfx + '/leyes/{}.txt'.format(law)
            text = get_text(filename)
            # paragraphs = re.findall(r'(Artículo|ARTÍCULO [0-9]+)(.*?)(Artículo|ARTÍCULO)',\
            # 	text, flags=re.DOTALL, overlapped=True)
            # paragraph_list = [match[1] for match in paragraphs]
            _, paragraph_list = preprocess_text_to_words(text)

            for word in self.stemmed_words:
                #print(word)
                paragraphs = self.paragraph_inverted_indices[law].get(word, [])
                results = [[paragraph_list[x], law, 0] for x in paragraphs]
                df_temp = pd.DataFrame(results,
                                       columns=['text', 'law', 'score'])
                self.results = self.results.append(
                    df_temp,
                    ignore_index=True,
                )

        return self.results

    ## http://stevenloria.com/finding-important-words-in-a-document-using-tf-idf/
    def tf(self, word, document):
        s = (word, document)
        #print(s, type(s))
        if s not in self.scores['tf']:
            self.scores['tf'][s] = document.words.count(word) / len(
                document.words)
        return self.scores['tf'][s]

    def n_containing(self, word, doclist):
        s = (word)
        #print(s, type(s))
        if s not in self.scores['n_containing']:
            self.scores['n_containing'][s] = sum(1 for doc in doclist
                                                 if word in doc.words)
        return self.scores['n_containing'][s]

    def idf(self, word, doclist):
        s = (word)
        #print(s, type(s))
        if s not in self.scores['idf']:
            self.scores['idf'][s] = math.log(
                len(doclist) / (1 + self.n_containing(word, doclist)))
        return self.scores['idf'][s]

    def tfidf(self, word, document, doclist):
        s = (word, document)
        #print(s, type(s))
        if s not in self.scores['tfidf']:
            self.scores['tfidf'][s] = self.tf(word, document) * self.idf(
                word, doclist)
        return self.scores['tfidf'][s]

    def bm25(self, word, document, doclist, k=2.0, b=0.75):
        '''
		Takes,
			word, a string
			document, a blob object
			doclist, a list with blob objects
			l, 
		Returns term frequency (TF)		
		'''
        s = (word, document)
        #print(s, type(s))
        if s not in self.scores['score']:
            self.scores['score'][s] = self.idf(word, doclist) * \
            self.tf(word, document) * (k + 1) / (self.tf(word, document) +\
            k * (1 - b + b * len(document)/self.L))
        return self.scores['score'][s]

    def score_docs(self, documents, words, method, k, b):
        '''
		documents, a list of strings (paragraphs)
		words, a list of strings (words)
		'''
        blobs = [tb(paragraph) for paragraph in documents]
        rv = [0] * len(blobs)

        for word in words:
            for i, blob in enumerate(blobs):
                if method == 'word_count':
                    rv[i] += blob.words.count(word)
                elif method == 'bm25':
                    rv[i] += self.bm25(word, blob, blobs, k, b)
                elif method == 'proximity':
                    rv[i] += jaro_winkler(self.question, blob.string)
        self.results['score'] = rv
        return rv

    def drop_duplicates_and_short_paragraphs(self, min_size):
        self.results.drop_duplicates(inplace=True)
        k = lambda x: len(x.split()) > min_size
        self.results = self.results[self.results.text.apply(k)]
        # clf = retrieve_model()
        # print(clf.predict(self.results))

    def load_law_names(self, filename):
        rv = pd.read_csv(filename, header=None, names=['law', 'Law'])
        self.results = self.results.merge(rv, on='law')
        del self.results['law']

    def texts(self, top_k, method, k=2, b=0.75):
        '''

		'''
        words = self.stemmed_words
        if self.stem:
            print('Working with stemmed words: {}'.format(words))
        else:
            print('Working with words: {}'.format(words))

        self.where_should_i_look()
        self.drop_duplicates_and_short_paragraphs(4)
        paragraphs = self.results.text
        texts = [
            ' '.join(self.stem_words(paragraph.split()))
            for paragraph in paragraphs
        ]
        self.score_docs(texts, words, method, k, b)
        self.results.sort_values('score', ascending=False, inplace=True)

        law_filename = self.path_pfx + '/doc/docnames.csv'
        df_names = self.load_law_names(law_filename)

        print(self.results.head(top_k))

        return self.results.head(top_k)
예제 #25
0
    lista_articulos_tokens.append(articulo_tokenizado)

for articulo in lista_articulos_tokens:
    print()
    print(' '.join(articulo))

pol_dict = {}
inputt = open('diccionario_polaridades_senticon.pkl', 'rb')
pol_dict = load(inputt)
inputt.close()


def lemmatize_sent(sent):
    return utils.lemmatize_text(
        utils.remove_unalphabetic_words(nltk.word_tokenize(sent)))


print('article', '\t\t', 'polarity')
for aspect_i in range(len(lista_articulos_tokens)):
    aspect_value = 0
    finded_words = 0
    for word in lista_articulos_tokens[aspect_i]:
        if pol_dict.get(ss.stem(word)):
            finded_words += 1
            aspect_value += pol_dict[ss.stem(word)]
    if finded_words > 0:
        aspect_value = aspect_value / finded_words
        print(aspect_i, '\t', aspect_value)
    else:
        print(aspect_i, 0, 0)
예제 #26
0
def export_swadesh_entries(input_path, output_path=None):

    print("Input: {0}".format(input_path))
    print("Ouput: {0}".format(output_path))

    cr = CorpusReaderDict(input_path)
    print("Data loaded")

    files = [ "book.csv",
          "component.csv",
          "corpusversion.csv",
          "dictdata.csv",
          "language_iso.csv",
          "language_bookname.csv",
          "language_src.csv",
          "language_tgt.csv",
          "nondictdata.csv",
          "wordlistdata.csv",
          "wordlistconcept.csv"
        ]
    
    for f in files:
        shutil.copyfile(os.path.join(
            input_path, f), os.path.join(output_path, f))
    
    from nltk.stem.snowball import SpanishStemmer
    stemmer = SpanishStemmer()
    import qlc.utils

    #get stopwords
    stopwords = qlc.utils.stopwords_from_file(os.path.join(os.path.dirname(
        os.path.realpath(
            __file__)), "data", "stopwords", "spa.txt"))

    # load swadesh list
    swadesh_file = codecs.open(os.path.join(os.path.dirname(
        os.path.realpath(
            __file__)), "data", "swadesh", "spa.txt"), "r", "utf-8")

    swadesh_entries = []
    for line in swadesh_file:
        line = line.strip()
        for e in line.split(","):
            stem = stemmer.stem(e)
            swadesh_entries.append(stem)

    # find all entries that contain one of the swadesh words
    # save entry ids to list
    entry_ids = []

    dictdata_ids = cr.dictdata_string_ids
    for dictdata_id in dictdata_ids:
        src_language_iso = cr.src_languages_iso_for_dictdata_id(dictdata_id)
        tgt_language_iso = cr.tgt_languages_iso_for_dictdata_id(dictdata_id)
        # is there some spanish?
        if (src_language_iso != ['spa']) and (tgt_language_iso != ['spa']):
            continue

        for entry_id, head, translation in \
                cr.ids_with_heads_with_translations_for_dictdata_id(
                    dictdata_id):
            if src_language_iso == [ 'spa' ]:
                (head, translation) = (translation, head)

            translation = re.sub(" ?\([^)]\)", "", translation)
            if translation in stopwords:
                entry_ids.append(entry_id)
            else:
                translation = qlc.utils.remove_stopwords(translation, stopwords)
                phrase_stems = qlc.utils.stem_phrase(translation, stemmer, True)
                for stem in phrase_stems:
                    if stem in swadesh_entries:
                        entry_ids.append(entry_id)

    #print(len(entry_ids))
    #return

    input_entry_csv = os.path.join(input_path, "entry.csv")
    output_entry_csv = os.path.join(output_path, "entry.csv")

    input_annotation_csv = os.path.join(input_path, "annotation.csv")
    output_annotation_csv = os.path.join(output_path, "annotation.csv")

    output_annotation = codecs.open(output_annotation_csv, "w", "utf-8")

    annotation_dict = collections.defaultdict(list)

    # cache annotations for lookup
    for i, line in enumerate(fileinput.input(
            input_annotation_csv, openhook=fileinput.hook_encoded("utf-8"))):
        if i == 0:
            output_annotation.write(line)
            continue
        data = line.strip().split("\t")
        annotation_dict[
            data[_annotation_table_columns['entry_id'] + 1]].append(line)
    
    fileinput.nextfile()

    output = codecs.open(output_entry_csv, "w", "utf-8")
    
    count_entries = 0
    for i, line in enumerate(fileinput.input(
            input_entry_csv, openhook=fileinput.hook_encoded("utf-8"))):
        if i == 0:
            output.write(line)
            continue
        data = line.strip().split("\t")
        if data[0] in entry_ids:
            output.write(line)
            for annotation_line in annotation_dict[data[0]]:
                output_annotation.write(annotation_line)

    fileinput.nextfile()
    output.close()
    output_annotation.close()
    
    # Worldists
    cr = CorpusReaderWordlist(sys.argv[1])
    print("Data loaded")

    # find all entries that contain one of the swadesh words
    # save entry ids to list
    wordlistdata_ids = cr.wordlistdata_string_ids
    bibtex_keys = collections.defaultdict(list)
    for wid in wordlistdata_ids:
        wordlistdata_string = cr.wordlistdata_string_ids[wid]
        bibtex_key = wordlistdata_string.split("_")[0]
        bibtex_keys[bibtex_key].append(wid)

    wordlistentry_ids = []
    for bibtex_key in bibtex_key:
        # first collect all concepts in this book where the spanish counterpart
        # has one of the swadesh words
        concepts = []
        for wordlistentry_id in wordlistentry_ids:
            language_iso = cr.get_language_code_for_wordlistdata_id(
                wordlistdata_id)
            # is there some spanish?
            if language_iso != ['spa']:
                continue

            for entry_id, concept, counterpart in \
                    cr.ids_with_concepts_with_counterparts_for_dictdata_id(
                        dictdata_id):

                counterpart = re.sub(" ?\([^)]\)", "", counterpart)
                if counterpart in stopwords:
                    entry_ids.append(entry_id)
                else:
                    counterpart = qlc.utils.remove_stopwords(
                        counterpart, stopwords)
                    phrase_stems = qlc.utils.stem_phrase(
                        counterpart, stemmer, True)
                    for stem in phrase_stems:
                        if stem in swadesh_entries:
                            concepts.append(concept)

        # now collect the entry ids for those concepts
        for wordlistentry_id in wordlistentry_ids:

            for entry_id, concept, counterpart in \
                    cr.ids_with_concepts_with_counterparts_for_dictdata_id(
                        dictdata_id):
                if concept in concepts:
                    wordlistentry_ids.append(entry_id)
    
    input_entry_csv = os.path.join(input_path, "wordlistentry.csv")
    output_entry_csv = os.path.join(output_path, "wordlistentry.csv")

    input_annotation_csv = os.path.join(input_path, "wordlistannotation.csv")
    output_annotation_csv = os.path.join(output_path, "wordlistannotation.csv")

    output_annotation = codecs.open(output_annotation_csv, "w", "utf-8")

    annotation_dict = collections.defaultdict(list)

    for i, line in enumerate(fileinput.input(input_annotation_csv, openhook=fileinput.hook_encoded("utf-8"))):
        if i == 0:
            output_annotation.write(line)
            continue
        data = line.strip().split("\t")
        annotation_dict[data[_wordlistannotation_table_columns['entry_id'] + 1]].append(line)
    
    fileinput.nextfile()

    output = codecs.open(output_entry_csv, "w", "utf-8")
    count_entries = 0
    for i, line in enumerate(fileinput.input(input_entry_csv, openhook=fileinput.hook_encoded("utf-8"))):
        if i == 0:
            output.write(line)
            continue
        data = line.strip().split("\t")
        if data[0] in entry_ids:
            output.write(line)
            for annotation_line in annotation_dict[data[0]]:
                output_annotation.write(annotation_line)

    fileinput.nextfile()
    output.close()
    output_annotation.close()    
# Then we loop through all the nodes of the merged graph and add the stem nodes to each Spanish node. If the node has only one word (after stopword removal) we will use the NLTK stemmer; otherwise we just leave the phrase as it is:

# <codecell>

combined_graph_stemmed = copy.deepcopy(combined_graph)
for node in combined_graph.nodes():
    if "lang" in combined_graph.node[node] and combined_graph.node[node]["lang"] == "spa":
        e = re.sub(" ?\([^)]\)", "", node)
        e = e.strip()
        stem = e
        words = e.split(" ")
        if len(words) > 1:
            words = [w for w in words if not w in stopwords or w == ""]
        if len(words) == 1:
            stem = stemmer.stem(words[0])
            
        stem = stem + "|stem"
        combined_graph_stemmed.add_node(stem, is_stem=True)
        combined_graph_stemmed.add_edge(stem, node)

# <markdowncell>

# Again we can count the nodes and the number of connected components. We see that the number of connected components decreases, as more nodes are connected into groups now:

# <codecell>

networkx.algorithms.components.number_connected_components(combined_graph_stemmed)

# <codecell>
예제 #28
0
def stemmer_all(tweet):
    stm = SpanishStemmer()
    split_tweet = [word for word in tweet.lower().split(' ') if word.strip()]
    return ' '.join([stm.stem(word.strip()) for word in split_tweet])
예제 #29
0
class TextProcessor:
    lemmatizer=None
    stopEnglish=None
    stopSpanish=None
    spanishStemmer=None

    def __init__(self):
        self.lemmatizer = treetaggerwrapper.TreeTagger(TAGLANG='es')
        self.stopEnglish = stopwords.words('english')
        self.stopSpanish = stopwords.words('spanish')
        self.stopSpanish.append('y/o')
        self.spanishStemmer=SpanishStemmer()

    def _remove_numbers(self, text):
        "Elimina los números del texto"

        return ''.join([letter for letter in text if not letter.isdigit()])

    def _remove_punctuation(self, text):
        "Elimina los signos de puntuacion del texto"

        regex = re.compile('[%s]' % re.escape(string.punctuation))
        return regex.sub(' ', text)

    def preprocessText(self,text):
        text=text.lower()
        text=self._remove_punctuation(text)
        text=self._remove_numbers(text)
        return text

    def lematizeText(self,text):
        newText = ""
        firstElement = 0
        firstWord=True
        for word in text.split():
            if word not in self.stopEnglish and word not in self.stopSpanish:
                word = word.replace("\ufeff", "")
                lemmaResult = self.lemmatizer.tag_text(word)
                # Return [[word,type of word, lemma]]
                if (len(lemmaResult) != 0):
                    word = lemmaResult[firstElement].split()[2]
                    if firstWord:
                        newText += word
                        firstWord = False
                    else:
                        newText += " " + word
        return newText

    def stemText(self,text):
        newText = ""
        firstWord = True
        for word in text.split():
            if word not in self.stopEnglish and word not in self.stopSpanish:
                word = word.replace("\ufeff", "")
                wordStemmed = self.spanishStemmer.stem(word)
                if firstWord:
                    newText += wordStemmed
                    firstWord = False
                else:
                    newText += " " + wordStemmed
        return newText
예제 #30
0
class MLAssistant(Assistant):
    def __init__(self,
                 language='en',
                 database_name='memory',
                 memory_table='memory',
                 listen_log_table='listen_log',
                 speak_log_table='speak_log'):
        super().__init__(language, database_name, memory_table,
                         listen_log_table, speak_log_table)

        try:
            json_file = open('modelo_gustos.json', 'r')
            loaded_model_json = json_file.read()
            json_file.close()

            self.model = model_from_json(loaded_model_json)
            self.model.load_weights("modelo_gustos.h5")
            self.model.compile(loss='mean_squared_error',
                               optimizer='adam',
                               metrics=['binary_accuracy'])
        except Exception:
            print('****ERROR: Error cargando modelo...****')

        self.stemmer = SpanishStemmer()
        self.words = [
            '¿qu', '?', 'peli', 'pelis', 'color', 'favorit', 'leer', 'libr',
            'novel', 'ver', 'prefier', 'gust', 'pelicul', 'jug', '¿cual',
            'prefer', 'jueg', 'com', 'plat', 'animal', 'videojueg'
        ]
        self.classes = [
            'comida', 'color', 'animal', 'juego', 'libro', 'película'
        ]

    def main(self, initial_sentence='¿Qué deseas?'):
        self.speak(initial_sentence, remember=False)
        self.listen()
        self.process_orders(self.last_recognised)
        self.adjust_for_ambient_noise()

    def process_orders(self, sentence):
        _class = self.classify_sentence(sentence)

        if not _class:
            self.speak('no estoy segura de lo que me quieres preguntar')
        else:
            if _class == 'comida':
                self.speak(
                    'Sin lugar a dudas mi comida preferida son los nachos con queso'
                )
            if _class == 'color':
                self.speak('Mi color preferido es el escarlata.')
            if _class == 'animal':
                self.speak(
                    'Me gustan mucho los grandes felinos, pero mi animal preferido es una perra que se llama'
                    ' Arale.')
            if _class == 'juego':
                self.speak('¡Me encanta Hollywood Monsters!')
            if _class == 'libro':
                self.speak(
                    'No queda muy bien decirlo, pero me han programado para decir siempre la verdad. No tengo'
                    ' tiempo para leer, y por tanto no tengo libro preferido.')
            if _class == 'película':
                self.speak(
                    'No tengo una película preferida, pero me gustan especialmente las películas de Disney y'
                    ' las del Studio Ghibli.')

    def classify_sentence(self, sentence, min_val=0.5):
        results = self._get_classification(sentence)
        if float(results[0][1]) < min_val:
            return None
        else:
            return results[0][0]

    def _clean_up_sentence(self, sentence):
        sentence_words = nltk.word_tokenize(sentence)
        sentence_words = [
            self.stemmer.stem(word.lower()) for word in sentence_words
        ]
        return sentence_words

    def _bow(self, sentence, words):
        sentence_words = self._clean_up_sentence(sentence)
        bag = [0] * len(words)
        for s in sentence_words:
            for i, w in enumerate(words):
                if w == s:
                    bag[i] = 1
        return np.array(bag)

    def _get_classification(self, sentence):
        array = [self._bow(sentence, self.words)]
        np_array = np.array(array, "float32")
        prediction = self.model.predict(np_array).round(2)[0]
        result = dict(zip(self.classes, prediction))
        return sorted(result.items(), key=operator.itemgetter(1))[::-1]
sents = []
for file_name in utils.find_all_files_in_path('*.txt',files_path):
    sents += nltk.sent_tokenize(open(file_name).read().replace('\n\n','.').replace('\n','.'))

print('aspect','\t\t','polarity')
for aspect in aspects:
    aspect_sent_avg_val = 0
    sents_of_aspect_count = 0
    for sent in sents:
        lemmatized_sent = lemmatize_sent(sent)
        if (aspect in sent) or (aspect in lemmatized_sent):
            sent_value = 0
            sents_of_aspect_count += 1
            finded_words = 0
            for word in lemmatized_sent:
                if pol_dict.get(ss.stem(word)):
                    finded_words += 1
                    sent_value += pol_dict[ss.stem(word)]
            if finded_words > 0:
                aspect_sent_avg_val += sent_value #/ finded_words
    if sents_of_aspect_count > 0:
        aspect_sent_avg_val = aspect_sent_avg_val / sents_of_aspect_count
        print(aspect, '\t',aspect_sent_avg_val)
    else:
        print(aspect, 0, 0)




예제 #32
0
def main(argv):
    log = logging.getLogger()
    logging.basicConfig(level=logging.INFO)
    
    conf = appconfig('config:development.ini', relative_to='.')
    config = None
    if not pylons.test.pylonsapp:
        config = load_environment(conf.global_conf, conf.local_conf)

    stemmer = SpanishStemmer(True)

    # load swadesh list
    swadesh_file = codecs.open(os.path.join(os.path.dirname(
        os.path.realpath(
            __file__)), "swadesh_spa.txt"), "r", "utf-8")

    swadesh_entries = []
    for line in swadesh_file:
        line = line.strip()
        for e in line.split(","):
            stem = stemmer.stem(e)
            swadesh_entries.append(stem)

    for b in quanthistling.dictdata.books.list:
        #if b['bibtex_key'] != "thiesen1998":
        #    continue

        book = model.meta.Session.query(model.Book).filter_by(bibtex_key=b['bibtex_key']).first()
        
        if book:

            print "Filtering entries in %s..." % b['bibtex_key']

            for dictdata in book.dictdata:

                entries = model.meta.Session.query(model.Entry).filter(model.Entry.dictdata_id==dictdata.id).order_by("startpage", "pos_on_page").all()

                annotations = model.meta.Session.query(model.Annotation).join(model.Entry, model.Annotation.entry_id==model.Entry.id).filter(model.Entry.dictdata_id==dictdata.id).all()
                dict_annotations = collections.defaultdict(list)
                for a in annotations:
                    dict_annotations[a.entry_id].append(a)

                for e in entries:
                    if b['bibtex_key'] == "thiesen1998":
                        e.filtered = False
                    else:
                        e.filtered = True
                        for a in dict_annotations[e.id]:
                            if a.value == "iso-639-3" and a.string == "spa":
                                for a2 in dict_annotations[e.id]:
                                    if (a2.value == "head" or a2.value == "translation") and a2.start == a.start:
                                        phrase = re.sub(" ?\([^)]\)", "", a2.string)
                                        phrase = phrase.strip()
                                        if not " " in phrase:
                                            stem = stemmer.stem(phrase)
                                            if stem in swadesh_entries:
                                                e.filtered = False
#                                                if e.is_subentry:
#                                                    e.mainentry().filtered = False

                Session.commit()
class TokenRepository:

    terminos = {}
    tokens = []
    reglasDocumento = []
    reglasTokens = []
    reglasEntities = []
    documentos = []
    fileNameTerminos = "results/terminos.txt"
    lista_vacias = []
    stemmer = None

    def __init__(self):

        self.reglasEntities.append(EmailRegla())
        self.reglasEntities.append(UrlRegla())
        self.reglasEntities.append(FechasRegla())
        self.reglasEntities.append(TelefonosRegla())
        self.reglasEntities.append(AbreviaturasRegla())
        self.reglasEntities.append(NombresPropiosRegla())
        self.reglasEntities.append(NumerosRegla())
        self.reglasDocumento.append(MinusculasRegla())
        self.reglasDocumento.append(TranslateRegla())
        self.reglasDocumento.append(LimpiarHtmlTagsRegla())
        self.reglasDocumento.append(LimpiadoBasicoRegla())
        self.reglasTokens.append(MinMaxCaracteresRegla())
        self.stemmer = SpanishStemmer()

    def tokenizar(self, documentos, **options):
        # INIT
        self.documentos = documentos
        self.tokens = []
        self.terminos = {}
        self.lista_vacias = []
        pathVacias = options.get('pathVacias', None)

        if pathVacias != None:
            print u"ANALIZANDO PALABRAS VACIAS"
            with codecs.open(pathVacias, mode='rt',
                             encoding='utf-8') as vacias:
                content = vacias.read()
                for instancia in self.reglasDocumento:
                    content = instancia.run(content)

                palabras = content.strip().split()
                for palabra in palabras:
                    if palabra not in self.lista_vacias:
                        self.lista_vacias.append(palabra)

        # Procesamos cada documento
        indexDocumento = 0
        cantidadDocumentos = len(documentos)
        for documento in documentos:
            documento.terminos = {}
            documento.tokens = []
            content = documento.content
            tokensEntities = []
            # Aplicamos cada regla definida en self.reglasEntities para entidades
            for instancia in self.reglasEntities:
                response = instancia.run(content)
                content = response['content']
                # Agregamos los terminos a los del documento
                documento.terminos.update(response['terminos'])
                tokensEntities += response['tokens']

            # Aplicamos cada regla definida en self.reglasDocumento para normalizar
            for instancia in self.reglasDocumento:
                content = instancia.run(content)
            # Sacamos tokens de documentos
            tokensAux = self.getTokens(content)
            self.tokens = self.tokens + tokensAux + tokensEntities
            documento.tokens = tokensAux + tokensEntities

            # Aplicamos cada regla definida en self.reglasTokens
            for instancia in self.reglasTokens:
                tokensAux = instancia.run(tokensAux)

            # Sacamos palabras vacias
            if pathVacias != None:
                for token in tokensAux:
                    if token in self.lista_vacias:
                        tokensAux.remove(token)

            # Aplicamos Stemming excepto entidades
            tokensAux = self.stemmizar(tokensAux)

            terminosAux = self.getTerminos(tokensAux)
            documento.terminos.update(terminosAux)

            self.saveTerminosGlobal(documento)
            indexDocumento += 1
            porcentaje = (indexDocumento * 100) / cantidadDocumentos

            sys.stdout.write(u"\r" + str(int(porcentaje)).ljust(3) +
                             u"% \u258F" +
                             (u"\u2588" * int(porcentaje / 2)).ljust(50) +
                             u"\u2595")
            sys.stdout.flush()

        print '\n'
        self.saveTerminosFile()
        # Armamos la respuesta
        response = {}
        response['terminos'] = self.terminos
        response['tokens'] = self.tokens
        response['documentos'] = documentos
        return response

    def getTokens(self, string):
        content = string.strip().split()
        # Return
        return content

    def getTerminos(self, tokens):
        terminos = {}
        for token in tokens:
            if token not in terminos:
                terminos[token] = {}
                terminos[token]['CF'] = 1
            else:
                terminos[token]['CF'] += 1
        return terminos

    def stemmizar(self, tokens):
        tokensAux = []
        for token in tokens:
            tokensAux.append(self.stemmer.stem(token))
        return tokensAux

    def saveTerminosGlobal(self, documento):
        terminos = {}
        for termino in documento.terminos:
            if termino not in self.terminos:
                self.terminos[termino] = {}
                self.terminos[termino]['CF'] = documento.terminos[termino][
                    'CF']
                self.terminos[termino]['DOCS'] = [documento]
            else:
                self.terminos[termino]['CF'] += 1
                if documento not in self.terminos[termino]["DOCS"]:
                    self.terminos[termino]["DOCS"].append(documento)

    def saveTerminos(tokens, documento):
        if token not in self.terminos:
            self.terminos[token] = {}
            self.terminos[token]['CF'] = 1
            self.terminos[token]['DOCS'] = [documento]
        else:
            self.terminos[token]['CF'] += 1
            if documento not in self.terminos[token]["DOCS"]:
                self.terminos[token]["DOCS"].append(documento)

    def saveTerminosFile(self):
        with codecs.open(self.fileNameTerminos, mode="w",
                         encoding="utf-8") as archivo:
            index = 0
            archivo.write('ID'.ljust(6))
            archivo.write('|')
            archivo.write('TERMINO'.ljust(30))
            archivo.write('|')
            archivo.write('CF'.ljust(6))
            archivo.write('|')
            archivo.write('DF'.ljust(6))
            archivo.write('\n')
            archivo.write('-' * 50)
            archivo.write('\n')
            for termino in sorted(self.terminos.keys()):
                archivo.write(str(index).ljust(6))
                archivo.write('|')
                archivo.write(termino.ljust(30))
                archivo.write('|')
                archivo.write(str(self.terminos[termino]['CF']).ljust(6))
                archivo.write('|')
                archivo.write(
                    str(len(self.terminos[termino]['DOCS'])).ljust(6))
                archivo.write('\n')
                index += 1
# Then we loop through all the nodes of the merged graph and add the stem nodes to each Spanish node. If the node has only one word (after stopword removal) we will use the NLTK stemmer; otherwise we just leave the phrase as it is:

# <codecell>

combined_graph_stemmed = copy.deepcopy(combined_graph)
for node in combined_graph.nodes():
    if "lang" in combined_graph.node[node] and combined_graph.node[node][
            "lang"] == "spa":
        e = re.sub(" ?\([^)]\)", "", node)
        e = e.strip()
        stem = e
        words = e.split(" ")
        if len(words) > 1:
            words = [w for w in words if not w in stopwords or w == ""]
        if len(words) == 1:
            stem = stemmer.stem(words[0])

        stem = stem + "|stem"
        combined_graph_stemmed.add_node(stem, is_stem=True)
        combined_graph_stemmed.add_edge(stem, node)

# <markdowncell>

# Again we can count the nodes and the number of connected components. We see that the number of connected components decreases, as more nodes are connected into groups now:

# <codecell>

networkx.algorithms.components.number_connected_components(
    combined_graph_stemmed)

# <codecell>
예제 #35
0
for l in fileinput.input("/Users/ramon/qlc-github/data/eswiki/AA/wiki00"):
    l = l.strip()
    l = l.decode("utf-8")
    l = unicodedata.normalize("NFD", l)
    
    if l.startswith("</doc>"):
        sentences = tokenizer.tokenize(doc)
        for s in sentences:
            s = regex.sub(u"[^\p{L}\p{M}]", " ", s)
            s = s.lower()
            for w in s.split():
                if not w in stopwords:
                    stem = w
                    if len(w) > 3:
                        stem = stemmer.stem(w)
                    sentences_for_stem[stem].add(sentence_id)
                    docs_for_stem[stem].add(doc_id)
            sentence_id += 1
        doc = ""
        doc_id += 1
            
    elif not l.startswith("<doc"):
        l = regex.sub("</?a[^>]*>", "", l)
        doc += l + " "
    
    #if doc_id > 500:
    #    break
    
stem1 = stemmer.stem("continua")
stem2 = stemmer.stem("figura")
예제 #36
0
def do_stemmer(df, stop_language='spanish'):
    """Apply stop words and Stemmers"""
    ##  Como nos llegan tickets en dos idiomas añadimos las palabras de ambos idiomas
    stop = get_stop_words(stop_language) + get_stop_words('english')
    ## Añdimos nuestras propias palabras
    stop += [
        "buenas", "buenos", "cid", "dias", "gracias", "hola", "mucho", "mucha",
        "poder", "proyecto", "please", "saludo", "tardes", "www", "habia"
    ]
    stop += [
        'ahora',
        'algun',
        'alguna',
        'amanecia interrumpio',
        'amanecia interrumpio relato',
        'amanecia interrumpio relato habian',
        'amanecia interrumpio relato habian dado',
        'aquel',
        'asi',
        'aun',
        'cada',
        'vez',
        'mas',
        'cualquier',
        'cosa',
        'cuanto',
        'dado',
        'darse',
        'debe',
        'debia',
        'despues',
        'dia noche',
        'dia siguiente',
        'diez años',
        'diez mil',
        'dijo',
        'dijo',
        'dio',
        'habia',
        'mas',
        'podia',
        'podian',
        'mismo',
        'si',
        'tal',
        'tan',
        'puede',
        'pueden ser',
        'pues',
        'puso',
        'toda',
        'todas',
        'vease tambien',
        'primer lugar',
        'varias',
        'dos',
        'largo',
        'hacia'
        'uno',
        'una',
        'unos',
        'una',
        'aquella',
        'aquello',
        'aquel',
        'hace',
        'muchas',
        'mucho',
        'muchos',
        'mucha',
        'pueden',
        'puedo',
        'unas',
        'abrio puerta',
        'arriba abajo',
        'aqui alla',
        'habian',
        'doña',
        'don',
        'señor',
        'señora',
        'hizo',
        'quedo',
        'fuerza sino',
        'quedo perplejo',
        'parece haber',
        'parece ser',
        'parecia haber',
        'mayor parte',
        'mañana siguiente',
        'media hora',
        'hoy dia',
        'iba ser',
        'iii pag',
        'haber hecho',
        'habria podido',
        'hacer cosas',
        'hacia arriba',
        'hacia atras',
        'hacia puerta',
        'hacia tiempo',
        'decir verdad',
        'dejo caer',
        'demasiado tarde',
        'derecha izquierda',
        'di cuenta',
        'dia anterior',
        'dia noche',
        'dia siguiente',
        'casi siempre',
        'cierto dia',
        'cierto modo',
        'cinco años',
        'aqui alla',
        'arriba abajo',
        'aunque solo',
        'año nuevo',
        'años edad',
        'buena parte',
        'ninguna parte',
        'noche anterior',
        'noche dia',
        'nunca visto',
        'partido comunista',
        'podria haber',
        'podria ser',
        'press cambridge',
        'primer lugar',
        'quiere decir',
        'quiero decir',
        'sentido comun',
        'seria mejor',
        'tras haber',
        'tres años',
        'tres cuatro',
        'tres meses',
        'voz alta',
        'voz baja',
    ]
    stop_words_generated_tokens = [
        'abajo', 'abrio', 'alla', 'alta', 'amanecia', 'anterior', 'aqui',
        'aren', 'arriba', 'atras', 'aunque', 'año', 'años', 'baja', 'buena',
        'caer', 'cambridge', 'can', 'casi', 'cierto', 'cinco', 'comun',
        'cosas', 'couldn', 'cuatro', 'cuenta', 'decir', 'dejo', 'demasiado',
        'di', 'dia', 'didn', 'diez', 'doesn', 'edad', 'haber', 'habria',
        'hacer', 'hacia', 'hadn', 'hasn', 'haven', 'hecho', 'hora', 'hoy',
        'iba', 'iii', 'isn', 'let', 'll', 'lugar', 'mayor', 'mañana', 'media',
        'mejor', 'meses', 'modo', 'mustn', 'ninguna', 'noche', 'nuevo',
        'nunca', 'pag', 'parece', 'parecia', 'parte', 'partido', 'podido',
        'podria', 'puerta', 'quiere', 'quiero', 're', 'relato', 'sentido',
        'ser', 'seria', 'shan', 'shouldn', 'siempre', 'siguiente', 'sino',
        'solo', 'tambien', 'tarde', 'tiempo', 'tras', 'tres', 've', 'vease',
        'visto', 'wasn', 'weren', 'won', 'wouldn'
    ]
    stop += stop_words_generated_tokens
    ps = SpanishStemmer()

    a = []
    df["stem"] = "n"
    for i, row in df.iterrows():
        a.append(
            ps.stem(row["text"]).replace('fuerza sino', '').replace(
                'acceder', 'acceso').replace('user', 'usuario').replace(
                    'access', 'acceso').replace('usuarios', 'usuario').replace(
                        'abrio puerta', '').replace('acto seguido', ''))
    df["stem"] = a
    return df, stop
#------------------------------

#guardaEnArchivo("OUT_FILES\VocabularioNoStop.txt", textoTokenizadoNoStopWords)

#------------------------------
#grabaEnBD('tokens_sin_numeros', textoTokenizado)
#------------------------------

print("Tokens set sin números ni stopwords -> set(len()): " +
      str(len(set(textoTokenizadoNoStopWords))))

tokensStem = []
spanishStemm = SpanishStemmer(ignore_stopwords=False)
for token in textoTokenizado:
    #stemming = stemmer.stem(token)
    palabraStem = spanishStemm.stem(token)
    tokensStem.append(palabraStem)

#------------------------------
#grabaEnBD('tokens_stem', tokensStem, update=True)
#------------------------------

#nuevosTokens=lemmatizer("lemmatization-es.txt", textoTokenizado)
nuevosTokensLemmas = lemmatizerBD("lemmatization-es.txt",
                                  textoTokenizadoNoStopWords,
                                  'tokens_sin_stopwords_lemmas',
                                  saveToTable=True)
tokensLemmasSinStopW = []

input("Checar bd")
#grabaEnBD('tokens_lemmas', nuevosTokens)