def main():
    text = read_doc()

    text = [unescape(sent) for sent in text]

    from nltk.tokenize.regexp import WhitespaceTokenizer
    ws_tokenizer = WhitespaceTokenizer()
    text = [ws_tokenizer.tokenize(sent) for sent in text if len(sent) > 0]

    text = [[token.lower() for token in sent] for sent in text]

    text = [[
        ''.join(ch for ch in token if ch.isalpha() or ch == '\'')
        for token in sent
    ] for sent in text]

    text = [[token for token in sent if len(token) >= 2 and len(token) <= 35]
            for sent in text]

    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))
    text = [[token for token in sent if not token in stopwords]
            for sent in text]

    from nltk.stem.snowball import SnowballStemmer
    stemmer = SnowballStemmer("english")
    text = [[stemmer.stem(token) for token in sent] for sent in text]

    from sklearn.feature_extraction.text import CountVectorizer
    vect = CountVectorizer(min_df=20, analyzer=lambda x: x)
    X = vect.fit_transform(text)

    #print(X.toarray())
    feature_names = vect.get_feature_names()
    #print(feature_names)

    from collections import Counter
    try:
        # Python 2
        from itertools import izip
    except ImportError:
        # Python 3
        izip = zip
    wfd = Counter(
        {key: value
         for (key, value) in izip(range(X.shape[1]), X.getnnz(0))})

    from itertools import combinations, chain
    bfd = Counter(
        chain.from_iterable(
            [combinations(sorted(segment.tocoo().col), 2) for segment in X]))

    N_seg = len(text)
    scores = [(mutinf(bfd[tup], wfd[tup[0]], wfd[tup[1]], N_seg), tup)
              for tup in bfd]

    print([(tup[0], feature_names[tup[1][0]], feature_names[tup[1][1]])
           for tup in sorted(scores, reverse=True)[:20]])

    pass
Пример #2
0
 def __chunk_sentence(self, sentence):
   """Tokenize the sentence into words using a whitespace parser to avoid parsing couldn't into two tokens (could and n't).
      Then chunk the tokens according to GRAMMAR.
   """
   tokenizer = WhitespaceTokenizer()
   tokens = tokenizer.tokenize(sentence)
   pos_tagged = nltk.pos_tag(tokens)
   return self.parser.parse(pos_tagged)
Пример #3
0
 def processPost(self, post):
     tokenizer = WhitespaceTokenizer()
     if post.text is not None and post.text != "":
         curtext = post.text.encode('utf-8')
         tokens = [word for sent in nltk.sent_tokenize(curtext) for word in tokenizer.tokenize(sent)]
         tokens = self.normalizeTokens(tokens)
         text = nltk.Text(tokens)
         self.processText(post, text)
Пример #4
0
 def __chunk_sentence(self, sentence):
     """Tokenize the sentence into words using a whitespace parser to avoid parsing couldn't into two tokens (could and n't).
    Then chunk the tokens according to GRAMMAR.
 """
     tokenizer = WhitespaceTokenizer()
     tokens = tokenizer.tokenize(sentence)
     pos_tagged = nltk.pos_tag(tokens)
     return self.parser.parse(pos_tagged)
def main():
    text = read_doc()

    text = [unescape(sent) for sent in text]

    from nltk.tokenize.regexp import WhitespaceTokenizer

    ws_tokenizer = WhitespaceTokenizer()
    text = [ws_tokenizer.tokenize(sent) for sent in text if len(sent) > 0]

    text = [[token.lower() for token in sent] for sent in text]

    text = [["".join(ch for ch in token if ch.isalpha() or ch == "'") for token in sent] for sent in text]

    text = [[token for token in sent if len(token) >= 2 and len(token) <= 35] for sent in text]

    from nltk.corpus import stopwords

    stopwords = set(stopwords.words("english"))
    text = [[token for token in sent if not token in stopwords] for sent in text]

    from nltk.stem.snowball import SnowballStemmer

    stemmer = SnowballStemmer("english")
    text = [[stemmer.stem(token) for token in sent] for sent in text]

    from sklearn.feature_extraction.text import CountVectorizer

    vect = CountVectorizer(min_df=20, analyzer=lambda x: x)
    X = vect.fit_transform(text)

    # print(X.toarray())
    feature_names = vect.get_feature_names()
    # print(feature_names)

    from collections import Counter

    try:
        # Python 2
        from itertools import izip
    except ImportError:
        # Python 3
        izip = zip
    wfd = Counter({key: value for (key, value) in izip(range(X.shape[1]), X.getnnz(0))})

    from itertools import combinations, chain

    bfd = Counter(chain.from_iterable([combinations(sorted(segment.tocoo().col), 2) for segment in X]))

    N_seg = len(text)
    scores = [(mutinf(bfd[tup], wfd[tup[0]], wfd[tup[1]], N_seg), tup) for tup in bfd]

    print([(tup[0], feature_names[tup[1][0]], feature_names[tup[1][1]]) for tup in sorted(scores, reverse=True)[:20]])

    pass
class LimparTexto(object):
	def __init__(self):
		self.portugues_stemmer = RSLPStemmer()
		self.tokenizar = WhitespaceTokenizer()
		self.stopwords = stopwords.words('portuguese')
		self.mais_utilizadas = ['ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa', 'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne', 'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra', 'vai', 'olha', 'pois', 'fica', 'muito', 'muita', 'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate']
		self.ascii_replace = [('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'), ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'), ('ô', 'o'), ('õ', 'o'), ('ú', 'u'),
                 ('ç', 'c'), ('ä', 'a'), ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'), ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'), ('Ê', 'e'),
                 ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'), ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c')]

	#Remover acentuação dos textos		
	def removeAccent(self, text):
		para = text
		for (lat, asc) in self.ascii_replace:
			para = para.replace(lat, asc)
		return para

	#Realiza a remoção das stop words que são palavras que não representam significado para o nosso modelo.
	def removerStopWords(self, texto):		
#O decode é necessário se for utilizado o latin-1 no mining
		texto = ' '.join([word for word in texto.split() if word.decode('latin-1') not in self.stopwords])
		texto = ' '.join([word for word in texto.split() if word.decode('latin-1') not in self.mais_utilizadas])
#		texto = ' '.join([word for word in texto.split() if word.decode('utf-8') not in self.stopwords])
#		texto = ' '.join([word for word in texto.split() if word.decode('utf-8') not in self.mais_utilizadas])
		return texto

	#Tokenização das palavras por espaços
	def tokenizarPalavras(self, texto):
		texto = self.tokenizar.tokenize(texto)
		return texto

	#A remoção da pontuação é necessário pois palavras seguidas de pontos difere de palavra iguais sem a pontuação.
	def removerPontuacao(self, texto):
		regex = re.compile('[%s]' % re.escape(string.punctuation))
		texto = regex.sub('',texto)
		return texto
		
		
	#Remoção dos sufixos das palavras
	def removerSufixo(self, para):
		text = ''
		for w in para:
#			text = text + self.portugues_stemmer.stem(w.decode('latin-1')) + ' '
			text = text + self.portugues_stemmer.stem(w) + ' '
		return text
	
	def removerAcentos(self, texto):
		texto = unicode(texto, 'latin-1')
		para = unidecode.unidecode(texto)
		return para

	def removerCaracteresRepetidos(self, texto):
		texto = re.sub(r'([a-z])\1+', r'\1', texto)
		return texto
    def _fit(self):
        '''Tokenize the documents, make backwards and forwards lists
        call the make_dictionary method'''

        tokenizer = WhitespaceTokenizer()
        # Get the sentences from the corpus
        sent_list_of_str = sent_tokenize(self.corpus_txt.lower())
        # Capitalize and save the punctuation from the end
        sent_cap = [(sent.capitalize()[:-1], sent[-1]) for sent in sent_list_of_str]
        # Word tokenize to keep contractions, add back on punc
        self.f_sent = [tokenizer.tokenize(word_tuple[0]) + [word_tuple[1]] for word_tuple in sent_cap]
        # Reverse those sentences
        self.b_sent = [list(reversed(word_list)) for word_list in self.f_sent]
        self.f_dict = self._make_dictionary(self.f_sent)
        self.b_dict = self._make_dictionary(self.b_sent)
Пример #8
0
    def process(self, text):
        """
            предобработка, токенизация по предложениям, удаление дублей.
            выдает список предложений (для векторного метода, на будущее)
            Args:
                text ([type]): [description]
            """

        #text = text.lower()

        # убираем числа, email, гиперрсылки

        #text = text.encode('utf-8')

        text = clear_emails(text)
        text = clear_url(text)
        text = clear_digits(text)
        text = clear_symb(text)

        # выделяем предложения
        sentence_tokenizer = PunktSentenceTokenizer()
        text = sentence_tokenizer.tokenize(text)

        cleaned_text = []
        stop_words = set(stopwords.words('russian'))

        # разбиваем по словам, чистим от оставшейся пунктуации и stopwords
        tokenizer = WhitespaceTokenizer()
        stemmer = SnowballStemmer('russian')

        for sentence in text:
            punct_cleaned_sent = clear_endings(
                sentence)  # служ. символы конца предложения
            tokenized_sent = tokenizer.tokenize(
                punct_cleaned_sent)  # раскидали по словам, только для отчистки
            stpw_clean_sentence = [
                word for word in tokenized_sent if not word in stop_words
            ]
            stemmed_sentence = [
                stemmer.stem(word) for word in stpw_clean_sentence
            ]  # проеборазуем в ед. число или корень слова
            clean_sentence = ' '.join(
                stemmed_sentence
            )  # собрали обратно в предложение-сторку для хэшировнаия

            cleaned_text.append(clean_sentence)

        return cleaned_text
Пример #9
0
def tokenize(s):
    """
    Tokenize string.
    Function to tokenize text into words (tokens). Downloads default NLTK
    tokenizer if not in machine.
    Args:
        - s: string with sentence to tokenize.
    Returns:
        - tokens: list of tuples (token, start-index, end-index)
    """
    text = sub(r"[,.:;'\"]", " ", s)
    tokenizer = Tokenizer()
    spans = tokenizer.span_tokenize(text)
    tokens = tokenizer.tokenize(text)
    tokens = [(t, s[0], s[1]-1) for t, s in zip(tokens, spans)]
    return tokens
Пример #10
0
    def analyize(self,text):
        try:
            unitext = any2unicode(text, encoding='utf8', errors='strict')
        except:
            print ("Not utf-8")
            return []
        pass

        #convert to lower
        lowerText = unitext.lower()

        # Regex way: gives some text 'qwe (x)' as 'qwe' '(x)'
        # very aggresive regex...removes puncs and digits..keeps only alphabetic words
        tokenizer = WhitespaceTokenizer()
        regexTokens = tokenizer.tokenize(lowerText)
        p_stemmer = PorterStemmer()
        stemmedTokens = [p_stemmer.stem(i) for i in regexTokens]

        stemmedRemSingleLetterTokens = [w for w in stemmedTokens if len(w)>1]
        return stemmedRemSingleLetterTokens
Пример #11
0
    def process(self, text, plain_text=False):
        """
        предобработка, токенизация по словам,  удаление дублей.
        выдает сплошной (plain) текст, для метода шиндлов или список токенов текста

        Args:
            text ([type]): [description]
        """
        #text = text.encode('utf-8')

        # убираем числа, email, гиперрсылки

        text = clear_emails(text)
        text = clear_url(text)
        text = clear_digits(text)
        text = clear_symb(text)

        # разбиваем по словам, чистим от оставшейся пунктуации и stopwords

        stop_words = set(stopwords.words('russian'))
        tokenizer = WhitespaceTokenizer()
        stemmer = SnowballStemmer('russian')

        punct_cleaned_text = clear_endings(
            text)  # служ. символы конца предложения
        tokenized_text = tokenizer.tokenize(
            punct_cleaned_text)  # раскидали по словам, только для отчистки
        stpw_clean_text = [
            word for word in tokenized_text if not word in stop_words
        ]
        stemmed_text = [stemmer.stem(word) for word in stpw_clean_text
                        ]  # проеборазуем в ед. число или корень слова
        clean_text = None
        if plain_text:
            clean_text = ' '.join(
                stemmed_text
            )  # собрали обратно в предложение-сторку для хэшировнаия
        else:
            clean_text = stemmed_text  #  иначе возвращаем список токенов

        return clean_text
Пример #12
0
class TextCleaner(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
        self.tokenizer = WhitespaceTokenizer()
        self.cached_stopwords = stopwords.words('english')
        self.ascii_replace = [
            ('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'),
            ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'),
            ('ô', 'o'), ('õ', 'o'), ('ú', 'u'), ('ç', 'c'), ('ä', 'a'),
            ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'),
            ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'),
            ('Ê', 'e'), ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'),
            ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c')
        ]
        self.link_patterns = [('http'), ('www'), ('w3c')]
        self.digraph = [(r'hash', '#'), (r'rxr', 'rr'), (r'sxs', 'ss'),
                        (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo'),
                        (r'fqf', 'ff'), (r'gqg', 'gg'), (r'cqc', 'cc'),
                        (r'dqd', 'dd'), (r'mqm', 'mm'), (r'nqn', 'nn'),
                        (r'pqp', 'pp'), (r'dqd', 'dd'), (r'tqt', 'tt'),
                        (r'fqf', 'ff'), (r'lql', 'll')]

    # Remover caracteres repetidos seguidamente, para que o modelo não seja prejudicado por falta de padrão na escrita.
    def removeRepChar(self, word):
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.removeRepChar(repl_word)
        else:
            return repl_word

    # Substituir caracateres acentuados por caracteres sem acentos.
    def removeAccent(self, text):
        para = text
        for (lat, asc) in self.ascii_replace:
            para = para.replace(lat, asc)
        return para

    # Remover stopwords dos textos.
    def removeStopwords(self, text):
        text = ' '.join([
            word for word in text.split() if word not in self.cached_stopwords
        ])
        return text

    # Remover links dos textos.
    def removeLinks(self, text):
        for l in self.link_patterns:
            text = text.split(l, 1)[0]
        return text

    # Reescrever os digrafos na sua forma original. Exemplo: rxr -> rr
    def normalizeDigraph(self, text):
        for a, d in self.digraph:
            text = re.sub(a, d, text)
        return text

    # Reescrever algumas palavras para dar melhor semântica e legibilidade aos resultados do modelo.
    def normalizeText(self, text):
        for a, b in self.normal:
            text = re.sub(a, b, text)
        return text

    def removeOneCharacter(self, text):
        text = self.tokenizeWords(text)
        for i in range(len(text)):
            if len(text[i]) <= 2:
                text[i] = ''
        return ' '.join(text)

    def tokenizeWords(self, text):
        text = self.tokenizer.tokenize(text)
        return text
Пример #13
0
class NamedEntity(object):
    def __init__(self):
        self.tokenizer = WhitespaceTokenizer()

    # Remover do texto duas ou mais palavras próprias em sequência.
    def removeName(self, text):
        i = 0
        j = 1
        words = text.split()
        lim = len(words) - 1
        while j <= lim:
            if not words[i].isupper() and not words[i].islower():
                if not words[j].isupper() and not words[j].islower():
                    words[i] = words[i].replace(words[i], "")
                    words[j] = words[j].replace(words[j], "")
            i += 1
            j += 1
        words = ' '.join(words)
        return words

    # Remover nomes próprios dos textos. Para isso, recebe o texto, que em seguida é dividido em palavras, que posteriormente recebem POS_Tags.
    # Para cada palavra/tag, é verificado se a tag nao corresponde a de nome proprio 'NPROP'. Ao final, forma-se um texto sem palavras com tags
    # 'NPROP', sendo assim retornado pelo método.
    def removePersonName(self, text):
        final_text = ''
        tokenized_text = self.tokenizeWords(text)
        tagged_text = self.tagWords(tokenized_text)
        for w, t in tagged_text:
            if t != "NPROP":
                final_text = final_text + ''.join(w) + ' '
        return final_text

    # Remover menções de usuários de tweets. Os mesmos são identificados pelo caractere '@'. O texto original é repassado ao método e divido em palavras,
    # em seguida. Após isso, é verificado para cada palavra do texto se a mesma se inicia com o caractere '@'. Caso sim, essa palavra é removida do texto.
    # Ao final, o texto é retornado, sem os nomes de usuários.
    def removeTwitterUsername(self, text):
        text = text.split()
        for w in text:
            if w[0] == '@':
                text.remove(w)
        return ' '.join(text)

    # Marcar as palavras de uma sentença tokenizada com POS_Tags. O texto é repassado ao método tag da classe UnigramTagger, que marca as palavras do texto com
    # POS_Tags. Retorna uma lista com palavras/tags.
    def tagWords(self, tokenized_text):
        tagged_words = tagger.tag(tokenized_text)
        return tagged_words

    # Desenhar arvore que destaca um determinado padrão gramatical do texto.
    def drawNamedEntityTree(self, text):
        tokenized_text = tokenizer.tokenize(text)
        tagged_text = self.tagWords(tokenized_text)
        grammar = "ENT: {<PESSOA>*}"
        cp = RegexpParser(grammar)
        res = cp.parse(tagged_text)
        res.draw()

    # Tokenizar sentenças em palavras. Retorna uma lista com as palavras que formam o texto.
    def tokenizeWords(self, text):
        text = self.tokenizer.tokenize(text)
        return text
def get_social_word_counts(social_var,
                           vocab,
                           comment_file,
                           meta_file,
                           comment_thresh=10):
    """
    Compute unique number of social vars 
    per word in vocab over all comments.
    Parameters:
    -----------
    social_var : str
    vocab : [str]
    Vocabulary to count.
    comment_file : str
    meta_file : str
    Tab-separated metadata file containing comment date, 
    author, thread ID, and subreddit.
    comment_thresh : int
    Minimum number of comments for a social var to be counted.
    Returns:
    --------
    social_var_counts : numpy.array
    """
    # indices in meta file corresponding to social vars
    social_var_indices = {'user': 1, 'subreddit': 3, 'thread': 2}
    social_txt = defaultdict(list)
    tokenizer = WhitespaceTokenizer()
    stopwords = get_default_stopwords()
    ngram_range = (1, 1)
    min_df = 1
    cv = CountVectorizer(encoding='utf-8',
                         lowercase=True,
                         tokenizer=tokenizer.tokenize,
                         stop_words=stopwords,
                         ngram_range=ngram_range,
                         min_df=min_df,
                         vocabulary=vocab,
                         binary=True)
    # keep it simple and store {vocab : {sub : count}}
    social_word_counts = defaultdict(Counter)
    with BZ2File(comment_file, 'r') as comments, BZ2File(meta_file,
                                                         'r') as metas:
        for i, (comment, meta) in enumerate(izip(comments, metas)):
            meta = meta.split('\t')
            social_id = meta[social_var_indices[social_var]]
            # print('got social id %s'%(social_id))
            # social_txt[social_id].append(comment)
            for w in tokenizer.tokenize(comment):
                social_word_counts[w][social_id] += 1
            if (i % 100000 == 0):
                print('processed %d comments' % (i))
            # if(i == 500000):
            #     break
    social_word_counts = {
        w: d
        for w, d in social_word_counts.iteritems() if w in vocab
    }
    social_word_counts = {
        w: {k: v
            for k, v in d.iteritems() if v >= comment_thresh}
        for w, d in social_word_counts.iteritems()
    }
    social_word_counts = {w: len(d) for w, d in social_word_counts.iteritems()}
    social_word_counts = np.array([
        social_word_counts[v] if v in social_word_counts else 0. for v in vocab
    ])

    # old code for constructing word/social dtm
    # restrict to consistent users??
    # social_txt = {k : v for k,v in social_txt.items()
    #               if len(v) >= comment_thresh}
    # # now convert to DTM
    # def get_txt_iter(social_txt):
    #     N = len(social_txt)
    #     for i, v in enumerate(social_txt.itervalues()):
    #         if(i % 1000 == 0):
    #             print('processed %d/%d social vars'%(i, N))
    #         yield ' '.join(v)
    # txt_iter = get_txt_iter(social_txt)
    # # txt_iter = (' '.join(v) for v in social_txt.values())
    # dtm = cv.fit_transform(txt_iter)
    # print('got %s dtm %s'%(social_var, dtm))
    # # save sparse matrix
    # # all_social_vals = social_txt.keys()
    # # vocab = sorted(cv.vocabulary_, key=lambda x: cv.vocabulary_[x])
    # # comment_date = re.findall(r'201[0-9]-[0-9]+', comment_file)[0]
    # # write_full_social_dtm(dtm, all_social_vals, vocab, comment_date, social_var)
    # # save unique social count for each word
    # # combine all counts per word
    # social_word_counts = np.array(dtm.sum(axis=0)).flatten()
    return social_word_counts
Пример #15
0
class TrueCase(object):
    '''True case from a corpus'''

    def __init__(self, fname):
        with open(fname, 'r') as f:
            self.corpus_txt = f.read().decode('utf-8').replace('\n', ' ')
        self.tokenizer = WhitespaceTokenizer()
        self.word_list = self.tokenizer.tokenize(self.corpus_txt)
        self.lower_word_list = [w.lower() for w in self.word_list]
        self.word_dict_count = Counter(self.word_list)

    def truecase(self, sent):
        '''Return a true_cased sentence to look well formatted'''
        if isinstance(sent, basestring):
            sent = self.tokenizer.tokenize(sent)
        output = []
        # If it appears capital more often, use that case
        for word in sent:
            capital = 0
            lower = 0
            all_caps = 0
            try:
                lower += self.word_dict_count[word.lower()]
            except:
                lower += 0
            try:
                capital += self.word_dict_count[word.capitalize()]
            except:
                capital += 0
            try:
                all_caps += self.word_dict_count[word.upper()]
            except:
                all_caps += 0

            # find max of those three options
            idx = np.argsort([all_caps, capital, lower])[-1]

            # If not found in dictionary, find original case
            if (all_caps + capital + lower) == 0:
                try:
                    i = self.lower_word_list.index(word.lower())
                    output.append(self.word_list[i])
                except:
                    try:
                        i = self.lower_word_list.index(word.lower().strip(punctuation))
                        output.append(self.word_list[i])
                    except:
                        output.append(word)
            elif idx == 0:
                output.append(word.upper())
            elif idx == 1:
                output.append(word.capitalize())
            else:
                output.append(word)

        # sometimes sentence delimiters get picked up in the middle of words
        # they should only go at the end
        sent_str = ' '.join([x.strip('!?.') for x in output[:-1]]) + ' ' + output[-1]
        sent_str = sent_str[0].upper() + sent_str[1:]

        return sent_str

    def bulk_truecase(self, list_sent):
        '''Return a list of true_cased strings from an iterable'''
        output = []
        for sent in list_sent:
            output.append(self.truecase(sent))
        return output
Пример #16
0
class TextCleaner(object):
    def __init__(self, use_unicode=True):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
        self.pt_stemmer = nltk.stem.RSLPStemmer()
        self.tokenizer = WhitespaceTokenizer()
        self.cached_stopwords = stopwords.words('portuguese')
        self.symbols = [
            u"\"", u"'", u"!", u"?", u".", u",", u";", u">", u"_", u"<", u"-",
            u"[", u"]", u"{", u"}", u"/", u"\\", u"^", u"~", u"´", u"`",
            u"``", u"\u2026", u":", u"(", u")", u"|", u"#", u"$", u"%", u"&",
            u"*", u"=", u"+", u"\u2013", u"\u201c", u"\u201d", u"\u300b",
            u"\u2019", u"\u2018", u"\u00b0", u"\u30fb", u"\u00ba", u"\u200b",
            u"\u00b7", u"\u2014", u"\u00bb", u"\u221a", u"\u00aa", u"\ufe0f",
            u"\u2794", u"\u2192", u"\u00a8", u"\u2022", u"\u300a", u"\u00bf",
            u"\u25a0", u"\u00af", u"\u22b3", u"\u2060", u"\u261b", u"\u00ad",
            u"\u00ab"
        ]
        self.more_stopwords = [
            'ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa',
            'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne',
            'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra',
            'vai', 'olha', 'pois', 'rt', 'retweeted', 'fica', 'muito', 'muita',
            'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate'
        ]
        if use_unicode:
            self.accents = unicode_replace
        else:
            self.accents = ascii_replace
        self.link_patterns = [('http'), ('www'), ('w3c'), ('https')]
        self.normal = [(r'kxkxk', 'kkk'), (r'nao ', ' nao_'),
                       (r' ir ', '_ir '), (r'bom demal', ' bomdemais '),
                       (r'\s*insan\s*', ' insano '),
                       (r'\s*saudad\s*', ' saudade ')]
        self.digraph = [(r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'),
                        (r'eqe', 'ee'), (r'oqo', 'oo')]

    # Remover caracteres repetidos seguidamente, para que o modelo não seja prejudicado
    # por falta de padrão na escrita.
    def removeRepChar(self, word):
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.removeRepChar(repl_word)
        else:
            return repl_word

    # Remover caracteres especiais (Ex: ?, !, " ...).
    def removeSymbols(self, text):
        for symbol in self.symbols:
            text = text.replace(symbol, ' ')
        return text

    # Remover sufixo das palavras da lingua portuguesa.
    def removeSufPort(self, para):
        para = para.split()
        text = ''
        for w in para:
            text = text + self.pt_stemmer.stem(w) + ' '
        return text

    # Substituir caracateres acentuados por caracteres sem acentos.
    def removeAccent(self, text):
        para = text
        for (lat, asc) in self.accents:
            para = para.replace(lat, asc)
        return para

    # Remover stopwords dos textos.
    def removeStopwords(self, text):
        text = ' '.join([
            word for word in text.split() if word not in self.cached_stopwords
        ])
        text = ' '.join(
            [word for word in text.split() if word not in self.more_stopwords])
        return text

    # Remover links dos textos.
    def removeLinks(self, text):
        for l in self.link_patterns:
            text = text.split(l, 1)[0]
        return text

    # Reescrever os digrafos na sua forma original. Exemplo: rxr -> rr
    def normalizeDigraph(self, text):
        for a, d in self.digraph:
            text = re.sub(a, d, text)
        return text

    # Reescrever algumas palavras para dar melhor semântica e legibilidade aos resultados do modelo.
    def normalizeText(self, text):
        for a, b in self.normal:
            text = re.sub(a, b, text)
        return text

    def removeOneCharacter(self, text):
        text = self.tokenizeWords(text)
        for i in range(len(text)):
            if len(text[i]) <= 2:
                text[i] = ''
        return ' '.join(text)

    def tokenizeWords(self, text):
        text = self.tokenizer.tokenize(text)
        return text
Пример #17
0
class TextCleaner(object):
    def __init__(self, use_unicode):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
        self.tokenizer = WhitespaceTokenizer()
        self.cached_stopwords = stopwords.words('english')
        self.symbols = [
            u"\"", u"'", u"!", u"?", u".", u",", u";", u">", u"_", u"<", u"-",
            u"[", u"]", u"{", u"}", u"/", u"\\", u"^", u"~", u"´", u"`",
            u"``", u"\u2026", u":", u"(", u")", u"|", u"#", u"$", u"%", u"&",
            u"*", u"=", u"+", u"\u2013", u"\u201c", u"\u201d", u"\u300b\u300b",
            u"\u2019", u"\u2018", u"\u00b0", u"\u00ba", u"\u200b", u"\u00b7",
            u"\u2014", u"\u00bb", u"\u221a", u"\u00aa", u"\ufe0f", u"\u2794",
            u"\u2192", u"\u00a8", u"\u2022", u"\u300a", u"\u00bf", u"\u25a0",
            u"\u00af", u"\u22b3", u"\u2060", u"\u261b", u"\u00ad", u"\u00ab"
        ]
        if use_unicode:
            self.accents = unicode_replace
        else:
            self.accents = ascii_replace
        self.link_patterns = [('http'), ('www'), ('w3c')]
        self.digraph = [(r'hash', '#'), (r'rxr', 'rr'), (r'sxs', 'ss'),
                        (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo'),
                        (r'fqf', 'ff'), (r'gqg', 'gg'), (r'cqc', 'cc'),
                        (r'dqd', 'dd'), (r'mqm', 'mm'), (r'nqn', 'nn'),
                        (r'pqp', 'pp'), (r'dqd', 'dd'), (r'tqt', 'tt'),
                        (r'fqf', 'ff'), (r'lql', 'll')]

    # Remover caracteres repetidos seguidamente, para que o modelo não seja prejudicado
    # por falta de padrão na escrita.
    def removeRepChar(self, word):
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.removeRepChar(repl_word)
        else:
            return repl_word

    # Remover caracteres especiais (Ex: ?, /, " ...).
    def removeSymbols(self, text):
        for symbol in self.symbols:
            text = text.replace(symbol, ' ')
        return text

    # Substituir caracateres acentuados por caracteres sem acentos.
    def removeAccent(self, text):
        para = text
        for (lat, asc) in self.accents:
            para = para.replace(lat, asc)
        return para

    # Remover stopwords dos textos.
    def removeStopwords(self, text):
        text = ' '.join([
            word for word in text.split() if word not in self.cached_stopwords
        ])
        return text

    # Remover links dos textos.
    def removeLinks(self, text):
        for l in self.link_patterns:
            text = text.split(l, 1)[0]
        return text

    # Reescrever os digrafos na sua forma original. Exemplo: rxr -> rr
    def normalizeDigraph(self, text):
        for a, d in self.digraph:
            text = re.sub(a, d, text)
        return text

    # Reescrever algumas palavras para dar melhor semântica e legibilidade aos resultados do modelo.
    def normalizeText(self, text):
        for a, b in self.normal:
            text = re.sub(a, b, text)
        return text

    def removeOneCharacter(self, text):
        text = self.tokenizeWords(text)
        for i in range(len(text)):
            if len(text[i]) <= 2:
                text[i] = ''
        return ' '.join(text)

    def tokenizeWords(self, text):
        text = self.tokenizer.tokenize(text)
        return text
class MarkovChain(object):
	'''Create a MarkovChain from the given dictionary and parameters,
	run() returns a sentence given a seed

	markov_dict should be a MarkovDict().api dictionary'''

	def __init__(self, markov_dict, priority_list=None, not_found_list=None, neighbor_dict=None):
		self.markov_dict = markov_dict
		self.gtype = self.markov_dict['gtype']
		self.stop_words = set(stopwords.words('english'))
		self.neighbor_dict = neighbor_dict
		self.tokenizer = WhitespaceTokenizer()
		self.word_list = self.tokenizer.tokenize(self.markov_dict['corpus_txt'])
		self.lower_word_list = [w.lower() for w in self.word_list]
		# Count of word freq, maintaining case
		self.word_dict_count = Counter(self.word_list)
		self.truecaser = TrueCase(self.markov_dict['fname'])

		# Create priority and not_found_list if none were entered
		if priority_list:
			self.priority_list = priority_list
		else:
			self._make_priority()
		if not_found_list:
			self.not_found_list = not_found_list
		else:
			self._make_not_found()

	def _make_priority(self, n=10):
		'''Return the n most common words in the corpus'''
		# Remove stop_words
		content = [w for w in self.lower_word_list if w not in self.stop_words]
		# Remove words that are only punctuation
		content_no_punc = []
		for word in content:
			tmp = False
			for char in word:
				if char not in punctuation:
					tmp = True
				else:
					continue
			if tmp:
				content_no_punc.append(word)

		priority_dict = Counter(content_no_punc)
		self.priority_list = [key for key, val in priority_dict.most_common(n)]

	def _make_not_found(self, n=15):
		'''Return the n most common sentences in the corpus'''
		not_found_dict = Counter(sent_tokenize(self.markov_dict['corpus_txt']))
		common_sent = [key for key, val in not_found_dict.most_common(n)]
		self.not_found_list = []
		# Might fill with small stuff, don't let that happen
		for sent in common_sent:
			if len(sent) > 5:
				self.not_found_list.append(sent)

	def _get_input(self, input_phrase):
		'''Take in the raw input from the user'''
		# Lowercase and remove common punc
		input_phrase = input_phrase.lower()
		input_phrase = re.sub('\?', '', input_phrase)
		input_phrase = re.sub('\.', '', input_phrase)
		input_phrase = re.sub(',', '', input_phrase)
		input_phrase = re.sub('!', '', input_phrase)

		# List of words from a potential input phrase
		word_list = input_phrase.split()

		# Make a list of words that are in priority_list
		priority_words = [w for w in word_list if w in self.priority_list]

		# If no priority words, look for non stop words
		content = [w for w in word_list if w not in self.stop_words]

		# Look for priority words first, content second, and finally random
		if priority_words:
			seed = np.random.choice(priority_words)
		elif content:
			seed = np.random.choice(content)
		else:  # Final option is a random word
		    seed = np.random.choice(word_list)

		# if the words is not in text, find neighbors
		if not self._in_text(seed):
			seed = self._get_neighbor(seed)

		return seed


	def _in_text(self, word):
		'''Return true if word is in the corpus'''
		return word.lower() in set(self.lower_word_list)

	def _get_neighbor(self, seed):
		'''Return the nearest neighbor to seed from a database'''
		if not self.neighbor_dict:
			return None

		neighbors = self.neighbor_dict[seed]

		good_neighbors = []
		for word in neighbors:
			if self._in_text(word):  # Only pick a neighbor if in text
				good_neighbors.append(word)
		if good_neighbors:
			return np.random.choice(good_neighbors)
		else:
			return None

	def _generate_key(self, seed, dir_dict):
		'''Return key from a chosen seed'''
		key_list = []
		for key in dir_dict:
			# Look at the last key_gram_size words in the key
			# First word in that key_gram_size len phrase must match seed
			if seed in key[-self.key_gram_size]:
				key_list.append(key)
		return key_list[np.random.choice(len(key_list))]

	def _run_chain(self, seed, dir_dict):
		'''Return a list of words generated from seed
		Iterate through dictionary until a period or capital is reached'''
		key = self._generate_key(seed, dir_dict)
		text = list(key[-self.key_gram_size:])

		# If not end/begin of sent, run
		while True:
			# Values is a list of lists
			values = dir_dict[key]

			# Choose a value with probability equal to distribution in corpus
			value = values[np.random.choice(len(values))]
			if (() in value) | (value == ()): # End condition
				break

			# Add a value_gram_size phrase to the text
			words_from_value = value[:self.value_gram_size]
			text += words_from_value

			# Create new lookup key
			key = tuple(text[-self.markov_dict['gram_size']:])
		return text

	def _get_sentence(self, seed):
		'''Return a sentence given a seed'''
		f_text = self._run_chain(seed, self.markov_dict['f_dict'])
		b_text = self._run_chain(seed, self.markov_dict['b_dict'])

		# b_text is backwards obviously, so turn it around
		b_text = list(reversed(b_text))

		# Only include seed once
		sent = b_text[:-1] + f_text

		return sent

	def _get_sentence_str(self, sent):
		'''Return a string representation of a list'''
		if self.gtype != 'naive':
			sent = [w[0] for w in sent]
		text = ' '.join(sent)

		punc_w_space = [' ' + x for x in punctuation]
		for i in xrange(len(text)-1):
			if text[i:i+2] in punc_w_space:
				text = text[:i] + text[i+1:]
		return text

	def run(self, input_text, key_gram_size=2, value_gram_size=1):
		'''Return a sentence based on gram_size
		Larger gram_size is more deterministic phrases
		gram_size cannot be larger than gram_size'''
		self.key_gram_size = min(key_gram_size, self.markov_dict['gram_size'])
		self.value_gram_size = min(value_gram_size, self.markov_dict['gram_size'])
		while self.key_gram_size + self.value_gram_size < self.markov_dict['gram_size']:
			self.value_gram_size += 1

		seed = self._get_input(input_text)
		# If seed not in corpus and no neighbor found, return random sent
		if not seed:
			return np.random.choice(self.not_found_list)
		sent = self._get_sentence(seed)

		# Turn into string for output
		sent_str = self._get_sentence_str(sent)

		# Fix space before punc
		output = self.truecaser.truecase(sent_str)
		return output
Пример #19
0
class Command(BaseCommand):
    args = '<page_id> <method>'
    help = 'Computes graph data for the given page'

    def __init__(self, *args, **kwargs):
        super(Command, self).__init__(*args, **kwargs)
        self._log = logging.getLogger('cmd')

    def handle(self, *args, **options):
        if args is None or len(args) < 1:
            pages = Page.objects.all()
            for page in pages:
                self._log.info("Page #%s: %s" % (page.id, page.fb_page_name))
            raise CommandError('Invalid arguments. Expected: <page_id>')

        page_id = args[0]

        self._log.info('GraphCommand initializing.')

        self._log.info('Page-Id: %s' % page_id)
        page = Page.objects.get(id=page_id)

        self.allTextGraph(page)
        #self.kpGraph(page)
        #self.buildGraph(page)

        self._log.info("All done for now.")

    def getNextIndex(self):
        self.nextFreeIndex = self.nextFreeIndex + 1
        return self.nextFreeIndex - 1

    def allTextGraph(self, page):
        pageowner = page.owner
        pageposts = Post.objects.filter(page__exact=page)

        self.stop_words = None
        self.idfCache = {}

        userterms = {}

        pageusers = User.objects.filter(id__in = pageposts.exclude(createuser__exact=pageowner).values('createuser').distinct() )
        pageusers_count = len(pageusers)
        print "Calculating vectors for %s users" % pageusers_count

        self.nextFreeIndex = 0
        curuseridx = 0
        for currentuser in pageusers:
            curuseridx = curuseridx + 1
            print "tok+tf %s/%s" % (curuseridx, pageusers_count)
            terms = self.getUserTfVector(page, currentuser, pageposts)
            if not terms is None:
                userterms[currentuser.id] = terms
        print "Maximal index: %s" % self.nextFreeIndex

        self.postcount = len(pageposts)
        print "Calculating IDF, posts: %s, terms: %s" % (self.postcount, len(self.idfCache))
        curuseridx = 0
        terms_with_idf = {}
        for user_id in userterms:
            curuseridx = curuseridx + 1
            print "idf %s/%s" % (curuseridx, pageusers_count)
            tokens = self.calculateIdf(userterms[user_id])
            terms_with_idf[user_id] = tokens

        print "tfidf"
        curuseridx = 0
        for user_id in terms_with_idf:
            curuseridx = curuseridx + 1
            print "tfidf %s/%s" % (curuseridx, pageusers_count)
            tokens = self.calculateTfIdf(terms_with_idf[user_id])
            userterms[user_id] = tokens

        del terms_with_idf

        print "Terms: %s" % len(self.idfCache)
        print "Calculating term IDs"
        termIds = self.calculateTermIds(userterms)

        uservectors = self.getUserVectors(userterms, termIds, len(self.idfCache), pageusers_count)
        userswithindex, usermatrix = self.getUserMatrix(uservectors)

        print "Creating graph"
        graph = nx.Graph()

        graph.add_nodes_from(pageusers)
        for i1 in range(usermatrix.shape[0]-1):
            max_edge = None
            max_edge_val = 0.0
            for i2 in range(usermatrix.shape[0]-1):
                if i1 == i2:
                    continue
                u1 = userswithindex[i1]
                u2 = userswithindex[i2]
                u1u2val = usermatrix[i1][i2]
                if u1u2val > max_edge_val:
                    max_edge = u2
                    max_edge_val = u1u2val

            if max_edge_val > 0.0 and not max_edge is None:
                self.add_edge(graph, u1, max_edge)

        components = nx.connected_components(graph)
        print "Number of connected components: %s" % len(components)
        print "Nodes: %s Edges: %s" % ( len(graph.nodes()), len(graph.edges()) )
        self.removeSingletons(graph)
        print "Nodes: %s Edges: %s" % ( len(graph.nodes()), len(graph.edges()) )

        components = nx.connected_components(graph)
        print "Number of connected components: %s" % len(components)

        self.deleteClusters(page)

        print "storing"
        cpage = page
        for compidx in range(len(components)-1):
            component = components[compidx]
            newcluster = UserCluster.objects.create(page=cpage)
            newcluster.save()
            tags = {}
            tagcounts = {}
            for user_id in component:
                adduser = pageusers.filter(id__exact=user_id)[0]
                newassoc = UserClusterAssoc.objects.create(cluster = newcluster, clusteruser = adduser)
                print user_id
                newassoc.save()

                for t, tfidf in userterms[user_id]:
                    if not t in tagcounts:
                        tagcounts[t] = 1.0
                    else:
                        tagcounts[t] = tagcounts[t] + 1.0
                    if not t in tags:
                        tags[t] = tfidf
                    else:
                        tags[t] = tags[t] + tfidf
            for t in tags.keys():
                tweight = tags[t] / tagcounts[t]
                print t
                newterm = UserClusterTerm.objects.create(cluster = newcluster, clusterterm = t, termweight = tweight)
                newterm.save()

            print "Component #%s Users: %s Tags (%s): \"%s\"" % (compidx, len(component), len(tags.keys()), ",".join(tags.keys()))

    def deleteClusters(self, page):
        print "cleaning"
        delclusters = 0
        for currentcluster in UserCluster.objects.filter(page__exact=page):
            uca = UserClusterAssoc.objects.filter(cluster__exact=currentcluster)
            uca.delete()
            uct = UserClusterTerm.objects.filter(cluster__exact=currentcluster)
            uct.delete()
            currentcluster.delete()
            delclusters = delclusters + 1
        print "Deleted %s clusters" % delclusters

    def getUserMatrix(self, uservectors):
        userswithindex = uservectors.keys()
        usermatrix = np.zeros([len(userswithindex)+1, len(userswithindex)+1])

        u1idx = 0

        for u1 in userswithindex:
            u2idx = 0
            for u2 in userswithindex:
                u2idx = u2idx + 1
                if u1 == u2:
                    continue

                u1_vec = uservectors[u1][0]
                u2_vec = uservectors[u2][0]
                u1u2dot = np.dot(u1_vec, u2_vec)
                usermatrix[u1idx][u2idx] = u1u2dot

            u1idx = u1idx + 1
            print "matrix %s/%s" % (u1idx, len(userswithindex))
        return (userswithindex, usermatrix)

    def getUserVectors(self, userterms, termIds, vectorlen, pageusers_count):
        uservectors = {}

        curuseridx = 0
        for user_id in userterms.keys():
            curuseridx = curuseridx + 1
            print "vec %s/%s" % (curuseridx, pageusers_count)

            currentvector = [0.0] * vectorlen

            terms = []
            for w, tfidf in userterms[user_id]:
                terms.append(w)
                currentvector[ termIds[w] ] = tfidf

            uservectors[user_id] = (np.array(currentvector), terms)
            #print ", ".join(map(str, currentvector))
            #print ", ".join(terms)

        return uservectors

    def calculateTermIds(self, userterms):
        next_id = 0
        ids = {}
        for user_id in userterms:
            for w, tfidf in userterms[user_id]:
                if not w in ids:
                    ids[w] = next_id
                    next_id = next_id + 1
        return ids

    def getIdf(self, term):
        if term in self.idfCache:
            return float(self.postcount) / self.idfCache[term]

        print "Missing IDF: %s " % term
        exit()

    def getUserTfVector(self, page, currentuser, pageposts):
        tok = {}

        for post in pageposts.filter(createuser__exact=currentuser):
            usertokens = self.getToken(post)
            for w, tf in usertokens:
                if not w in tok:
                    tok[w] = tf
                else:
                    tok[w] = tok[w] + tf

        return [(w, tok[w]) for w in tok]

    def getToken(self, post):
        self.tokenizer = WhitespaceTokenizer()
        if post.text is not None and post.text != "":
            curtext = post.text.encode('utf-8')
            tokens = self.tokenize(curtext)
            tokens = self.normalizeTokens(tokens)
            tokens = self.stripSpecialChars(tokens)
            tokens = self.filterInvalid(tokens)
            tokens = self.calculateTf(tokens)
            return tokens
        return []

    def getTfIdf(self, w, tf, idf, tokens):
        return (tf * idf) / len(tokens)

    def calculateTfIdf(self, tokens):
        return [ (w, self.getTfIdf(w, tf, idf, tokens) ) for w, tf, idf in tokens ]

    # maximum normalized tf
    def calculateTf(self, tokens):
        if len(tokens) == 0:
            return []

        seen = {}
        max_tf = 1.0

        for w in tokens:
            if not w in seen:
                seen[w] = 1.0
                if not w in self.idfCache:
                    self.idfCache[w] = 1.0
                else:
                    self.idfCache[w] = self.idfCache[w] + 1.0
            else:
                seen[w] = seen[w] + 1.0
            if seen[w] > max_tf:
                max_tf = seen[w]

        res = []
        for w in tokens:
            res.append( (w, seen[w] / max_tf) )
        return res

    def calculateIdf(self, tokens):
        return [(w, tf, self.getIdf(w)) for w, tf in tokens]

    def filterInvalid(self, tokens):
        vt = [w for w in tokens if self.isValidTerm(w)]
        if vt is None:
            vt = []
        return vt

    def tokenize(self, curtext):
        return [word for sent in nltk.sent_tokenize(curtext) for word in self.tokenizer.tokenize(sent)]

    def is_number(self, s):
        try:
            float(s)
            return True
        except ValueError:
            return False

    def is_stop_word(self, term):
        self.read_stop_words()
        return term in self.stop_words

    def read_stop_words(self):
        if not self.stop_words is None:
            return
        res = {}
        for word in open(os.path.join(settings.STATIC_ROOT, 'stop_words'), 'rt').read().split('\r\n'):
            if not word is None and word != '' and not word in res:
                res[word] = True
        self.stop_words = res

    def isValidTerm(self, term):
        if len(term) < 2:
            return False
        for t in [".", ",", "-", "+", "%", "?", "!", "$", "&", "/", "\"", "'", "`", "`", "|", ":", ";", ")", "(", "[", "]", "{", "}"]:
            if t in term:
                return False
        if self.is_number(term):
            return False
        if self.is_stop_word(term):
            return False

        try:
            term = term.decode('ascii')
        except:
            return False

        if term.find('.') > -1: # or term.find('/') > -1 or term.find("?"): # url parts
            return False
        return True

    def normalizeTokens(self, tokens):
        return [w.lower() for w in tokens]

    def stripSpecialChars(self, tokens):
        return [w.strip("\r\n.,-+%?!$&/\\'`|:;)([]{}\t\" ") for w in tokens]

    def kpGraph(self, page):
        # initialization
        self.nextFreeIndex = 0
        self.tokenIndices = {}
        self.allTerms = []

        pageowner = page.owner
        pageposts = Post.objects.filter(page__exact=page)

        pageusers = User.objects.filter(id__in = pageposts.exclude(createuser__exact=pageowner).values('createuser').distinct() )
        pageusers_count = len(pageusers)
        print "Calculating vectors for %s users" % pageusers_count

        kp_term_method = KeyphraseMethod.objects.get(name='pos_sequence')

        userterms = {}

        curuseridx = 0
        for currentuser in pageusers:
            curuseridx = curuseridx + 1
            print "%s/%s" % (curuseridx, pageusers_count)
            (terms, ids) = self.getUserVector(page, currentuser, kp_term_method)
            if not terms is None:
                userterms[currentuser.id] = (terms, ids)

        print "Maximal index: %s" % self.nextFreeIndex

        uservectors = {}
        vectorlen = self.nextFreeIndex
        for currentuser in userterms.keys():
            terms, ids = userterms[currentuser]
            currentvector = [0.0] * vectorlen

            for i in range(len(ids)-1):
                currentvector[ids[i]] = 1.0

            uservectors[currentuser] = (np.array(currentvector), terms)
            #print ", ".join(map(str, currentvector))
            #print ", ".join(self.allTerms)

        userswithindex = uservectors.keys()
        usermatrix = np.zeros([len(userswithindex)+1, len(userswithindex)+1])

        u1idx = 0

        for u1 in userswithindex:
            u2idx = 0
            for u2 in userswithindex:
                u2idx = u2idx + 1
                if u1 == u2:
                    continue

                u1_vec = uservectors[u1][0]
                u2_vec = uservectors[u2][0]
                u1u2dot = np.dot(u1_vec, u2_vec)
                usermatrix[u1idx][u2idx] = u1u2dot

            u1idx = u1idx + 1
            print "%s/%s" % (u1idx, len(userswithindex))

        print "Creating graph"
        graph = nx.Graph()

        graph.add_nodes_from(pageusers)
        for i1 in range(usermatrix.shape[0]-1):
            max_edge = None
            max_edge_val = 0.0
            for i2 in range(usermatrix.shape[0]-1):
                if i1 == i2:
                    continue
                u1 = userswithindex[i1]
                u2 = userswithindex[i2]
                u1u2val = usermatrix[i1][i2]
                if u1u2val > max_edge_val:
                    max_edge = u2
                    max_edge_val = u1u2val

            if max_edge_val > 0.0 and not max_edge is None:
                self.add_edge(graph, u1, max_edge)

        components = nx.connected_components(graph)
        print "Number of connected components: %s" % len(components)
        print "Nodes: %s Edges: %s" % ( len(graph.nodes()), len(graph.edges()) )
        self.removeSingletons(graph)
        print "Nodes: %s Edges: %s" % ( len(graph.nodes()), len(graph.edges()) )

        components = nx.connected_components(graph)
        print "Number of connected components: %s" % len(components)

        for compidx in range(len(components)-1):
            component = components[compidx]
            taglist = []
            for user_id in component:
                ut = userterms[user_id][0]
                for t in ut:
                    if not t in taglist:
                        taglist.append(t)

            print "Component #%s Users: %s Tags (%s): \"%s\"" % (compidx, len(component), len(taglist), ",".join(taglist))




        return

    def getIndex(self, token):
        if not token in self.tokenIndices:
            self.allTerms.append(token)
            self.tokenIndices[token] = self.getNextIndex()
        return self.tokenIndices[token]

    def getUserVector(self, page, currentuser, kp_term_method):
        user_posts = Post.objects.filter(page__exact=page, createuser__exact=currentuser)
        user_post_parents = Post.objects.filter(id__in=user_posts.values('parent').distinct())

        user_kps = PostKeyphraseAssoc.objects.filter(post__in = user_posts, keyphrase__method__exact=kp_term_method)
        user_kp_count = len(user_kps)

        terms_all = []
        terms_split = []
        terms_n = user_kps.values('keyphrase__normalized').distinct()
        terms_t = user_kps.values('keyphrase__term').distinct()

        for term in terms_n:
            t = term['keyphrase__normalized']
            if not t in terms_all:
                terms_all.append(t)

        for term in terms_t:
            t = term['keyphrase__term']
            if not t in terms_all:
                terms_all.append(t)

        for term in terms_all:
            for term_part in term.split(" "):
                if not term_part in terms_split:
                   terms_split.append(term_part)

        terms_all = terms_split

        #if (len(terms_all) > 0):
        #    for thread_post in user_post_parents:
        #        terms_all.append("POST%s" % (thread_post.id))

        print "User: %s Posts: %s Keyphrases: %s" % ( currentuser, len(user_posts), user_kp_count )
        print "Terms: %s" % ", ".join(terms_all)

        if user_kp_count == 0:
            return (None, None)

        res_terms = []
        res_ids = []
        for term in terms_all:
            term_idx = self.getIndex(term)
            res_terms.append(term)
            res_ids.append(term_idx)

        return (res_terms, res_ids)

    def add_edge(self, graph, obj_from, obj_to, add_weight=1.0):
        if not graph.has_edge(obj_from, obj_to):
            graph.add_edge(obj_from, obj_to, weight=add_weight)
        else:
            graph[obj_from][obj_to]['weight'] = graph[obj_from][obj_to]['weight'] + add_weight

    def addPostUser(self, graph, post, added_users):
        if not post.createuser in graph:
            graph.add_node(post.createuser)
            added_users.append(post.createuser)
        # edge: post -> createuser
        self.add_edge(graph, post, post.createuser)

    def addPostParent(self, graph, post):
        if not post.parent is None:
            if not post.parent in graph:
                graph.add_node(post.parent)
                self.add_edge(graph, post, post.parent)

    def addPostKeyPhrases(self, graph, post):
        # keyphrases in this post
        for pk in PostKeyphraseAssoc.objects.filter(post__exact=post):
            graph.add_node(pk.keyphrase)
            self.add_edge(graph, post, pk.keyphrase)

    def addUserMetaCategory(self, graph, user):
        metaentries = UserMeta.objects.filter(user__exact=user)
        for metaentry in metaentries:
            if metaentry is None:
                continue
            if metaentry.fb_category is None or metaentry.fb_category == '':
                continue
            nodeval = u'CAT_' + unicode(metaentry.fb_category)
            graph.add_node(nodeval)
            self.add_edge(graph, user, nodeval)

    def addUserMeta(self, graph, user):
        metaentries = UserMeta.objects.filter(user__exact=user)
        for metaentry in metaentries:
            if metaentry is None:
                continue
            nodeval = unicode(metaentry)
            graph.add_node(nodeval)
            self.add_edge(graph, user, nodeval)

    def removeNonConnectedUsers(self, graph, dist_threshold):
        components = nx.connected_components(graph)
        print "Number of connected components: %s" % len(components)

        print "Removing non-connected user nodes"
        remove_nodes = []
        for component in components:
            usernodes = []
            userdists = {}
            for node in component:
                if type(node) == User:
                    usernodes.append(node)
            u1idx = 0
            ulen = len(usernodes)
            for u1 in usernodes:
                u1idx = u1idx + 1
                print "%s/%s" % (u1idx, ulen)
                if not u1.id in userdists:
                    userdists[u1.id] = 1000
                for u2 in usernodes:
                    if u1 == u2:
                        continue
                    pathres = nx.dijkstra_path_length(graph,u1,u2)
                    if pathres < userdists[u1.id]:
                        userdists[pathres] = pathres
                    if userdists[u1.id] < dist_threshold:
                        break # condition satisfied
            for user in usernodes:
                if userdists[user.id] > dist_threshold: # shortest path to another user is > 5 -> remove
                    print "Removing user %s. Dist value: %s" % (user.id, userdists[user.id])
                    remove_nodes.append(user)
        print "Removing %s user nodes" % len(remove_nodes)
        graph.remove_nodes_from(remove_nodes)
        del remove_nodes

    def removeSingletons(self, graph):
        print "Removing singletons"
        singleton_nodes = [ n for n,d in graph.degree_iter() if d==0 ]
        graph.remove_nodes_from(singleton_nodes)
        del singleton_nodes


    def buildGraph(self, page):
        print "Building graph"
        pageowner = page.owner
        pageposts = Post.objects.filter(page__exact=page)

        graph = nx.Graph()

        #pageposts = pageposts[500:700] ##########################################

        print "nodes: posts"
        graph.add_nodes_from(pageposts)

        print "edges: user -> post"
        added_users = []

        for post in pageposts:
            # post.createuser
            self.addPostUser(graph, post, added_users)

            # post->parent post
            self.addPostParent(graph, post)

            # post->postkeyphraseassoc->keyphrase
            self.addPostKeyPhrases(graph, post)

            # post.createuser->usermeta
            #self.addUserMeta(graph, post.createuser)
            #self.addUserMetaCategory(graph, post.createuser)

        print "Graph nodes: %s" % len(graph.nodes())
        print "Graph edges: %s" % len(graph.edges())

        print "Removing page owner"
        graph.remove_node(pageowner)

        print "Graph nodes: %s" % len(graph.nodes())
        print "Graph edges: %s" % len(graph.edges())


        self.removeSingletons(graph)

        components = nx.connected_components(graph)
        print "Number of connected components: %s" % len(components)

        print "Removing components with only 0/1 user nodes"
        remove_components = []
        for component in components:
            usercount = 0
            for node in component:
                if type(node) == User:
                    usercount = usercount + 1
            if usercount <= 1:
                remove_components.append(component)
            else:
                print "Found %s user nodes" % usercount
        print "Removing %s components" % len(remove_components)
        for component in remove_components:
            graph.remove_nodes_from(component)
        del remove_components

        components = nx.connected_components(graph)
        print "Number of connected components: %s" % len(components)

        print "Edges: %s" % len(graph.edges())
        remove_edges = []
        weight_threshold = 2.0
        for node_a, node_b, attr in sorted(graph.edges(data = True), key = lambda (a, b, attr): attr['weight']):
            if type(node_a) == Post or type(node_b) == Post: # exclude post connections
                continue
            if 'weight' in attr and attr['weight'] > weight_threshold:
                break
            remove_edges.append((node_a, node_b))
            #print('{a} {b} {w}'.format(a = node_a, b = node_b, w = attr['weight']))
        for node_a, node_b in remove_edges:
            graph.remove_edge(node_a, node_b)
        print "Edges: %s" % len(graph.edges())

        self.removeSingletons(graph)

        print "Graph dotfile"
        nx.write_dot(graph, '/home/double/graph_viz.dot')


        tmp = []
        for user in added_users:
            if user in graph:
                tmp.append(user)
        added_users = tmp
        print "Unique users in graph: %s" % len(added_users)

        usergraph = nx.Graph()
        usergraph.add_nodes_from(added_users)
        for user_a, user_b in combinations(added_users, 2):
            try:
                userpath = nx.shortest_path_length(graph, user_a, user_b, weight='weight')
                usergraph.add_edge(user_a, user_b, weight=userpath)
                print user_a, user_b, userpath
            except nx.NetworkXNoPath, e:
                #print e
                continue

        self.removeSingletons(usergraph)

        #print "Drawing graph"
        plt.ioff()

        #nx.draw(graph, node_size=10, font_size=8)
        #plt.savefig('/home/double/graph.png', dpi=1000)

        print "UserGraph nodes: %s" % len(usergraph.nodes())
        print "UserGraph edges: %s" % len(usergraph.edges())


        return
Пример #20
0
class TextCleaner(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
        self.pt_stemmer = nltk.stem.RSLPStemmer()
        self.tokenizer = WhitespaceTokenizer()
        self.cached_stopwords = stopwords.words('portuguese')
        self.more_stopwords = [
            'ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa',
            'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne',
            'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra',
            'vai', 'olha', 'pois', 'fica', 'muito', 'muita', 'muitos',
            'muitas', 'onde', 'mim', 'oi', 'ola', 'ate'
        ]
        self.ascii_replace = [
            ('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'),
            ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'),
            ('ô', 'o'), ('õ', 'o'), ('ú', 'u'), ('ç', 'c'), ('ä', 'a'),
            ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'),
            ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'),
            ('Ê', 'e'), ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'),
            ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c')
        ]
        self.link_patterns = [('http'), ('www'), ('w3c')]
        self.normal = [(r'kxkxk', 'kkk'), (r'nao ', ' nao_'),
                       (r' ir ', '_ir '), (r'bom demal', ' bomdemais '),
                       (r'\s*insan\s*', ' insano '),
                       (r'\s*saudad\s*', ' saudade ')]
        self.digraph = [(r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'),
                        (r'eqe', 'ee'), (r'oqo', 'oo')]

    # Remover caracteres repetidos seguidamente, para que o modelo não seja prejudicado por falta de padrão na escrita.
    def removeRepChar(self, word):
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.removeRepChar(repl_word)
        else:
            return repl_word

    # Remover sufixo das palavras da lingua portuguesa.
    def removeSufPort(self, para):
        para = para.split()
        text = ''
        for w in para:
            text = text + self.pt_stemmer.stem(w) + ' '
        return text

    # Substituir caracateres acentuados por caracteres sem acentos.
    def removeAccent(self, text):
        para = text
        for (lat, asc) in self.ascii_replace:
            para = para.replace(lat, asc)
        return para

    # Remover stopwords dos textos.
    def removeStopwords(self, text):
        text = ' '.join([
            word for word in text.split() if word not in self.cached_stopwords
        ])
        text = ' '.join(
            [word for word in text.split() if word not in self.more_stopwords])
        return text

    # Remover links dos textos.
    def removeLinks(self, text):
        for l in self.link_patterns:
            text = text.split(l, 1)[0]
        return text

    # Reescrever os digrafos na sua forma original. Exemplo: rxr -> rr
    def normalizeDigraph(self, text):
        for a, d in self.digraph:
            text = re.sub(a, d, text)
        return text

    # Reescrever algumas palavras para dar melhor semântica e legibilidade aos resultados do modelo.
    def normalizeText(self, text):
        for a, b in self.normal:
            text = re.sub(a, b, text)
        return text

    def removeOneCharacter(self, text):
        text = self.tokenizeWords(text)
        for i in range(len(text)):
            if len(text[i]) <= 2:
                text[i] = ''
        return ' '.join(text)

    def tokenizeWords(self, text):
        text = self.tokenizer.tokenize(text)
        return text