Пример #1
0
def corpus_statistics():
    #train_corpus_path = "/userstore/jieg/credbank/corpus/credbank_train_corpus.txt"
    train_corpus_path = "C:\\Data\\credbank\\tweets_corpus\\shuffled_credbank_held_corpus.txt"
    with open(train_corpus_path, mode='r', encoding='utf-8') as file:
        train_corpus = file.readlines()

    from nltk.tokenize.regexp import WhitespaceTokenizer
    whitespace_tokenize = WhitespaceTokenizer().tokenize
    corpus_size = 0
    for tweet in train_corpus:
        tokens = whitespace_tokenize(tweet)
        corpus_size += len(tokens)

    print("all words (corpus size): ", corpus_size)

    from sklearn.feature_extraction.text import CountVectorizer

    #extract tokens
    text_vectorizer = CountVectorizer(analyzer='word',
                                      tokenizer=WhitespaceTokenizer().tokenize,
                                      ngram_range=(1, 1),
                                      min_df=1)
    X = text_vectorizer.fit_transform(train_corpus)
    # Vocabulary
    vocab = list(text_vectorizer.get_feature_names())
    print("vocabulary size: ", len(vocab))  # 913611
    counts = X.sum(axis=0).A1

    from collections import Counter
    freq_distribution = Counter(dict(zip(vocab, counts)))

    print("top N frequent words: ", freq_distribution.most_common(10))
Пример #2
0
    def evaluateclassifier(self, featureselection):
        positivecount=0
        negativecount=0
        negativetweets = []
        positivetweets = []
        #print 'Evaluating Classifier'
        print featureselection
        with open(r'..\polarityData\TweetCorpus\training.1600000.processed.noemoticon.csv', 'rb') as f:
            #print 'Opening corpus file'
            reader = csv.reader(f)
            for row in reader:
                #Positive sentiment tweets
                if(row[0] == '4' and positivecount < self.corpuslength):
                    positivetweets.append(row[5])        
                    positivecount+=1        
                #Negative sentiment tweets
                if(row[0] == '0' and negativecount < self.corpuslength):
                    negativetweets.append(row[5])
                    negativecount+=1
        
        #print 'Generating Features' 
        self.positivefeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'pos') for tweet in positivetweets]
        self.negativefeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'neg') for tweet in negativetweets]
        
        poscutoff = len(self.positivefeatures)
        negcutoff = len(self.negativefeatures)
        print "Train Pos Cutoff: " + str(poscutoff) + " Train Neg Cutoff: " + str(negcutoff)
        trainfeats = self.positivefeatures[:poscutoff] + self.negativefeatures[:negcutoff]
        
        testfeats = self.test(featureselection) 
        #testfeats = self.positivefeatures[:poscutoff] + self.negativefeatures[:negcutoff]       
        print 'Train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
        classifier = NaiveBayesClassifier.train(trainfeats)        
        print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
        
        #classifier.show_most_informative_features(20)
        
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set) 
        
        for i, (feats, label) in enumerate(testfeats):    
            refsets[label].add(i)    
            observed = classifier.classify(feats)  
            #print label, observed  
            testsets[observed].add(i)

        print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
        print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
        print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
        print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
        print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
        print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
def main():
    text = read_doc()

    text = [unescape(sent) for sent in text]

    from nltk.tokenize.regexp import WhitespaceTokenizer
    ws_tokenizer = WhitespaceTokenizer()
    text = [ws_tokenizer.tokenize(sent) for sent in text if len(sent) > 0]

    text = [[token.lower() for token in sent] for sent in text]

    text = [[
        ''.join(ch for ch in token if ch.isalpha() or ch == '\'')
        for token in sent
    ] for sent in text]

    text = [[token for token in sent if len(token) >= 2 and len(token) <= 35]
            for sent in text]

    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))
    text = [[token for token in sent if not token in stopwords]
            for sent in text]

    from nltk.stem.snowball import SnowballStemmer
    stemmer = SnowballStemmer("english")
    text = [[stemmer.stem(token) for token in sent] for sent in text]

    from sklearn.feature_extraction.text import CountVectorizer
    vect = CountVectorizer(min_df=20, analyzer=lambda x: x)
    X = vect.fit_transform(text)

    #print(X.toarray())
    feature_names = vect.get_feature_names()
    #print(feature_names)

    from collections import Counter
    try:
        # Python 2
        from itertools import izip
    except ImportError:
        # Python 3
        izip = zip
    wfd = Counter(
        {key: value
         for (key, value) in izip(range(X.shape[1]), X.getnnz(0))})

    from itertools import combinations, chain
    bfd = Counter(
        chain.from_iterable(
            [combinations(sorted(segment.tocoo().col), 2) for segment in X]))

    N_seg = len(text)
    scores = [(mutinf(bfd[tup], wfd[tup[0]], wfd[tup[1]], N_seg), tup)
              for tup in bfd]

    print([(tup[0], feature_names[tup[1][0]], feature_names[tup[1][1]])
           for tup in sorted(scores, reverse=True)[:20]])

    pass
Пример #4
0
 def __init__(self,
              data_iterator,
              tokenizer=WhitespaceTokenizer(),
              char_map=None,
              word_len=30,
              sent_len=200):
     '''
     DESCRIPTIONS:
         This class converts text to numbers for the standard unicode vocabulary
         size.
     PARAMS:
         data_iterator (iterator): iterator to iterates the text strings
         word_len (int): maximum length of the word, any word of length less
             than that will be padded with zeros, any word of length more than
             that will be cut at max word length.
         sent_len (int): maximum number of words in a sentence, any sentence
             with less number of words than that will be padded with zeros,
             any sentence with more words than the max number will be cut at
             the max sentence length.
         char_map (dict): a dictionary for mapping characters to numbers.
     '''
     self.data_iterator = data_iterator
     self.word_len = word_len
     self.sent_len = sent_len
     self.char_map = char_map
     self.tokenizer = tokenizer
     self.char_zero = ' '  # character to be assigned the zero index
Пример #5
0
 def __init__(self, use_unicode=True):
     self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
     self.repl = r'\1\2\3'
     self.pt_stemmer = nltk.stem.RSLPStemmer()
     self.tokenizer = WhitespaceTokenizer()
     self.cached_stopwords = stopwords.words('portuguese')
     self.symbols = [
         u"\"", u"'", u"!", u"?", u".", u",", u";", u">", u"_", u"<", u"-",
         u"[", u"]", u"{", u"}", u"/", u"\\", u"^", u"~", u"´", u"`",
         u"``", u"\u2026", u":", u"(", u")", u"|", u"#", u"$", u"%", u"&",
         u"*", u"=", u"+", u"\u2013", u"\u201c", u"\u201d", u"\u300b",
         u"\u2019", u"\u2018", u"\u00b0", u"\u30fb", u"\u00ba", u"\u200b",
         u"\u00b7", u"\u2014", u"\u00bb", u"\u221a", u"\u00aa", u"\ufe0f",
         u"\u2794", u"\u2192", u"\u00a8", u"\u2022", u"\u300a", u"\u00bf",
         u"\u25a0", u"\u00af", u"\u22b3", u"\u2060", u"\u261b", u"\u00ad",
         u"\u00ab"
     ]
     self.more_stopwords = [
         'ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa',
         'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne',
         'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra',
         'vai', 'olha', 'pois', 'rt', 'retweeted', 'fica', 'muito', 'muita',
         'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate'
     ]
     if use_unicode:
         self.accents = unicode_replace
     else:
         self.accents = ascii_replace
     self.link_patterns = [('http'), ('www'), ('w3c'), ('https')]
     self.normal = [(r'kxkxk', 'kkk'), (r'nao ', ' nao_'),
                    (r' ir ', '_ir '), (r'bom demal', ' bomdemais '),
                    (r'\s*insan\s*', ' insano '),
                    (r'\s*saudad\s*', ' saudade ')]
     self.digraph = [(r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'),
                     (r'eqe', 'ee'), (r'oqo', 'oo')]
Пример #6
0
 def __init__(self, use_unicode):
     self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
     self.repl = r'\1\2\3'
     self.tokenizer = WhitespaceTokenizer()
     self.cached_stopwords = stopwords.words('english')
     self.symbols = [
         u"\"", u"'", u"!", u"?", u".", u",", u";", u">", u"_", u"<", u"-",
         u"[", u"]", u"{", u"}", u"/", u"\\", u"^", u"~", u"´", u"`",
         u"``", u"\u2026", u":", u"(", u")", u"|", u"#", u"$", u"%", u"&",
         u"*", u"=", u"+", u"\u2013", u"\u201c", u"\u201d", u"\u300b\u300b",
         u"\u2019", u"\u2018", u"\u00b0", u"\u00ba", u"\u200b", u"\u00b7",
         u"\u2014", u"\u00bb", u"\u221a", u"\u00aa", u"\ufe0f", u"\u2794",
         u"\u2192", u"\u00a8", u"\u2022", u"\u300a", u"\u00bf", u"\u25a0",
         u"\u00af", u"\u22b3", u"\u2060", u"\u261b", u"\u00ad", u"\u00ab"
     ]
     if use_unicode:
         self.accents = unicode_replace
     else:
         self.accents = ascii_replace
     self.link_patterns = [('http'), ('www'), ('w3c')]
     self.digraph = [(r'hash', '#'), (r'rxr', 'rr'), (r'sxs', 'ss'),
                     (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo'),
                     (r'fqf', 'ff'), (r'gqg', 'gg'), (r'cqc', 'cc'),
                     (r'dqd', 'dd'), (r'mqm', 'mm'), (r'nqn', 'nn'),
                     (r'pqp', 'pp'), (r'dqd', 'dd'), (r'tqt', 'tt'),
                     (r'fqf', 'ff'), (r'lql', 'll')]
Пример #7
0
def get_ngram_counts(comment_iter, n, tokenizer=None, sample_pct=100):
    """
    Compute ngram counts from comments.
    
    Parameters:
    -----------
    comment_iter : generator
    n : int
    tokenizer : nltk.tokenize.Tokenizer
    sample_pct : float
    Optional percentage from which to subsample the data.
    
    Returns:
    --------
    counts : pandas.DataFrame
    Rows = ngrams, col = counts.
    """
    if (tokenizer is None):
        tokenizer = WhitespaceTokenizer()
    counts = Counter()
    for i, c in enumerate(comment_iter):
        if (sample_pct == 100 or random.random() * 100 < sample_pct):
            ngrams = ngram_split(c, n, tokenizer)
            for ngram in ngrams:
                ngram = [' '.join(ngram)]
                counts.update(ngram)
        if (i % 1000000 == 0):
            print('got %d unique ngrams' % (len(counts)))
    # convert to dataframe
    counts = pd.DataFrame(pd.Series(counts))
    return counts
Пример #8
0
 def __init__(self):
     self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
     self.repl = r'\1\2\3'
     self.pt_stemmer = nltk.stem.RSLPStemmer()
     self.tokenizer = WhitespaceTokenizer()
     self.cached_stopwords = stopwords.words('portuguese')
     self.more_stopwords = [
         'ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa',
         'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne',
         'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra',
         'vai', 'olha', 'pois', 'fica', 'muito', 'muita', 'muitos',
         'muitas', 'onde', 'mim', 'oi', 'ola', 'ate'
     ]
     self.ascii_replace = [
         ('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'),
         ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'),
         ('ô', 'o'), ('õ', 'o'), ('ú', 'u'), ('ç', 'c'), ('ä', 'a'),
         ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'),
         ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'),
         ('Ê', 'e'), ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'),
         ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c')
     ]
     self.link_patterns = [('http'), ('www'), ('w3c')]
     self.normal = [(r'kxkxk', 'kkk'), (r'nao ', ' nao_'),
                    (r' ir ', '_ir '), (r'bom demal', ' bomdemais '),
                    (r'\s*insan\s*', ' insano '),
                    (r'\s*saudad\s*', ' saudade ')]
     self.digraph = [(r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'),
                     (r'eqe', 'ee'), (r'oqo', 'oo')]
Пример #9
0
 def __chunk_sentence(self, sentence):
     """Tokenize the sentence into words using a whitespace parser to avoid parsing couldn't into two tokens (could and n't).
    Then chunk the tokens according to GRAMMAR.
 """
     tokenizer = WhitespaceTokenizer()
     tokens = tokenizer.tokenize(sentence)
     pos_tagged = nltk.pos_tag(tokens)
     return self.parser.parse(pos_tagged)
Пример #10
0
 def build_topn_best_words(self):
     word_fd = FreqDist()
     label_word_fd = ConditionalFreqDist()
     positivecount = 0;
     negativecount = 0
     with open(r'..\polarityData\TweetCorpus\training.1600000.processed.noemoticon.csv', 'rb') as f:
         reader = csv.reader(f)
         for row in reader:
                 #Positive sentiment tweets
                 if(row[0] == '4' and positivecount < self.corpuslength):
                     tweet = row[5]
                     tokens = WhitespaceTokenizer().tokenize(tweet)
                     #print tweet
                     for token in tokens:                        
                         word_fd.inc(token.lower())    
                         label_word_fd['pos'].inc(token.lower()) 
                     positivecount+=1
                 #Negative sentiment tweets
                 if(row[0] == '0' and negativecount < self.corpuslength):
                     tweet = row[5]
                     tokens = WhitespaceTokenizer().tokenize(tweet)
                     #print tweet
                     for token in tokens:     
                         word_fd.inc(token.lower())    
                         label_word_fd['neg'].inc(token.lower())
                     negativecount+=1
                     
     #print word_fd
     #print label_word_fd
     
     pos_word_count = label_word_fd['pos'].N()
     neg_word_count = label_word_fd['neg'].N()
     total_word_count = pos_word_count + neg_word_count
     print "Positive Word Count:", pos_word_count, "Negative Word Count:", neg_word_count, "Total Word count:", total_word_count
     
     word_scores = {}
     for word, freq in word_fd.iteritems():    
         pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count)    
         neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count)    
         word_scores[word] = pos_score + neg_score
         
     best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
     self.bestwords = set([w for w, s in best])        
     print 'Best Words Count:', len(self.bestwords)#, 'Best Words Set:', self.bestwords
Пример #11
0
 def test(self, featureselection):
     positiveTweets = [] 
     negativeTweets = []
     with open(r'..\polarityData\TweetCorpus\testdata.manual.2009.06.14.csv', 'rb') as f:
         reader = csv.reader(f)
         for row in reader:
             #Positive sentiment tweets
             if(row[0] == '4'):
                 positiveTweets.append(utils.common.processTweetBlank(row[5]))          
             #Negative sentiment tweets
             if(row[0] == '0'):
                 negativeTweets.append(utils.common.processTweetBlank(row[5]))
         
     positiveTestFeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'pos') for tweet in positiveTweets]
     negativeTestFeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'neg') for tweet in negativeTweets]
     
     poscutoff = len(positiveTestFeatures)
     negcutoff = len(negativeTestFeatures)
     print "Test Pos Cutoff: " + str(poscutoff) + " Test Neg Cutoff: " + str(negcutoff)
     testfeatures = positiveTestFeatures[:poscutoff] + negativeTestFeatures[:negcutoff]
     #print testfeatures
     return (testfeatures)
Пример #12
0
 def __init__(self, tokenizer=WhitespaceTokenizer(), sent_len=200):
     self.sent_len = sent_len
     self.tokenizer = tokenizer
     self.w2v_dim = 300
     this_dir = os.path.dirname(os.path.realpath(__file__))
     model_dir = this_dir + '/model'
     if not os.path.exists(model_dir):
         os.makedirs(model_dir)
     pretrained_path = model_dir + '/GoogleNews-vectors-negative300.bin.gz'
     if not os.path.exists(pretrained_path):
         raise Exception('pretrained vector file not exists: {}'.format(pretrained_path))
     print('..loading model')
     self.model = gensim.models.KeyedVectors.load_word2vec_format(pretrained_path, binary=True)
Пример #13
0
    def process(self, text):
        """
            предобработка, токенизация по предложениям, удаление дублей.
            выдает список предложений (для векторного метода, на будущее)
            Args:
                text ([type]): [description]
            """

        #text = text.lower()

        # убираем числа, email, гиперрсылки

        #text = text.encode('utf-8')

        text = clear_emails(text)
        text = clear_url(text)
        text = clear_digits(text)
        text = clear_symb(text)

        # выделяем предложения
        sentence_tokenizer = PunktSentenceTokenizer()
        text = sentence_tokenizer.tokenize(text)

        cleaned_text = []
        stop_words = set(stopwords.words('russian'))

        # разбиваем по словам, чистим от оставшейся пунктуации и stopwords
        tokenizer = WhitespaceTokenizer()
        stemmer = SnowballStemmer('russian')

        for sentence in text:
            punct_cleaned_sent = clear_endings(
                sentence)  # служ. символы конца предложения
            tokenized_sent = tokenizer.tokenize(
                punct_cleaned_sent)  # раскидали по словам, только для отчистки
            stpw_clean_sentence = [
                word for word in tokenized_sent if not word in stop_words
            ]
            stemmed_sentence = [
                stemmer.stem(word) for word in stpw_clean_sentence
            ]  # проеборазуем в ед. число или корень слова
            clean_sentence = ' '.join(
                stemmed_sentence
            )  # собрали обратно в предложение-сторку для хэшировнаия

            cleaned_text.append(clean_sentence)

        return cleaned_text
Пример #14
0
def get_sentences_for_text(corpus_root, filename, lang='english'):
    """Segments the given text into sentences.

  Args:
    corpus_root: Directory in which the text file is residing.
    filename: Name of the text file.
    lang: Tokenizer language. For possible values, look at:
    ${NLTK_DATA}/tokenizers/punkt

  Returns:
    Sentences in the given text. 

  """
    tokenizer_path = 'tokenizers/punkt/' + lang + '.pickle'
    text = PlaintextCorpusReader(
        corpus_root, [filename],
        word_tokenizer=WhitespaceTokenizer(),
        sent_tokenizer=nltk.data.LazyLoader(tokenizer_path))
    return text.sents()
Пример #15
0
    def analyize(self,text):
        try:
            unitext = any2unicode(text, encoding='utf8', errors='strict')
        except:
            print ("Not utf-8")
            return []
        pass

        #convert to lower
        lowerText = unitext.lower()

        # Regex way: gives some text 'qwe (x)' as 'qwe' '(x)'
        # very aggresive regex...removes puncs and digits..keeps only alphabetic words
        tokenizer = WhitespaceTokenizer()
        regexTokens = tokenizer.tokenize(lowerText)
        p_stemmer = PorterStemmer()
        stemmedTokens = [p_stemmer.stem(i) for i in regexTokens]

        stemmedRemSingleLetterTokens = [w for w in stemmedTokens if len(w)>1]
        return stemmedRemSingleLetterTokens
Пример #16
0
    def process(self, text, plain_text=False):
        """
        предобработка, токенизация по словам,  удаление дублей.
        выдает сплошной (plain) текст, для метода шиндлов или список токенов текста

        Args:
            text ([type]): [description]
        """
        #text = text.encode('utf-8')

        # убираем числа, email, гиперрсылки

        text = clear_emails(text)
        text = clear_url(text)
        text = clear_digits(text)
        text = clear_symb(text)

        # разбиваем по словам, чистим от оставшейся пунктуации и stopwords

        stop_words = set(stopwords.words('russian'))
        tokenizer = WhitespaceTokenizer()
        stemmer = SnowballStemmer('russian')

        punct_cleaned_text = clear_endings(
            text)  # служ. символы конца предложения
        tokenized_text = tokenizer.tokenize(
            punct_cleaned_text)  # раскидали по словам, только для отчистки
        stpw_clean_text = [
            word for word in tokenized_text if not word in stop_words
        ]
        stemmed_text = [stemmer.stem(word) for word in stpw_clean_text
                        ]  # проеборазуем в ед. число или корень слова
        clean_text = None
        if plain_text:
            clean_text = ' '.join(
                stemmed_text
            )  # собрали обратно в предложение-сторку для хэшировнаия
        else:
            clean_text = stemmed_text  #  иначе возвращаем список токенов

        return clean_text
Пример #17
0
 def __init__(self):
     self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
     self.repl = r'\1\2\3'
     self.tokenizer = WhitespaceTokenizer()
     self.cached_stopwords = stopwords.words('english')
     self.ascii_replace = [
         ('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'),
         ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'),
         ('ô', 'o'), ('õ', 'o'), ('ú', 'u'), ('ç', 'c'), ('ä', 'a'),
         ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'),
         ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'),
         ('Ê', 'e'), ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'),
         ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c')
     ]
     self.link_patterns = [('http'), ('www'), ('w3c')]
     self.digraph = [(r'hash', '#'), (r'rxr', 'rr'), (r'sxs', 'ss'),
                     (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo'),
                     (r'fqf', 'ff'), (r'gqg', 'gg'), (r'cqc', 'cc'),
                     (r'dqd', 'dd'), (r'mqm', 'mm'), (r'nqn', 'nn'),
                     (r'pqp', 'pp'), (r'dqd', 'dd'), (r'tqt', 'tt'),
                     (r'fqf', 'ff'), (r'lql', 'll')]
from argparse import ArgumentParser
from collections import OrderedDict
from textblob import TextBlob
from nltk.util import bigrams
from multiprocessing import Pool
from traceback import format_exc
from nltk.stem.snowball import EnglishStemmer
from nltk.tokenize.regexp import WhitespaceTokenizer
from nltk.corpus import stopwords
from boto import connect_s3
import requests
import codecs
import traceback

stemmer = EnglishStemmer()
tokenizer = WhitespaceTokenizer()
stops = stopwords.words(u'english')


def get_args():
    ap = ArgumentParser()
    ap.add_argument(u'--num-processes',
                    dest=u"num_processes",
                    default=8,
                    type=int)
    ap.add_argument(u'--solr-host',
                    dest=u"solr_host",
                    default=u"http://search-s10:8983")
    ap.add_argument(u'--outfile', dest=u'outfile', default=u'wiki_data.csv')
    ap.add_argument(u'--s3dest', dest=u's3dest')
    return ap.parse_args()
def get_social_word_counts(social_var,
                           vocab,
                           comment_file,
                           meta_file,
                           comment_thresh=10):
    """
    Compute unique number of social vars 
    per word in vocab over all comments.
    Parameters:
    -----------
    social_var : str
    vocab : [str]
    Vocabulary to count.
    comment_file : str
    meta_file : str
    Tab-separated metadata file containing comment date, 
    author, thread ID, and subreddit.
    comment_thresh : int
    Minimum number of comments for a social var to be counted.
    Returns:
    --------
    social_var_counts : numpy.array
    """
    # indices in meta file corresponding to social vars
    social_var_indices = {'user': 1, 'subreddit': 3, 'thread': 2}
    social_txt = defaultdict(list)
    tokenizer = WhitespaceTokenizer()
    stopwords = get_default_stopwords()
    ngram_range = (1, 1)
    min_df = 1
    cv = CountVectorizer(encoding='utf-8',
                         lowercase=True,
                         tokenizer=tokenizer.tokenize,
                         stop_words=stopwords,
                         ngram_range=ngram_range,
                         min_df=min_df,
                         vocabulary=vocab,
                         binary=True)
    # keep it simple and store {vocab : {sub : count}}
    social_word_counts = defaultdict(Counter)
    with BZ2File(comment_file, 'r') as comments, BZ2File(meta_file,
                                                         'r') as metas:
        for i, (comment, meta) in enumerate(izip(comments, metas)):
            meta = meta.split('\t')
            social_id = meta[social_var_indices[social_var]]
            # print('got social id %s'%(social_id))
            # social_txt[social_id].append(comment)
            for w in tokenizer.tokenize(comment):
                social_word_counts[w][social_id] += 1
            if (i % 100000 == 0):
                print('processed %d comments' % (i))
            # if(i == 500000):
            #     break
    social_word_counts = {
        w: d
        for w, d in social_word_counts.iteritems() if w in vocab
    }
    social_word_counts = {
        w: {k: v
            for k, v in d.iteritems() if v >= comment_thresh}
        for w, d in social_word_counts.iteritems()
    }
    social_word_counts = {w: len(d) for w, d in social_word_counts.iteritems()}
    social_word_counts = np.array([
        social_word_counts[v] if v in social_word_counts else 0. for v in vocab
    ])

    # old code for constructing word/social dtm
    # restrict to consistent users??
    # social_txt = {k : v for k,v in social_txt.items()
    #               if len(v) >= comment_thresh}
    # # now convert to DTM
    # def get_txt_iter(social_txt):
    #     N = len(social_txt)
    #     for i, v in enumerate(social_txt.itervalues()):
    #         if(i % 1000 == 0):
    #             print('processed %d/%d social vars'%(i, N))
    #         yield ' '.join(v)
    # txt_iter = get_txt_iter(social_txt)
    # # txt_iter = (' '.join(v) for v in social_txt.values())
    # dtm = cv.fit_transform(txt_iter)
    # print('got %s dtm %s'%(social_var, dtm))
    # # save sparse matrix
    # # all_social_vals = social_txt.keys()
    # # vocab = sorted(cv.vocabulary_, key=lambda x: cv.vocabulary_[x])
    # # comment_date = re.findall(r'201[0-9]-[0-9]+', comment_file)[0]
    # # write_full_social_dtm(dtm, all_social_vals, vocab, comment_date, social_var)
    # # save unique social count for each word
    # # combine all counts per word
    # social_word_counts = np.array(dtm.sum(axis=0)).flatten()
    return social_word_counts
Пример #20
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--out_dir', default='../../data/frequency')
    parser.add_argument('--comment_files', nargs='+', default=None)
    parser.add_argument('--n', type=int, default=2)
    parser.add_argument('--file_suffix', default=None)
    parser.add_argument('--sample_pct', type=float, default=100)
    args = parser.parse_args()
    out_dir = args.out_dir
    comment_files = args.comment_files
    n = args.n
    file_suffix = args.file_suffix
    sample_pct = args.sample_pct
    if (comment_files is None):
        comment_files = get_all_comment_files()
        # replace with clean normalized (smaller vocab)
        comment_files = [
            f.replace('.bz2', '_clean_normalized.bz2') for f in comment_files
        ]
    # start small
    # comment_files = comment_files[:1]
    # min_df = 5
    # min_tf = 10
    min_tf = 1
    stopwords = []
    tokenizer = WhitespaceTokenizer()
    # breaking memory
    # ngram_range = (1,3)
    # ngram_range = (2,3)
    # ngram_range = (2,2)
    # ngram_range = (1,1)
    # no CountVectorizer because memory and we don't need
    # cooccurrence anyway
    # cv = CountVectorizer(min_df=min_df, tokenizer=tokenizer.tokenize,
    #                      stop_words=stopwords, ngram_range=ngram_range)
    date_format = '201[0-9]-[0-9]+'
    for f in comment_files:
        print('processing file %s' % (f))
        date_str = re.findall(date_format, f)[0]
        # for each level of ngram, recompute counts
        # for n in range(ngram_range[0], ngram_range[1]+1):
        print('computing ngram = %d' % (n))
        with BZ2File(f, 'r') as comment_file:
            # takes too long to generate full DTM...what do??
            # just compute counts
            comment_iter = make_iter(comment_file)
            counts = get_ngram_counts(comment_iter,
                                      n,
                                      tokenizer=tokenizer,
                                      sample_pct=sample_pct)

            # limit min_frequency?
            counts = counts[counts >= min_tf]
            counts.columns = [date_str]
            # write to file
            # TOO MUCH SPACE => compress?
            if (file_suffix is not None):
                out_fname = os.path.join(
                    out_dir,
                    '%s_%dgram_tf_%s.tsv' % (date_str, n, file_suffix))
            else:
                out_fname = os.path.join(out_dir,
                                         '%s_%dgram_tf.tsv' % (date_str, n))
            counts.to_csv(out_fname, sep='\t')
Пример #21
0
 def classify(self, text):
     return (self.classifier.classify(WhitespaceTokenizer().tokenize(text)))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--comment_files', nargs='+', default=None)
    parser.add_argument('--out_dir', default='../../data/frequency/')
    parser.add_argument(
        '--social_vars',
        nargs='+',
        # default=['user', 'thread', 'subreddit'])
        # default=['user'])
        # default=['thread'])
        default=['subreddit'])
    args = parser.parse_args()
    comment_files = args.comment_files
    out_dir = args.out_dir
    social_vars = args.social_vars
    if (comment_files is None):
        # data_dir = '/mnt/new_hg190/corpora/reddit_comment_data/monthly_submission/'
        data_dir = '/mnt/new_hg190/corpora/reddit_comment_data/monthly_submission/'
        years = ['2015', '2016']
        comment_files = get_all_comment_files(data_dir, years)
        print('comment files %s' % (str(comment_files)))
        # but we actually want clean_normalized lol
        comment_files = [
            f.replace('.bz2', '_normalized.bz2') for f in comment_files
        ]
    meta_files = [f.replace('.bz2', '_meta.bz2') for f in comment_files]
    # print('got meta files %s'%(meta_files))
    # TODO: start small, eventually move to rest of files
    # comment_files = comment_files[3:]
    # comment_files = comment_files[1:]

    # for testing
    # social_vars = social_vars[:1]

    vocab = get_default_vocab()
    # chunk_size = 1000
    # chunk_size = 5000
    # chunk_size = len(vocab)
    # chunks = int(len(vocab) / chunk_size)
    # vocab_chunks = [vocab[i*chunk_size:i*chunk_size+chunk_size]
    #                 for i in xrange(chunks)]
    # start small
    # top_vocab = 1000
    top_vocab = 100000
    stopwords = get_default_stopwords()
    # already whitespace separated, so just need whitespace tokenizer
    tokenizer = WhitespaceTokenizer()
    ngram_range = (1, 1)
    min_df = 1
    cv = CountVectorizer(
        encoding='utf-8',
        lowercase=True,
        tokenizer=tokenizer.tokenize,
        stop_words=stopwords,
        ngram_range=ngram_range,
        min_df=min_df,
        # max_features=top_vocab,
        vocabulary=vocab,
        # binarize to save space b/c we only care about cooccurrence
        binary=True)
    out_dir = args.out_dir
    # min number of comments within social value
    # to make it count
    # social_comment_thresh = 10
    social_comment_thresh = 1
    for comment_file, meta_file in izip(comment_files, meta_files):
        print('processing comment file %s and meta file %s' %
              (comment_file, meta_file))
        date_str = re.findall(r'201[0-9]-[0-9]+', comment_file)[0]
        for social_var in social_vars:
            # use for full dtm
            # out_fname = os.path.join(out_dir, '%s_%s_dtm'%(date_str, social_var))
            out_fname = os.path.join(
                out_dir, '%s_%s_unique.tsv' % (date_str, social_var))
            # for each vocab chunk in list, get unique social counts!
            # for vocab in vocab_chunks:
            print('got vocab size %d' % (len(vocab)))
            social_word_counts = get_social_word_counts(
                social_var,
                vocab,
                comment_file,
                meta_file,
                comment_thresh=social_comment_thresh)
            # write to file
            social_word_counts = pd.DataFrame(social_word_counts, index=vocab)
            social_word_counts.to_csv(out_fname, sep='\t', header=False)
from nltk import *
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer


def getUniqueWords(allWords):
    uniqueWords = []
    for i in allWords:
        if not i in uniqueWords:
            uniqueWords.append(i)
    return uniqueWords


text_str = open('corpus.txt').read()
tokens = WhitespaceTokenizer().tokenize(text_str)
print("\nInitial Statistics of the Corpus.")
print("#token: " + str(len(tokens)))
print("#types: " + str(len(getUniqueWords(tokens))))

print("\nThe Top-10 Frequent Tokens.")
freq = nltk.FreqDist(tokens)
print(freq.most_common(10))

tokens = [token.lower() for token in tokens]
print("\nAfter Case Folding.")
print("#token: " + str(len(tokens)))
print("#types: " + str(len(getUniqueWords(tokens))))

print("\nThe Top-10 Frequent Tokens.")
freq = nltk.FreqDist(tokens)
Пример #24
0
# -*- coding: latin-1 -*-
import re
import nltk
from nltk.tag import UnigramTagger
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import PunktWordTokenizer
from nltk import RegexpParser
from nltk.corpus import stopwords
from nltk.tokenize.regexp import WhitespaceTokenizer
global corpus, sent_tags, tagger

# corpus = TaggedCorpusReader('/root/adail/python/names',r'.*\.txt',word_tokenizer=PunktWordTokenizer(),sep="_") PATH no linux
corpus = TaggedCorpusReader(
    'C:/Users/jose.adail/workspace/TextProcessor/names',
    r'.*\.txt',
    word_tokenizer=WhitespaceTokenizer(),
    sep="_")
name_tags = corpus.tagged_sents(
)  # Recebe as sentenças marcadas com POS_Tags.
tagger = UnigramTagger(
    name_tags
)  # UnigramTagger é treinado com essas sentenças marcadas que o são repassadas.


class RegexpReplacer(object):
    def __init__(self):
        self.replacement_patterns = [(r"'", ''), (r'#', 'hash'),
                                     (r'no', 'no_'), (r'not', 'not_'),
                                     (r'RT ', ''), (r'rs[rs]+', 'rs'),
                                     (r'ha[ha]+', 'haha'), (r's[s]+', 'sxs'),
                                     (r'r[r]+', 'rxr'), (r'a[a]+', 'aqa'),
Пример #25
0
 def __init__(self):
     self.tokenizer = WhitespaceTokenizer()