def evaluate(row):
    record, stop_words = row
    text = nltk.word_tokenize(record['text'])

    sentList = nltk.sent_tokenize(record['text'])
    wordsInSentsPos = [nltk.pos_tag(nltk.word_tokenize(s)) for s in sentList]
    wordsInSentsWnPos = [[(w[0], penn2morphy(w[1])) for w in s
                          if w[0].lower() not in stop_words]
                         for s in wordsInSentsPos]
    #the above returns a list of sentences where each sentence is a list of
    #(word-as-string, pos tag) tuples. Stop words are removed here because pos_tag
    #uses grammatical structure but lesk does not.

    #lemmatization
    lmtzr = WordNetLemmatizer()
    for sent in wordsInSentsWnPos:
        for indx, tup in enumerate(sent):
            if tup[1] != '':  #if pos exists, reset word to lemma
                word = lmtzr.lemmatize(tup[0], tup[1])
                text[indx] = word
            #if pos tagger failed, keep original word

    fdist = FreqDist(text)
    unfiltered_frequencies = fdist.most_common(fdist.B())
    frequencies = [
        t for t in unfiltered_frequencies
        if t[0] not in stop_words and t[1] > 5 and len(t[0]) > 2
    ]
    return frequencies
示例#2
0
def show_stats_for_text(text):
    words = tokenize(text, clean_filter=FILTER_ALL)
    fd = FreqDist(words)
    logger.info('Total words: %s', len(words))
    logger.info('Recurrent words: %s', fd.B())
    logger.info('Most common words')
    for word, count in fd.most_common(20):
        logger.info('%s\t%s', word, count)
示例#3
0
def evaluate(row):
    record, stop_words = row
    text = nltk.word_tokenize(record['text'])
    fdist = FreqDist(text)
    unfiltered_frequencies = fdist.most_common(fdist.B())
    frequencies = [
        t for t in unfiltered_frequencies
        if t[0] not in stop_words and t[1] > 5 and len(t[0]) > 2
    ]
    return frequencies
示例#4
0
def calculo_frecuencias(bag_of_words):
    """Calcula frecuencias de las palabras y muestra una gráfica con las más frecuentes
    
    Args:
        bag_of_words: lista de strings
    """
    freq_dist = FreqDist(bag_of_words)
    print("Nº. objetos: %d" % freq_dist.N())
    print("Nº. objetos únicos: %d" % freq_dist.B())
    print("El objeto más frecuente es: %s" % str(freq_dist.max()))
    freq_dist.plot(50)
示例#5
0
    def getFreqDist(self):
        
        fieldnames = ['Word','Frequency']
        
        with open(self.csvfile, 'wb') as csvf:
            writer = csv.DictWriter(csvf, fieldnames=fieldnames)

            writer.writeheader()
            
            text=self.text
        
            #set stopwords
            stopwords = set(nltk.corpus.stopwords.words('english'))
                   
            words=word_tokenize(text)
            
            #remove words if length of word is not over 1 (i.e. punctuation)
            words = [word for word in words if len(word) > 1]
            #remove numbers
            words = [word for word in words if not word.isnumeric()]
            #make all words lowercase
            words = [word.lower() for word in words]
            #remove stopwords
            words = [word for word in words if word not in stopwords]
                
            fdist= FreqDist(words)

            #number of all words
            print ('Total number of samples: %i' % fdist.N())
            
            #number of all distinct words
            print ('Total number of bins: %i' % fdist.B())
            
            #write all bins and count into CSV file
            for word, frequency in fdist.most_common(fdist.B()):
                writer.writerow({'Word':word,'Frequency': frequency})
示例#6
0
def evaluate(row):
    record, stop_words = row
    text = nltk.word_tokenize(record['text'])

    #stemming
    stemmer = SnowballStemmer("english")
    for indx, word in enumerate(text):
        word = stemmer.stem(word)
        text[indx] = word

    fdist = FreqDist(text)
    unfiltered_frequencies = fdist.most_common(fdist.B())
    frequencies = [
        t for t in unfiltered_frequencies
        if t[0] not in stop_words and t[1] > 5 and len(t[0]) > 2
    ]
    return frequencies
示例#7
0
    def __init__(self, n, train, estimator=None):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        @param n: the order of the language model (ngram size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{string}
        @param estimator: a function for generating a probability distribution
        @type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}
        """
        self._n = n
        self._N = 1 + len(train) - n

        if estimator is None:
            def estimator(fdist, bins): return MLEProbDist(fdist)

        if n == 1:
            fd = FreqDist(train)
            self._model = estimator(fd, fd.B())
        else:
            cfd = ConditionalFreqDist()
            self._ngrams = set()
            self._prefix = ('',) * (n - 1)

            for ngram in ingrams(chain(self._prefix, train), n):
                self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
                cfd[context][token] += 1

            self._model = ConditionalProbDist(cfd, estimator, len(cfd))
        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n - 1, train, estimator)
示例#8
0
    def __init__(self, n, train, pad_left=False, pad_right=False,
                 estimator=None, *estimator_args, **estimator_kwargs):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        :param n: the order of the language model (ngram size)
        :type n: C{int}
        :param train: the training text
        :type train: C{iterable} of C{string} or C{iterable} of C{iterable} of C{string} 
        :param estimator: a function for generating a probability distribution---defaults to MLEProbDist
        :type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}
        :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of <s>
        :type pad_left: bool
        :param pad_right: whether to pad the right of each sentence with </s>
        :type pad_right: bool
        :param estimator_args: Extra arguments for estimator.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying ConditionalFreqDist are passed to
            the estimator as an argument.
        :type estimator_args: (any)
        :param estimator_kwargs: Extra keyword arguments for the estimator
        :type estimator_kwargs: (any)
        """

        # protection from cryptic behavior for calling programs
        # that use the pre-2.0.2 interface
        assert(isinstance(pad_left, bool))
        assert(isinstance(pad_right, bool))

        # make sure n is greater than zero, otherwise print it
        assert (n > 0), n

        # For explicitness save the check whether this is a unigram model
        self.is_unigram_model = (n == 1)
        # save the ngram order number
        self._n = n
        # save left and right padding
        self._lpad = ('<s>',) * (n - 1) if pad_left else ()
        # Need _rpad even for unigrams or padded entropy will give
        #  wrong answer because '</s>' will be treated as unseen...
        self._rpad = ('</s>',) if pad_right else ()
        self._padLen = len(self._lpad)+len(self._rpad)

        self._N=0
        delta = 1+self._padLen-n        # len(sent)+delta == ngrams in sent

        if estimator is None:
            assert (estimator_args is ()) and (estimator_kwargs=={}),\
                   "estimator_args (%s) or _kwargs supplied (%s), but no estimator"%(estimator_args,estimator_kwargs)
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        # Given backoff, a generator isn't acceptable
        if not isinstance(train,collections.abc.Sequence):
          train=list(train)
        self._W = len(train)
        # Coerce to list of list -- note that this means to train charGrams,
        #  requires exploding the words ahead of time 
        if train is not None:
            if isinstance(train[0], compat.string_types):
                train = [train]
                self._W=1
            elif not isinstance(train[0],collections.abc.Sequence):
                # if you mix strings and generators, you have only yourself
                #  to blame!
                for i in range(len(train)):
                    train[i]=list(train[i])

        if n == 1:
            if pad_right:
                sents=(chain(s,self._rpad) for s in train)
            else:
                sents=train
            fd=FreqDist()
            for s in sents:
                fd.update(s)
            if not estimator_args and not estimator_kwargs:
                self._model = estimator(fd,fd.B())
            else:
                self._model = estimator(fd,fd.B(),
                                        *estimator_args, **estimator_kwargs)
            self._N=fd.N()
        else:
            cfd = ConditionalFreqDist()
            self._ngrams = set()

            for sent in train:
                self._N+=len(sent)+delta
                for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
                    self._ngrams.add(ngram)
                    context = tuple(ngram[:-1])
                    token = ngram[-1]
                    cfd[context][token]+=1
            if not estimator_args and not estimator_kwargs:
                self._model = ConditionalProbDist(cfd, estimator, len(cfd))
            else:
                self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)

        # recursively construct the lower-order models
        if not self.is_unigram_model:
            self._backoff = NgramModel(n-1, train,
                                        pad_left, pad_right,
                                        estimator,
                                        *estimator_args,
                                        **estimator_kwargs)

            # Code below here in this method, and the _words_following and _alpha method, are from
            # http://www.nltk.org/_modules/nltk/model/ngram.html "Last updated on Feb 26, 2015"
            self._backoff_alphas = dict()
            # For each condition (or context)
            for ctxt in cfd.conditions():
                backoff_ctxt = ctxt[1:]
                backoff_total_pr = 0.0
                total_observed_pr = 0.0

                # this is the subset of words that we OBSERVED following
                # this context.
                # i.e. Count(word | context) > 0
                for word in self._words_following(ctxt, cfd):
                    total_observed_pr += self.prob(word, ctxt)
                    # we also need the total (n-1)-gram probability of
                    # words observed in this n-gram context
                    backoff_total_pr += self._backoff.prob(word, backoff_ctxt)
                if isclose(total_observed_pr,1.0):
                    total_observed_pr=1.0
                else:
                    assert 0.0 <= total_observed_pr <= 1.0,\
                           "sum of probs for %s out of bounds: %.10g"%(ctxt,total_observed_pr)
                # beta is the remaining probability weight after we factor out
                # the probability of observed words.
                # As a sanity check, both total_observed_pr and backoff_total_pr
                # must be GE 0, since probabilities are never negative
                beta = 1.0 - total_observed_pr

                if beta!=0.0:
                    assert (0.0 <= backoff_total_pr < 1.0), \
                           "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr)
                    alpha_ctxt = beta / (1.0 - backoff_total_pr)
                else:
                    assert ((0.0 <= backoff_total_pr < 1.0) or
                            isclose(1.0,backoff_total_pr)), \
                           "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr)
                    alpha_ctxt = 0.0

                self._backoff_alphas[ctxt] = alpha_ctxt
示例#9
0
    token_list = TweetTokenizer().tokenize(line)
    for token in token_list:
        #Excludes non-word tokens
        if re.search('\W',token) == None:
            #Excludes stopwords
            if (EXCLUDE_STOPWORDS) and (token not in stopwords.words('english')):
                text.append(token)
            elif (not EXCLUDE_STOPWORDS):
                text.append(token)

print text

#Create frequency distribution
fdist = FreqDist(text)
total_tokens = fdist.N()
unique_tokens = fdist.B()

if EXCLUDE_STOPWORDS:
    print "Stopwords are excluded"
else:
    print "Stopwords are NOT excluded"

#Print distribution properties
print "\nThe number of total tokens:", total_tokens
print "The number of unique tokens:", unique_tokens
print "Lexical density:", (unique_tokens + 0.0)/total_tokens

print "\nThe most common Words:"
print "======================="
for x in fdist.most_common(100):
    w, n = x
示例#10
0
allwords.count('Hamlet')

A = set(allwords)
longwords = [w for w in A if len(w) > 12]  #单词长度>12的所有单词
print(sorted(longwords))

from nltk.probability import FreqDist, ConditionalFreqDist
"""
FreqDist: 创建一个所给数据的频率分布
B(): 不同单词的个数
N(): 所有单词的个数
tabulate(20): 把前20组数据以表格的形式显示出来
fd2.plot(20,cumulative=True): 参数cumulative 对数据进行累计 
"""
fd2 = FreqDist([sx.lower() for sx in allwords if sx.isalpha()])
print("不同单词的个数:%d" % fd2.B())
print("所有单词的个数:%d" % fd2.N())
fd2.tabulate(20)  #把前20组数据 以表格的形式显示出来
fd2.plot(20)
fd2.plot(20, cumulative=True)
"""
freq('the')  #单词the出现的频率
ConditionalFreqDist( ): 条件频率统计的函数,研究类别之间的系统性的差异
"""
from nltk.corpus import inaugural
print(fd2.freq('the'))  #单词the出现的频率
cfd = ConditionalFreqDist((fileid, len(w)) for fileid in inaugural.fileids()
                          for w in inaugural.word(fileid)
                          if fileid > '1980' and fileid < '2010')
print(cfd.items())
cfd.plot()
示例#11
0
def preprocess(s, output_train=False, output_test=False):
    """
    preprocess takes a string of the form
    '(ham|spam) words words words\n(ham|spam) more words here....'
    And returns a list of processed texts. The class is removed, all words are
    lowercase, there are no stop words or punctuation, all the words are
    stemmed, and tokens that appear less than 5 times in the entire string are
    removed entirely.
    Return values:
     - list of processed SMS texts
     - nltk.probability.FreqDist of frequency of tokens
     - labels of each SMS text in order
    """

    # Removing punctuation from unicde is tricky
    # I'm doing this because the word_tokenizer gives us unicode, so we want
    # everything to be unicode
    # Anyway, use this punctuation table with unicode.translate()
    # https://stackoverflow.com/questions/11066400/remove-punctuation-from-unicode-formatted-strings#11066687
    # punctuation = dict.fromkeys(i for i in xrange(sys.maxunicode)
    #         if unicodedata.category(unichr(i)).startswith('P'))

    stopwords = set(sw.words('english'))
    punctuation = [i for i in u'{}'.format(string.punctuation)]

    # Step 1: Remove uppercase, and make utf-8 to be sure
    s = s.lower()

    # Step 2: Tokenize!
    # Split everything by line (one line for each text)
    # Tokenize it
    # Skip word 1 (the class -- spam/ham), put it to array
    # Also trim the last item (it's an empty string after the last \n)
    token_texts = [word_tokenize(text) for text in s.split('\n')][:-1]
    labels = [text[0] for text in token_texts]

    texts = [text[1:] for text in token_texts]

    if output_train:
        answer_question(
            'STEP 2.a', 'Total numner of distinct tokens is ' +
            str(FreqDist([word for text in texts for word in text]).B()) + '.')

    # Step 3: Remove stop words
    # Step 4: Remove punctuation
    # Step 5: Stem all the tokens
    # Doing this all in one go for simplicity.
    stemmer = PorterStemmer()
    for i in range(len(texts)):
        texts[i] = [
            stemmer.stem(word) for word in texts[i]
            if word not in stopwords and word not in punctuation
        ]

    if output_train:
        answer_question('STEP 5.a', 'The list is ' + str(texts[10]) + '.')
    if output_test:
        answer_question('STEP 1.a', 'The list is ' + str(texts[23]) + '.')

    # Get freq distribution of the whole set
    freq = FreqDist([word for text in texts for word in text])

    # Step 6: Dump all infrequent tokens
    # Note that len(freq) can give you the number of unique tokens in the data
    texts = [[word for word in text if freq[word] >= 5] for text in texts]
    freq = FreqDist([word for text in texts for word in text])

    if output_train:
        answer_question(
            'STEP 6.a',
            'Total numner of distinct tokens is ' + str(freq.B()) + '.')

    # Done!
    return texts, freq, labels
示例#12
0
    #print("Line Bigrams:",list(bigrams(lineWords)), file=log_file)
    bgs = bgs + list(bigrams(lineWords))
    #print("bgs=",bgs, file=log_file)
    #out = " ".join(s.encode('ascii', 'ignore') for s in tokens)
    #file_tokenized.write(out+'\n')
    print('*end of line*', file=log_file)

# Write to file
for item in bgs:
    output_file1.write(str(item) + "\n")
output_file1.close()

# Compute frequency distribution for all the bigrams in the corpus
fdist1 = FreqDist(bgs)
total_bigrams = fdist1.N()
unique_bigrams = fdist1.B()
print("Number of total bigrams:", total_bigrams, file=log_file)
print("Number of unique bigrams:", unique_bigrams, file=log_file)
print("100 most frequent bigrams:", file=log_file)
mostFrequentBigramsList = fdist1.most_common(100)
print(mostFrequentBigramsList, file=log_file)

# Write to file
for item in mostFrequentBigramsList:
    #print(item[1],"\t",item[0],"\n\n")
    output_file2.write(str(item[1]) + "\t" + str(item[0]) + "\n")
output_file2.close()

# Compute frequency distribution for all the words (excluding stopwords) in the corpus
fdist2 = FreqDist(words)
total_words = fdist2.N()
示例#13
0
    def __init__(self, n, train, k=5, v=None,
                 liveDangerously=False, quiet=False):
        """
        Creates an Katz-threshholded Ngram language model to capture
        patterns in n consecutive words of training text.
        Uses the KGoodTuringProbDist to estimate the conditional and unigram probabilities,
        to provide coverage of Ngrams not seen during training.

        @param n: the order of the language model (ngram size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{string}
        @param k: The threshhold above which counts are assumed
                  to be reliable.  Defaults to 5.
        @type  k: C{Int}
        @param v: The number of unseens of degree 1.  Defaults to the
                  number of types in the training set
        @type  v: C{Int}
        @param liveDangerously: If False, for each model check that
                                the total probability mass after all
                                adjustments is close to 1.  Defaults
                                to False.
        @type  liveDangerously: C{Boolean}
        @param quiet: Various information will be printed during model
                       construction unless this is True.  Defaults to False.
        @type  quiet: C{Boolean}
        """
        self._n = n
        self._N = 1 + len(train) - n
        fd = FreqDist(train)
        if v is None:
            v = fd.B()
        print(('v', v))
        if n == 1:
            # Treat this case specially
            self._model = KGoodTuringProbDist(fd, k, v, liveDangerously, ())
            if not quiet:
                print("%s entries for %s tokens at degree 1, %s" % (len(fd),
                                                                    fd.N(),
                                                                    self._model.status))
        else:
            def estimator(fdist, ctxt): return KGoodTuringProbDist(fdist, k, v,
                                                                   liveDangerously,
                                                                   ctxt)

            cfd = ConditionalFreqDist()

            for ngram in ingrams(train, n):
                # self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
                cfd[context].inc(token)

            self._model = ConditionalProbDist(cfd, estimator, True)
            if not quiet:
                statuses = {'normal': 0, 'bigSkewed': 0,
                            'weak': 0, LowHacked: 0}
                for ctx in cfd.conditions():
                    statuses[self[ctx].status] += 1
                print("%s conditions at degree %s" %
                      (len(cfd.conditions()), n))
                for s in list(statuses.keys()):
                    print(" %s %6d" % (s, statuses[s]))

            # recursively construct the lower-order models
            self._backoff = KBNgramModel(n - 1, train, k, v, liveDangerously)
    for _, r in df:
        sentences = tokenize_sentence(r.Review)
        text_preprocessed.append([remove_stopword(tokenize_word(s), stopwords=stopwords) for s in sentences])
        marker_shopId.append(r.ShopID)

    #Observe result
    print(text_preprocessed[:10])

    #Save result
    #In binary, must be read in binary mode
    with open(r'..\data\preprocessed_{}.pickle'.format(title), 'wb') as f:
        pickle.dump((text_preprocessed, marker_shopId), f)

    return text_preprocessed

text_preprocessed = preprocess()


#--Word count (all docs)
#Word frequency distribution by nltk
fdist = FreqDist([i for i in flatten_list(text_preprocessed)])

#Observe result
print('Unique terms:', fdist.B())
print('Total terms:', fdist.N())
sorted(fdist.items(), key=operator.itemgetter(1), reverse=True) #Top terms

#Save result
with open(r'..\data\fdist.pickle', 'wb') as f:
    pickle.dump(fdist, f)
示例#15
0
def load_filter_list(json_file):
    filter_list = []
    with open(json_file, 'r', encoding='utf8') as filter_file:
        for line in filter_file:
            filter_list.append(json.loads(line)['word'])
    return sorted(filter_list)

start = datetime.datetime.now()

assert os.path.isdir(CORPUS_DIR)

newcorpus = PlaintextCorpusReader(CORPUS_DIR, '.*\.txt')
print ('Corpus begins with {}'.format(newcorpus.words()[:10]))
frequencies = FreqDist(newcorpus.words())
print('Samples: %d' % frequencies.N())
print('Words: %d' % frequencies.B())
pattern = '^[a-zěščřžýáýíéóďťňúů]*$' if DIACRITICS else '^[a-z]*$'
candidates = {word: freq for (word, freq) in frequencies.most_common()
    if 6 < len(word) < 9 and re.match(pattern,word)}
print('Candidates: %d' % len(candidates))
print ((datetime.datetime.now() - start).total_seconds())
spell = get_spellcheck_candidates()
print('Spellchck candidates:\nAdjectives %d\nSubstantives %d\nNouns %d'
    % (len(spell['adjectives']), len(spell['substantives']), len(spell['nouns'])))
print ((datetime.datetime.now() - start).total_seconds())
substs = [word for word in candidates.keys() if word in spell['substantives']]
substs.sort(key=lambda subst : -candidates[subst])
adjs = [word for word in candidates.keys() if word in spell['adjectives']]
adjs.sort(key=lambda adj : -candidates[adj])
nouns = [word for word in candidates.keys() if word in spell['nouns']]
nouns.sort(key=lambda noun : -candidates[noun])
示例#16
0
def plot_words(wordList):
    fDist = FreqDist(wordList)
    #print(fDist.most_common())
    print("单词总数: ", fDist.N())
    print("不同单词数: ", fDist.B())
    fDist.plot(10)
示例#17
0
class TextFeatures:

    parts_of_speech = [
        "NN", "NNS", "NNP", "NNPS", "DT", "RB", "IN", "PRP", "CC", "CD", "VB",
        "VBD", "VBN", "VBG", "JJ", "EX", "FW"
    ]
    most_common_words = [
        "the", "of", "and", "to", "a", "in", "for", "is"
        "on", "that", "by", "this", "with", "i", "you", "it", "not", "or",
        "be", "are", "from", "at", "as", "your", "all", "have", "new", "more",
        "an", "was", "we", "will", "home", "can", "us", "about", "if", "page",
        "my", "has", "search", "free"
    ]
    punctuation = [".", ",", "!", "?", ";", ":"]

    def __init__(self, text, session):
        self.session = session
        self.tokens = nltk.word_tokenize(text)
        self.text = text
        self.fdist = FreqDist()
        for token in self.tokens:
            self.fdist.inc(token.lower())
        self.tagged = nltk.pos_tag(self.tokens)
        self.counts = self.__get_word_commonality_counts(self.text.split())
        self.word_lengths = [len(word) for word in self.tokens]
        self.sentences = nltk.sent_tokenize(self.text)
        self.sentence_lengths = [len(sen.split()) for sen in self.sentences]

    def __get_word_commonality_counts(self, words):
        results = [
            self.session.query(WordCount).filter_by(word=w).first()
            for w in words
        ]
        results = [w.count for w in results if w is not None]
        if len(results) == 0:
            return [0]
        return results

    def _word_freq_to_vector(self):
        dist = self.word_freq()
        return [dist.freq(word) for word in TextFeatures.most_common_words]

    def _punctuation_freq_vector(self):
        dist = self.word_freq()
        return [dist.freq(mark) for mark in TextFeatures.punctuation]

    def _word_length_freq_to_vector(self):
        dist = self.word_length_freq()
        return [dist.freq(length) for length in range(1, 12)]

    def _POS_freq_to_vector(self):
        dist = self.POS_freq()
        return [dist.freq(pos) for pos in TextFeatures.parts_of_speech]

    def _POS_cond_freq_to_vector(self):
        dist = self.POS_cond_freq()
        freq_vector = []
        for pos0 in TextFeatures.parts_of_speech:
            for pos1 in TextFeatures.parts_of_speech:
                freq_vector.append(dist[pos0].freq(pos1))
        return freq_vector

    def _word_rarity_freq_to_vector(self):
        dist = self.word_rarity_freq()
        return [dist.freq(i) for i in range(20)]

    def to_vector(self):
        return ([
            self.avg_word_length(),
            self.std_dev_word_length(),
            float(self.max_word_length()),
            float(self.max_sentence_length()),
            float(self.min_sentence_length()),
            self.avg_sentence_length(),
            self.std_sentence_length(),
            float(self.avg_word_commonality()),
            float(self.std_word_commonality()),
            self.unique_word_freq()
        ] + self._word_rarity_freq_to_vector() + self._word_freq_to_vector() +
                self._punctuation_freq_vector() +
                self._word_length_freq_to_vector() +
                self._POS_freq_to_vector()
                #self._POS_cond_freq_to_vector()
                )

    def word_freq(self):
        return self.fdist

    def word_length_freq(self):
        return FreqDist(len(word) for word in self.tokens)

    def POS_freq(self):
        "Returns the frequency distribution of parts of speech"
        pos_dist = FreqDist()
        for pos_pair in self.tagged:
            pos_dist.inc(pos_pair[1])
        return pos_dist

    def POS_cond_freq(self):
        "Returns the conditional frequency distribution of parts of speech"
        cond_dist = ConditionalFreqDist()
        pos = [word_pos[1] for word_pos in self.tagged]
        [cond_dist[pair[0]].inc(pair[1]) for pair in pairwise(pos)]
        return cond_dist

    def word_rarity_freq(self):
        "Returns the frequency distribution of groups of word rarities"
        rarity_dist = FreqDist()
        for common in self.counts:
            if common > 500000000:
                rarity_dist.inc(0)
            elif common > 450000000:
                rarity_dist.inc(1)
            elif common > 400000000:
                rarity_dist.inc(2)
            elif common > 350000000:
                rarity_dist.inc(3)
            elif common > 300000000:
                rarity_dist.inc(4)
            elif common > 250000000:
                rarity_dist.inc(5)
            elif common > 200000000:
                rarity_dist.inc(6)
            elif common > 150000000:
                rarity_dist.inc(7)
            elif common > 100000000:
                rarity_dist.inc(8)
            elif common > 80000000:
                rarity_dist.inc(9)
            elif common > 65000000:
                rarity_dist.inc(10)
            elif common > 50000000:
                rarity_dist.inc(11)
            elif common > 30000000:
                rarity_dist.inc(12)
            elif common > 10000000:
                rarity_dist.inc(13)
            elif common > 8000000:
                rarity_dist.inc(14)
            elif common > 5500000:
                rarity_dist.inc(15)
            elif common > 3000000:
                rarity_dist.inc(16)
            elif common > 1000000:
                rarity_dist.inc(17)
            elif common > 500000:
                rarity_dist.inc(18)
            else:
                rarity_dist.inc(19)
        return rarity_dist

    def avg_word_length(self):
        return numpy.average(self.word_lengths)

    def std_dev_word_length(self):
        return numpy.std(self.word_lengths)

    def max_word_length(self):
        return max(self.word_lengths)

    def unique_word_freq(self):
        return float(self.fdist.B()) / self.fdist.N()

    def max_sentence_length(self):
        return max(self.sentence_lengths)

    def min_sentence_length(self):
        return min(self.sentence_lengths)

    def avg_sentence_length(self):
        return numpy.average(self.sentence_lengths)

    def std_sentence_length(self):
        return numpy.std(self.sentence_lengths)

    def avg_word_commonality(self):
        return numpy.average(self.counts)

    def std_word_commonality(self):
        return numpy.std(self.counts)
示例#18
0
def frequency_analysis(mode, tokens):
    """
    Performs simple frequency analysis with options for
    minimum word length, number of words and parts-of-speech to be included.
    """
    # Variables for word selection
    num_tokens = 100
    min_token_length = 3
    max_token_length = 16
    all_pos_tags_included = True
    pos_tags_included = {
        'untagged words': True,
        'nouns': True,
        'pronouns': True,
        'verbs': True,
        'adverbs': True,
        'adjectives': True,
        'prepositions': True,
        'miscellaneous words': True
    }

    committed = False
    while not committed:
        #Determining words to use in new FreqDist
        new_tokens = []
        working_tokens = []
        # Only choose words with POS tags
        # matching the classes in pos_tags_included:
        if not all_pos_tags_included:
            for w in tokens:
                word_included = False
                for tag in pos_tags_included:
                    if pos_tags_included[tag]:
                        for t in POS_TAGS[tag]:
                            if w[1] == t:
                                #print(w[0] + ' matches ' + str(w[1]))
                                working_tokens.append(w[0])
                                word_included = True
                if not word_included:
                    if pos_tags_included['miscellaneous words']:
                        working_tokens.append(w[0])
        else:
            working_tokens = tokens.copy()
        print(working_tokens)
        for w in working_tokens:
            if mode == TokenisationMode.CHUNKS:
                token = w
            elif mode == TokenisationMode.NGRAMS:
                token = w
            elif mode == TokenisationMode.WORDS:
                token = w[0]
            if min_token_length <= len(token) <= max_token_length:
                new_tokens.append(token)

        fdist = FreqDist(new_tokens)

        prelim_results = ['\nFrequency analysis had found ']
        prelim_results.append(str(fdist.B()))
        prelim_results.append(' unique token of potential interest\n')
        prelim_results.append('out of a total of ')
        prelim_results.append(str(fdist.N()))
        prelim_results.append('.')
        print(''.join(prelim_results))

        intro = ['The ']
        intro.append(str(num_tokens))
        intro.append(' most frequent tokens are currently selected.\n')
        intro.append('Selected words are currently between ')
        intro.append(str(min_token_length))
        intro.append(' and ')
        intro.append(str(max_token_length))
        intro.append(' characters in length.\n')
        if not mode == TokenisationMode.NGRAMS:
            if all_pos_tags_included:
                intro.append(
                    'All parts-of-speech are included in the selection.\n')
            else:
                intro.append('Parts-of-speech included in the selection:\n')
                for tag in pos_tags_included:
                    if pos_tags_included[tag]:
                        intro.append(tag)
                        intro.append(', ')
                intro[len(intro) -
                      1] = '\n'  # replace trailing comma with linebreak
                intro.append('Parts-of-speech excluded from the selection:\n')
                for tag in pos_tags_included:
                    if not pos_tags_included[tag]:
                        intro.append(tag)
                        intro.append(', ')
                intro[len(intro) -
                      1] = '\n'  # replace trailing comma with linebreak

        intro.append('Below are the selected words, most frequent first:\n')
        print(''.join(intro))
        selected_tokens = fdist.most_common(num_tokens)
        word_string = []
        charcount = 0
        print(selected_tokens)
        for w in selected_tokens:
            if mode == TokenisationMode.CHUNKS:
                token = w[0]
            elif mode == TokenisationMode.NGRAMS:
                token = w[0]
            elif mode == TokenisationMode.WORDS:
                token = w[0]
            charcount += len(token) + 2
            if charcount > 80:
                word_string.append('\n')
                charcount = 0
            word_string.append(token)
            word_string.append(', ')
        word_string[len(word_string) - 1] = '\n'  # strip comma, add linebreak
        print(''.join(word_string))
        chosen = False
        while not chosen:
            print('Enter M below to change the minimum token length.')
            print('Enter X below to change the maximum token length.')
            print('Enter N to change the total number of tokens selected.')
            if not mode == TokenisationMode.NGRAMS:
                print('Enter P to restrict selection with PoS tagging.')
            print('Enter A to accept the current list of tokens and continue.')
            user_input = input()
            if user_input.lower() == 'n':
                num_tokens = int_input_prompt(
                    '\nHow many words do you want selected?\n')
                chosen = True
            elif user_input.lower() == 'm':
                validated = False
                while not validated:
                    min_token_length = int_input_prompt(
                        '\nEnter a new minimum word length...\n')
                    if min_token_length > max_token_length:
                        print("Minimum word length can't exceed maximum!")
                    else:
                        validated = True
                chosen = True
            elif user_input.lower() == 'x':
                validated = False
                while not validated:
                    max_token_length = int_input_prompt(
                        '\nEnter a new maximum word length...\n')
                    if mamin_token_length > max_token_length:
                        print("Maximum word length cannot be less than" +
                              "minimum!")
                    else:
                        validated = True
                chosen = True
            elif user_input.lower() == 'p':
                if not mode == TokenisationMode.NGRAMS:
                    nothing_selected = True
                    while nothing_selected:
                        for tag in pos_tags_included:
                            print('Do you want to include ' + tag + '?')
                            pos_tag_chosen = yes_no_input_prompt()
                            pos_tags_included[tag] = pos_tag_chosen
                            if pos_tag_chosen:
                                chosen = True
                                nothing_selected = False
                            else:
                                all_pos_tags_included = False
                        if nothing_selected:
                            print('Error: you must include at least ' +
                                  'one class of POS tags.')
                            print('Restarting selection...\n')
                else:
                    print('PoS tagging not applicable to ngrams.')
                    print('Doing nothing...')
            elif user_input.lower() == 'a':
                chosen = True
                committed = True
            else:
                print('Input not recognised')

    words_string = ', '.join([w[0][0] for w in selected_tokens])
    string_to_text_file(sys.argv[3], words_string)
    print('Words/phrases sucessfully saved to ' + sys.argv[3])
    quit()
示例#19
0
# In[50]:


l=[s for ls in l for s in ls if s != '' ]


# In[81]:


fd['<unk>']=1


# In[82]:


b=fd.B()
for k in fd.keys():
    fd[k]=(fd[k]+1)/(n+b)


# In[83]:


fd


# In[79]:


def generate_unigram_model(corpus,vocab):
    fd=FreqDist(corpus)