Exemplo n.º 1
0
 def word_freq(self, top_n=10, group=1 == 1, bi=1 == 1):
     '''
     Function: Show words frequency & word clouds.
     Paramters-
         group: If true produce frequency & word clouds for each tokens list in the list.
         top_n: Number of top frequency words to show.
         bi: Whether to show bigrams or not.
     '''
     if group and self.labels == None: group = 1 == 2
     from nltk import FreqDist as fd
     import pandas as pd
     if not group:
         tks = [t for l in self.lst_tk_lsts for t in l if type(t) == str]
         ndist = pd.Series(fd(tks)).sort_values(ascending=1 == 2)
         sm = ndist.sum()
         ndist = ndist.map(lambda x: str(round(x / sm * 100, 2)) + '%')
         word_cloud(' '.join(tks))
         print(ndist[:top_n])
         if bi:
             df_bi = self.vectorize(method='count',
                                    ngram_range=(2, 2),
                                    stop_words='english',
                                    return_df=1 == 1)
             if 'label' in df_bi.columns:
                 df_bi.drop(columns=['label'], inplace=1 == 1)
             sr = df_bi.sum().sort_values(ascending=1 == 2)
             sr = sr.map(lambda x: str(round(x / sr.sum() * 100, 1)) + '%')
             print(sr[:top_n])
         return None
     if bi:
         df_bi = self.vectorize(method='count',
                                ngram_range=(2, 2),
                                stop_words='english',
                                return_df=1 == 1)
         df = df_bi.groupby('label').sum()
     sr = pd.Series(self.lst_tk_lsts, index=self.labels)
     for i in list(set(self.labels)):
         print(i, ':')
         sr0 = sr[i].to_list()
         tks = [t for k in sr0 for t in k]
         ndist = pd.Series(fd(tks)).sort_values(ascending=1 == 2)
         sm = ndist.sum()
         ndist = ndist.map(lambda x: str(round(x / sm * 100, 2)) + '%')
         word_cloud(' '.join(tks))
         print(ndist[:top_n], '\n')
         if bi:
             sr1 = df.loc[i, :].sort_values(ascending=1 == 2)
             sr1 = sr1.map(
                 lambda x: str(round(x / sr1.sum() * 100, 1)) + '%')
             print(sr1[:top_n], '\n')
Exemplo n.º 2
0
 def __getSpeaketDict(self):
     for speaker in self.speakerData:
         data = self.speakerData[speaker]['statement'].strip()
         self.totalTrainData += data+" "
         self.speakerData[speaker]["words"] = dict(fd(word_tokenize(data)))
         self.speakerData[speaker]["wordCount"] = sum(self.speakerData[speaker]["words"].values())
         self.speakerData[speaker]["classVocabSize"] = len(self.speakerData[speaker]["words"].keys())
         self.uniqueList.append(self.speakerData[speaker]["words"].keys())
 def list_to_fd_DF(lt, t):
     lt_fd = fd(lt)
     id_ = []
     row = []
     for col in lt_fd:
         row.append([col, lt_fd[col]])
         id_.append(col)
     return (df(
         row,
         index=id_,
         columns=['FileName_{0}'.format(t), 'Count_{0}'.format(t)]))
Exemplo n.º 4
0
def fristXpq(sDat, max_len, x=0):
    token = wt(sDat.lower())
    lt = []
    if len(sDat) < max_len:
        max_len = len(sDat)
    for i in range(1, max_len + 1):
        text = []
        if i > 1:
            text = textToWordList(token, i)
        else:
            text = token
        if x == 0:
            fdist = fd(text).most_common()
        else:
            fdist = fd(text).most_common(x)
        fdist = [[fdist[k][0], fdist[k][1]]
                 for tp, k in zip(fdist, range(len(fdist)))]
        #        sfdist=[[fdist[k][0],fdist[k][1]/len(text)] for tp,k in zip(fdist,range(len(fdist)))]
        lt.extend(fdist)
#        lt=fqtopq(lt,len(text))
    return (lt)
Exemplo n.º 5
0
def guess_lang(text):
    '''Guess the language of the text. This version includes only Spanish,
    German and English and the sample needs to be quite big but it could be enhanced.'''
    Spanish = udhr.words('Spanish-Latin1')
    German = udhr.words('German_Deutsch-Latin1')
    English = udhr.words('English-Latin1')
    spanfd = fd(Spanish)
    small_spanfd = {}
    gerfd = fd(German)
    small_gerfd = {}
    enfd = fd(English)
    small_enfd = {}

    text_fd = fd(nltk.regexp_tokenize(text.lower(), r'\w+'))

    for key in spanfd.keys():
        if text_fd.has_key(key):
            small_spanfd[key] = spanfd[key]

    for key in enfd.keys():
        if text_fd.has_key(key):
            small_enfd[key] = enfd[key]

    for key in gerfd.keys():
        if text_fd.has_key(key):
            small_gerfd[key] = gerfd[key]

    corwithspan = cor(small_spanfd, text_fd)
    corwithen = cor(small_enfd, text_fd)
    corwithger = cor(small_gerfd, text_fd)

    if abs(corwithspan) == abs(corwithen) == abs(corwithger):
        print "I don't know..."
    elif max(abs(corwithspan), abs(corwithen), abs(corwithger)) == abs(corwithspan):
        print "It's Spanish!"
    elif max(abs(corwithspan), abs(corwithen), abs(corwithger)) == abs(corwithen):
        print "It's English!"
    elif max(abs(corwithspan), abs(corwithen), abs(corwithger)) == abs(corwithger):
        print "It's German!"
Exemplo n.º 6
0
def arrange(devdict,option=0):
    global exportlist
    exportlist = []
    print 'getting dict'
    for x,y in devdict.items(): # x is index key, y is value
        #print 'init variables'
        combrole = []
        combtag = []
        combtagfd = {}
        combtag2 = []
        tagcount = 0
        #print 'rolegrab start'
        for z in y[1:]: # grab roles
            # print 'grab roles'
            combrole.append(z[0])
            if z[1] != 'NoTag':
                tagcount += 1
            # print 'grabbing tags'
            for a in z[3:]: # grab tags
                # print a
                combtag.append(a)
        if combtag:
            # print 'getting tag fdist'
            combtagfd = fd(combtag)
            for b,c in combtagfd.items(): # x is tag, y is frequency
                # print b,c
                combtag2.append(b)
                combtag2.append(str(c))
        else:
            combtag = []
            combtag2 = []
        #print [y[0][0]]
        #print combrole
        #print y[0][1:]
        #print combtag2
        if option == 1:
            if len(combrole) > 1:
                combrole = [' | '.join(combrole)]
        elif option == 0:
            if len(combrole) < 54:
                blank = 54 - len(combrole)
                appendix = [''] * blank
                combrole.extend(appendix)
        exportlist.append([y[0][0]]+combrole+y[0][1:]+[str(tagcount)]+combtag2)
        # print 'exported'
    print 'done'
Exemplo n.º 7
0
          mode='rb') as f:
    word_tokenized_lowered = pickle.load(f)

with open('{}/data/pos_dicts.pickle'.format(mod_path), mode='rb') as file:
    pos_tag_dict, pos_tag_dict_univ = pickle.load(file)

with open('{}/data/abbrev_dict.pickle'.format(mod_path), mode='rb') as file:
    abbrevs_orig = pickle.load(file)

with open('{}/data/sig_dict.pickle'.format(mod_path), mode='rb') as file:
    sig_dict = pickle.load(file)

brown = word_tokenized_lowered[:1161192]
brown_common = {
    word: log(1161192 / freq)
    for word, freq in fd(brown).most_common(5000)[100:]
}
words = [w for w, freq in fd(brown).most_common()]
names_lower = {w.lower() for w in names.words()}


def expand_EXPN(nsw, i, text, user_abbrevs={}):
    """Expand abbreviations to best possible match. If no close matches,
       return nsw."""
    try:
        if user_abbrevs:
            abbrevs = create_user_abbrevs(user_abbrevs)
        else:
            abbrevs = abbrevs_orig
        if nsw in ['St.', 'st.', 'St']:
            if i < len(text):
Exemplo n.º 8
0
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

documents = [(list(mr.words(fileid)), category)
             for category in mr.categories()
             for fileid in mr.fileids(category)]

random.shuffle(documents)

all_words = []
for w in mr.words():
    all_words.append(w.lower())

all_words = fd(all_words)
word_features = list(all_words.keys())[:3000]


def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = {w in words}

    return features


featuresets = [(find_features(rev), category) for (rev, category) in documents]
training_set = featuresets[:2500]
testing_set = featuresets[2500:]
Exemplo n.º 9
0
 def __genBagOfWords(self):
     self.bagOfWords = dict(fd(word_tokenize(self.totalTrainData.strip())))
     self.vocabSize = len(self.bagOfWords.keys())
     self.totalwords = sum(self.bagOfWords.values())
        if t.lower() not in stops:
            pos = pos_tag([t])
            clean_word = lemmatizer.lemmatize(t, pos=get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words


# print(tweet_words[0])
clean_words_train = [(clean_tweets(tweet), sentiment)
                     for tweet, sentiment in tweet_words]
clean_words_test = [(clean_tweets(tweet)) for tweet in test_tweet_words]
print(clean_words_train)
all_words = []
for tweet in clean_words_train:
    all_words += tweet[0]
freq = fd(all_words)
common = freq.most_common(200)
features = [i[0] for i in common]


def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features


training_data = [(get_feature_dict(tweet), sentiment)
                 for tweet, sentiment in clean_words_train]
testing_data = [(get_feature_dict(tweet)) for tweet in clean_words_test]
Exemplo n.º 11
0
def createVector(filename, last):
    #Lexical
    doc = open('text/'+filename,'r')
    DocString = doc.read()
    data = []
    charactercount = len(DocString)
    alphebetcount = 0
    uppercasecount = 0
    digitcount = 0
    whitespacecount = 0
    tab_and_spacecount = 0
    # Freq Special char

    # WordBased
    totalwordcount = 0
    totalshortwords = 0
    totalcharinwordsC = 0
    avgwordlen = 0
    avg_sent_len_chars = 0
    avg_sent_len_word = 0
    total_dif_words = 0
    freq_once_ocur_words = 0  # Hapax legomena
    freq_twice_ocur_words = 0  # Hapax dislegomena
    Yule_K_measure = 0  # vocab richness def by yule
    Simpson_D_measure = 0  # vocab rich def by simpson
    Sichel_S_measure = 0 # vocab rich by sichele
    Brunet_W_measure  = 0# vocab rich by Brune
    Honore_R_measure  = 0# vocab rich by Honre
    Freq_dist_Wordlen = 0  # need 20 of these
    punclist = [0,0,0,0,0,0,0,0] #, . ? ! : ; ' "
    for x in DocString:
        if x.isalpha():
            alphebetcount+=1
        if x.isdigit():
            digitcount+=1
        if x.isupper():
            uppercasecount+=1
        if x==',':
            punclist[0]+=1
        if x=='.':
            punclist[1]+=1
        if x=='?':
            punclist[2]+=1
        if x=='!':
            punclist[3]+=1
        if x==':':
            punclist[4]+=1
        if x==';':
            punclist[5]+=1
        if x=='\'':
            punclist[6]+=1
        if x=='"':
            punclist[7]+=1

    # A-Z count
    letcur = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    for line in enumerate(DocString):
        tab_and_spacecount+=line.count('\t')
        tab_and_spacecount+=line.count(' ')

        for i in range(0, 25):
            letcur[i] = letcur[i] + line.count(chr(i+65))+line.count(chr(i+97))

    words = word_tokenize(DocString, language='english')
    totalwordcount = len(words)
    temp = [x for x in words if len(x)<4]
    totalshortwords = len(temp)
    for x in words:
        totalcharinwordsC+=len(x)
    avgwordlen=totalcharinwordsC/len(words)
    sents = sent_tokenize(DocString,language='english')
    senttemp = 0
    for x in sents:
        senttemp+=len(x)
    avg_sent_len_chars=senttemp/len(sents)
    wordtemp = 0
    for x in sents:
        wordtemp+=len(word_tokenize(x,language='english'))
    avg_sent_len_word = wordtemp/len(sents)
    freqdist = fd(words)

    total_dif_words=len(freqdist)
    freq_once_ocur_words=len(freqdist.hapaxes())
    freqwords = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    for x in freqdist:
        if freqdist[x]<=20:
            if freqdist[x]>=3:
                freqwords[freqdist[x]-3]+=1
    #richness measures

    #syntatic feat: freq punctuation, and function words will be loaded from the research doc asd


    #structural freatures
    data+=[charactercount,alphebetcount,uppercasecount,digitcount,whitespacecount,tab_and_spacecount]+letcur+\
          [totalwordcount,total_dif_words,totalshortwords,totalcharinwordsC,freq_twice_ocur_words,freq_once_ocur_words]\
          + freqwords+punclist
    read = open('profiles/'+last+'.txt', 'a')
    read.write('\n')
    for a in data:
        read.write(str(a)+',')

    read.write(" ")
Exemplo n.º 12
0
        elif n:
            middle = n.group(1)
            end = n.group(2)
            exp += infer_spaces(middle) + " " + ends[end]
        else:
            return word
        return exp
    except (KeyboardInterrupt, SystemExit):
        raise
    except:
        return word


# Build a cost dict, assuming Zipf's law and cost = -math.log(probability).
brown = word_tokenized_lowered[:1161192]
words = [w for w, freq in fd(brown).most_common()]
wordcost = dict(
    (k, log((i + 1) * log(len(words)))) for i, k in enumerate(words))
maxword = max(len(x) for x in words)


def infer_spaces(s):
    """Uses dynamic programming to infer the location of spaces in a string
    without spaces."""

    # Find the best match for the i first characters, assuming cost has
    # been built for the i-1 first characters.
    # Returns a pair (match_cost, match_length).
    def best_match(i):
        candidates = enumerate(reversed(cost[max(0, i - maxword):i]))
        return min((c + wordcost.get(s[i - k - 1:i], 9e999), k + 1)