def get_word_features(wordlist):

    wordlist = FreqDist(wordlist)

    word_features = wordlist.keys()

    return word_features
示例#2
0
def main():
    keyword_list = ["Top Secret", "Secret Service", "Classified", "Targeted", "Assassination",
                    "Kill Program", "NSA", "wire", "CIA", "FBI", "DEA", "DOJ", "hackers",
                    "hacker", "exploit code", "Defense", "Intelligence", "Agency"]
    file_name = "tweets_output.txt"
    pickle_words_file = "words.pickle"
    pickle_words(file_name, pickle_words_file, keyword_list)
    pickle_tweets_file = "tweets.pickle"
    pickle_tweets(file_name, pickle_tweets_file)
    words = load(open("words.pickle"))
    tweets = load(open("tweets.pickle"))
    freq_dist = FreqDist(words)
    print tweets
    print("===")
    print("Conducting Frequency and Lexical Diversity Analysis of Twitter Search Space: ")
    print("===")
    print("Number of words within the twitter search space: ")
    print(len(words))
    print("Number of unique words within twitter search space: ")
    print(len(set(words)))
    print("Lexical Diversity of unique words within twitter search space: ")
    print(lexical_diversity(words))
    print("===")
    print("Conducting Native Language Processing Analysis Utilizing Python NLTK")
    print("===")
    print("Top 50 Frequent Words within the Twitter Search Space: ")
    print(freq_dist.keys()[:50])
    print("===")
    print("Bottom 50 Frequent Words within the Twitter Search Space: ")
    print(freq_dist.keys()[-50:])
    print("===")
示例#3
0
    def work_1():
        file_string = ""
        txt_file = open("trabalho1.txt", "r+")
        csv_file = open("trabalho1.csv", "w+")
        csv_manage = csv.writer(csv_file,
                                delimiter=";",
                                quoting=csv.QUOTE_MINIMAL)
        base_text = txt_file.read()
        sentences = word_tokenize(base_text)
        frequency = FreqDist(sentences)

        print("texto : {0}".format(base_text))

        print("Total de palavras : {0}".format(frequency.N()))
        print("Total de Termos : {0}".format(len(frequency.keys())))
        print("")

        print("Tabela de Frequência de Termos")
        print("")

        for key in frequency.keys():
            csv_manage.writerow([key, str(frequency.get(key))])
            print("Termo: {0}  Total: {1}".format(key,
                                                  str(frequency.get(key))))

        pdfOutput = PdfOutput(frequency, frequency.N(), len(frequency.keys()),
                              base_text)
        servicePdfManager = ServiceManagerPdf()
        servicePdfManager.writePdf(pdfOutput)

        txt_file.close()
        csv_file.close()
示例#4
0
def prepare_pos_features(Language_model_set, output_file):
    corpus_root = '/home1/c/cis530/data-hw2/' + Language_model_set
    texts = PlaintextCorpusReader(corpus_root, '.*')
    text = texts.words()
    tagged_text = nltk.pos_tag(text)
    merged_tag_text = mergeTags(tagged_text)
    lists = seperate_pos(merged_tag_text)
    nouns_dist = FreqDist(lists[0])
    top_nouns = nouns_dist.keys()[:200]
    verbs_dist = FreqDist(lists[1])
    top_verbs =verbs_dist.keys()[:200]
    advs_dist = FreqDist(lists[2])
    top_advs =advs_dist.keys()[:100]
    prep_dist = FreqDist(lists[3])
    top_preps =prep_dist.keys()[:100]
    adjs_dist = FreqDist(lists[4])
    top_adjs =adjs_dist.keys()[:200]


    out = open(output_file, 'w')

    for n in top_nouns:
        out.write('NN'+ n + '\n')
    for v in top_verbs:
        out.write('VV'+ v + '\n')
    for av in top_advs:
        out.write('ADV'+ av + '\n')
    for p in top_preps:
        out.write('PREP'+ p + '\n')
    for aj in top_adjs:
        out.write('ADJ'+ aj + '\n')
示例#5
0
def frequent_words(x, terms=30):
    all_words = ' '.join([text for text in x])
    all_words = all_words.split()
    freq_dist = FreqDist(all_words)
    x = transformer.transform(
        word.replace("_", " ") for word in freq_dist.keys())
    words_df = pd.DataFrame({
        'word': list(freq_dist.keys()),
        'count': list(freq_dist.values()),
        'vector': list(x)
    })
    good = []
    bad = []
    for i in range(1, len(words_df)):
        if (nb.predict(words_df.at[i, 'vector']) == 5):
            good.append([
                words_df.at[i, 'count'], words_df.at[i,
                                                     'word'].replace(" ", "_")
            ])
        else:
            bad.append([
                words_df.at[i, 'count'], words_df.at[i,
                                                     'word'].replace(" ", "_")
            ])
    good = sorted(good, key=lambda x: x[0], reverse=True)
    bad = sorted(bad, key=lambda x: x[0], reverse=True)
    return format_result(good, bad, terms)
def generate_vocab(tokens: list, min_token_len: int = 2, threshold: int = 2, remove_numbers=True):
    freq_dist = FreqDist(tokens)
    if remove_numbers:
        remove_digit_tokens(freq_dist)
    tokens = preprocess_tokens(tokens=list(freq_dist.keys()), min_token_len=min_token_len)
    removed_tokens = set(freq_dist.keys()).difference(tokens)
    for t in removed_tokens:
        freq_dist.pop(t, None)
    [freq_dist.pop(t, None) for t in tokens if freq_dist[t] < threshold]
    return freq_dist
示例#7
0
 def extract_most_common_words(self, words, sentiment):
     word_freq = FreqDist(words)
     print("for the sentiment", sentiment)
     print("there are", len(word_freq.keys()), "different words")
     print("that were used", sum(word_freq.values()), "times")
     df = pd.DataFrame({
         f'{sentiment}_words': list(word_freq.keys()),
         f'{sentiment}_counts': list(word_freq.values())
     })
     df = df.nlargest(self.n_words, columns=f'{sentiment}_counts')
     df.reset_index(drop=True, inplace=True)
     return df, len(word_freq.keys()), sum(word_freq.values())
示例#8
0
    def handle(self, *args, **options):
    	fdist = FreqDist()
    	print "Analyzing raw data"
    	limit = 10
    	if args:
    		raw_datas = RawData.objects.filter(pk__in=args)
    	else:
	   		raw_datas = RawData.objects.all()[:limit]
    	tagged_data = []
    	for raw_data in raw_datas:
    		words = nltk.word_tokenize(raw_data.data)
    		tagged_data.extend(nltk.pos_tag(words))
    		for word in words:
    			word = word.strip()
    			if word:
	    			fdist.inc(word)

    	print "Anaylzed %s items" % len(raw_datas)
    	print

    	print "Top word: %s" % fdist.max()
    	print 

    	print "Top 10 words"
    	for word in fdist.keys()[:10]:
    		times = fdist[word]
    		print " -- %s occurred %s times" % (word, times)
    	print

    	
    	print "Bottom 10 words"
    	for word in fdist.keys()[-10:]:
    		times = fdist[word]
    		print " -- %s occurred %s times" % (word, times)
    	print

    	print "Words occurring between 50-100 times"
    	words = [ word for word in fdist.keys() if fdist[word] >= 50 and fdist[word] <= 100 ]
    	print ", ".join(words)


    	cfdist = ConditionalFreqDist()
    	for (word, tag) in tagged_data:
    		cfdist[tag].inc(word)
    	
    	print "Most popular noun: %s" % cfdist["NN"].max()
    	print 

    	print "Top 50 nouns"
    	for word in cfdist["NN"].keys()[:50]:
    		times = cfdist["NN"][word]
    		print " -- %s occurred %s times" % (word, times)
    	print
def term_ratio(tf1: FreqDist, tf2: FreqDist, c=None, normalize=False):
    if normalize:
        if c is None:
            c = 1e-4
        return {
            word: (tf1[word] / tf1.N()) / (tf2[word] / tf2.N() + c)
            for word in tf1.keys()
        }
    else:
        if c is None:
            c = 1
        return {word: tf1[word] / (tf2[word] + c) for word in tf1.keys()}
示例#10
0
def entropy(tokens):
    """
    Get the Shannon entropy of a document using it's token distribution
    :param tokens: A document represented as a list of tokens.
    :return:
    """
    doc_len = len(tokens)
    frq = FreqDist(tokens)
    for key in frq.keys():
        frq[key] /= doc_len
    ent = 0.0
    for key in frq.keys():
        ent += frq[key] * math.log(frq[key], 2)
    ent = -ent
    return ent
示例#11
0
def entropy(tokens):
    """
    Get the Shannon entropy of a document using it's token distribution
    :param tokens: A document represented as a list of tokens.
    :return:
    """
    doc_len = len(tokens)
    frq = FreqDist(tokens)
    for key in frq.keys():
        frq[key] /= doc_len
    ent = 0.0
    for key in frq.keys():
        ent += frq[key] * math.log(frq[key], 2)
    ent = -ent
    return ent
示例#12
0
def parse(filename):
    outfilename = filename + ".freq"
    entry_string = open(filename, 'r').read()

    # convert to lower case
    entry_string = entry_string.lower()

    # remove punctuation
    for c in string.punctuation:
        entry_string = entry_string.replace(c, " ")

    # remove everything except letters and spaces
    entry_string = re.sub("[^a-z ]", " ", entry_string)

    # strip out multiple spaces
    entry_string = re.sub(r'\s+', r' ', entry_string)

    # make the string into a list and remove stopwords from it
    entry_string_split = entry_string.split()
    entry_string_no_stopwords = remove_stopwords(entry_string_split)

    fd = FreqDist(entry_string_no_stopwords)

    fout = open(outfilename, "w")
    sys.stdout.write(outfilename + "\n")
    fout.write(" ".join(fd.keys()))
    fout.close()
示例#13
0
def features(word_list):
	freq = FreqDist(word_list)
	f = freq.keys()
	return {
		'biology': 'biolog' in word_list,
		'engineering': 'engin' in word_list,
		'animal' : 'anim' in word_list,
		'behavior': 'behavy' in word_list,
		'chemistry': 'chem' in word_list,
		'health': 'heal' in word_list,
		'physics': 'phys' in word_list,
		'math': 'math' in word_list,
		'plant': 'plant' in word_list,
		'earth': 'earth' in word_list,
		'biochemistry': 'biochem' in word_list,
		'social': 'soc' in word_list,
		'planet': 'planet' in word_list,
		'temperature': 'temperature' in word_list,
		'blood': 'blood' in word_list,
		'tube': 'tube' in word_list,
		'pyschology': 'pyscholog' in word_list,
		'protein': 'protein' in word_list,
		'gene': 'gen' in word_list,
		'most_0': f[0],
		'most_1': f[1],
		'most_2': f[2],
		'most_3': f[3],
		'most_4': f[4],
		'most_5': f[5],
		'most_6': f[6],
		'most_7': f[7],
		}
示例#14
0
def generate_corpus(folder_name, top, n):
    '''corpus of words generated to be used as the vocabulary. Function takes into account topn and will 
    create a corpus with the topn amount of tokens if topn is True.'''

    lower = True  #activates lowercase tokens
    subfolders = [i for i in os.listdir(folder_name)
                  ]  #iterates through subfolder
    corpus_list = []
    for i in subfolders:
        for v in os.listdir(folder_name + "/" + i):
            text = open_text(i, folder_name, v, lower)
            corpus_list += [i for i in text]

    corpus_freqs = FreqDist(corpus_list)
    sorted_x = sorted(corpus_freqs.items(),
                      key=operator.itemgetter(1),
                      reverse=True)
    if top == True:
        topn_words = {}
        for i in sorted_x[:n]:
            topn_words[i[0]] = 0
        vocabulary = list(sorted(topn_words.keys()))
        return topn_words, vocabulary  #empty  topn dictionary to be used to populate vectors and vocabulary for columns

    else:
        vocabulary = list(sorted(corpus_freqs.keys()))
        corpus = {str(i): 0 for i in sorted(vocabulary)}
        return corpus, vocabulary
示例#15
0
def analyze(inputfile):

    file = open(inputfile, "rt")

    text = file.read()
    file.close()

    # split into words
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    junk_words = ['nt']
    words = [w for w in words if not w in junk_words]
    print(words[:100])

    freqDist = FreqDist(words)
    words = list(freqDist.keys())

    print(freqDist.plot(50))
def compress_term_matrix(matrix, words):
    initials = [item[0] for item in words]
    
    fdist = FreqDist(initials)
    
    letterindices = []
    for letter in sorted(fdist.keys()):
        letterindices.append((letter, fdist[letter]))
    
    indexmatrix = []
    start = 0
    for letter, occ in letterindices:
        newocc = occ / 5
        
        print letter,"  ",occ
        print " range: ", start,"  ", start+occ,"  ",newocc
        indexes = np.random.random_integers(start, start+occ, newocc)
        indexmatrix.append((letter, indexes.tolist()))
        start = start+ occ
    
    allindices = []
    for _,v in indexmatrix:
        allindices.extend(v)
    smatrix = matrix[allindices, :]
    return indexmatrix, smatrix                
示例#17
0
def preprocess_data(train_file):

    # Read the trainset
    data_train = pd.read_csv(train_file, header=0)
    X_train = data_train.mr.tolist()  # convert rm part to a list
    y_train = data_train.ref.tolist()  # convert ref part to a list

    # Preprocess
    X_train_seq, y_train_seq, dico_ = mr2oh(
        X_train,
        y_train)  # convert train and test sets into lists of slots and values.
    y_train = [proc_text(y) for y in y_train_seq]  # process text
    X_train = [proc_text(y) for y in X_train_seq]  # process text
    dist = FreqDist(np.concatenate(y_train + X_train))
    i_to_w = list(dist.keys())  # create a list to convert index to words
    i_to_w.insert(0, '-PADDING-')
    i_to_w.insert(2, '<STOP>')
    w_to_i = {word: idx
              for idx, word in enumerate(i_to_w)
              }  # dictionary that converts words to their corresponding index

    X_train_oh = ref2oh(
        X_train, w_to_i)  # convert words in ref sentences into their indexes
    y_train_oh = ref2oh(
        y_train, w_to_i)  # convert words in ref sentences into their indexes

    return X_train_oh, y_train_oh, i_to_w, dico_
示例#18
0
def getUniqueWords(subredditname):
    wordfile_path = datadirectory + "/ProcessedData/" + subredditname + "_words" + ".txt"

    set_of_words = set()
    freq_subreddit = FreqDist()

    if not path.exists(wordfile_path):
        for datafile in getTextFileNames(subredditname):
            if path.exists(datafile):
                print("reading " + datafile)
                freq_subreddit = collectFreqData(datafile) + freq_subreddit
            else:
                print("no data for " + datafile)

        for i in freq_subreddit.most_common(20):
            print(i)

        with open(wordfile_path, "a+") as wordfile:
            for word in freq_subreddit.keys():
                word = word.strip()
                word = word.lower()
                set_of_words.add(word)
                wordfile.write(word + "\n")
        return set_of_words
    else:
        with open(wordfile_path, "r") as wordfile:
            #read line by line
            print("reading " + wordfile_path)
            for word in wordfile:
                word = word.strip()
                word = word.lower()
                set_of_words.add(word)
            return set_of_words
示例#19
0
def doNLTK(play):
    # Initialize NLTK object:
    toks = word_tokenize(play)
    full_text = nltk.Text(toks)
    context = nltk.text.ContextIndex(toks) # Yes, this has similar_words, this is what we need!

    allwords = []
    # print(full_text.concordance('madness')) # No need to print. Returns None, like similar().
    # print(full_text.similar('death'))
    fdist = FreqDist(full_text)
    # commons = fdist.most_common(250)
    commons = [f for f in fdist.keys() if fdist[f] > 8] # Can also check it's not a stop word here
    commons_str = '  '.join(commons)
    commons_toks = word_tokenize(commons_str)
    commons_tags = nltk.pos_tag(commons_toks)
    # Hideous -- figure out regex:
    commons_imp = [(c[0]) for c in commons_tags if (c[1] == 'NN') or (c[1] == 'NNP') or ('VB' in c[1]) or ('JJ' in c[1])]
    commons_imp_nostop = [c for c in commons_imp if c.lower() not in stop_words]
    # print(commons_imp_nostop)

    for w in commons_imp_nostop:
        # x = full_text.ContextIndex.similar_words(w) # What? Why does this return None but just print?
        x = context.similar_words(w)
        # print(x)
        for idx, x in enumerate(context.similar_words(w)):
            if x.lower() not in stop_words and x not in allwords:
                allwords.append(x.lower())
                print('{} is similar to {} by degree {}'.format(w, x.lower(), idx))
示例#20
0
def find_abbreviations():
    import db
    from tokenizers import es
    from nltk import FreqDist

    corpus = db.connect()
    #text = '\n'.join([a['text'] for a in corpus.articles.find().limit(10)])
    text = '\n'.join([a['text'] for a in corpus.articles.find()])
    tokens = es.tokenize(text, ignore_abbreviations=True)

    fd = FreqDist()
    fd_abbr = FreqDist()
    fd_n_abbr = FreqDist()
    n_tokens = len(tokens)
    for i in range(n_tokens):
        fd.inc(tokens[i])
        if i < (n_tokens - 1) and tokens[i + 1] == u'.':
            fd_abbr.inc(tokens[i])
        else:
            fd_n_abbr.inc(tokens[i])

    adjusted = {}
    f_avg = len(fd.keys()) / fd.N()
    for t, n in fd_abbr.iteritems():
        f = fd.get(t, 0) / fd.N()
        deviation = 1 + (f - f_avg)
        adjusted[t] = n * deviation / fd_n_abbr.get(t, 1) / len(t)

    items = adjusted.items()
    items.sort(key=lambda i: i[1], reverse=True)
    for t, n in items[:100]:
        print u'%s. %f (%d, %d)' % (t, n, fd_abbr[t], fd_n_abbr.get(t, 0))
示例#21
0
class FrequenceVocabulary:
    """
    Vocabulary that contains words frequency estimated from
    words count in files specified.
    """
    def __init__(self, miss_f):
        """
        Construct new vocabulary with function that computes word probability
        for words which absent in vocabulary. Example usage:

            >>> miss_f = lambda key, N: 10. / (N * 10 ** len(key))

        :param miss_f: function for estimating probability of missing words.
        """
        self.vocab = FreqDist()
        self._miss_f = miss_f

    def load_vocab(self, root='.', files='.*'):
        """
        Load new vocabulary.

        :param root: the root directory for the corpus.
        :param files: A list or regexp specifying the files in this corpus.
        """
        voc = PlaintextCorpusReader(root, files)
        for word in voc.words():
            self.vocab[word.lower()] += 1

    def p(self, key):
        """
        :param key: word to compute it's probability
        :return: A probability distribution computed for key.
        """
        return 1. * self.vocab[key] / self.vocab.N() if key in self.vocab.keys(
        ) else self._miss_f(key, self.vocab.N())
示例#22
0
def resume_skills(input_skills):
    # Bigrams and trigrams identifier
    ''.join(input_skills)
    bigrams_present = []
    trigrams_present = []
    s1 = []
    s1.append(input_skills)
    for phrase in s1:
        bigrams_present.extend([" ".join(bi) for bi in ngrams(phrase.lower().split(), 2)])
        trigrams_present.extend([" ".join(tri) for tri in ngrams(phrase.lower().split(), 3)])

    all_grams = set(bigrams_present).union(set(trigrams_present))
    soft_skills_present = soft_skills.intersection(all_grams)
    # print(soft_skills_present)


    tokenized_data = word_tokenize(input_skills)

    tokenized_data = [word for word in tokenized_data if word not in stop_words]
    tagged = pos_tag(tokenized_data)
    nouns = [word for word,pos in tagged if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS' or pos == 'JJ' or pos == 'VBP')]
    nouns.extend(list(soft_skills_present))
    test_data_freq = FreqDist(nouns)
#     print(' '.join(test_data_freq.keys()))
    skills_present = test_data_freq.keys()

    return skills_present
def count_difficult_items(items, min_length=4, min_freq=2):
    freq_dist = FreqDist(items)
    keys = freq_dist.keys()
    return len([
        key for key in keys
        if len(key) >= min_length and freq_dist[key] <= min_freq
    ])
示例#24
0
    def GetAllWords(self, content):
        ''' get all words appear in content
            - content: input string
            - Returns a set of all words
        '''
        rawTokens = nltk.word_tokenize(content)

        alphabeticalTokens = [w for w in rawTokens if w.isalpha()]
        del rawTokens
        lowerTokens = [w.lower() for w in alphabeticalTokens]
        del alphabeticalTokens
        stopwords = nltk.corpus.stopwords.words('english')
        tokens = [w for w in lowerTokens if w not in stopwords]

        del lowerTokens
        del stopwords

        lemmatizer = nltk.WordNetLemmatizer()
        lemmatizedTokens = [lemmatizer.lemmatize(t) for t in tokens]

        tokenDist = FreqDist(lemmatizedTokens)
        allWords = set(tokenDist.keys())
        if allWords == None:
            return set()
        return allWords
def parse(filename):
    outfilename = filename + ".freq"
    entry_string = open(filename, 'r').read()
 
    # convert to lower case 
    entry_string = entry_string.lower() 

    # remove punctuation 
    for c in string.punctuation: 
            entry_string = entry_string.replace(c, " ") 

    # remove everything except letters and spaces
    entry_string = re.sub("[^a-z ]", " ", entry_string) 

    # strip out multiple spaces 
    entry_string = re.sub(r'\s+', r' ', entry_string) 

    # make the string into a list and remove stopwords from it 
    entry_string_split = entry_string.split() 
    entry_string_no_stopwords = remove_stopwords(entry_string_split) 

    fd = FreqDist(entry_string_no_stopwords)

    fout = open(outfilename, "w")
    sys.stdout.write(outfilename + "\n")
    fout.write(" ".join(fd.keys()))
    fout.close() 
示例#26
0
def term_frequency():
    for id in doc_id:
        print("start tf : ", id)
        # seperate corpus on basis of id
        text = soup.find(id=id).get_text()
        # basic pre-processing using clear_text method
        words = clear_text(text)
        unigram = get_ngram(words, 1)
        doc_token = FreqDist(unigram)
        structure[id] = doc_token.keys()
        for x in list(unique.keys()):
            if x in list(doc_token.keys()):
                tf[id, x] = doc_token[x] / len(doc_token)
            else:
                tf[id, x] = 0
    return tf, structure
示例#27
0
def freq_words(x, terms = 30):
    all_words = ' '.join([text for text in x])
    all_words = all_words.split()

    fdist = FreqDist(all_words)
    words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})
    return words_df
def draw_word2vec():
    ### Load data
    dataloader = csv_dataloader()
    dataloader.load("output/data_cache.pk")
    print "Read in finished"

    ### Load pre-train word2vector model
    word2vec = get_word2vec(model="data/GoogleNews-vectors-negative300.bin", binary=True, size=300)
    print "Pretrained word2vec loaded"

    all_tokens = sum(dataloader.data.viewvalues(), [])
    print "#Tokens: " + str(len(all_tokens))
    fdist = FreqDist(all_tokens)
    tokens = fdist.keys()[1:500]
    print tokens
    tokens_has_vectors = []
    for token in tokens:
        if word2vec[token] is not None:
            tokens_has_vectors.append(token)

    print "#Unique Tokens \w Vectors: " + str(len(tokens_has_vectors))
    vectors = word2vec.encode(tokens_has_vectors)
    print "#Unique Vectors: " + str(len(vectors))

    print ("Computing MDS embedding")
    clf = manifold.MDS(n_components=2, n_init=1, max_iter=2000)
    # clf = manifold.Isomap(n_components=2, max_iter=100)
    vectors_mds = clf.fit_transform(vectors)
    print ("Done. Stress: %f" % clf.stress_)
    plot_embedding(vectors_mds, tokens_has_vectors, "MDS embedding of the words")
示例#29
0
def append_terms(doc, terms, data, minterm_doc,vector):

	tf = FreqDist(terms)
	max_tf = max(tf.values()if len(tf)>0 else [0])

	for term in tf.keys():
		normalize_tf = tf[term] / max_tf
		new_doc = {'tf': normalize_tf,
				   'weight': 0,
				   'minterm':minterm_doc}
		in_data = False
		for term_data in data:
			if term == term_data['key']:

				#update
				term_data['value']['documents'][doc] = new_doc
				in_data = True
				break

		if not in_data:
			# add
			data.append({'key': term,
						 'value': {'idf':0,
									'documents': {doc:new_doc},
									'index_in_vector': vector[term]}})
def answer_four():

    from nltk import FreqDist
    dist = FreqDist(text1)
    vocab = dist.keys()
    freqwords = sorted([w for w in vocab if len(w) > 5 and dist[w] > 150])
    return freqwords
示例#31
0
def category_by_movie():
    from nltk.corpus import movie_reviews as mr
    from nltk import FreqDist
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.corpus import names
    from nltk.classify import apply_features
    import random

    documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
    random.shuffle(documents)

    all_words = FreqDist(w.lower() for w in mr.words())
    word_features = all_words.keys()[:2000]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    #print document_features(mr.words('pos/cv957_8737.txt'))
    #print documents[0]

    features = [(document_features(d), c) for (d, c) in documents]
    train_set, test_set = features[100:], features[:100]
    classifier = NaiveBayesClassifier.train(train_set)
    print classify.accuracy(classifier, train_set)
示例#32
0
   def top(self, tokens, lowest_rank=50):
      ''' A list of the most frequent (non-stopword) tokens '''
      from operator import itemgetter
      content = self.words(tokens)

      fdist = FreqDist(content)
      vocab = iter(fdist.keys())

      # Forget all previous ranking
      self.lower_words = {}
      frequency = 0
      while frequency < lowest_rank:
         try:
            word = vocab.next()
         except StopIteration:
            break

         word_lower = word.lower()
         if word_lower in self.lower_words:
            self.lower_words[word_lower] = self.lower_words[word_lower] + fdist[word]
         else:
            self.lower_words[word_lower] = fdist[word]

         frequency = frequency + 1

#      return sorted(self.lower_words, key=itemgetter(1), reverse=True)
      return map(itemgetter(0), sorted(self.lower_words.items(), key=itemgetter(1), reverse=True))
示例#33
0
def analysis(dataset, topic_list):
    '''
        start with some data analysis on Review Text and Review Title
        applying the bag of words approach first
    '''
    # remove stopwords and punctions and symbols
    dataset['Review Text'] = dataset['Review Text'].str.replace(
        "[^a-zA-Z#]", " ")
    # remove short words (length < 3)
    dataset['Review Text'] = dataset['Review Text'].apply(
        lambda x: ' '.join([w.lower() for w in x.split() if len(w) > 2]))
    all_reviews = [
        remove_stop_words(words.split(" ")) for words in dataset['Review Text']
    ]
    lemmatizer = WordNetLemmatizer()
    all_words = ' '.join([lemmatizer.lemmatize(word)
                          for word in all_reviews]).split()
    '''
        Plotting the top 30 words of highest frequency 
    '''
    freq_dist = FreqDist(all_words)
    words_distribution = pd.DataFrame({
        'word': list(freq_dist.keys()),
        'count': list(freq_dist.values())
    })
    top_words_distribution = words_distribution.nlargest(
        columns='count', n=30)  # want to view top 30 words

    #plot the output
    plt.figure(figsize=(50, 10))
    ax = sns.barplot(data=top_words_distribution, x="word", y="count")
    ax.set(ylabel='Count')
    plt.show()

    return top_words_distribution, dataset
示例#34
0
def featureExtraction(sentence):

    # feature 1 -> tagged input
    features = {'taggedInput': tagInput(sentence), 'bow': {}}

    # bag of words

    # words not to include
    exclude_words = stopwords.words('english')
    for c in [".", "?", "!", ","]:
        exclude_words.append(c)
    arr_all_words = list(
        set([w for w in word_tokenize(sentence) if w not in exclude_words]))

    if os.path.isfile(CONSTANTS.BOW_PATH):
        for w in joblib.load(CONSTANTS.BOW_PATH):
            if w not in arr_all_words and w != None:
                arr_all_words.append(w)
        joblib.dump(arr_all_words, CONSTANTS.BOW_PATH)
    else:
        joblib.dump(arr_all_words, CONSTANTS.BOW_PATH)
    all_words = FreqDist(w.lower() for w in arr_all_words)
    word_features = all_words.keys()[:2000]
    document_words = set(word_tokenize(sentence))
    bow = {}
    for word in word_features:
        bow['contains(%s)' % word] = (word in document_words)
    features['bow'] = bow

    # add other features here... v
    return bow
示例#35
0
    def build_distribution_matrix(self, stems):
        distrib_matrix_filename = '{0}_distrib_matrix.txt'.format(self.db_name)
        if os.path.isfile(distrib_matrix_filename):  # load matrix from file
            self.log(
                'Loading existing distribution matrix from {0}'.format(
                    distrib_matrix_filename), logging.INFO)
            distrib_matrix = dict()
            with open(distrib_matrix_filename, 'rt') as f:
                csvrreader = csv.DictReader(f,
                                            delimiter=' ',
                                            lineterminator=self.linesep)
                for row in csvrreader:
                    distrib_matrix.update({row['w']: row['P(w|M)']})
                f.close()
        else:  # create matrix and save file
            self.log(
                'Creating new distribution matrix into {0}. Please wait, this may take some time'
                .format(distrib_matrix_filename), logging.INFO)
            distrib_matrix = FreqDist(stems)

            with open(distrib_matrix_filename, 'wt') as f:
                writer = csv.DictWriter(f,
                                        fieldnames=['w', 'P(w|M)'],
                                        delimiter=' ',
                                        lineterminator=self.linesep)
                writer.writeheader()
                for k in distrib_matrix.keys():
                    writer.writerow({'w': k, 'P(w|M)': distrib_matrix[k]})
                f.close()

        distrib_matrix = Discretizer.reduce_distribution_matrix(distrib_matrix,
                                                                cutoff=1)
        return distrib_matrix
class BrownDataset(object):
    def __init__(self, include_start=True):
        self.words = brown.words()
        self.words = map(lambda x: x.lower(), self.words)
        self.total_word_cnt = len(
            self.words) + 2 * len(brown.sents())  # include START and END
        if include_start:
            self.words.append(u'START')
        self.words.append(u'END')
        self.vocab = set(self.words)

        self.vocab_len = len(self.vocab)
        self.word_to_idx = dict(zip(list(self.vocab), range(self.vocab_len)))

        self.sentences = []
        self.bigrams = []
        self.unigrams = []
        for sent in brown.sents():
            sentence = map(lambda x: x.lower(), sent)
            if include_start:
                sentence.insert(0, u'START')
            sentence.append(u'END')
            self.sentences.append(sentence)
            self.bigrams.extend(list(ngrams(sentence, 2)))
            self.unigrams.extend(sentence)

        self.unigram_freq = dict(Counter(self.unigrams))

        self.num_sentences = len(self.sentences)
        self.bigram_cnt = FreqDist(self.bigrams)
        self.bigram_len = len(self.bigram_cnt)
        self.bigram_idx = dict(
            zip(self.bigram_cnt.keys(), range(self.bigram_len)))
        self.bigram_freq = np.asarray(self.bigram_cnt.values())
        self.num_bigrams = len(self.bigram_cnt)
示例#37
0
def bag_of_words(data, label_codebook, feature_codebook, theta):
    """"""
    word_dict = Alphabet()
    stopset = set(stopwords.words('english'))
    for key, value in data.items():
        label_codebook.add(key)
        for doc in value:
            doc_tokens = set(nltk.regexp_tokenize(doc, pattern="\w+"))
            for word in doc_tokens:
                if word not in stopset:
                    word_dict.add(word)
                    
    all_words = word_dict._label_to_index.keys()
    fdict = FreqDist([w for w in all_words])
    word_feature = fdict.keys()[theta:]
    for word in all_words:
        if word in word_feature:
            feature_codebook.add(word)
    
    instance_list = {}
    for label, document_list in data.items():
        instance_list[label] = []
        for document in document_list:
            vector = np.zeros(feature_codebook.size())
            tokens = set(nltk.regexp_tokenize(document, pattern="\w+"))
            indice = 0
            
            for word in tokens:
                if feature_codebook.has_label(word):
                    indice = feature_codebook.get_index(word)
                    vector[indice] = 1.
            instance_list[label].append(vector)
    return instance_list
示例#38
0
def bigram(DATA):
    bigramWordBook = []
    bigramDict = {}
    for comment in DATA:
        if 'text' in comment:
            # stop_words = set(stopwords.words('english'))
            # clean_text = [w for w in comment["text"] if not w in stop_words]
            # bigrams = ngrams(clean_text, 2)

            bigrams = ngrams(comment["word"], 2)
            bigramDist = FreqDist(bigrams)
            bigramWordBook.append(bigramDist)
            for key in bigramDist.keys():
                if key in bigramDict.keys():
                    bigramDict[key] = bigramDict[key] + bigramDist[key]
                else:
                    bigramDict[key] = bigramDist[key]
    bigramDict = {
        key: value
        for key, value in bigramDict.items() if value > 100
    }
    # print(len(bigramDict))
    for key in bigramDict:
        bigramDict[key] = 0
    for comment, bigrams in zip(DATA, bigramWordBook):
        Copy_BiWB = copy.deepcopy(bigramDict)
        for key in bigrams:
            if key in Copy_BiWB:
                Copy_BiWB[key] += 1
        comment['BigramWordBook'] = Copy_BiWB

    return DATA
def main():
    userInput = parser.getInput()
    fileList = parser.getFiles(userInput['train'])
    pdata = parser.parseFiles(fileList)





    allsent = ''
    for f in pdata:
        allsent += f[3]

    all_words = FreqDist(w.lower()
                    for w in word_tokenize(allsent)
                        if w not in stopwords.words('english') )

    global top_words
    top_words = all_words.keys()[:500]

    # pdata = getParseData()
    featdata = featureAggregator(pdata)







    print featdata[:10]
示例#40
0
def plot_postives(s=0, e=50):
    all_words = ' '.join([text for text in df['review']])
    all_words = all_words.split()
    fdist = FreqDist(all_words)
    words_df = pd.DataFrame({
        'word': list(fdist.keys()),
        'count': list(fdist.values())
    })

    # selecting top 20 most frequent words
    d = words_df.nlargest(columns="count", n=len(df['review']))
    d.reset_index(inplace=True)

    d['pos_perc'] = np.nan
    for tag in d['word'].values:
        ret = df[df['review'].str.contains(tag)]
        pos_perc = ret[ret['prediction'] ==
                       'pos'].shape[0] / ret.shape[0] * 100
        neg_perc = 100 - pos_perc
        d.loc[(d['word'] == tag), 'pos_perc'] = pos_perc
    d = d.sort_values('pos_perc', ascending=False)
    plt.figure(figsize=(20, 5))
    sns.barplot(data=d[s:e], x='word', y='pos_perc')
    if (e - s > 60):
        plt.xticks(rotation=90)
    else:
        plt.xticks(rotation=45)
    plt.xticks()
    plt.title('Percentage of Positive Reviews per tag.')
    plt.show()
    def getLongTermsRanked(self,
                           minLen=7.0,
                           numberMostCommons=30,
                           display=False):
        result = []
        resultDocuments = {}
        for seoDocument in self.seoLibrary.seoDocuments:
            tokenList = list(
                set(
                    seoDocument.getTextTokens(removeSplitter=True,
                                              lemmatize=True)))
            for token in tokenList:
                if len(token) > minLen:
                    result.append(token)
                    if token not in resultDocuments:
                        resultDocuments[token] = [seoDocument.order]
                    else:
                        resultDocuments[token].append(seoDocument.order)

        fdist = FreqDist(result)

        for token in fdist.keys():
            fdist[token] = fdist[token] * self.getRankingModifier(
                numpy.mean(resultDocuments[token])) * self.getLengthModifier(
                    len(token), minLen)

        maxValue = max(fdist.values())

        return [(word, int(metric * 100.00 / maxValue))
                for word, metric in fdist.most_common(numberMostCommons)]
示例#42
0
 def train_finder(self, all_listings):
     """
     Train the product identification algorithm with example data.
     """
     logging.info("Start training of recognizer for product: {0}"
                  .format(self.product_id))
     self.classifier = None
     
     #select example listings for the finder's product
     listings, n_pos, n_neg = self.filter_trainig_samples(all_listings)
     logging.info("Number listings: {l}, positive: {p}, negative: {n}; "
                  "features: {f}"
                  .format(l=len(listings), p=n_pos, n=n_neg,
                          f=self.n_features))
     if len(listings) < 30:
         logging.warn("Product {0}. Can't compute classifier. "
                      "Too few listings."
                      .format(self.product_id))
         return
     elif n_pos < 10:
         logging.warn("Product {0}. Can't compute classifier. "
                      "Too few positive listings."
                      .format(self.product_id))
         return
     elif n_neg < 10:
         logging.warn("Product {0}. Can't compute classifier. "
                      "Too few negative listings."
                      .format(self.product_id))
         return
     
     #Create list of most common words, and put it into feature extractor
     #TODO: remove stop-words
     self.feature_extractor = FeatureExtractor()
     word_freqs = FreqDist()
     for _, listing in listings.iterrows():
         words = self.feature_extractor.extract_words(listing)
         word_freqs.update(words)
     common_words = word_freqs.keys()[:self.n_features]
     self.feature_extractor = FeatureExtractor(common_words)
     logging.debug("Number individual words: {0}; hapaxes: {1}"
                   .format(len(word_freqs), len(word_freqs.hapaxes())))
     logging.debug("Most common words: {}".format(word_freqs.keys()[:100]))
     
     #Train the classifier
     train_set = self.create_labeled_features(listings)
     self.classifier = nltk.NaiveBayesClassifier.train(train_set)
     self.classifier.show_most_informative_features(20)
示例#43
0
    def understand_text(self, source):
        output = open(
            "Analytics_for_" + source +
            '_{:%Y_%m_%d_%H%M%S}.txt'.format(datetime.datetime.now()), "w")
        main = self.combine_articles_from_source(source)
        puncts = list(string.punctuation)
        article_tokens = word_tokenize(main)
        clean_tokens = []
        stop_words = set(stopwords.words("english"))
        # Remove punctuation and stop words
        for token in article_tokens:
            if token not in puncts and token not in stop_words and token != "'s" and token != "``" and token != "''":
                clean_tokens.append(token)

        print("************ANALYSING************")
        print(main)
        output.write(
            "#########################################################")
        output.write("#Analysis of all cached posts by " + source +
                     "         #")
        output.write(
            "#########################################################")
        output.write(
            "#                   Concatenated text:                  #")
        output.write(
            "#########################################################")
        output.write(main.encode('utf-8', 'ignore'))
        output.write(
            "#########################################################")
        output.write("\n\n")
        print("*********************************")
        output.write(
            "############Detected tokens:#############################\n\n")
        fdist = FreqDist(clean_tokens)
        print("*************STATS:*****************")
        print("Detected words: ")
        words = ""
        for key in fdist.keys():
            words += key + ", "
        print(words)
        output.write(words.encode('utf-8', 'ignore') + "\n")
        output.write(
            "\n\n#######################Top 25 words:#####################\n\n"
        )
        print("\n\n***25 Most common***:")
        for common in fdist.most_common(n=25):
            print("\"" + common[0] + "\"" + " occurances " + str(common[1]))
            output.write("\"" + common[0].encode('utf-8', 'ignore') + "\"" +
                         " occurances " + str(common[1]) + "\n")

        output.write(
            "######################COMPLETE############################")
        output.close()

        text = Text(clean_tokens)

        #    text.plot(25)

        print("************/STATS*****************")
def load_book_features(file_name):
    with open(file_name, 'r') as file_handler:
        text = file_handler.read()

    morph = pymorphy2.MorphAnalyzer()

    sentence_list = sent_tokenize(text)

    usual_book_words = []
    sentences_length_dist = []
    words_length_dist = []
    pron_dist = []
    conj_dist = []

    for sentence in sentence_list:
        if sentence != ".":
            pron_count = 0
            conj_count = 0
            sentence_words = re.findall(r"[\w]+", sentence)
            sentences_length_dist.append(len(sentence_words))

            for word in sentence_words:
                words_length_dist.append(len(word))
                if word in NOMINATIVE_PRONOUNS:
                    pron_count += 1
                if morph.parse(word)[0].tag.POS == 'CONJ':
                    conj_count += 1
                if word not in STOPWORDS:
                    usual_book_words.append(word)

            conj_dist.append(conj_count)
            pron_dist.append(pron_count)

    sentence_length_freq_dist = FreqDist(sentences_length_dist)
    sentences_length_dist = [sentence_length_freq_dist.freq(i) for i in range(1, RANGE + 1)]
    sentences_length_dist.append(1 - sum(sentences_length_dist))

    words_length_freq_dist = FreqDist(words_length_dist)
    words_length_dist = [words_length_freq_dist.freq(i) for i in range(1, RANGE + 1)]
    words_length_dist.append(1 - sum(words_length_dist))

    pron_freq_dist = FreqDist(pron_dist)
    pron_dist = [pron_freq_dist.freq(i) for i in range(0, RANGE + 1)]
    pron_dist.append(1 - sum(pron_dist))

    conj_freq_dist = FreqDist(conj_dist)
    conj_dist = [conj_freq_dist.freq(i) for i in range(0, RANGE + 1)]
    conj_dist.append(1 - sum(conj_dist))

    words_freq_dist = FreqDist(usual_book_words)

    num_unique_words = len(words_freq_dist.keys())
    num_total_words = len(usual_book_words)

    hapax = len(words_freq_dist.hapaxes()) / num_unique_words
    dis = len([item for item in words_freq_dist if words_freq_dist[item] == 2]) / num_unique_words
    richness = num_unique_words / num_total_words

    return [hapax, dis, richness, *sentences_length_dist, *words_length_dist, *pron_dist, *conj_dist]
示例#45
0
文件: helper.py 项目: nimasputri/TA
 def get_word_features(self, wordlist):
     """
     wordlist - List of words. Word can be the same
     This function takes distinct words in list and return it in a list
     """
     wordlist = FreqDist(wordlist)
     word_features = wordlist.keys()
     return word_features
def get_top_n_words(n, category=''):
	#return the most frequent n words from a category (or the entire corpus)
	if category=='':
		text=brown.words() # get the text from the entire corpus
	else:
		text=brown.words(categories=category) # get the text from the given category
	fdist=FreqDist(text)
	top_words=fdist.keys()
	return top_words[:n]
示例#47
0
文件: db.py 项目: ptmono/dScrapper
    def FreqDisk(self):

        fd = file('full_title_set', 'r')
        title_set = pickle.load(fd)
        fd.close()

        fdist = FreqDist(title_set)
        print "===>best 100", repr(fdist.keys()[:20])
        print "==========================="
def docTF_over_corpusTF(dirPath, lang, searchTerm):
    '''
    Returns a dictionary with tokens as keys and relative frequencies as values.
    Given a search term, 'token', the function groups all tweets with that token
    together to form a document. Then it returns the log of the frequency of 
    tokens in that document over frequency of those tokens outside the document.
    The logic is meant to be similar to tf-idf.
    '''
    searchTerm = codecs.decode(searchTerm, 'utf-8')
    docTokens=[]
    corpusTokens=[]
    for tweetFile in os.listdir(dirPath):
        # make sure tweetFile is a file, not a dir
        if os.path.isfile(dirPath+tweetFile):
            # using codecs for encoding issues, not sure if needed
            rawFile = codecs.open(dirPath + tweetFile, 'r', 'utf-8')
            # my tweet files have one tweet per line
            for rawTweet in rawFile:
                try:
                    # just look at one language
                    if rawTweet.split('\t')[4] == lang:
                        tweetText = rawTweet.split('\t')[1].lower()
                        tokens = tokenize(tweetText)
                        # look for the search term in the tweet text
                        if re.compile(searchTerm).search(tweetText):
                            docTokens.append(tokens)
                        else:
                            corpusTokens.append(tokens)
                # issues with windows ^M newline
                except:
                    pass

    # make lists of vocab for each set
    docVocab = [token for doc in docTokens for token in doc]
    corpusVocab = [token for doc in corpusTokens for token in doc]

    # make frequency distributions with nltk, excluding hapaxes from document
    docFD = {key:value for key,value in FreqDist(docVocab).items() if value > 1}
    corpusFD = FreqDist(corpusVocab)

    # calculate relative frequency for each token
    docOverCorpusTF={}
    for key in docFD.keys():
        if key in corpusFD.keys():
            docOverCorpusTF[key] = log(docFD[key]/corpusFD[key])
        else:
            docOverCorpusTF[key] = log(docFD[key]/1)

    f = codecs.open('results.txt', 'w', 'utf-8')
    f.write(str(len(docTokens)) + 
            " tweets were found *with* the search term: " + searchTerm +"\n")
    f.write(str(len(corpusTokens)) + 
            " tweets were found *without* the search term: " + searchTerm +"\n")
    for item in sorted(docOverCorpusTF, key=docOverCorpusTF.get):
        f.write("%s\n" % item)
    f.close()
def entity_helper(catname, catnum):
    document = load_files(str(catnum))
    print "Extracting Chunks"
    chunks = chunk_document(document)
    print "Extracting Entities"
    entities = extract_entities(chunks)
    fdist = FreqDist(entities)
    print "5 most common entities ({0})".format(catname)
    for i in fdist.keys()[:10]:
        print i
 def similar(self,context, word, num=20):
     word = word.lower()
     wci = context._word_to_contexts
     if word in wci.conditions():
         contexts = set(wci[word])
         fd = FreqDist(w for w in wci.conditions() for c in wci[w]
                       if c in contexts and not w == word)
         words = fd.keys()[:num]
         del fd
         return words
     else:
         return []
    def end_reuters(self):
       """Write out the contents to a file and reset all variables."""

       from textwrap import fill
       import re
       import string

       filename = "/dev/null"
       if self.reuters_lewis_split == "TRAIN" and self.reuters_topics == "YES":
          directory = "C:\\Users\\JeffT\\University Work\\phd\\corpora\\reuters-21578\\" + category
          filename = self.doc_id
       elif self.reuters_lewis_split == "TEST" and self.reuters_topics == "YES":
          directory = "C:\\Users\\JeffT\\University Work\\phd\\corpora\\reuters-21578\\" + category
          filename = self.doc_id
       elif self.reuters_lewis_split == "NOT-USED" and (self.reuters_topics == "YES" or self.reuters_topics == "NO" or self.reuters_topics == "BYPASS"):
          filename = "junk"

       if filename != "junk" and filename != "/dev/null":
          if category in self.topics:
             fullfilepath = directory + "\\" + filename
             sys.stdout.write(fullfilepath + "\n") 
             doc_file = open(fullfilepath, "w")

             # we're only interested in the title and body, so just combine them
             all_content = self.title + self.body
             # convert to lowercase
             all_content = all_content.lower()
             #remove everything except letters and spaces
             all_content = re.sub("[^a-z ]", " ", all_content)
             #strip out multiple spaces
             all_content = re.sub(r'\s+', r' ', all_content) 

          # make the string into a list and remove stopwords from it
             all_content_split = all_content.split()
             all_content_no_stopwords = remove_stopwords(all_content_split)

             fd = FreqDist(all_content_no_stopwords)

             doc_file.write(" ".join(fd.keys()))
             doc_file.close()
#
           # Reset variables
       self.in_topics = 0
       self.in_title = 0
       self.in_body = 0
       self.reuters_lewis_split = ""
       self.reuters_topics = ""
       self.doc_id = 0
       self.topics = []
       self.title = ""
       self.body = ""
def main():
    userInput = parser.getInput()
    fileList = parser.getFiles(userInput['train'])
    parsedata = parser.parseFiles(fileList)


    allsent = ''
    for f in parsedata:
        allsent += f[3]

    all_words = FreqDist(w.lower()
                    for w in word_tokenize(allsent)
                        if w not in stopwords.words('english') )

    global top_words
    top_words = all_words.keys()[:500]


    featdata = extractor.featureAggregator(parsedata)



    # print featdata[20]




    print "Sample Data Item:\n\n"

    print "%20s %4s %4s %20s" % ("FILENAME", "LINENUM", "VOTE", "SENTENCE" )
    print "-" * 79
    print "%10s %4s %4s %20s" % (featdata[20][0], featdata[20][1], featdata[20][2], featdata[20][3])

    print "\n\nFeatures of this Data Item"
    print "-" * 79
    for key,val in featdata[20][4].items():
        print "%50s : %10s" % (key, val )
    # print  "A sample feature: %s" % (featdata[20][4])




    allacc = splitfeatdata(featdata)

    print "\n\n"
    print "-" * 60
    print "Accuracy Values: %s" % (allacc)
    print "==" * 60
    print "Overall Classifier Accuracy %4.4f " % (sum(allacc)/len(allacc))
示例#53
0
def nltk_test_1():
	fd = FreqDist()
	# for each token in the relevant text, increment its counter
	for word in gutenberg.words('austen-persuasion.txt'):
		fd[word.lower()] += 1
	print fd.N()	# total number of samples
	print fd.B()	# number of bins or unique samples
	# Get a list of the top 10 words sorted by frequency
	l = []
	for word in fd.keys():
		tp = (word, fd[word])
		l.append(tp)
	l.sort(key = lambda x : x[1], reverse=True)
	for itr in l[:10]:
		print itr[0], itr[1]
示例#54
0
	def feature_selection_freq(self,instance_list,limits):
		"""get the 2000 most frequent words"""
		#to store all the tokens
		all_words = []
		#populates the codebook
		for i in instance_list:
			if self.label_codebook.has_label(i.label) == False:
				self.label_codebook.add(i.label)
			#here we do some feature selection work by filtering the stopwords defined by NLTK.
			all_words += i.raw_data 			

		#select the 'limit' most frequent words as feature
		fdict = FreqDist([w for w in all_words])
		word_feature = fdict.keys()[:limits]
		for wd in word_feature:
			self.feature_codebook.add(wd) 
def transfer(fileDj,vocabulary):
    fo=open(fileDj,"r")
    content=fo.read()
    tokens=nltk.word_tokenize(content)
    # st=[SBStemmer.stem(t) for t in tokens]
    st=tokens
    fo.close()

    fdist=FreqDist(st)
    BOWDj = []
    for key in vocabulary:
        if key in fdist.keys():
            BOWDj.append(fdist.get(key))
        else:
            BOWDj.append(0)
    return BOWDj
示例#56
0
def accuracy(classifier, calc):
    print("====== ESTIMATING CLASSIFIER ACCURACY ======")
    ast = Normalizer.normalize(astronomic_test)
    rel = Normalizer.normalize(religion_test)
    cou = Normalizer.normalize(countries_test)
    print("Test set generation")
    test_set = [(x, "astronomy") for x in ast] + [(x, "religion") for x in rel] + [(x, "country") for x in cou]
    del ast
    del rel
    del cou
    vocabulary = FreqDist(chain(*[n for n, tag in test_set]))
    vocabulary = list(vocabulary.keys())[:100]
    feature_set = [({i: (i in sentence) for i in vocabulary}, tag) for sentence, tag in test_set]
    print("Trained classifier estimated accuracy:", classify.accuracy(classifier, feature_set))
    if (calc):
        calculate(classifier, feature_set)
示例#57
0
def sort_tfidf(infilename, outfilename):
  inputfile = open(infilename)
  outputfile = open(outfilename, 'w')

  freqdist = FreqDist()
  for line in inputfile:
    line = line.strip()
    words = line.split("\t")
    freqdist[words[0]] = float(words[1])

  for word in freqdist.keys():
    tmp = word + "\t" + str(freqdist[word]) + "\n"
    outputfile.write(tmp)

  inputfile.close()
  outputfile.close()
示例#58
0
def get_word_features(lines):
	""" Create a reference word feature """

	wordlist = []
	for line in lines:
		wordlist += word_tokenize(line)

	#remove stopwords
	wordlist = [w for w in wordlist if w not in stopwords.words('english')]
	
	#remove proper nouns
	taglist = pos_tag(wordlist)
	wordlist = [w for (w, tag) in taglist if tag != "NP" ]
	
	wordlist = FreqDist(wordlist)
	word_features = wordlist.keys()
	return word_features
示例#59
-1
def category_by_pos():
    from nltk.corpus import brown
    from nltk import FreqDist
    from nltk import DecisionTreeClassifier
    from nltk import NaiveBayesClassifier
    from nltk import classify

    suffix_fdist = FreqDist()
    for word in brown.words():
        word = word.lower()
        suffix_fdist.inc(word[-1:])
        suffix_fdist.inc(word[-2:])
        suffix_fdist.inc(word[-3:])

    common_suffixes = suffix_fdist.keys()[:100]
#    print common_suffixes

    def pos_features(word):
        features = {}
        for suffix in common_suffixes:
            features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
        return features

    tagged_words = brown.tagged_words(categories='news')
    featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
#    classifier = DecisionTreeClassifier.train(train_set)
#    print 'Decision Tree %f' % classify.accuracy(classifier, test_set)

    classifier = NaiveBayesClassifier.train(train_set)
    print 'NaiveBay %f' % classify.accuracy(classifier, test_set)