예제 #1
0
class LoadTrainingData:
    def __init__(self, pos_dir, neg_dir):
        self.pos_dir = pos_dir
        self.neg_dir = neg_dir

        text_processor = ProcessText()
        self.data = []

        # Load preprocessed training data if it exists
        if os.path.exists("training_data.data"):
            with open("training_data.data") as f:
                self.data = eval(f.read())
        # Preprocess again if it does not exist
        else:
            # Read each example in the negative training directory.
            print("Loading negatives")
            for txt_name in os.listdir(neg_dir):
                with open(os.path.join(neg_dir, txt_name)) as f:
                    text = f.read()
                    self.data.append(
                        text_processor.process_text(txt_name, text, 0))

            # # Read each example in the positive training directory.
            print("Loading positives")
            for txt_name in os.listdir(pos_dir):
                with open(os.path.join(pos_dir, txt_name)) as f:
                    text = f.read()
                    self.data.append(
                        text_processor.process_text(txt_name, text, 1))

            with open("training_data.data", "w") as f:
                f.write(str(self.data))

        # Load precomputed if it exists
        if os.path.exists("word_freqs.data"):
            with open("word_freqs.data") as freqs:
                self.words_freq = eval(freqs.read())
        else:
            print("Computing word frequencies")
            # Compute word frequencies
            all_words = []
            for review in self.data:
                all_words = all_words + review["review"]
            self.words_freq = FreqDist(all_words)
            # This is long to compute. We must save this to a file.
            with open("word_freqs.data", "w") as f:
                self.words_freq.pprint(100000000, f)
예제 #2
0
파일: models.py 프로젝트: wujm2007/duality
def build_vocabulary(doc_set,
                     nlp,
                     stop_words,
                     threshold=0.8,
                     vocab_size=10000):
    # this now more
    # the pickled data is of this form [(sent., 0| 2| 4)]
    fdist = FreqDist()
    count = 0
    log_times = 10000
    total_size = len(doc_set)
    for pair in doc_set:
        # for log
        count += 1
        if (np.mod(count, log_times)):
            print('%f finished' % (count / total_size))
        for word in semantic_clean(pair[0], nlp, stop_words):
            fdist[word] += 1
    fdist.pprint(maxlen=50)
    print('Begin flush freq dist into disk')
    util_dump(fdist, os.path.join(DIR, FREQ_DIST_PICK))
    print('End flush freq dist into disk')
    return _build_vocabulary(fdist, threshold, vocab_size)
예제 #3
0
#turns all works into lowercase
lowerToken = [i.lower() for i in token]

#filters out non-alphanumeric
filToken = [i for i in lowerToken if i.isalnum()]

#lemmatizaton
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemToken = [lemmatizer.lemmatize(i) for i in filToken]

#stop words for english filtered out of text
from nltk.corpus import stopwords
a = set(stopwords.words("english"))

finalToken = [x for x in lemToken if x not in a]

#saves list externally
with open('listfile.csv', 'a') as file:
    for i in finalToken:
        file.write('%s\n' % i)


#records the frequency of words
from nltk.probability import FreqDist
fdist = FreqDist(finalToken)

print(fdist.pprint(500))
print(number)
예제 #4
0
"""
#Creating the frequency distribution of words

import matplotlib.pyplot as plt
import nltk
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import pandas as pd
import csv

file = open("/home/verareyes/twitch_clips/fortnite/fort_01_time.txt", "r")
p = file.read()
fdist = FreqDist()
for sentence in nltk.tokenize.sent_tokenize(p):
    for word in nltk.tokenize.word_tokenize(sentence):
        noise={"[", "]", "<", ">", "the", "to", "``", "a", "you", "?", "it", "!", "me", "and", "TO", "THIS", ":", "SPAM", "is", "HELP", "for", "i", "all", "in", "this", "on", "can", "of", "so", "please", "get", "of", "if", "do", "that", "be", "an", "my", "but", "no", "they", "will", "THE", "are", "at", "I", "'s", "'re", "'ll", "'", ",", ".", "have", "got", "with", "YOU", "your", "(", ")", "we", "’", "was", "A", "ME", "na", "did", "IT", "im", "IS", "IF", "gon", "WE", "'s", "''", "n't", "'m"}
        if word not in noise:
            fdist[word]+=1


import matplotlib.pyplot as plt
fdist.plot(30,cumulative=False)
plt.show()
fdist.pprint(800)

dataFrame = pd.DataFrame(list(fdist.items()), columns = ["Time", "Frequency" ])
dataFrame.to_csv('/home/verareyes/twitch_clips/fortnite/fort_01_freq.csv', index=False, header=True)

예제 #5
0
# needed to install some nltk stuff
nltk.download('punkt')
nltk.download('stopwords')

sents = sent_tokenize(concatstring)
words = word_tokenize(concatstring.lower())
_stopwords = set(stopwords.words('english') + list(punctuation))
words=[word for word in words if word not in _stopwords]
print("\n".join(words))

#let's add stemming

from nltk.probability import FreqDist
freq = FreqDist(words)
freq.pprint(200)
type(freq)
freqdict = dict(freq)

# could iterat this way but no
for word in freqdict:
    print(word, freqdict[word])

# another way to iterate over the whole freq
s = [(k, freqdict[k]) for k in sorted(freqdict, key=freqdict.get, reverse=True)]
for k, v in s[:1000]:
    k, v
    linestr = "{}, , , {}\n".format(k, v)
    with open("/Users/jayers/Temp/freqdistwords.csv", "a") as myfile:
        myfile.write(linestr)
def process_corpus(corpus_name):
    print(f'1. Corpus name: {corpus_name}')
    input_file = corpus_name + ".zip"
    corpus_contents = unzip_corpus(input_file)
    corpus_sentences = []
    for content in corpus_contents:
        corpus_sentences.append(re.split(r'(?<=\.) ', content))
    corpus_words = []
    allwords = []
    for sent in corpus_sentences:
        words = []
        for word in sent:
            x = nltk.word_tokenize(word)
            words.append(x)
            for w in x:
                allwords.append(w.lower())
        corpus_words.append(words)
    f = open(corpus_name + "-pos.txt", "w")
    allpos = []
    for story in corpus_words:
        for sentence in story:
            sent = nltk.pos_tag(sentence)
            for word in sent:
                f.write(word[0] + "/" + word[1] + " ")
                allpos.append(word)
        f.write("\n\n")
    f.close()
    print(f'\n2. Total words in the corpus: {len(allwords)}')
    numunique = len(set(allwords))
    print(f'\n3. Vocabulary size of the corpus: {numunique}')
    posfreq = {}
    for i in allpos:
        if i[1] in posfreq:
            posfreq[i[1]] += 1
        else:
            posfreq[i[1]] = 1
    inv = {v: k for k, v in posfreq.items()}
    sorted_posfreq = {k: inv[k] for k in sorted(inv)}
    l = list(sorted_posfreq.keys())
    print(
        f'\n4. The most frequent part-of-speech tag is {sorted_posfreq.get(l[-1])} with frequency {l[-1]}'
    )
    f = open(corpus_name + "-word-freq.txt", "w")
    fdist = FreqDist(word for word in allwords)
    fdist.pprint(maxlen=numunique, stream=f)
    f.close()
    cfdist = ConditionalFreqDist((word[1], word[0].lower()) for word in allpos)
    print(
        f'\n5. Frequencies and relative frequencies of all part-of-speech tags in the corpus in decreasing order of frequency are: '
    )
    for i in range(1, len(sorted_posfreq)):
        print(
            f'{sorted_posfreq.get(l[-i])} tag has frequency {l[-i]} and relative frequency {round(l[-i]/3676, 3)}.'
        )
    f = open(corpus_name + "-pos-word-freq.txt", "w")
    with redirect_stdout(f):
        cfdist.tabulate()
    f.close()
    text = nltk.Text(allwords)
    pos_list = ["NN", "VBD", "JJ", "RB"]
    print("\n6.")
    for pos in pos_list:
        m = cfdist[pos].max()
        print(
            f'The most frequent word in the POS {pos} is {m} and its most similar words are:'
        )
        text.similar(m)
    print(f'7. Collocations:')
    text.collocations()
예제 #7
0
# try the porter stemmer
from nltk.stem import PorterStemmer
st = PorterStemmer()
stemmedWords=[st.stem(word) for word in words]
words = stemmedWords
# also a thing called Lemmatization?
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
lmtzr.lemmatize('cars')

# add parts of speech
# tag list https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging/
poswords = nltk.pos_tag(words)
posfreq = FreqDist(poswords)
freqdict = dict(freq)
posfreq.pprint(200)
# this looks good

# add some word sense disambiguation
# this will give all the definitions of words
from nltk.corpus import wordnet as wn
for ss in wn.synsets('bass'):
    print(ss, ss.definition())

#this will look as the use in sentence and return word sense
from nltk.wsd import lesk
sense1 = lesk(word_tokenize("Sing in a lower tone, along with the bass"),'bass')
print(sense1, sense1.definition())

# calc the frequencies
freq = FreqDist(words)