Пример #1
0
def generate_words_grammar():
    """
    Use sentence grammar to find words that could be Rent lyrics
    :return:
    """
    # Load corpuses to look in
    gentrification = PlaintextCorpusReader(
        'corpus', '.*')  # Gentrification articles are in this directory
    gentrify_sents = gentrification.sents()  #
    wine_sents = nltk.corpus.webtext.sents('wine.txt')
    corpus_sents = gentrify_sents + wine_sents
    syls_1 = []
    syls_2 = []
    syls_4 = []
    syls_2_sing = []
    for sent in corpus_sents:
        parsed_sent = nltk.pos_tag(sent)
        for word in parsed_sent:
            no_syls = count_syllables(word[0])
            if word[1] == 'NNS' and len(word[0]) > 3:
                if no_syls == 1:
                    syls_1 = syls_1 + [word[0].lower()]
                elif no_syls == 2:
                    syls_2 = syls_2 + [word[0].lower()]
                elif no_syls == 4:
                    syls_4 = syls_4 + [word[0].lower()]
            if word[1] == 'NN' and len(word[0]) > 2:
                if no_syls == 2:
                    syls_2_sing = syls_2_sing + [word[0].lower()]
    return list(set(syls_1)), list(set(syls_2)), list(set(syls_4)), list(
        set(syls_2_sing))
def align(filename):

	files = filename.split('(')
	ripe_file = os.path.abspath(files[1])
	raw_file = os.path.abspath(files[0])
	raw_for_nltk = os.path.abspath('data/newcorpus/source.txt')
	with open(files[0]) as f:
		with open(raw_for_nltk,"w") as f1:
			for line in f:
				f1.write(line)

	corpusdir = 'data/newcorpus/'
	newcorpus = PlaintextCorpusReader(corpusdir, '.*',sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/german.pickle'))
	out = open(ripe_file, "w")
	i = 0
	temp =[]
	temp.append(newcorpus.sents(raw_for_nltk))
	tempVal = str(temp[i])
	tempVal = tempVal.replace(",", "")
	tempVal = tempVal.replace("u'", "")
	tempVal = tempVal.replace("'", "")
	tempVal = tempVal.replace("[", "")
	tempVal = tempVal.replace("]", "")
	out.write(tempVal+os.linesep)
	out.close()
	return
def generateNgramModel(corpusPath, corpusName):
    corpusdir = 'corpora/'  # Directory of corpus.
    generatedCorpus = PlaintextCorpusReader(corpusPath, corpusName)
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    ngrammodel = NgramModel(2, generatedCorpus.sents(), True, False,
                            estimator)  #uses bigrams just cause they BETTER
    return ngrammodel
Пример #4
0
def processFile(newCorpusDir):
    if not os.path.isdir(newCorpusDir):
        os.mkdir(newCorpusDir)
    txt1 = getText('sample_feed.txt')
    txt2 = pdf.getTextPDF('VirtualBoxTroubleshooting.pdf')
    txt3 = word.getTextWord('my_doc.docx')

    files = [txt1, txt2, txt3]
    for idx, f in enumerate(files):
        with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
            fout.write(f)

    newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')

    print(newCorpus.words())
    print(newCorpus.sents(newCorpus.fileids()[1]))
    print(newCorpus.paras(newCorpus.fileids()[0]))
import os
import word, pdf
from nltk.corpus.reader.plaintext import PlaintextCorpusReader


def getText(txtFileName):
    file = open(txtFileName, 'r')
    return file.read()


newCorpusDir = 'mycorpus/'
if not os.path.isdir(newCorpusDir):
    os.mkdir(newCorpusDir)

txt1 = getText('sample_feed.txt')
txt2 = pdf.getTextPDF('sample-pdf.pdf')
txt3 = word.getTextWord('sample-one-line.docx')

files = [txt1, txt2, txt3]
for idx, f in enumerate(files):
    with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
        fout.write(f)

newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')

print(newCorpus.words())
print(newCorpus.sents(newCorpus.fileids()[1]))
print(newCorpus.paras(newCorpus.fileids()[0]))
Пример #6
0
# Access the plaintext; outputs pure string/basestring.
print newcorpus.raw().strip()
print

# Access paragraphs in the corpus. (list of list of list of strings)
# NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and
#       nltk.tokenize.word_tokenize.
#
# Each element in the outermost list is a paragraph, and
# Each paragraph contains sentence(s), and
# Each sentence contains token(s)
print newcorpus.paras()
print

# To access pargraphs of a specific fileid.
print newcorpus.paras(newcorpus.fileids()[0])

# Access sentences in the corpus. (list of list of strings)
# NOTE: That the texts are flattened into sentences that contains tokens.
print newcorpus.sents()
print

# To access sentences of a specific fileid.
print newcorpus.sents(newcorpus.fileids()[0])

# Access just tokens/words in the corpus. (list of strings)
print newcorpus.words()

# To access tokens of a specific fileid.
print newcorpus.words(newcorpus.fileids()[0])
Пример #7
0
class TextAnalizer:
    def __init__(self, my_input_file):
        self.config = configparser.ConfigParser()
        self.config.read("text_analysis.cfg")
        self.input_file = my_input_file
        self.nlp_model = self.config["DEFAULT"]["nlp_model"]
        #The output file name
        self.output_file = self.config["DEFAULT"]["output_file"]
        self.nlp = load_nlp(self.nlp_model)
        self.corpus = CorpusReader(".", self.input_file)
        self.raw_text = self.corpus.raw()
        self.nlp_text = self.nlp(self.raw_text)
        # Here, lets put together the infos for text analysis with spacy.
        self.analysis_dictionary = Counter()
        self.word_count = 0
        self.get_word_count_nltk()

    def get_paragraph(self):
        return self.corpus.paras()

    def get_sentence(self):
        return self.corpus.sents()

    def get_word(self):
        return self.corpus.words()

    def get_word_count_nltk(self):
        tokenizer = Tokenizer(r'\w+')
        counts = Counter()
        sentences = self.get_sentence()
        for sentence in sentences:
            tokens = tokenizer.tokenize(" ".join(sentence))
            self.word_count = self.word_count + len(tokens)
            filtered = [w for w in sentence if w.isalnum()]
            counts = counts + Counter(filtered)
        return counts, self.word_count

    def analize_nlp(self):
        analized_data_str = (self.config["ANALIZED"]["POS"])
        analized_data = (analized_data_str.split(","))
        result_dict = {}
        diff_str, tot_str = (
            self.config["DEFAULT"]["diff_tot_string"]).split(",")
        lemma_counter = Counter()
        pos_counter = Counter()
        tag_counter = Counter()

        for token in self.nlp_text:
            lemma_counter = lemma_counter + Counter([token.lemma_])
            pos_counter = pos_counter + Counter([token.pos_])
            tag_counter = tag_counter + Counter([token.tag_])
            my_key = token.lemma_ + "_" + token.tag_ + "_" + token.pos_
            self.analysis_dictionary[my_key] += 1
        for pos in analized_data:
            instance_counter = 0
            total_counter = 0
            for key in self.analysis_dictionary.keys():
                try:
                    my_lemma, my_tag, my_pos = key.split("_")
                except ValueError:
                    print("Warning: Array has a empty line")  # add logging
                if pos == my_pos:
                    instance_counter += 1
                    total_counter = total_counter + self.analysis_dictionary.get(
                        key)
            result_dict[pos + diff_str] = instance_counter
            result_dict[pos + tot_str] = total_counter
        #add the stuff from nltk
        diff_word, word_count = self.get_word_count_nltk()
        result_dict["WORDS" + tot_str] = word_count
        result_dict["WORDS" + diff_str] = len(diff_word)
        result_dict["PARAGRAPHS"] = len(self.get_paragraph())
        result_dict["SENTENCES"] = len(self.get_sentence())

        return result_dict

    def write_output(self):
        with open(self.output_file, "w+") as f:
            f.write("Number of paragraphes: " +
                    str(len(self.get_paragraph())) + "\n")
            f.write("Number of sentences: " + str(len(self.get_sentence())) +
                    "\n")
            f.write("Number of words: " + str(self.word_count) + "\n")
            f.write("Average words per sentence: " +
                    str(round(self.word_count / len(self.get_sentence()), 2)) +
                    "\n")
            f.write("Number of different words: " +
                    str(len(self.get_word_count_nltk())) + "\n")
            f.write("Text variety (different words/total words: " + str(
                round(len(self.get_word_count_nltk()) / self.word_count, 2)) +
                    "\n")
            f.close()
Пример #8
0
 def filter_token_in(self):
     corpusdir = 'data/cv_corpus'
     corpa = PlaintextCorpusReader(corpusdir,'.*',encoding='windows-1252')
     corpa_words = set(token.lemma_ for sent in corpa.sents() for token in nlp(" ".join(sent).lower()) )
     tokens = [t for t in self.tokens_in if t in corpa_words]
     return tokens
Пример #9
0
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.probability import LidstoneProbDist, WittenBellProbDist
from nltk.model import NgramModel
from nltk.tokenize import sent_tokenize, word_tokenize


corpusdir = 'corpora/' # Directory of corpus.
SickCorpus = PlaintextCorpusReader(corpusPath, 'sick_tweets.txt')
HealthyCorpus = PlaintextCorpusReader(corpusdir, 'healthy_tweets.txt')
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    


estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

sick_model_1 = NgramModel(1, SickCorpus.sents(), True, False, estimator)
sick_model_2 = NgramModel(2, SickCorpus.sents(), True, False, estimator)

healthy_model_1 = NgramModel(1, HealthyCorpus.sents(), True, False, estimator)
healthy_model_2 = NgramModel(2, HealthyCorpus.sents(), True, False, estimator)

tweet = "Remember when we were all diagnosed with Bieber fever ? Lol"

print "sick_model_1 is: " + str(sick_model_1.perplexity(word_tokenize(tweet)))
print "sick_model_2 is: " + str(sick_model_2.perplexity(word_tokenize(tweet)))
print "healthy_model_1 is: " + str(healthy_model_1.perplexity(word_tokenize(tweet)))
print "healthy_model_2 is: " + str(healthy_model_2.perplexity(word_tokenize(tweet)))
Пример #10
0
# Variables

#STEP 1
# This is the variable name for the target file to read. Note it is useful to copy and paste all from
# .PDF into a .TXT file to read
File_to_Read = 'Sample_from_PDF.txt'

# Read file
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

# Read file
corpus = PlaintextCorpusReader(os.getcwd(), File_to_Read)
#print(corpus.raw())

# Counts total sentences in document and creates a list of words in document
sentences = corpus.sents()
print("\n Total sentences in this corpus : ", len(sentences))
print("\n Words in this corpus : ", corpus.words())

# Finds frequency distribution of words in document
course_freq_dist = nltk.FreqDist(corpus.words())
print("\n Top 30 words in the corpus : ", course_freq_dist.most_common(30))

# Calculate distribution for a specific word
print("\n Distribution for \"hydrogen\" : ", course_freq_dist.get('hydrogen'))

# Tokenization

# Read base file into raw text variable
base_file = open(os.getcwd() + "/" + File_to_Read, mode='rt', encoding='utf-8')
raw_text = base_file.read()
Пример #11
0
def try_out_some_functionalities():

    corpusdir ="/media/benzro/OS/Users/benzro/Desktop/Studium Uni/2)" \
           "ZweitesSemester/27)PCL-2/Uebungen/Uebung03/Enron/test/"
    newcorpus = PCR(corpusdir, '.*')

    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "access one file in the corpus"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    infile = corpusdir + "0001.1999-12-10.farmer.ham.txt"
    infile = "0004.1999-12-14.farmer.ham.txt"
    fin = newcorpus.open(infile)
    print fin.read().strip()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "all file ids"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.fileids()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "access each file in the corpus"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    # (reduced output: [0:2])
    for infile in sorted(newcorpus.fileids()):
        # the fileids of each file
        print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
        print infile
        # opens the file
        fin = newcorpus.open(infile)
        # prints the content of the file
        print fin.read().strip()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "access the plaintext; outputs pure string of all files"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.raw().strip()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Access paragraphs in the corpus. (list of list of list of strings)"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and
    #       nltk.tokenize.word_tokenize.
    #
    # Each element in the outermost list is a paragraph, and
    # Each paragraph contains sentence(s), and
    # Each sentence contains token(s)
    print newcorpus.paras()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "To access pargraphs of a specific fileid."
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.paras(newcorpus.fileids()[0])
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Access sentences in the corpus. (list of list of strings)"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    # NOTE: That the texts are flattened into sentences that contains tokens.
    print newcorpus.sents()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "To access sentences of a specific fileid."
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.sents(newcorpus.fileids()[0])
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Access just tokens/words in the corpus. (list of strings)"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.words()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "To access tokens of a specific fileid."
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.words(newcorpus.fileids()[0])
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
Пример #12
0
class Contract_Reader():
    def __init__(self, config):
        print('Filepath for texts = ', config.textpath)
        self.corpus = PCR(config.textpath,
                          '.*\.txt',
                          encoding='utf-16',
                          para_block_reader=read_line_block)
        if config.clean_paragraphs == 'yes':
            self.clean(config, mode='para')
        if config.clean_sentences == 'yes':
            self.clean(config, mode='sent')
        #Corpus summaries
        self.corpus_info()
        self.LDA(config.num_topics, config.num_words)
        self.plot(config.num_words)

    def clean(self, config, mode='sent'):
        stop = set(stopwords.words('english'))
        exclude = set(string.punctuation)
        lemma = WNL()
        if mode == 'para':
            #paragraphs are lists of sentences each of which is a list of tokens. Reducing to list of strings.
            self.para_list = [
                list(itertools.chain.from_iterable(para))
                for para in self.corpus.paras()
            ]
            for index, paragraph in enumerate(self.para_list):
                paragraph = " ".join(paragraph)
                stop_free = " ".join(
                    [i for i in paragraph.lower().split() if i not in stop])
                punc_free = ''.join(ch for ch in stop_free
                                    if ch not in exclude)
                normalized = " ".join(
                    lemma.lemmatize(word) for word in punc_free.split())
                self.para_list[index] = normalized
            print(self.para_list[0])
            self.para_list = [para.split() for para in self.para_list]
            print(self.para_list[0])
        if mode == 'sent':
            #Obtain list of strings each one a sentence rather than list of lists.
            self.sents_list = [" ".join(sent) for sent in self.corpus.sents()]
            for index, sentence in enumerate(self.sents_list):
                stop_free = " ".join(
                    [i for i in sentence.lower().split() if i not in stop])
                punc_free = ''.join(ch for ch in stop_free
                                    if ch not in exclude)
                normalized = " ".join(
                    lemma.lemmatize(word) for word in punc_free.split())
                self.sents_list[index] = normalized
            print(self.sents_list[0])
            self.sents_list = [
                sentence.split() for sentence in self.sents_list
            ]
            print(self.sents_list[0])

    def LDA(self, num_topics, num_words):
        dictionary = corpora.Dictionary(self.para_list)
        doc_term_matrix = [dictionary.doc2bow(para) for para in self.para_list]
        path = '/mnt/APPDATA/Project_Mafia/omkhalil/vowpal_binaries/vw-7.20150623'
        self.ldamodel = LdaVowpalWabbit(path,
                                        doc_term_matrix,
                                        num_topics=num_topics,
                                        id2word=dictionary)
        self.ldamodel.save('model/lda_model')
        print(self.ldamodel.print_topics(num_topics=10, num_words=num_words))

    def plot(self, num_words):
        for t in range(self.ldamodel.num_topics):
            plt.figure()
            tuples = [
                reversed(x) for x in self.ldamodel.show_topic(t, num_words)
            ]
            plt.imshow(WordCloud().fit_words(dict(tuples)))
            plt.axis("off")
            plt.title("Topic #" + str(t))
            plt.savefig('plots/topic' + str(t))

    def corpus_info(self):
        """
        Summary information about the status of a corpus.
        """
        fids = len(self.corpus.fileids())
        paras = len(self.corpus.paras())
        sents = len(self.corpus.sents())
        sperp = sum(len(para) for para in self.corpus.paras()) / float(paras)
        tokens = FreqDist(self.corpus.words())
        count = sum(tokens.values())
        vocab = len(tokens)
        lexdiv = float(count) / float(vocab)

        print(
            ("Text corpus contains {} files\n"
             "Composed of {} paragraphs and {} sentences.\n"
             "{:0.3f} sentences per paragraph\n"
             "Word count of {} with a vocabulary of {}\n"
             "lexical diversity is {:0.3f}").format(fids, paras, sents, sperp,
                                                    count, vocab, lexdiv))
def generateNgramModel(corpusPath, corpusName):
    corpusdir = 'corpora/' # Directory of corpus.
    generatedCorpus = PlaintextCorpusReader(corpusPath, corpusName)
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    ngrammodel = NgramModel(2, generatedCorpus.sents(), True, False, estimator) #uses bigrams just cause they BETTER
    return ngrammodel
Пример #14
0
    #corpus = nltk.corpus.gutenberg
    #mytexts  = TextCollection([text1])
    
    APPDIR = os.path.dirname(__file__)
    corpus_root = 'D:\\INSTALL\\Python3\\PROJECTS\\SCRIPTS\\TEXTS\\corpora\\'
    corpus = PlaintextCorpusReader(corpus_root, '*.txt')
    print(corpus.words())
    
    
    pprint(rank_quadgrams(corpus, QuadgramAssocMeasures.likelihood_ratio))
   

    
    tokens = [''.join(word[0]) for word in corpus.words()]
    vocab = Counter(tokens)
    sents = list([word[0] for word in sent] for sent in corpus.sents())
    trigram_counts = count_ngrams(3, vocab, sents)


    #Распределение частоты для униграмм можно получить из атрибута unigrams.
    print(trigram_counts.unigrams)
    #Для n­грамм более высокого порядка условное распределение частот можно 
    #получить из атрибута ngrams.
    print(trigram_counts.ngrams[3])    # <FreqDist with 88 samples and 3015993 outcomes>
    
    #Ключи условного распределения частот показывают возможные контексты, 
    #предшествующие каждому слову.
    #print(sorted(trigram_counts.ngrams[3].conditions())) # неверно
    #Наша модель также способна возвращать список возможных следующих слов:
    print(list(trigram_counts.ngrams[3][('the', 'President')]))
    print("Processing directory " + corpusdir)
    print(str(len(corpusfiles)) + " files found")
    print("Loading NLTK")
    from nltk import pos_tag_sents
    from nltk.corpus.reader.plaintext import PlaintextCorpusReader
    counter = 0
    for fn in corpusfiles:
        counter += 1
        outputfn = fn + ".pos"
        print("Processing file " + str(counter) + " of " +
              str(len(corpusfiles)) + ": " + fn)
        if not os.path.isfile(fn):
            print("Warning: " + fn + " is not a regular file. Skipping")
        elif os.path.exists(os.path.join(corpusdir, outputfn)):
            print("Warning: " + outputfn + " already exists. Skipping")
        else:
            # read file into corpus object using the plain text reader
            corpus = PlaintextCorpusReader(corpusdir, fn)
            # POS-tag corpus object using the default NLTK tagger
            tagged = pos_tag_sents(corpus.sents())
            # write tagged corpus object
            outputpath = os.path.join(corpusdir, outputfn)
            try:
                with open(outputpath, 'w') as output:
                    for sent in tagged:
                        line = " ".join(word + "/" + tag for word, tag in sent)
                        output.write(line + '\n')
            except OSError:
                print("Error: could not write to file " + outputfn +
                      ". Skipping")
Пример #16
0
import os
import word, pdf
from nltk.corpus.reader.plaintext import PlaintextCorpusReader


def getText(txtFileName):
    file = open(txtFileName, 'r')
    return file.read()


# 새로운 corpus 폴더 생성-디렉터리
newCorpusDir = 'mycorpus/'
if not os.path.isdir(newCorpusDir):
    os.mkdir(newCorpusDir)

txt1 = getText('sample_feed.txt')
txt2 = pdf.getTextPDF('sample-pdf.pdf')
txt3 = word.getTextWord('sample-one-line.docx')

# 세 문자열 객체의 내용을 디스크에 파일로 작성(쓰기모드)
files = [txt1, txt2, txt3]
for idx, f in enumerate(files):
    with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
        fout.write(f)

# 파일을 저장한 디렉터리에서 plaintext 객체 생성
newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')
print(newCorpus.words())  #0.txt 모든 단어 출력
print(newCorpus.sents(newCorpus.fileids()[1]))  #1.txt 문장 출력
print(newCorpus.sents(newCorpus.fileids()[0]))  #0.txt 단락별 출력
Пример #17
0
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
corpus = PlaintextCorpusReader(".", 'mobi_dick.txt')

for oracion in corpus.sents():
    print(oracion)
import nltk
#import this module for drawing graphs
import matplotlib
# Reader of NLTK to access our own text files and treat them as regular corpora
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
#This is the directory in which we can store our text file
corpusdir = 'newcorpus/'
#this will make the directory in the folder you are working.
if not os.path.isdir(corpusdir):
    os.mkdir(corpusdir)
#accesing the file which is inside the directory
newcorpus = PlaintextCorpusReader('newcorpus/', '.*')
#Now Let us perform some of the operation using Natural Language processing
#displaying the content of the file in the newcorpus which has been made
print(newcorpus.raw().strip())
#displaying the length of the words of the file which is inside the directory newcorpus
a=(len(newcorpus.words()))
print("This will tell me the words inside the file",a)
#displaying the length of the words of the file which is inside the directory newcorpus
b=(len(newcorpus.sents()))
print("This will tell me the sentence inside the file",b)
#calculating average words per sentence
aws= a/b;
print("This will give me average words per sentence",aws)
#**********************************************************************
words_dispalyed = newcorpus.words()
#This function will tell me the frequency distribution of each word in the text file
fre_dis = nltk.FreqDist(words_dispalyed)
#Let us plot each word and their frequency distribution using plot function.
fre_dis.plot(title="Frequency Distribution")
Пример #19
0
# Access the plaintext; outputs pure string/basestring.
print newcorpus.raw().strip()
print 

# Access paragraphs in the corpus. (list of list of list of strings)
# NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and 
#       nltk.tokenize.word_tokenize.
#
# Each element in the outermost list is a paragraph, and
# Each paragraph contains sentence(s), and
# Each sentence contains token(s)
print newcorpus.paras()
print

# To access pargraphs of a specific fileid.
print newcorpus.paras(newcorpus.fileids()[0])

# Access sentences in the corpus. (list of list of strings)
# NOTE: That the texts are flattened into sentences that contains tokens.
print newcorpus.sents()
print

# To access sentences of a specific fileid.
print newcorpus.sents(newcorpus.fileids()[0])

# Access just tokens/words in the corpus. (list of strings)
print newcorpus.words()

# To access tokens of a specific fileid.
print newcorpus.words(newcorpus.fileids()[0])
Пример #20
0
import random
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import bigrams, trigrams
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter, defaultdict

#create a folder for your corpus
corpusdir = 'miscme/'
newcorpus = PlaintextCorpusReader(corpusdir, '.*')
#tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#tokenizer.tokenize(newcorpus.strip())
words = newcorpus.words()
sents = newcorpus.sents()

words = [w.lower() for w in words]
sents = [[w.lower() for w in sent] for sent in sents]

trigram_counts = defaultdict(lambda: Counter())

for sentence in sents:
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        trigram_counts[(w1, w2)][w3] += 1

trigram_probs = defaultdict(lambda: Counter())
for w1_w2 in trigram_counts:
    total_count = float(sum(trigram_counts[w1_w2].values()))
    trigram_probs[w1_w2] = Counter({w3: c/total_count for w3,c in trigram_counts[w1_w2].items()})

for i in range(10):
Пример #21
0
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import tf_glove

corpusdir = 'abstract/'
corpus = PlaintextCorpusReader(corpusdir, '.*')

model = tf_glove.GloVeModel(embedding_size=200,
                            context_size=10,
                            min_occurrences=25,
                            learning_rate=0.05,
                            batch_size=512)
model.fit_to_corpus(corpus.sents())
model.train(num_epochs=50, log_dir="log/example", summary_batch_interval=1000)

import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import tf_glove

corpusdir = 'abstract/'
corpus = PlaintextCorpusReader(corpusdir, '.*')

model = tf_glove.GloVeModel(embedding_size=200,
                            context_size=10,
                            min_occurrences=25,
                            learning_rate=0.05,
                            batch_size=512)
model.fit_to_corpus(corpus.sents())
model.train(num_epochs=50, log_dir="log/example", summary_batch_interval=1000)
Пример #22
0
    if keepWordPOS:
        return words, lemmas, [None if i == '' else i for i in poss]
    return lemmas

regex = re.compile('[_]+')

for f in corpus.fileids():
    outname = args.preprocess + "/" + f + ".out"
    fout = open(outname,"w", encoding="utf8")

splitter = nltk.data.load(‘tokenizers/punkt/english.pickle’)
tokenizer.tokenize(text)

word_tokenize

    for sent in corpus.sents(f):
        s = []
        for w in sent:
                w = regex.sub('',w).lower()
                if (
                        len(w)>2
                        and not w in stop_words
                        and w.isalpha()
                   ):
                        s.append(w)
        if args.lemmatize:
                s = lemmatize_sentence(s)
        print(s)
        if len(s) > 1:
 #               fout.write (f + "\t")
                for w in s:                      
    return file.read()


# 말뭉치 폴더 생성
newCorpusDir = 'mycorpus/'
if not os.path.isdir(newCorpusDir):  # 말뭉치 폴더가 이미 존재하는가?
    os.mkdir(newCorpusDir)

# 파일 읽기
# 일반 텍스트 파일
txt1 = getText('./Files/sample_feed.txt')
# PDF 파일
txt2 = pdf.getTextPDF('./Files/sample-pdf.pdf')
# DOCX 파일
txt3 = word.getTextWord('./Files/sample-one-line.docx')

# 파일 쓰기
files = [txt1, txt2, txt3]
for idx, f in enumerate(files):
    with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
        fout.write(f)

# 사용자 정의 말뭉치 만들기
# 폴더 내의 모든 파일을 읽어와 파일들로부터 말뭉치를 생성한다
newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')

# 사용자 정의 말뭉치가 잘 만들어 졌는지 확인
print(newCorpus.words())  # 말뭉치의 모든 단어를 포함하는 배열
print(newCorpus.sents(newCorpus.fileids()[1]))  # 1.txt에 있는 모든 문장 배열을 출력
print(newCorpus.paras(newCorpus.fileids()[0]))  # 0.txt에 있는 모든 단락 배열을 출력
Пример #24
0
article_corpus = PlaintextCorpusReader('text_plain/', '.*\.txt', 
	sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/spanish.pickle'))

stop_words = nltk.corpus.stopwords.words('spanish') 
non_alphabetic = re.compile("\W|\d")
words = []
tags = []

# Using TreeTagger 
# 1) pip install treetaggerwrapper
# 2) put treetragger in %PYHOME%\Lib\site-packages\TreeTagger
# 3) put spanish-utf8.par and spanish-chunker.par in \TreeTagger\lib
# See http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/spanish-tagset.txt for tag meanings
tagger = treetaggerwrapper.TreeTagger(TAGLANG='es')
for sentence in article_corpus.sents():
	tagged_sentence = tagger.tag_text(sentence) 
	tags.extend(treetaggerwrapper.make_tags(tagged_sentence))

#TODO: create a tagger script, save the tagged files
#TODO: look at alternate taggers, compare

#TODO: profile this and see which part is taking so long
for tag in tags:
	lemma = tag[2].lower()
	if lemma not in stop_words and not non_alphabetic.search(lemma):
		words.append(lemma)

freq_dist = FreqDist(words)

with open('./frequency_distribution.txt', 'w', encoding='utf-8') as f:
Пример #25
0
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import word_tokenize
import re


corpusdir = 'python/' # Directory of corpus.

newcorpus = PlaintextCorpusReader(corpusdir, '.*')
print(newcorpus.fileids()[0])
print(type(newcorpus))
#print newcorpus.raw()
print newcorpus.words(newcorpus.fileids()[0])
print(len(newcorpus.words()))

tokens = word_tokenize(newcorpus.raw())
#type(tokens)
print len(tokens)
print tokens[:50]
#tokens[:10]
print newcorpus.sents()
print

#to remove comments
def removeComments(string):
    string = re.sub(re.compile("/\*.*?\*/",re.DOTALL ) ,"" ,string) # remove all occurance streamed comments (/*COMMENT */) from string fdf
    string = re.sub(re.compile("//.*?\n" ) ,"" ,string) # remove all occurance singleline comments (//COMMENT\n ) from string
    return string

print(removeComments(newcorpus.words(newcorpus.raw())))
Пример #26
0
print(common_words)

data = []
hmap = {}
detokenized = {}

for word, frequency in common_words.items():
    datum = {'word': word, 'frequency': frequency}
    docs = []
    sents = []

    for key, fileid in enumerate(corpus.fileids()):
        if key not in hmap:
            hmap[key] = {}

        for s_id, sentence in enumerate(corpus.sents(fileid)):
            if key in hmap and s_id in hmap[key]:
                words = hmap[key][s_id]
            else:
                words = [lemmatizer.lemmatize(w.lower()) for w in sentence]
                hmap[key][s_id] = words

            if word in words:
                s_key = f'{key}-{s_id}'
                sent = ''

                if s_key in detokenized:
                    sent = detokenized[s_key]
                else:
                    sent = TreebankWordDetokenizer().detokenize(sentence)
                    detokenized[s_key] = sent
Пример #27
0
 def cv_to_matrix(self):
     corpusdir = 'data/cv_corpus'
     corpa = PlaintextCorpusReader(corpusdir,'.*',encoding='windows-1252')
     print("Preprocessing words....")
     sents = [[token.lemma_ for token in nlp(" ".join(self.clean(sent)).lower()) if token.lemma_ not in stopset] for sent in corpa.sents()]
     print("training word vectors....")
     model = Word2Vec(sents,window=5, size=self.ncol,min_count=1, workers=4)
     fname = get_tmpfile("vectors.kv")
     model.wv.save(fname)
     print("cv_to_matrix model saved")
     return model.wv