Пример #1
0
def get_phrase():
    root_dir = r'E:\github_repo\python_basic\pythonbasictest\self_nltk\files'
    wordlists = PlaintextCorpusReader(root_dir,".*")
    x = nltk.Text(wordlists.words("test.txt"))
    print(x)
    
    print(x.collocations())
Пример #2
0
def get_corpus_words():
    '''
        Returns all the words from corpus.
    '''
    reader = PlaintextCorpusReader(settings.CORPUS_ROOT,
                                   settings.CORPUS_FILES_GLOBB)
    if reader:
        return reader.words()
    return []
Пример #3
0
    def load_corpus(self):

        if len(self.corpus) == 0:
            raise Exception('No corpus defined.')

        if os.path.isdir(self.corpusdir) is False:
            self.generate_corpus_files()

        newcorpus = PlaintextCorpusReader(self.corpusdir, '.*')

        # bard.sents = newcorpus.sents
        bard.tokens = newcorpus.words()
        print len(bard.tokens)

        # print 'init markov NLG text generator'
        self.generator = bard.generators.markov.IntelligentMarkovGenerator(bard.tokens)
Пример #4
0
def processFile(newCorpusDir):
    if not os.path.isdir(newCorpusDir):
        os.mkdir(newCorpusDir)
    txt1 = getText('sample_feed.txt')
    txt2 = pdf.getTextPDF('VirtualBoxTroubleshooting.pdf')
    txt3 = word.getTextWord('my_doc.docx')

    files = [txt1, txt2, txt3]
    for idx, f in enumerate(files):
        with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
            fout.write(f)

    newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')

    print(newCorpus.words())
    print(newCorpus.sents(newCorpus.fileids()[1]))
    print(newCorpus.paras(newCorpus.fileids()[0]))
Пример #5
0
	def _strip_tags(self, title):

		new_title = ''

		custom_corpus = PlaintextCorpusReader('../custom_corpora/', '.*')

		#For each word in the title
		for word in title.split():

			#Remove all punctuation
			noPunc = ''.join(c for c in word if c not in string.punctuation)

			#If this word isn't in stopwords and isn't just a single letter
			if noPunc.lower() not in (stopwords.words('english')) and len(noPunc) > 1:

				stripped_word = self._strip_word(word)

				if stripped_word not in (custom_corpus.words('media')) and len(stripped_word) > 1:
					new_title = ' '.join([new_title, stripped_word])

		return new_title[1:]
Пример #6
0
def corpus_reader(filepath):
    """
    takes a filepath including filename
    formats in case file is csv
    loads file into PlainTextCorpusReader
    """
    print "TEST: corpus_reader call"

    csv_file = open(filepath, 'rb') # use test_1.csv as test case
    csv_data = csv.reader(csv_file)
    global csv_read
    csv_read = open('uploads/tmp/read.tmp', 'w')
    for line in csv_data:
        line_to_write = re.sub('[\s\t]+', ' ', str(line))
        line_to_write = line_to_write.lstrip('[\'')
        line_to_write = line_to_write.rstrip('\']')
        csv_read.write(str(line_to_write) + "\n\n")
    root = 'uploads/'
    corpus = PlaintextCorpusReader(root, 'tmp/read.tmp')
    #response = corpus.paras()
    words = corpus.words()
    return words
Пример #7
0
import os
import nltk
import pickle
import zlib
import base64
from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.classify import PositiveNaiveBayesClassifier
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

corpusdir = "./text"
newcorpus = PlaintextCorpusReader(corpusdir, ".*")
labeled_names = (
    [(name, "comp") for name in newcorpus.words("comp.txt")]
    + [(name, "animal") for name in newcorpus.words("animal.txt")]
    + [(word, "ignore") for word in newcorpus.words("ignorethese.txt")]
)
features = [({n: n}, thing) for (n, thing) in labeled_names]
training = features[:]
testing = "What color is the mouse?".lower().split(" ")
classifier = NaiveBayesClassifier.train(training)
pickleclf = pickle.dumps(classifier)
compressed = base64.b64encode(zlib.compress(pickleclf, 9))
with open("PickledClassifier.txt", "wb") as outobj:
    outobj.write(compressed)
compScore = 0
animalScore = 0
for word in testing:
    if (
        word[len(word) - 1] == "."
        or word[len(word) - 1] == ","
        or word[len(word) - 1] == "?"
    return file.read()


# 말뭉치 폴더 생성
newCorpusDir = 'mycorpus/'
if not os.path.isdir(newCorpusDir):  # 말뭉치 폴더가 이미 존재하는가?
    os.mkdir(newCorpusDir)

# 파일 읽기
# 일반 텍스트 파일
txt1 = getText('./Files/sample_feed.txt')
# PDF 파일
txt2 = pdf.getTextPDF('./Files/sample-pdf.pdf')
# DOCX 파일
txt3 = word.getTextWord('./Files/sample-one-line.docx')

# 파일 쓰기
files = [txt1, txt2, txt3]
for idx, f in enumerate(files):
    with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
        fout.write(f)

# 사용자 정의 말뭉치 만들기
# 폴더 내의 모든 파일을 읽어와 파일들로부터 말뭉치를 생성한다
newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')

# 사용자 정의 말뭉치가 잘 만들어 졌는지 확인
print(newCorpus.words())  # 말뭉치의 모든 단어를 포함하는 배열
print(newCorpus.sents(newCorpus.fileids()[1]))  # 1.txt에 있는 모든 문장 배열을 출력
print(newCorpus.paras(newCorpus.fileids()[0]))  # 0.txt에 있는 모든 단락 배열을 출력
import nltk
from nltk.corpus import stopwords
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

from nltk.corpus.reader.plaintext import PlaintextCorpusReader

stop_words = set(stopwords.words('english')) # not interested in stop words
stop_words.update(['.', ',', "',", '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '-']) # ... or punctuation

corpusdir = 'lyrics/' # Directory of corpus.
oLedZeppelinCorpus = PlaintextCorpusReader(corpusdir, '.*')

lNoStopWords = []
for sWord in oLedZeppelinCorpus.words():
    if sWord not in stop_words:
        lNoStopWords.append(sWord)

lNoStopWordsLength = len(lNoStopWords)
wordCounts = Counter(lNoStopWords)
wordCountsLower = Counter(i.lower() for i in lNoStopWords)

# top 25
lCountLabels, lCountValues = zip(*wordCountsLower.most_common(50)[0:25])
lCountIndexes = np.arange(len(lCountLabels))
iCountWidth = 1
barlist = plt.bar(lCountIndexes, lCountValues)
for i in range(0, len(barlist)): # all bars to black
    barlist[i].set_color('black')
plt.xticks(lCountIndexes, lCountLabels)
Пример #10
0
focal_word = sys.argv[1]
senses = [sys.argv[2], sys.argv[3]]
#focal_word = "plant"
#senses = ["manufacturing","life"]
corpus = PlaintextCorpusReader('outcorpus/', '.*')
collocations = [ wsd.BigramLeft(senses, 0), wsd.BigramRight(senses, 1), wsd.BigramScope(senses, 2, [2, 10]) ]
decision_list = wsd.DecisionList()
decision_list.load("senses_bootstrap_" + focal_word + ".csv")    

i = 0
for infile in sorted(corpus.fileids()):
  print i, "/", len(corpus.fileids())
  i += 1
  
  words = corpus.words(infile)
  text = Text(words)
  c = nltk.ConcordanceIndex(text.tokens)
  offsets = c.offsets(focal_word)
  
  for offset in offsets:
    for collocation in collocations:
      tokens = collocation.get_collocation(text, offset)
      if tokens == None: continue
      sense = decision_list.get_sense(tokens, collocation.index)
      if sense == None: continue
      collocation.add_collocation(text, offset, sense)
      collocation.update_decision_list(decision_list)
      #decision_list.add_sense(sense, tokens, collocation.index, score)
      print sense
Пример #11
0
arxiu_diccionari = codecs.open("diccionari2-cat.txt", "r", encoding="utf-8")

for entrada in arxiu_diccionari:
    entrada = entrada.rstrip()
    camps = entrada.split(" ")
    forma = camps[0]
    lema = camps[1]
    etiqueta = camps[2]
    if forma in diccionari:
        diccionari[forma] = diccionari.get(forma,
                                           "") + " " + lema + " " + etiqueta
    else:
        diccionari[forma] = lema + " " + etiqueta

segmentador = nltk.data.load("catalan-mod.pickle")
tokenitzador = RegexpTokenizer('[ldsmLDSM]\'|\w+|[^\w\s]+')

corpus = PlaintextCorpusReader(".",
                               'noticia.txt',
                               word_tokenizer=tokenitzador,
                               sent_tokenizer=segmentador)

for forma in corpus.words():
    if forma in diccionari:
        info = diccionari[forma]
    elif forma.lower() in diccionari:
        info = diccionari[forma.lower()]
    else:
        info = "DESCONEGUDA"
    print(forma + " " + info)
Пример #12
0
import random
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import bigrams, trigrams
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter, defaultdict

#create a folder for your corpus
corpusdir = 'miscme/'
newcorpus = PlaintextCorpusReader(corpusdir, '.*')
#tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#tokenizer.tokenize(newcorpus.strip())
words = newcorpus.words()
sents = newcorpus.sents()

words = [w.lower() for w in words]
sents = [[w.lower() for w in sent] for sent in sents]

trigram_counts = defaultdict(lambda: Counter())

for sentence in sents:
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        trigram_counts[(w1, w2)][w3] += 1

trigram_probs = defaultdict(lambda: Counter())
for w1_w2 in trigram_counts:
    total_count = float(sum(trigram_counts[w1_w2].values()))
    trigram_probs[w1_w2] = Counter({w3: c/total_count for w3,c in trigram_counts[w1_w2].items()})

for i in range(10):
###############################################################################
### ATENTION: if we have some tmp files like .DS_STORE in Mac OSX, we must remove it ###

# Reading corpus
corpusdir = '/Users/marceloschiessl/RDF_text_project/corpus/WikiRisk/test/glossAnnotated/' # Directory of corpus.
#corpusdir = '/Users/marceloschiessl/RDF_text_project/corpus/WikiRisk/test/test1/' # Directory of corpus.   
risco = PlaintextCorpusReader(corpusdir, '.*')
risco.fileids()

raw_text = risco.raw('gloss533.txt')
#print raw_text[0:]

# Some statistics

print 'Number of term: ', len(risco.words())
print 'Number of unique terms: ', len(set(risco.words()))

fd = nltk.FreqDist(risco.words())
print fd.freq('bem')
print fd['bem']

# presenting ngrams of the term
target_word = 'bem como'
fd = nltk.FreqDist(ng
              for ng in nltk.ngrams(risco.words(), 6)
              if target_word in ng)
for hit in fd:
    print(' '.join(hit))

txt = nltk.Text(risco.words())
                  encoding='utf-8') as f:

            for tweet in tweets:
                text = tweet.get("full_text")
                text = re.sub(r"http\S+", "",
                              text)  # remove links from corpora
                f.write(text)


create_corpus(tweet_folder)

# Create NLTK corpus from txt files
corpus_folder = Path("./corpus/")
corpus = PlaintextCorpusReader('./corpus/', '.*')

print(corpus.words('anime.txt'))


# Task a)
def filter_corpus(corpus, file=None):
    '''
    Removes english stopwords
    :param tokens:
    :return:
    '''

    if file is not None:
        tokens = corpus.raw(file).split(' ')  # Using split to keep hashtags
    else:
        tokens = corpus.raw(corpus.fileids()).split(' ')
    filtered_words = []
def getText(textFileName):
    #  读取txt文档
    file = open(textFileName)
    return file.read()


# 创建文件夹
newCorpusDir = 'mycorpus/'
if not os.path.isdir(newCorpusDir):
    os.mkdir(newCorpusDir)
# 读取文件
txt1 = getText('sample_feed.txt')
txt2 = pdf.getTextPDF('sample-pdf.pdf')
txt3 = word.getTextWord('sample-one-line.docx')

files = [txt1, txt2, txt3]
for idx, f in enumerate(files):
    with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
        fout.write(f)

newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')

print(newCorpus.words()[:20])
print(newCorpus.sents(newCorpus.fileids()[1]))
print(newCorpus.paras(newCorpus.fileids()[0]))
print(newCorpus.fileids())
print(newCorpus.words(['1.txt', '2.txt']))
word = newCorpus.words(['0.txt'])
fDist = nltk.FreqDist(word)
print(fDist.most_common(10))
Пример #16
0
import nltk
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer

segmentador = nltk.data.load("catalan.pickle")
tokenitzador = RegexpTokenizer('[ldsmLDSM]\'|\w+|[^\w\s]+')

corpus = PlaintextCorpusReader(".",
                               'DOGC-2015-cat.txt',
                               word_tokenizer=tokenitzador,
                               sent_tokenizer=segmentador)
for paraula in corpus.words():
    print(paraula)
print("TOTAL PALABRAS:", len(corpus.words()))
Пример #17
0
import nltk
import os
from os import listdir
from os.path import isfile, join
from nltk.collocations import *
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

# read in corpus, find all the 3-grams above the min frequency
print "Reading in corpus from", CORPUS_ROOT
my_corpus = PlaintextCorpusReader(CORPUS_ROOT, CORPUS_EXTENSION)
print "Read in " + str(len(my_corpus.fileids())) + " files"
print "Finding 3-grams"
finder_3gram = TrigramCollocationFinder.from_words(my_corpus.words())
print "Filtering out 3-grams of frequency less than", MIN_FREQUENCY
finder_3gram.apply_freq_filter(MIN_FREQUENCY)

# combine all the 3-grams meeting the PMI threshold
print "Looking for 3-grams with a PMI of at least ", MIN_3GRAM_PMI
filelist = [ join(CORPUS_ROOT,f) for f in listdir(CORPUS_ROOT) if isfile(join(CORPUS_ROOT,f)) ]
gen = finder_3gram.above_score(trigram_measures.pmi, MIN_3GRAM_PMI)
processGrams(gen, filelist)

# now let's do the same for the 2-grams
# our previous step altered the corpus so let's read it in again
print "Reading in corpus from", CORPUS_ROOT
my_corpus = PlaintextCorpusReader(CORPUS_ROOT, CORPUS_EXTENSION)
print "Finding 2-grams"
finder_2gram = BigramCollocationFinder.from_words(my_corpus.words())
Пример #18
0
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import word_tokenize
import re


corpusdir = 'python/' # Directory of corpus.

newcorpus = PlaintextCorpusReader(corpusdir, '.*')
print(newcorpus.fileids()[0])
print(type(newcorpus))
#print newcorpus.raw()
print newcorpus.words(newcorpus.fileids()[0])
print(len(newcorpus.words()))

tokens = word_tokenize(newcorpus.raw())
#type(tokens)
print len(tokens)
print tokens[:50]
#tokens[:10]
print newcorpus.sents()
print

#to remove comments
def removeComments(string):
    string = re.sub(re.compile("/\*.*?\*/",re.DOTALL ) ,"" ,string) # remove all occurance streamed comments (/*COMMENT */) from string fdf
    string = re.sub(re.compile("//.*?\n" ) ,"" ,string) # remove all occurance singleline comments (//COMMENT\n ) from string
    return string

print(removeComments(newcorpus.words(newcorpus.raw())))
Пример #19
0
class Misunderstood_artificial_poet:
    def __init__(self, master):
        '''	Constructor. master is a string that names a directory in the same repository that contains all the work from inspiration
		'''
        self.master = 'masters/' + master
        self.reader = PlaintextCorpusReader(self.master,
                                            r'.*',
                                            encoding='utf-8')
        self.text = self.reader.words()

    def generate_model(self, word, num=50):
        '''	Writes a text based on most probable word to appear after each word. Prone to looping
		'''
        bigrams = nltk.bigrams(self.text)
        cfdist = nltk.ConditionalFreqDist(bigrams)
        print(cfdist[word].pformat())
        for i in range(num):
            print(word, end=' ')
            word = cfdist[word].max()

    def count_foot(self, word):
        ''' Counts number of foot in word (doesn't account for jonctions for now)
		'''
        startsWithVowel = False
        if re.match('[aeiouyàéèùôûâêîïöüäëÿ]', word):
            startsWithVowel = True
        splitword = re.split(pattern='[aeiouyàéèùôûâêîïöüäëÿ]',
                             string=word,
                             flags=re.IGNORECASE)
        cleansplit = [el for el in splitword if el is not '']
        if startsWithVowel:
            cleansplit.append('vowel')
        return (len(cleansplit))

    def check_rhyme(self, language, string, substring):
        ''' Checks if the end of a string rhymes with a substring (could this be implemented via machine learning ?)
		'''
        pattern = re.sub(
            '(.*?)([aeiouyàéèùôûâêîïöüäëÿ][zrtpmlkjhgfdsqwxcvbn]*?$)', '\2',
            string)
        if language == 'french':
            pattern = re.sub('', '', pattern)
        elif language == 'english':
            pass

    def text_generator(self, word, num=10):
        ''' Writes a text based on a random choice of word that appear in collocation in master's work
		'''
        verse = ""
        bigrams = nltk.bigrams(self.text)
        cfdist = nltk.ConditionalFreqDist(bigrams)
        for i in range(num):
            verse += word + ' '
            word_collocates = []
            for w in cfdist[word]:
                word_collocates.append(w)
            word = random.choice(word_collocates)
        return verse

    def rhyme_generator(self, inputWord, rhyme=None, foot=12):
        ''' Writes a verse based on previous word used
		'''
        verse = ""
        counted_foot = 0
        bigrams = nltk.bigrams(self.text)
        cfdist = nltk.ConditionalFreqDist(bigrams)
        continueWriting = True
        if rhyme is None:
            rhyme = random.choice([
                word for word in self.text
                if (len(word) > 3 and re.match('[a-zA-Z]', word))
            ])[-3:]
        while continueWriting:
            word_collocates = []
            for w in cfdist[inputWord]:
                word_collocates.append(w)
            if counted_foot < foot - 2:
                word = random.choice(word_collocates)
                verse += word + ' '
            else:
                rhyming_collocates = [
                    word for word in word_collocates
                    if (word.endswith(rhyme) and self.count_foot(word) == 2
                        and word != inputWord)
                ]
                if not rhyming_collocates:
                    rhyming_collocates = [
                        word for word in self.text
                        if (word.endswith(rhyme) and self.count_foot(word) == 2
                            and word != inputWord)
                    ]
                word = random.choice(rhyming_collocates)
                verse += word + ' '
                continueWriting = False
            counted_foot += self.count_foot(word)
        verse = re.sub(' $', '', verse)
        return verse

    def compose_standard_poem(self, length):
        '''	Writes a poem with each verse starting with most commons words in master's work
		'''
        poem = ''
        all_word_dist = nltk.FreqDist(w.lower() for w in self.text)
        mostcommon = all_word_dist.most_common(length)
        for word in [
                x[0] for x in mostcommon
                if re.search('[a-zA-Z]', x[0]) is not None
        ]:
            verse = self.text_generator(word, title=title)
            poem += verse + '\n'
        return poem

    def compose_random_poem(self, length):
        '''	Writes a poem with each verse starting with random words from master's work
		'''
        poem = ''
        for word in random.sample(
            [x for x in self.text if re.search('[a-zA-Z]', x) is not None],
                length):
            verse = self.text_generator(word, title=title)
            poem += verse + '\n'
        return poem

    def compose_prose_poem(self, length):
        '''	write a text that jumps to line after every n number of words, but is composed of one block only
		'''
        final_work = ""
        first_word = random.choice([
            w.lower() for w in self.text
            if re.search('[a-zA-Z]', w) is not None
        ])
        paragraph = self.text_generator(word=first_word, num=length)
        paragraphlist = paragraph.split(' ')
        for i in range(1, len(paragraphlist)):
            final_work += paragraphlist[i] + ' '
            if i % 10 == 0:
                final_work += '\n'
        return final_work

    def compose_rhyming_poem(self, length, foot):
        ''' write a poem that rhymes. For now we use a simple rhyming techniques : AABBCCDD (it's boring but it's simple)
			Also the phonetics aren't fully implemented yet, so for now rhymes are based on the 3 last letters from previous verse
			EXTRA tricky for french with all usual mute letters that we love so much
		'''
        final_work = []
        first_word = random.choice([
            w.lower() for w in self.text
            if re.search('[a-zA-Z]', w) is not None
        ])
        verse = self.rhyme_generator(inputWord=first_word, foot=12, rhyme=None)
        final_work.append(verse)
        first_word = verse.split(' ')[-1]
        previous_rhyme = verse[-3:]
        for i in range(2, length):
            if i % 2 != 0:
                verse = self.rhyme_generator(inputWord=first_word, rhyme=None)
                previous_rhyme = verse[-3:]
                final_work.append(verse)
            else:
                verse = self.rhyme_generator(inputWord=first_word,
                                             rhyme=previous_rhyme)
                final_work.append(verse)
        final_work = "\n".join(final_work)
        return final_work

    def find_title(self):
        '''	find the best title to capture the essence of his work, through random search into words
		'''
        first_word = random.choice([
            w.lower() for w in self.text
            if re.search('[a-zA-Z]', w) is not None
        ])
        length = random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9])
        title = self.text_generator(word=first_word, num=length)
        return title

    def draft_manuscript(self, title, func, **kwargs):
        '''	write a piece of text to a file, send it to everyone in town and wait for the letters of rejection
		'''
        masterpiece = func(**kwargs)
        with open('failed_attempts/' + title + '.txt', 'w',
                  encoding='utf-8') as manuscript:
            manuscript.write(masterpiece)
            signature = re.sub("(^[a-zA-Z])(/)([a-zA-Z])(/)(.)*?", "\3",
                               self.master.capitalize())
            manuscript.write('\n\n\t\t\t\t' + signature)
Пример #20
0
        print("Usage: %s <filelist> <wordlist>" % (sys.argv[0]))
        sys.exit(1)

    files = []
    with open(sys.argv[1], 'r') as fileList:
        files.extend([x for x in fileList.readlines() if not x.startswith("#")])

    ROOT = "/home/ngilbert/xspace/data/"
    docs = []
    for f in files:
        f=f.strip()
        f=f.replace(ROOT, "")
        docs.append(f+"/raw.txt")
    corpus = PlaintextCorpusReader(ROOT, docs)
    #sys.stderr.write(str(corpus.fileids())+"\n")
    unigrams = [token.lower() for token in corpus.words()]
    unigram_fd = nltk.FreqDist(unigrams)
    bigrams = nltk.bigrams(list(map(string.lower, corpus.words())))
    #sys.stderr.write(str(len(bigrams)))
    bigram_fd = nltk.FreqDist(bigrams)
    #print bigram_fd
    #print bigram_fd[("the", "emotional")]
    #print unigram_fd["the"]
    #print bigrams
    sys.stderr.write(">>>finished counting unigrams and bigrams\n")

    wordlist = []
    with open(sys.argv[2], 'r') as wordList:
        wordlist.extend([x.strip() for x in wordList.readlines()])

    #process the documents, find the documents where each of these words
import os
import word, pdf
from nltk.corpus.reader.plaintext import PlaintextCorpusReader


def getText(txtFileName):
    file = open(txtFileName, 'r')
    return file.read()


newCorpusDir = 'mycorpus/'
if not os.path.isdir(newCorpusDir):
    os.mkdir(newCorpusDir)

txt1 = getText('sample_feed.txt')
txt2 = pdf.getTextPDF('sample-pdf.pdf')
txt3 = word.getTextWord('sample-one-line.docx')

files = [txt1, txt2, txt3]
for idx, f in enumerate(files):
    with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
        fout.write(f)

newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')

print(newCorpus.words())
print(newCorpus.sents(newCorpus.fileids()[1]))
print(newCorpus.paras(newCorpus.fileids()[0]))
Пример #22
0
import os
import word, pdf
from nltk.corpus.reader.plaintext import PlaintextCorpusReader


def getText(txtFileName):
    file = open(txtFileName, 'r')
    return file.read()


# 새로운 corpus 폴더 생성-디렉터리
newCorpusDir = 'mycorpus/'
if not os.path.isdir(newCorpusDir):
    os.mkdir(newCorpusDir)

txt1 = getText('sample_feed.txt')
txt2 = pdf.getTextPDF('sample-pdf.pdf')
txt3 = word.getTextWord('sample-one-line.docx')

# 세 문자열 객체의 내용을 디스크에 파일로 작성(쓰기모드)
files = [txt1, txt2, txt3]
for idx, f in enumerate(files):
    with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
        fout.write(f)

# 파일을 저장한 디렉터리에서 plaintext 객체 생성
newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')
print(newCorpus.words())  #0.txt 모든 단어 출력
print(newCorpus.sents(newCorpus.fileids()[1]))  #1.txt 문장 출력
print(newCorpus.sents(newCorpus.fileids()[0]))  #0.txt 단락별 출력
Пример #23
0
class DumbClusterer():
    """A rather dumb clusterer. 
    """
    def __init__(self, corpus_dir=None, mwes=[], setup_mwes=True, **kwargs):
        self.mwes = mwes
        if corpus_dir is not None:
            self.setup_corpus(corpus_dir, '.*')
            if setup_mwes:
                self.setup_mwes(**kwargs)

    def setup_corpus(self, corpus_dir, paths='.*'):
        """Setting up a corpus.

        Args:
            corpus_dir(str): Path to corpus directory.
        """
        self.corpus = PlaintextCorpusReader(corpus_dir, paths)
        return self.corpus

    def extract_expressions(self, document, features=None):
        """Returns expressions from given features and multi-word expressions.
        
        In addition to passing a document into this method, MWEs or Multi-Word Expressions
        can be given to treat some multi words as one expression.

        >>> from document import ArthurDocument
        >>> pdf_path = base_path + '/test/test.pdf'
        >>> with open(pdf_path, 'rb') as f:
        ...     document = ArthurDocument(f.read())
        >>> features = document.get_features()[730:816,:]
        >>> print(document.get_text(features)) # doctest:+ELLIPSIS
        VICTORIA'S CROWN JEWEL OF WATERFRONT ESTATES. Nestled on a quiet cove in the exclusive

        Multi-word expression should be detected:
        >>> clusterer = DumbClusterer(mwes=['crown jewel', 'waterfront estates'])
        >>> expressions = clusterer.extract_expressions(document, features)
        >>> print(expressions[2]['text'])
        CROWN JEWEL

        x position should equal x of "C" from "CROWN JEWEL" :
        >>> expressions[2]['x'] == features[11, ArthurDocument.get_feature_id('x')]
        True

        and width should equal to width of "CROWN JEWEL":
        >>> expr_width = expressions[2]['x1']-expressions[2]['x']
        >>> ftr_width = features[21, ArthurDocument.get_feature_id('x1')] - features[11, ArthurDocument.get_feature_id('x')]
        >>> expr_width == ftr_width
        True

        Args:
            document(ArthurDocument): Document to extract data fields from.
            features(list): List of features containing data fields to extract. If not given, use
                            all document features.
            mwes(list): List of Multi-Word Expressions. Example value:
                        `['property type', 'single family)]`. With that list, both "property type"
                        and "single family" will each be treated as single expressions.        
        Returns:
            np.array: An array of data_fields.
        """
        mwes = self.mwes
        if features is None:
            features = document.get_features()
        text = document.get_text(features)
        for idx, mwe in enumerate(mwes):
            if isinstance(mwe, str):
                mwes[idx] = word_tokenize(mwe.lower())
            elif hasattr(mwe, '__iter__'):
                mwes[idx] = [x.lower() for x in mwe]
        tokenizer = MWETokenizer(mwes, separator=' ')
        tokenized = tokenizer.tokenize(word_tokenize(text.lower()))

        expressions = []
        pos = 0
        for token in tokenized:
            # token could be "deez nutz" but text contains multiple spaces e.g. "deez  nutz",
            # so we need to split the token and find position of first and last characters.
            words = token.split()
            start_pos = text.lower().index(words[0], pos)
            for word in words:
                ipos = text.lower().index(word, pos)
                end_pos = ipos + len(word)
            pos = end_pos
            min_x = 0
            max_x = 0
            min_y = 0
            max_y = 0
            page = 0
            if len(features[start_pos:end_pos,:] > 0):
                min_x =  np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x')]
                max_x =  np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x1')]
                min_y =  np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y')]
                max_y =  np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y1')]
                page = features[start_pos, ArthurDocument.get_feature_id('page')]

            expressions.append({
                'text': text[start_pos:end_pos],
                'x': min_x,
                'x1': max_x,
                'y': min_y,
                'y1': max_y,
                'page': page
            })
        return expressions

    def setup_mwes(self, trigram_nbest=100, bigram_nbest=2000):
        """Create multi-word expressions by learning a corpus located in a corpus directory.

        Testing setting up mwes with custom path and setting it up twice (correct when no exception):
        >>> corpus_dir = os.path.join(base_path, 'test', 'corpus')
        >>> clusterer = DumbClusterer(corpus_dir=corpus_dir, mwes=['custom mwe'])
        >>> mwes = clusterer.setup_mwes(trigram_nbest=1000, bigram_nbest=15000)
        >>> 'custom mwe' not in mwes
        True

        >>> 'custom mwe' in clusterer.mwes
        True

        Args:
            trigram_nbest(int): Number of highest ranked trigrams to acquire.
            bigram_nbest(int): Number of highest ranked trigrams to acquire.
        Returns:
            list: List of multi-word expressions.
        """
        if self.corpus is None:
            raise Exception("Corpus not found. Run method `setup_corpus` with given corpus directory first.")

        bigram_measures = BigramAssocMeasures()
        trigram_measures = TrigramAssocMeasures()

        # Following are not used since ne chunk takes too much time.
        # Text processing before bigrams and trigrams calculated
        # words = []
        # for sent in self.corpus.sents():
        #     for chunk in nltk.ne_chunk(nltk.pos_tag(sent)):
        #         if not isinstance(chunk, nltk.Tree):
        #             w = chunk[0]
        #             # - Removal of words containing numbers or punctuations
        #             if not any((ch.isdigit() or ch in string.punctuation) for ch in w):
        #                 # - Lowercasing all words
        #                 words.append(w.lower())
        #                 print(w.lower().encode("utf-8")),

        # Text processing before bigrams and trigrams calculated
        words = []
        for w in self.corpus.words():
            # - Removal of words containing numbers or punctuations
            if not any((ch.isdigit() or ch in string.punctuation) for ch in w):
                # - Lowercasing all words
                words.append(w.lower())

        bigram_finder = BigramCollocationFinder.from_words(words)
        trigram_finder = TrigramCollocationFinder.from_words(words)
        mwes = trigram_finder.nbest(trigram_measures.pmi, trigram_nbest) + bigram_finder.nbest(bigram_measures.pmi, bigram_nbest)
        # Basically combining two list by turning them into sets to make sure union returned 
        # i.e. `set1 | set2` where set1 could be list of string or list, and if the latter, they
        # need to be converted into sets.
        set1 = {(tuple(mwe) if isinstance(mwe,list) else mwe) for mwe in self.mwes}
        set2 = set(mwes)
        self.mwes = list(set1 | set2)
        return mwes
Пример #24
0
class Contract_Reader():
    def __init__(self, config):
        print('Filepath for texts = ', config.textpath)
        self.corpus = PCR(config.textpath,
                          '.*\.txt',
                          encoding='utf-16',
                          para_block_reader=read_line_block)
        if config.clean_paragraphs == 'yes':
            self.clean(config, mode='para')
        if config.clean_sentences == 'yes':
            self.clean(config, mode='sent')
        #Corpus summaries
        self.corpus_info()
        self.LDA(config.num_topics, config.num_words)
        self.plot(config.num_words)

    def clean(self, config, mode='sent'):
        stop = set(stopwords.words('english'))
        exclude = set(string.punctuation)
        lemma = WNL()
        if mode == 'para':
            #paragraphs are lists of sentences each of which is a list of tokens. Reducing to list of strings.
            self.para_list = [
                list(itertools.chain.from_iterable(para))
                for para in self.corpus.paras()
            ]
            for index, paragraph in enumerate(self.para_list):
                paragraph = " ".join(paragraph)
                stop_free = " ".join(
                    [i for i in paragraph.lower().split() if i not in stop])
                punc_free = ''.join(ch for ch in stop_free
                                    if ch not in exclude)
                normalized = " ".join(
                    lemma.lemmatize(word) for word in punc_free.split())
                self.para_list[index] = normalized
            print(self.para_list[0])
            self.para_list = [para.split() for para in self.para_list]
            print(self.para_list[0])
        if mode == 'sent':
            #Obtain list of strings each one a sentence rather than list of lists.
            self.sents_list = [" ".join(sent) for sent in self.corpus.sents()]
            for index, sentence in enumerate(self.sents_list):
                stop_free = " ".join(
                    [i for i in sentence.lower().split() if i not in stop])
                punc_free = ''.join(ch for ch in stop_free
                                    if ch not in exclude)
                normalized = " ".join(
                    lemma.lemmatize(word) for word in punc_free.split())
                self.sents_list[index] = normalized
            print(self.sents_list[0])
            self.sents_list = [
                sentence.split() for sentence in self.sents_list
            ]
            print(self.sents_list[0])

    def LDA(self, num_topics, num_words):
        dictionary = corpora.Dictionary(self.para_list)
        doc_term_matrix = [dictionary.doc2bow(para) for para in self.para_list]
        path = '/mnt/APPDATA/Project_Mafia/omkhalil/vowpal_binaries/vw-7.20150623'
        self.ldamodel = LdaVowpalWabbit(path,
                                        doc_term_matrix,
                                        num_topics=num_topics,
                                        id2word=dictionary)
        self.ldamodel.save('model/lda_model')
        print(self.ldamodel.print_topics(num_topics=10, num_words=num_words))

    def plot(self, num_words):
        for t in range(self.ldamodel.num_topics):
            plt.figure()
            tuples = [
                reversed(x) for x in self.ldamodel.show_topic(t, num_words)
            ]
            plt.imshow(WordCloud().fit_words(dict(tuples)))
            plt.axis("off")
            plt.title("Topic #" + str(t))
            plt.savefig('plots/topic' + str(t))

    def corpus_info(self):
        """
        Summary information about the status of a corpus.
        """
        fids = len(self.corpus.fileids())
        paras = len(self.corpus.paras())
        sents = len(self.corpus.sents())
        sperp = sum(len(para) for para in self.corpus.paras()) / float(paras)
        tokens = FreqDist(self.corpus.words())
        count = sum(tokens.values())
        vocab = len(tokens)
        lexdiv = float(count) / float(vocab)

        print(
            ("Text corpus contains {} files\n"
             "Composed of {} paragraphs and {} sentences.\n"
             "{:0.3f} sentences per paragraph\n"
             "Word count of {} with a vocabulary of {}\n"
             "lexical diversity is {:0.3f}").format(fids, paras, sents, sperp,
                                                    count, vocab, lexdiv))
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

filecontent1 = "This is a cow"
filecontent2 = "This is a Dog"

corpusdir = 'nltk_data/'
with open(corpusdir + 'content1.txt', 'w') as text_file:
    text_file.write(filecontent1)
with open(corpusdir + 'content2.txt', 'w') as text_file:
    text_file.write(filecontent2)

text_corpus = PlaintextCorpusReader(corpusdir,
                                    ["content1.txt", "content2.txt"])

no_of_words_corpus1 = len(text_corpus.words("content1.txt"))
print(no_of_words_corpus1)
no_of_unique_words_corpus1 = len(set(text_corpus.words("content1.txt")))

no_of_words_corpus2 = len(text_corpus.words("content2.txt"))
no_of_unique_words_corpus2 = len(set(text_corpus.words("content2.txt")))
Пример #26
0
class Corpus(object):
    def __init__(self, data_root):
        self.data_root = data_root
        self.data = PlaintextCorpusReader(data_root, '.*')
        self.words = [i for i in self.data.words() if i.isalpha()]
        self.text = Text(self.words)
        self.stop = set(stopwords.words('english')).union({
            'cid', 'et', 'al', 'also', 'and', 'editingboston', 'arxiv',
            'pages', 'trackboston', 'preprint', 'page', 'vol', 'volume',
            'march', 'boston', 'table'
        })
        with open('bib.json') as fi:
            self.bib = json.load(fi)

    def documents(self):
        """Return a list of all documents in the corpus"""
        return sorted([i for i in os.listdir(self.data_root)])

    def words_in_file(self, filename):
        """Given a file, return a list of tokenized words"""
        try:
            text = self.data.open(filename).read()
        except FileNotFoundError:
            print("The file does not exist.")
        return word_tokenize(text)

    def sentences_in_file(self, filename):
        """Given a file, return a list of sentences"""
        try:
            text = self.data.open(filename).read()
        except FileNotFoundError:
            print("The file does not exist.")
        return sent_tokenize(text)

    def tokenized_sentences_in_file(self, filename):
        """Given a file name, return a list of word tokenized sentences"""
        try:
            text = self.data.open(filename).read()
            sent = [word_tokenize(s) for s in sent_tokenize(text)]
        except FileNotFoundError:
            print("The file does not exist.")
        return sent

    def most_frequent_content_words(self, n_words):
        """Return a list with the most frequent content words and their
        frequencies in (word, frequency) pairs ordered by frequency"""
        content_words = [
            w for w in self.words
            if w.lower() not in self.stop and w.isalpha() and len(w) > 1
        ]
        content_words_dict = FreqDist(content_words)
        return content_words_dict.most_common(n_words)

    def most_frequent_bigrams(self, n_bigrams):
        """Return a list with the most frequent bigrams of content words
        in the form of pairs where the first element is the bigram and
        the second is its frequency"""
        bigram_dict = FreqDist([k for k in bigrams(self.words)if k[0].isalpha()
            and k[1].isalpha() and len(k[0])>1 and len(k[1])>1 \
            and k[0].lower() not in self.stop and k[1].lower() not in self.stop])
        return bigram_dict.most_common(n_bigrams)

    def most_frequent_trigrams(self, n_trigrams):
        trigram_dict = FreqDist([k for k in trigrams(self.words)if k[0].isalpha()
            and k[1].isalpha() and len(k[0])>1 and len(k[1])>1 \
            and k[0].lower() not in self.stop and k[1].lower() not in self.stop
            and k[2].lower() not in self.stop])
        return trigram_dict.most_common(n_trigrams)

    def get_info(self, fileID):
        """Return metadata associate with a file indexed by the following fields:
        author, title, booktitle, year, publisher, pages, location, doi, url"""
        return self.bib[fileID]

    def print_reference(self, fileID):
        """Print metadata (author, title of paper, title of book, publishing year)
        associated with each file as a reference"""
        d = self.bib[fileID]
        print("%s. %s. %s, %s" % (' '.join(
            d['author'].split('\n')), d['title'], d['booktitle'], d['year']))

    def concordance(self, word):
        self.text.concordance(word)
Пример #27
0
# nltk.download()
# nltk.download('gutenberg')

# text1.concordance("water")
# print(FreqDist(text1).most_common(50))
# FreqDist(text1).plot(50, cumulative=True)
# print(set(text1))

corpus_root = '/Users/devindyson/Desktop/nltk/corpora'
corpora = PlaintextCorpusReader(corpus_root, '.*')

# print(corpora.raw('meditations.txt'))
# print(SentimentIntensityAnalyzer().polarity_scores("NLTK is pretty dope."))

print(sorted(corpora.fileids()))
print(len(corpora.words('meditations.txt')))
print(len(corpora.words('benjamin.txt')))

meditations = Text(corpora.words('meditations.txt'))
benjamin = Text(corpora.words('benjamin.txt'))


def lexical_diversity(text_data):
    word_count = len(text_data)
    vocab_size = len(set(text_data))
    diversity_score = vocab_size / word_count
    return diversity_score


print(lexical_diversity(meditations))
print(lexical_diversity(benjamin))
Пример #28
0
corpusdir = 'newcorpus2/'
if not os.path.isdir(corpusdir):
    os.mkdir(corpusdir)
# ***************************************************************************************************************************
# Reading the content of the file which is placed inside the directory newcorpus
newcorpus = PlaintextCorpusReader('newcorpus/', '.*')
print("This is the text file inside newcorpus directory:")
print (newcorpus.raw())
# ***************************************************************************************************************************
# Reading the content of the file which is placed inside the directory newcorpus2
newcorpus2 = PlaintextCorpusReader('newcorpus2/', '.*')
print("This is the text file inside newcorpus2 directory:")
print(newcorpus2.raw())
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
######################################################################################################################
file_1_count=newcorpus.words()
print
print("Display of each word of the file inside the directory newcorpus:")
print(file_1_count)
#count the frequency distribution of each word in the text file
fre_count_file_1= nltk.FreqDist(file_1_count)
print
print("Please see the frequency distribution of each word:")
print(fre_count_file_1)
most_common_word = fre_count_file_1.most_common(2)
print
print("See the most two common used words from the file:")
print(most_common_word)

#################################################################################################################
######################################################################################################################
Пример #29
0
import nltk
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer

segmentador = nltk.data.load("catalan.pickle")
tokenitzador = RegexpTokenizer('[ldsmLDSM]\'|\w+|[^\w\s]+')

corpus = PlaintextCorpusReader(".",
                               'DOGC-2015-cat.txt',
                               word_tokenizer=tokenitzador,
                               sent_tokenizer=segmentador)

ocurrencies = corpus.words()
tipus = set(ocurrencies)

print("OCURRENCIES:", len(ocurrencies))
print("TIPUS:", len(tipus))
Пример #30
0
import nltk
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer

segmentador = nltk.data.load("catalan.pickle")
tokenitzador = RegexpTokenizer('[ldsmLDSM]\'|\w+|[^\w\s]+')
corpus = PlaintextCorpusReader(".",
                               'DOGC-2015-cat.txt',
                               word_tokenizer=tokenitzador,
                               sent_tokenizer=segmentador)
frequencia = {}
for paraula in corpus.words():
    frequencia[paraula] = frequencia.get(paraula, 0) + 1
for clau in frequencia.keys():
    print(frequencia[clau], clau)
def detect(request):
    #Entrada de datos
    if request.method == 'POST':
        identificacion=request.POST.get('dni')
        a=request.FILES['document']
        documento=str(a)
        datos_doc=documento.split('.')
        nombre_doc=datos_doc[0]
        tipo_doc=datos_doc[1]
        if tipo_doc=='txt':
            name=request.FILES['document'].read().lower()
            print(datos_doc)
            #mul=set(stopwords.words("spanish"))
            mul=codecs.open('mul.txt', "r", encoding='UTF-8').read()
            remove('muletillas.txt')
            discurso=(name.decode('UTF-8'))
            #Separar muletillas de palabras comunes
            text_completo = wordpunct_tokenize(discurso)
            m = []
            m = [w for w in text_completo if w in mul]
            
            muletillas= codecs.open('muletillas.txt', "a")
            for i in m:
                muletillas.write(i)
                muletillas.write(" ")
                
            muletillas.close()
            #Contabilizar muletillas 
            tokenizador=RegexpTokenizer('\w+|[^\w\s]+')

            corpus = PlaintextCorpusReader(".", 'muletillas.txt',word_tokenizer=tokenizador, encoding='Latin-1')
            
            frecuencia=FreqDist(corpus.words())
            salida=codecs.open("muletillasR.txt","w",encoding="utf-8")
            palabras=[]
            repeticiones=[]
            #Agregar los datos extraidos en un txt para posterior presentacion
            for mc in frecuencia.most_common(): 
                palabra=mc[0]
                frecuencia_absoluta=mc[1]
                frecuencia_relativa=frecuencia.freq(palabra)
                cadena=str(frecuencia_absoluta)+"\t"+str(frecuencia_relativa)+"\t"+palabra  
                
                palabras.append(palabra.upper()) 
                repeticiones.append(frecuencia_absoluta)  
                salida.write(cadena+"\n")
            try:
                collection.insert_one({
                    'identificacion':identificacion,
                    'documento': documento,
                    'discurso':discurso,
                    'muletillas':palabras
                })
            except Exception as e:
                print("Error : ", type(e), e)
            #Enviado de datos al front
            context={
                'documento': nombre_doc,
                'muletillas':palabras[0:10],
                'repeticiones': repeticiones[0:10]
            }
            return render(request, 'responde.html', context)
        else :
            messages.warning(request, "Verifique el tipo de archivo", extra_tags='file')
            return render(request, 'home.html')
    return render(request, 'home.html')





# class LineChartJSONView(BaseLineChartView):
#     def get_labels():
#         """Return 7 labels for the x-axis."""
#         return ["January", "February", "March", "April", "May", "June","July", "August", "September", "October"]

#     def get_providers(self):
#         """Return names of datasets."""
#         return ["Repeticiones"]

#     def get_data(self):
#         """Return 3 datasets to plot."""

#         return [[75, 44, 92, 11, 44, 95, 35, 11, 44, 95, 35]]


# line_chart = TemplateView.as_view(template_name='responde.html')
# line_chart_json = LineChartJSONView.as_view()
Пример #32
0
corpusdir = 'files/' # Directory of corpus.
root = os.getcwd()
newcorpus = PlaintextCorpusReader(corpusdir, '.*',encoding="latin-1")
print(len(onlyfiles))

fhand = open('stopWords.txt', 'r')
stopWords = fhand.read()
stopWords = stopWords.split('\n')

is_noun = lambda pos: pos[:2] == 'NN'
is_adject = lambda pos: pos[:2] == 'JJ'


for file in onlyfiles:
    print(file)
    text = newcorpus.words(file)
    print(nltk.pos_tag(text))
    print(len(text))
    filename = root + "/coupusFiles/" + file
    print(filename)
    f = open(filename, 'w')
    for words in text:
     print(is_noun(words)
            #      if is_noun(words):
#        if words.lower() not in stopWords:
#          f.write(words)
#          f.write("\n")
#      if is_adject(words):
#         if words.lower() not in stopWords:
#         f.write(words)
#         f.write("\n")
Пример #33
0
import nltk
import os
import glob
from os.path import join
from nltk.collocations import *
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

# read in corpus, find all the 3-grams above the min frequency
print "Reading in corpus from", CORPUS_ROOT
my_corpus = PlaintextCorpusReader(CORPUS_ROOT, CORPUS_EXTENSION)
print "Read in " + str(len(my_corpus.fileids())) + " files"
print "Finding 3-grams"
finder_3gram = TrigramCollocationFinder.from_words(my_corpus.words())
print "Filtering out 3-grams of frequency less than", MIN_FREQUENCY
finder_3gram.apply_freq_filter(MIN_FREQUENCY)

# combine all the 3-grams meeting the PMI threshold
print "Looking for 3-grams with a PMI of at least ", MIN_3GRAM_PMI
filelist = [f for f in glob.glob(CORPUS_ROOT + CORPUS_OUTPUT_EXTENSION)]

gen = finder_3gram.above_score(trigram_measures.pmi, MIN_3GRAM_PMI)
processGrams(gen, filelist)

# now let's do the same for the 2-grams
# our previous step altered the corpus so let's read it in again
print "Reading in corpus from", CORPUS_ROOT
my_corpus = PlaintextCorpusReader(CORPUS_ROOT, CORPUS_EXTENSION)
print "Finding 2-grams"
Пример #34
0
#STEP 1
# This is the variable name for the target file to read. Note it is useful to copy and paste all from
# .PDF into a .TXT file to read
File_to_Read = 'Sample_from_PDF.txt'

# Read file
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

# Read file
corpus = PlaintextCorpusReader(os.getcwd(), File_to_Read)
#print(corpus.raw())

# Counts total sentences in document and creates a list of words in document
sentences = corpus.sents()
print("\n Total sentences in this corpus : ", len(sentences))
print("\n Words in this corpus : ", corpus.words())

# Finds frequency distribution of words in document
course_freq_dist = nltk.FreqDist(corpus.words())
print("\n Top 30 words in the corpus : ", course_freq_dist.most_common(30))

# Calculate distribution for a specific word
print("\n Distribution for \"hydrogen\" : ", course_freq_dist.get('hydrogen'))

# Tokenization

# Read base file into raw text variable
base_file = open(os.getcwd() + "/" + File_to_Read, mode='rt', encoding='utf-8')
raw_text = base_file.read()
base_file.close()
Пример #35
0
class TextAnalizer:
    def __init__(self, my_input_file):
        self.config = configparser.ConfigParser()
        self.config.read("text_analysis.cfg")
        self.input_file = my_input_file
        self.nlp_model = self.config["DEFAULT"]["nlp_model"]
        #The output file name
        self.output_file = self.config["DEFAULT"]["output_file"]
        self.nlp = load_nlp(self.nlp_model)
        self.corpus = CorpusReader(".", self.input_file)
        self.raw_text = self.corpus.raw()
        self.nlp_text = self.nlp(self.raw_text)
        # Here, lets put together the infos for text analysis with spacy.
        self.analysis_dictionary = Counter()
        self.word_count = 0
        self.get_word_count_nltk()

    def get_paragraph(self):
        return self.corpus.paras()

    def get_sentence(self):
        return self.corpus.sents()

    def get_word(self):
        return self.corpus.words()

    def get_word_count_nltk(self):
        tokenizer = Tokenizer(r'\w+')
        counts = Counter()
        sentences = self.get_sentence()
        for sentence in sentences:
            tokens = tokenizer.tokenize(" ".join(sentence))
            self.word_count = self.word_count + len(tokens)
            filtered = [w for w in sentence if w.isalnum()]
            counts = counts + Counter(filtered)
        return counts, self.word_count

    def analize_nlp(self):
        analized_data_str = (self.config["ANALIZED"]["POS"])
        analized_data = (analized_data_str.split(","))
        result_dict = {}
        diff_str, tot_str = (
            self.config["DEFAULT"]["diff_tot_string"]).split(",")
        lemma_counter = Counter()
        pos_counter = Counter()
        tag_counter = Counter()

        for token in self.nlp_text:
            lemma_counter = lemma_counter + Counter([token.lemma_])
            pos_counter = pos_counter + Counter([token.pos_])
            tag_counter = tag_counter + Counter([token.tag_])
            my_key = token.lemma_ + "_" + token.tag_ + "_" + token.pos_
            self.analysis_dictionary[my_key] += 1
        for pos in analized_data:
            instance_counter = 0
            total_counter = 0
            for key in self.analysis_dictionary.keys():
                try:
                    my_lemma, my_tag, my_pos = key.split("_")
                except ValueError:
                    print("Warning: Array has a empty line")  # add logging
                if pos == my_pos:
                    instance_counter += 1
                    total_counter = total_counter + self.analysis_dictionary.get(
                        key)
            result_dict[pos + diff_str] = instance_counter
            result_dict[pos + tot_str] = total_counter
        #add the stuff from nltk
        diff_word, word_count = self.get_word_count_nltk()
        result_dict["WORDS" + tot_str] = word_count
        result_dict["WORDS" + diff_str] = len(diff_word)
        result_dict["PARAGRAPHS"] = len(self.get_paragraph())
        result_dict["SENTENCES"] = len(self.get_sentence())

        return result_dict

    def write_output(self):
        with open(self.output_file, "w+") as f:
            f.write("Number of paragraphes: " +
                    str(len(self.get_paragraph())) + "\n")
            f.write("Number of sentences: " + str(len(self.get_sentence())) +
                    "\n")
            f.write("Number of words: " + str(self.word_count) + "\n")
            f.write("Average words per sentence: " +
                    str(round(self.word_count / len(self.get_sentence()), 2)) +
                    "\n")
            f.write("Number of different words: " +
                    str(len(self.get_word_count_nltk())) + "\n")
            f.write("Text variety (different words/total words: " + str(
                round(len(self.get_word_count_nltk()) / self.word_count, 2)) +
                    "\n")
            f.close()
import nltk
import numpy as np
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

# Get raw text as string.
corpusdir = 'lyrics/'  # Directory of corpus.
oLedZeppelinCorpus = PlaintextCorpusReader(corpusdir, '.*')
lCorpus = oLedZeppelinCorpus.words()  # every word in the corpus


def make_pairs(lCorpus):
    for i in range(len(lCorpus) - 1):
        yield (lCorpus[i], lCorpus[i + 1])


pairs = make_pairs(lCorpus)

word_dict = {}

for word_1, word_2 in pairs:
    if word_1 in word_dict.keys():
        word_dict[word_1].append(word_2)
    else:
        word_dict[word_1] = [word_2]

first_word = np.random.choice(lCorpus)

while first_word.islower():
    first_word = np.random.choice(lCorpus)

chain = [first_word]
Пример #37
0
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.probability import FreqDist

#Create a corpus
corpusdir = "/home/erdinc/nltk/cs290f_proj/tos/"
newcorpus = PlaintextCorpusReader(corpusdir, '.*')
corpusWords = nltk.Text(newcorpus.words())
posTags = nltk.pos_tag(corpusWords)


#Total number of words in corpus
def getTotalNumberOfWords(words):
	return len(words)

#Number of unique words in corpus
def getNumberOfUniqueWords(words):
	return len(set(words))

#Most frequently used 25 words
def getMostFreqWords(words):
	fdist = FreqDist(words)
	vocab = fdist.keys()
	return vocab[:25]


#Name List
def getNameList(tags):
	nameList = []
Пример #38
0
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import nltk

# Might need the below line once
# nltk.download('punkt')

corpusDir = 'own_corpus/'

newCorpus = PlaintextCorpusReader(corpusDir, '.*\.txt')

for file in sorted(newCorpus.fileids()):
    words = newCorpus.words(file)
    text = nltk.Text(words)
    print(text)
Пример #39
0
import nltk
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer
from nltk import FreqDist
segmentador = nltk.data.load("catalan.pickle")
tokenitzador = RegexpTokenizer('[ldsmLDSM]\'|\w+|[^\w\s]+')

corpus = PlaintextCorpusReader(".",
                               'DOGC-2015-cat.txt',
                               word_tokenizer=tokenitzador,
                               sent_tokenizer=segmentador)

frequencia = FreqDist(corpus.words())

for mc in frequencia.most_common():
    print(mc)
Пример #40
0
# Access the plaintext; outputs pure string/basestring.
print newcorpus.raw().strip()
print

# Access paragraphs in the corpus. (list of list of list of strings)
# NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and
#       nltk.tokenize.word_tokenize.
#
# Each element in the outermost list is a paragraph, and
# Each paragraph contains sentence(s), and
# Each sentence contains token(s)
print newcorpus.paras()
print

# To access pargraphs of a specific fileid.
print newcorpus.paras(newcorpus.fileids()[0])

# Access sentences in the corpus. (list of list of strings)
# NOTE: That the texts are flattened into sentences that contains tokens.
print newcorpus.sents()
print

# To access sentences of a specific fileid.
print newcorpus.sents(newcorpus.fileids()[0])

# Access just tokens/words in the corpus. (list of strings)
print newcorpus.words()

# To access tokens of a specific fileid.
print newcorpus.words(newcorpus.fileids()[0])
Пример #41
0
# Access the plaintext; outputs pure string/basestring.
print newcorpus.raw().strip()
print 

# Access paragraphs in the corpus. (list of list of list of strings)
# NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and 
#       nltk.tokenize.word_tokenize.
#
# Each element in the outermost list is a paragraph, and
# Each paragraph contains sentence(s), and
# Each sentence contains token(s)
print newcorpus.paras()
print

# To access pargraphs of a specific fileid.
print newcorpus.paras(newcorpus.fileids()[0])

# Access sentences in the corpus. (list of list of strings)
# NOTE: That the texts are flattened into sentences that contains tokens.
print newcorpus.sents()
print

# To access sentences of a specific fileid.
print newcorpus.sents(newcorpus.fileids()[0])

# Access just tokens/words in the corpus. (list of strings)
print newcorpus.words()

# To access tokens of a specific fileid.
print newcorpus.words(newcorpus.fileids()[0])