Пример #1
0
def load_feat_data(dir_array):

    data_list = []

    for direct in dir_array:

        data = []

        corpus_dir = 'dataset/' + direct
        corpus = PlaintextCorpusReader(corpus_dir, '.*\.*')
        file_ids = corpus.fileids()

        for file in file_ids:
            text = corpus.raw(file)
            e = email.message_from_string(text)

            if (e.is_multipart()):
                for payload in e.get_payload:
                    text = payload.get_payload

            else:
                text = e.get_payload()

            data.append(extract_features(text, corpus, file))

        data_list.extend(data)

    return data_list
Пример #2
0
def read_corpus(corpus_path):
    from nltk.corpus.reader.plaintext import PlaintextCorpusReader
    corpus = PlaintextCorpusReader(corpus_path, ".*\.txt")
    ctext = corpus.raw()
    #    with open('corpus.txt', 'w') as cf:
    #        cf.write(ctext.encode('utf-8'))
    return ctext
Пример #3
0
def load_corpus(race_code=None,
                gender_code=None
                ):  #loads corpora into an array based on race and gender

    if (race_code == None):  # if none is specified, search all
        race_code = ".."
    if (gender_code == None):
        gender_code = ".."

    reader = PlaintextCorpusReader(
        corpus_root, ".*_" + race_code + "_" + gender_code +
        "\.txt")  # uses filename encoding to load specified texts
    corpora = []

    for fileid in reader.fileids(
    ):  #creates ComedyCorpus object, populates with fileid and name
        new_corpus = ComedyCorpus()
        new_corpus.set_fileid(fileid)
        try:
            new_corpus.set_text(
                reader.raw(fileid))  #gets word content based on fileid
        except UnicodeDecodeError:
            continue
        fileid = re.sub("_" + race_code + "-" + gender_code + "\.txt", "",
                        fileid)
        #name is fileid without encoding
        fileid = fileid.replace("%20", " ")
        fileid = fileid.replace("_", "; ")
        print(fileid)
        new_corpus.set_name(fileid)
        corpora.append(new_corpus)

    return corpora
Пример #4
0
def load_data(dir_label):

    data_list = []
    labels = []

    for dl in dir_label:

        data = []

        directory = dl[0]
        label = dl[1]

        corpus_dir = 'dataset/' + directory
        corpus = PlaintextCorpusReader(corpus_dir, '.*\.*')
        file_ids = corpus.fileids()

        for file in file_ids:

            d = []

            text = corpus.raw(file)
            e = email.message_from_string(text)

            if (e.is_multipart()):
                for payload in e.get_payload:
                    text = payload.get_payload
            else:
                text = e.get_payload()

            feats = [
                cf.charac_feats_extractor(text),
                wf.word_feats_extractor(text),
                syf.syntac_feats_extractor(text),
                stf.struct_feats_extractor(corpus, file, text),
                fwf.funct_word_feats_extractor(text)
            ]

            for f in feats:
                d.extend(list(f.values()))

            data.append(d)
            labels.append(label)

        data_list.extend(data)

    return [data_list, labels]
Пример #5
0
def pdf_to_corpus():
    path = 'D://Eclipse Workspace//NLP//Assignment//res//'

    for filename in glob.glob(os.path.join(path, '*.pdf')):
        print(filename)
        pdfFileObj = open(filename, 'rb')

        # creating a pdf reader object
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

        # printing number of pages in pdf file
        print(pdfReader.numPages)

        # creating a page object
        pageObj = pdfReader.getPage(0)

        # extracting text from page
        text = pageObj.extractText()

        strings_list = text.split("\n")
        # Make new dir for the corpus.
        corpusdir = 'customcorpus/'
        if not os.path.isdir(corpusdir):
            os.mkdir(corpusdir)

        # Output the files into the directory.
        file_name = filename.split("\\")[-1]

        print(file_name)
        pbar = ProgressBar(widgets=[
            'Creating Corpus',
            Bar('#', '[', ']'), ' ',
            Percentage(), ' ',
            ETA()
        ],
                           maxval=100)
        for text in pbar(strings_list):
            with open(corpusdir + '[PDF] ' + file_name + '.txt', 'ab') as fout:
                fout.write(text.encode('utf-8'))
        pbar.finish()

        #create_corpus(text)
        corpus = PlaintextCorpusReader('customcorpus/', '.*')

        print(corpus.raw())
Пример #6
0
def token_assamese():
    # Modifiy these to change the location of the coupus file  and the name of  the courpus  file
    corpus_path = "/Users/partha/All/Python/ProjectMaterials/Learned material/Arts"
    corpus_filename = 'Psychology.txt'

    newcorpus = PlaintextCorpusReader(corpus_path,
                                      corpus_filename,
                                      encoding='utf16')
    text = newcorpus.raw().strip().replace('ред', '.')
    words = nltk.word_tokenize(text)

    for index, item in enumerate(words):
        if (str(item) == '.'):
            words[index] = 'ред'

    output_file_path = "C:/Users/HEMANT/Documents/1.Project/"
    output_filename = 'Result.txt'

    with open(output_file_path + output_filename, 'w', encoding='utf8') as f:
        for i in words:
            f.writelines(str(i) + '\n')

    f.close()
Пример #7
0
def read_article(file_path):
    #file = open(file_path, "r")
    ##INSERT FILE NAME IN FUNCTION CALL BELOW######
    bcr = PlaintextCorpusReader(file_path, 'bernie.txt')
    #filedata = file.read()
    filedata = bcr.raw()
    #for word in filedata.split():
    #    if word == 'Mr.':
    #        filedata[word] = 'Mr'
    article = filedata.replace("\n\n", '. ').replace('Mr.', 'Mr').replace(
        "\r", ' ').replace('\n', ' ').split('. ')
    articlez = []
    for line in article:
        if line == '':
            continue
        if line[0] == '\n':
            line = line[1:]
        articlez.append(line)
    sentences = []
    for sentence in articlez:
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop()

    return sentences
    refined_used_vocab = []
    n = 0
    for word in used_vocab:
        if word not in refined_used_vocab:
            refined_used_vocab.append(word)
            n = n + 1
    print '    Used Vocab words: '
    print '\t\t\t', refined_used_vocab
    print '    No. of vocab words used: ', n, '\n'
    return n


datadir = 'just for test/moreErrors/'
train_data = PlaintextCorpusReader(datadir, '.*')
all_contents = train_data.raw().strip()
all_text = preprocess(all_contents)
bag_of_words = [word for word, word_count in Counter(all_text).most_common(20)]
print('\n Bag of Words:')
print bag_of_words
print('\n')

path1 = '/home/sudo-this/PycharmProjects/Automated Essay Marking/top_scored_essay/*.txt'
files = glob.glob(path1)
reference_collection = []
for filename in sorted(files):
    with open(filename, 'r') as f:
        f_contents = f.read()
    preprocessed_text = preprocess(f_contents)
    reference_text = preprocessed_text[:-1]
    reference_collection.append(reference_text)
Пример #9
0
	rem_chars = "[!\"#$%&()*+,:;<=>?@[\\]^_`{|}~0123456789]" # remove these
	rep_chars = "[-./\']" # replace these
	t_temp = re.sub(rem_chars, "", t.lower())
	t_temp = re.sub(rep_chars, " ", t_temp)
	t_strip_lower_filt = [w for w in t_temp.split() if not w in stopwords.words('english')]
	return " ".join(t_strip_lower_filt)

# load the data
corpusdir = 'corpus_txt/' # Directory of corpus.
mycorp_raw = PlaintextCorpusReader(corpusdir, '.*')
file_index = mycorp_raw.fileids()

# preprocess the text (slow)
# uncomment one of the following lines for usual vs parallel processing
#mycorp_proc = nltk.Text([preprocess(mycorp_raw.raw(f)) for f in file_index])
mycorp_proc = Parallel(n_jobs=3,verbose=True)(delayed(preprocess)(mycorp_raw.raw(f)) for f in file_index)


# get ngrams (1-3)
vectorizer_ngrams = CountVectorizer(min_df = 0.05, ngram_range=(1, 3))
mat_ngrams = vectorizer_ngrams.fit_transform(mycorp_proc)
n_df = pd.DataFrame(data = mat_ngrams.A, 
	columns = vectorizer_ngrams.get_feature_names())
n_df['pt_id'] = [i[:-4] for i in file_index]
# write results to file
n_df.to_csv('ngrams_dtm.csv', index = False)

# note the following analysis is redundant in this small case, since all words 
# will be captured. In a very large dataset we might limit the capture of n-grams
# to those that only occur in say 5% fo the documents, thus potentially excluding other
# terms of interest such as those below
Пример #10
0
import os
import nltk
from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.classify import PositiveNaiveBayesClassifier
from nltk.corpus.reader.plaintext import PlaintextCorpusReader


def features(sentence):
    words = sentence.lower().split()
    return dict(('contains(%s)' % w, True) for w in words)


corpusdir = './text'
newcorpus = PlaintextCorpusReader(corpusdir, '.*')
positive_featuresets = list(map(features, newcorpus.raw('comp.txt')))
unlabeled_featuresets = list(map(features, newcorpus.raw('animal.txt')))
classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
                                                unlabeled_featuresets, .3)
print classifier.classify(features('.'))
Пример #11
0
def try_out_some_functionalities():

    corpusdir ="/media/benzro/OS/Users/benzro/Desktop/Studium Uni/2)" \
           "ZweitesSemester/27)PCL-2/Uebungen/Uebung03/Enron/test/"
    newcorpus = PCR(corpusdir, '.*')

    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "access one file in the corpus"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    infile = corpusdir + "0001.1999-12-10.farmer.ham.txt"
    infile = "0004.1999-12-14.farmer.ham.txt"
    fin = newcorpus.open(infile)
    print fin.read().strip()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "all file ids"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.fileids()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "access each file in the corpus"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    # (reduced output: [0:2])
    for infile in sorted(newcorpus.fileids()):
        # the fileids of each file
        print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
        print infile
        # opens the file
        fin = newcorpus.open(infile)
        # prints the content of the file
        print fin.read().strip()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "access the plaintext; outputs pure string of all files"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.raw().strip()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Access paragraphs in the corpus. (list of list of list of strings)"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and
    #       nltk.tokenize.word_tokenize.
    #
    # Each element in the outermost list is a paragraph, and
    # Each paragraph contains sentence(s), and
    # Each sentence contains token(s)
    print newcorpus.paras()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "To access pargraphs of a specific fileid."
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.paras(newcorpus.fileids()[0])
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Access sentences in the corpus. (list of list of strings)"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    # NOTE: That the texts are flattened into sentences that contains tokens.
    print newcorpus.sents()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "To access sentences of a specific fileid."
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.sents(newcorpus.fileids()[0])
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Access just tokens/words in the corpus. (list of strings)"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.words()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "To access tokens of a specific fileid."
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.words(newcorpus.fileids()[0])
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
Пример #12
0
# Create a new corpus by specifying the parameters
# (1) directory of the new corpus
# (2) the fileids of the corpus
# NOTE: in this case the fileids are simply the filenames.
newcorpus = PlaintextCorpusReader('nltkCorpusAll/', '.*')

# Access each file in the corpus.
for infile in sorted(newcorpus.fileids()):
    print infile # The fileids of each file.
    fin = newcorpus.open(infile)# Opens the file.
    print fin.read().strip() # Prints the content of the file
print

# Access the plaintext; outputs pure string/basestring.
print newcorpus.raw().strip()
print 

# Access paragraphs in the corpus. (list of list of list of strings)
# NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and 
#       nltk.tokenize.word_tokenize.
#
# Each element in the outermost list is a paragraph, and
# Each paragraph contains sentence(s), and
# Each sentence contains token(s)
print newcorpus.paras()
print

# To access pargraphs of a specific fileid.
print newcorpus.paras(newcorpus.fileids()[0])
Пример #13
0
config = configparser.ConfigParser()
config.read("text_analysis.cfg")
#Read out configuration parameters
#Filname of text file to analyze
input_file = config["DEFAULT"]["input_file"]
#The nlp model used
nlp_model = config["DEFAULT"]["nlp_model"]
#The output file name
output_file = config["DEFAULT"]["output_file"]

#Load the nlp model
nlp = load_nlp(nlp_model)

#This Section generates a corpus (for nltk) and a string-text (for spacy)
corpus = CorpusReader(".", input_file)
my_text = corpus.raw()

#This section deals with nltk-stuff for analysis
paragraphs = corpus.paras()
sentences = corpus.sents()
words = corpus.words()

tokenizer = Tokenizer(r'\w+')
word_count = 0
counts = Counter()

for sentence in sentences:
    tokens = tokenizer.tokenize(" ".join(sentence))
    word_count = word_count + len(tokens)
    filtered = [w for w in sentence if w.isalnum()]
    counts = counts + Counter(filtered)
Пример #14
0
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from decimal import Decimal
from math import pi

if __name__ == '__main__':
    ptcr = PlaintextCorpusReader('C:\Users\Jakub\Downloads\pr4\Trzeci plik', ['znormalizowane.txt', 'katy.txt'])
    data = []
    t = ptcr.raw(fileids=ptcr.fileids()[1]).replace(',', '.').replace('\r', '').split('\n')
    t.remove('')
    for x in t:
        data.append(float(Decimal(x)*360/315))
    print data
    data_ = []
    t = ptcr.raw(fileids=ptcr.fileids()[0]).replace(',', '.').replace('\r', '').split('\n')
    t.remove('')
    for x in t:
        data_.append(float(x)/100)
    print data_
Пример #15
0
import os
#import the module nltk
import nltk
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
corpusdir = 'newcorpus2/'
if not os.path.isdir(corpusdir):
    os.mkdir(corpusdir)
# ***************************************************************************************************************************
# Reading the content of the file which is placed inside the directory newcorpus
newcorpus = PlaintextCorpusReader('newcorpus/', '.*')
print("This is the text file inside newcorpus directory:")
print (newcorpus.raw())
# ***************************************************************************************************************************
# Reading the content of the file which is placed inside the directory newcorpus2
newcorpus2 = PlaintextCorpusReader('newcorpus2/', '.*')
print("This is the text file inside newcorpus2 directory:")
print(newcorpus2.raw())
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
######################################################################################################################
file_1_count=newcorpus.words()
print
print("Display of each word of the file inside the directory newcorpus:")
print(file_1_count)
#count the frequency distribution of each word in the text file
fre_count_file_1= nltk.FreqDist(file_1_count)
print
print("Please see the frequency distribution of each word:")
print(fre_count_file_1)
most_common_word = fre_count_file_1.most_common(2)
print
print("See the most two common used words from the file:")
Пример #16
0
import nltk
import re
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import FreqDist

corpus_root = '/home/aman/entire-src/py/dir'
speeches = PlaintextCorpusReader(corpus_root, '.*\.txt')

print "Finished importing corpus"

raw = speeches.raw().lower()
tokens = nltk.word_tokenize(raw)
tgs = nltk.trigrams(tokens)
fdist = nltk.FreqDist(tgs)
for k,v in fdist.items():
    print k,v
Пример #17
0
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import word_tokenize
import re


corpusdir = 'python/' # Directory of corpus.

newcorpus = PlaintextCorpusReader(corpusdir, '.*')
print(newcorpus.fileids()[0])
print(type(newcorpus))
#print newcorpus.raw()
print newcorpus.words(newcorpus.fileids()[0])
print(len(newcorpus.words()))

tokens = word_tokenize(newcorpus.raw())
#type(tokens)
print len(tokens)
print tokens[:50]
#tokens[:10]
print newcorpus.sents()
print

#to remove comments
def removeComments(string):
    string = re.sub(re.compile("/\*.*?\*/",re.DOTALL ) ,"" ,string) # remove all occurance streamed comments (/*COMMENT */) from string fdf
    string = re.sub(re.compile("//.*?\n" ) ,"" ,string) # remove all occurance singleline comments (//COMMENT\n ) from string
    return string

print(removeComments(newcorpus.words(newcorpus.raw())))
#import the module nltk
import nltk
#import this module for drawing graphs
import matplotlib
# Reader of NLTK to access our own text files and treat them as regular corpora
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
#This is the directory in which we can store our text file
corpusdir = 'newcorpus/'
#this will make the directory in the folder you are working.
if not os.path.isdir(corpusdir):
    os.mkdir(corpusdir)
#accesing the file which is inside the directory
newcorpus = PlaintextCorpusReader('newcorpus/', '.*')
#Now Let us perform some of the operation using Natural Language processing
#displaying the content of the file in the newcorpus which has been made
print(newcorpus.raw().strip())
#displaying the length of the words of the file which is inside the directory newcorpus
a=(len(newcorpus.words()))
print("This will tell me the words inside the file",a)
#displaying the length of the words of the file which is inside the directory newcorpus
b=(len(newcorpus.sents()))
print("This will tell me the sentence inside the file",b)
#calculating average words per sentence
aws= a/b;
print("This will give me average words per sentence",aws)
#**********************************************************************
words_dispalyed = newcorpus.words()
#This function will tell me the frequency distribution of each word in the text file
fre_dis = nltk.FreqDist(words_dispalyed)
#Let us plot each word and their frequency distribution using plot function.
fre_dis.plot(title="Frequency Distribution")
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus import floresta,mac_morpho
from parser_portuguese_risk import evaluateModel, splitTrainTestModel, simplify_tag
time1 =datetime.datetime.now()

###############################################################################
### ATENTION: if we have some tmp files like .DS_STORE in Mac OSX, we must remove it ###

# Reading corpus
corpusdir = '/Users/marceloschiessl/RDF_text_project/corpus/WikiRisk/test/glossAnnotated/' # Directory of corpus.
#corpusdir = '/Users/marceloschiessl/RDF_text_project/corpus/WikiRisk/test/test1/' # Directory of corpus.   
risco = PlaintextCorpusReader(corpusdir, '.*')
risco.fileids()

raw_text = risco.raw('gloss533.txt')
#print raw_text[0:]

# Some statistics

print 'Number of term: ', len(risco.words())
print 'Number of unique terms: ', len(set(risco.words()))

fd = nltk.FreqDist(risco.words())
print fd.freq('bem')
print fd['bem']

# presenting ngrams of the term
target_word = 'bem como'
fd = nltk.FreqDist(ng
              for ng in nltk.ngrams(risco.words(), 6)
Пример #20
0
import os
import nltk
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer

## create the corpus of 1965 songs from html files
corpusdir = '../../data/billboard_data/1960/billboard_1965/'
bb_1965 = PlaintextCorpusReader(corpusdir, '.*')

## get the raw text from specific songs/files
help = bb_1965.raw('help.html')
desolation_row = bb_1965.raw('desolation_row.html')

## clean the raw text to remove the p tags
clean_help = nltk.clean_html(help)
clean_desolation = nltk.clean_html(desolation_row)

# word tokenize
tokens_help = nltk.word_tokenize(clean_help)
tokens_desolation = nltk.word_tokenize(clean_desolation)

# point of speech tagging
tags_help = nltk.pos_tag(tokens_help)
tags_desolation = nltk.pos_tag(tokens_desolation)

tokenizer = RegexpTokenizer(r'\w+')

## print the unique, sorted pos tags
for item in sorted(set(tags_help)):
	print 'help tags: ', item
Пример #21
0
#         text_list.append(text)

# preprocessed_docs = []
# for n,t in enumerate(text_list):
#     # print sample of text before and after processing
#     #if n == (len(text_list) - 1):
#     #    print(("Doc {} (before preproc): {}").format(n, t))
#     #    print(("Doc {}: {}").format(n, p))
#     p = preprocess(t)
#     preprocessed_docs.append(p)

# print("Preprocessed docs len:", len(text_list))

texts = PlaintextCorpusReader(d, ".*\.txt")

boc_texts = [extract(texts.raw(fileid)) for fileid in texts.fileids()]

dictionary = gensim.corpora.Dictionary(boc_texts)
#dictionary = gensim.corpora.Dictionary(preprocessed_docs)
#dictionary.filter_extremes(no_below=10,no_above=.5,keep_n=100000)
#bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]
bow_corpus = [dictionary.doc2bow(doc) for boc_text in boc_texts]
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

fileids = texts.fileids()

for idx, doc in enumerate(corpus_tfidf):
    new_file.write("Document '{}' key phrases:\n".format(fileids[idx]))
    # Get top 100 terms by TF-IDF score
    for wid, score in heapq.nlargest(100, doc, key=itemgetter(1)):
#make a new corpus
corpusdir = 'communications/small_test_batch' #where the files are
newcorpus = PlaintextCorpusReader(corpusdir, '.*')

fileids = newcorpus.fileids() #list of fileids
j = len(fileids) #number of docs

words_list = [] #['doc', '1', 'words', 'doc', '2', 'words',...]
doc_breaks = [0] #ith entry = index of first word in doc i in words_list
keywords = set() #{'doc', '1', 'words', '2',...}

tokenizer = RegexpTokenizer('\w+') #pick out alphanumeric sequences; discard punctuation, white space

#create set of keywords and list of file texts
for id in fileids:
    raw = newcorpus.raw(id)
    raw2 = ''.join([i if ord(i)<128 else '' for i in raw]) #remove unicode characters
    raw3 = raw2.encode('ascii')
    file_words = map(str.lower,tokenizer.tokenize(raw3)) #list of cleaned words: lower-case, no punct, no whitespace
    words_list = words_list + file_words
    doc_breaks = doc_breaks + [len(file_words)+doc_breaks[len(doc_breaks)-1]]
    
doc_breaks = doc_breaks + [len(words_list)]
keywords = set(words_list)
print 'Number of keywords: ' + str(len(keywords))
print 'Number of total words: ' + str(len(words_list))

red_keywords = set() #reduced set of keywords; try to remove too common words to save matrix computation later
cutoff = 3*j

sorted_words_list = sorted(words_list)
from gensim.models import LsiModel
from gensim.similarities import MatrixSimilarity

download('punkt')
download('stopwords')

corpusdir = './txts' # Directory of corpus.
all_files = PlaintextCorpusReader(corpusdir, '.*')
fileids = all_files.fileids()
print fileids
print len(fileids)
texts = []
fileindex = []
i = 0;
for fileid in fileids:
	texts.append(all_files.raw(fileids=fileid))
	fileindex.append(fileid)
	i += 1

stop_words = stopwords.words('english')

def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc if word.isalpha()]
    return doc
texts_og = texts
corpus = [preprocess(text) for text in texts]

number_of_docs = len(corpus)
Пример #24
0
import os
import nltk
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer

## create the corpus of 1965 songs from html files
corpusdir = 'text/'
bb_1965 = PlaintextCorpusReader(corpusdir, '.*')

## get the raw text from specific songs/files
beatles_help = bb_1965.raw('help.html')
desolation_row = bb_1965.raw('desolation_row.html')

## clean the raw text to remove the p tags
clean_help = nltk.clean_html(beatles_help)
clean_desolation = nltk.clean_html(desolation_row)

# word tokenize
tokens_help = nltk.word_tokenize(clean_help)
tokens_desolation = nltk.word_tokenize(clean_desolation)

# point of speech tagging
tags_help = nltk.pos_tag(tokens_help)
tags_desolation = nltk.pos_tag(tokens_desolation)

tokenizer = RegexpTokenizer(r'\w+')

help_tags = [item for item in sorted(set(tags_help))]
desolation_tags = [item for item in sorted(set(tags_desolation))]

print 'help_tags: ', help_tags
Пример #25
0
import nltk
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

#Loading the file you want to Train
corpusdir = 'E:\MTech' # Directory of corpus.

newcorpus = PlaintextCorpusReader(corpusdir, '.*')
print(newcorpus.fileids())
dictlist=[]

#Converting from word/tag pait to a list containing two tuple i.e,[(word1,tag),(word2,tag)]
for i in newcorpus.fileids():
    tagged_sent=newcorpus.raw(i)
    tagged=tagged_sent.split()
    for t in tagged:
        temp1=nltk.tag.str2tuple(t)
        dictlist.append(temp1)
print(dictlist)
print("This is the length of distinct words")
print(len(set(dictlist)))
fdist=nltk.FreqDist(dictlist)
print("fdist items")
print(fdist.items())
print(fdist.max())




rawtext = '''
 '''
Пример #26
0
# Create a new corpus by specifying the parameters
# (1) directory of the new corpus
# (2) the fileids of the corpus
# NOTE: in this case the fileids are simply the filenames.
newcorpus = PlaintextCorpusReader('newcorpus/', '.*')

# Access each file in the corpus.
for infile in sorted(newcorpus.fileids()):
    print infile  # The fileids of each file.
    with newcorpus.open(infile) as fin:  # Opens the file.
        print fin.read().strip()  # Prints the content of the file
print

# Access the plaintext; outputs pure string/basestring.
print newcorpus.raw().strip()
print

# Access paragraphs in the corpus. (list of list of list of strings)
# NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and
#       nltk.tokenize.word_tokenize.
#
# Each element in the outermost list is a paragraph, and
# Each paragraph contains sentence(s), and
# Each sentence contains token(s)
print newcorpus.paras()
print

# To access pargraphs of a specific fileid.
print newcorpus.paras(newcorpus.fileids()[0])
textsfile = PlaintextCorpusReader(corpus_directory, '.*')
ID_files = textsfile.fileids()
print(ID_files, len(ID_files))

##############################  Preprossesing Data  ######################################

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
Index_of_files = []
texts = []
count = 0

#file with file_ids
for fileid in ID_files:
    texts.append(textsfile.raw(fileids=fileid))
    Index_of_files.append(fileid)
    count += 1


def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return WordNetLemmatizer().lemmatize(word)
    else:
        return WordNetLemmatizer().lemmatize(lemma)


def clean_preprocessing(text):
    text = text.lower()
    doc = word_tokenize(text)
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from nltk.stem import WordNetLemmatizer

corpusdir = 'train_data/'
newcorpus = PlaintextCorpusReader(corpusdir, '.*')

#print newcorpus.raw().strip()

all_contents = newcorpus.raw().strip()

#print(all_contents)

tokenizer = RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(all_contents)
lowered_words = []
for w in words:
    lowered_words.append(w.lower())
stopwords = set(stopwords.words("english"))
filtered_contents = [w for w in lowered_words if not w in stopwords]
lemmatized_contents = []
lemmatizer = WordNetLemmatizer()
for w in filtered_contents:
    lemmatized_contents.append(lemmatizer.lemmatize(w))

#most_common_words = [word for word,word_count in Counter(lemmatized_contents).most_common(20)]
#bag_of_words = most_common_words
Пример #29
0
from wordcloud import STOPWORDS
_stop_words = set(STOPWORDS)

stop_words = set(stopwords.words('english'))
stop_words.update(_stop_words, ('thing', 'u', 'us', 'nt'))
lemmatizer = WordNetLemmatizer()

# Read .txt files from ./docs directory into a corpus
corpus = PlaintextCorpusReader('./docs/', ".*\.txt")

# filter list of words to remove uneeded ones and punctuation
# losing U.S. which is not ideal, tried splitting sentences on spaces and preserving dots just for it

from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
tokenized = tokenizer.tokenize(corpus.raw())

# drop punctuation
non_punct = list(
    filter(lambda token: nltk.tokenize.punkt.PunktToken(token).is_non_punct,
           tokenized))

# lowercase everything
lowercased = [word.lower() for word in non_punct]

# filter stop words
filtered = list(filter(lambda token: token not in stop_words, lowercased))

# lemmatize to get root of words
token_list = [lemmatizer.lemmatize(word) for word in filtered]
Пример #30
0
class TextAnalizer:
    def __init__(self, my_input_file):
        self.config = configparser.ConfigParser()
        self.config.read("text_analysis.cfg")
        self.input_file = my_input_file
        self.nlp_model = self.config["DEFAULT"]["nlp_model"]
        #The output file name
        self.output_file = self.config["DEFAULT"]["output_file"]
        self.nlp = load_nlp(self.nlp_model)
        self.corpus = CorpusReader(".", self.input_file)
        self.raw_text = self.corpus.raw()
        self.nlp_text = self.nlp(self.raw_text)
        # Here, lets put together the infos for text analysis with spacy.
        self.analysis_dictionary = Counter()
        self.word_count = 0
        self.get_word_count_nltk()

    def get_paragraph(self):
        return self.corpus.paras()

    def get_sentence(self):
        return self.corpus.sents()

    def get_word(self):
        return self.corpus.words()

    def get_word_count_nltk(self):
        tokenizer = Tokenizer(r'\w+')
        counts = Counter()
        sentences = self.get_sentence()
        for sentence in sentences:
            tokens = tokenizer.tokenize(" ".join(sentence))
            self.word_count = self.word_count + len(tokens)
            filtered = [w for w in sentence if w.isalnum()]
            counts = counts + Counter(filtered)
        return counts, self.word_count

    def analize_nlp(self):
        analized_data_str = (self.config["ANALIZED"]["POS"])
        analized_data = (analized_data_str.split(","))
        result_dict = {}
        diff_str, tot_str = (
            self.config["DEFAULT"]["diff_tot_string"]).split(",")
        lemma_counter = Counter()
        pos_counter = Counter()
        tag_counter = Counter()

        for token in self.nlp_text:
            lemma_counter = lemma_counter + Counter([token.lemma_])
            pos_counter = pos_counter + Counter([token.pos_])
            tag_counter = tag_counter + Counter([token.tag_])
            my_key = token.lemma_ + "_" + token.tag_ + "_" + token.pos_
            self.analysis_dictionary[my_key] += 1
        for pos in analized_data:
            instance_counter = 0
            total_counter = 0
            for key in self.analysis_dictionary.keys():
                try:
                    my_lemma, my_tag, my_pos = key.split("_")
                except ValueError:
                    print("Warning: Array has a empty line")  # add logging
                if pos == my_pos:
                    instance_counter += 1
                    total_counter = total_counter + self.analysis_dictionary.get(
                        key)
            result_dict[pos + diff_str] = instance_counter
            result_dict[pos + tot_str] = total_counter
        #add the stuff from nltk
        diff_word, word_count = self.get_word_count_nltk()
        result_dict["WORDS" + tot_str] = word_count
        result_dict["WORDS" + diff_str] = len(diff_word)
        result_dict["PARAGRAPHS"] = len(self.get_paragraph())
        result_dict["SENTENCES"] = len(self.get_sentence())

        return result_dict

    def write_output(self):
        with open(self.output_file, "w+") as f:
            f.write("Number of paragraphes: " +
                    str(len(self.get_paragraph())) + "\n")
            f.write("Number of sentences: " + str(len(self.get_sentence())) +
                    "\n")
            f.write("Number of words: " + str(self.word_count) + "\n")
            f.write("Average words per sentence: " +
                    str(round(self.word_count / len(self.get_sentence()), 2)) +
                    "\n")
            f.write("Number of different words: " +
                    str(len(self.get_word_count_nltk())) + "\n")
            f.write("Text variety (different words/total words: " + str(
                round(len(self.get_word_count_nltk()) / self.word_count, 2)) +
                    "\n")
            f.close()
Пример #31
0
import string
import nltk
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
from scipy.cluster.hierarchy import ward, dendrogram
from sklearn.manifold import MDS
import matplotlib.pyplot as plt
import scipy.stats as stats

names = []
corpus = []

co = PlaintextCorpusReader("./election", ".*\.txt")

for fileids in co.fileids():
    names.append(fileids)
    corpus.append(co.raw(fileids))

print len(names), 'documents in the corpus'
print names[:30]

for idx in range(len(corpus) - 1, -1, -1):
    print
    print names[idx]
    print corpus[idx][:70].replace('\n', ' ')

vectorizer = TfidfVectorizer(stop_words='english', min_df=2)
dtm = vectorizer.fit_transform(corpus)
print dtm.shape
vocab = vectorizer.get_feature_names(
)  # list of unique vocab, we will use this later
print len(vocab), '# of unique words'
Пример #32
0
import os
import nltk
from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.classify import PositiveNaiveBayesClassifier
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

def features(sentence):
	words = sentence.lower().split()
	return dict(('contains(%s)' % w, True) for w in words)

corpusdir = './text'
newcorpus = PlaintextCorpusReader(corpusdir, '.*')
positive_featuresets = list(map(features, newcorpus.raw('comp.txt')))
unlabeled_featuresets = list(map(features, newcorpus.raw('animal.txt')))
classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, 
	unlabeled_featuresets, .3)
print classifier.classify(features('.'))