Пример #1
1
def create_vocabularies():
    poem_corpus = CategorizedPlaintextCorpusReader('./data', 'poems.*',
        cat_file='cats.txt')

    for emotion in base_emotions:
        words = poem_corpus.words(categories=[emotion])
        words = [w.lower() for w in words if w.isalpha() and w not in stopwords.words('english')]
        fdist = nltk.FreqDist(words)
        vocabulary = fdist.keys()[:200]

        vocab_file = open('./opinion-lexicon-English/%s-words.txt' % emotion, 'w')
        vocab_file.write('\n'.join(vocabulary))
        vocab_file.close()
Пример #2
0
    def __init__(self, dir, doc):
        self.doc = doc
        self.dir = dir
        self.eng_stopw = stopwords.words('english')


        text_corpus = CategorizedPlaintextCorpusReader(
            './%s/' % self.dir,
            r'.*\.csv',  # leggo solamente i file che terminato con .csv
            cat_pattern=r'(\w+)/*',  # prendi tutto quello che c'è dopo la directory
            encoding='latin-1'
        )

        self.text = nltk.Text(text_corpus.words(self.doc))
Пример #3
0
 def __init__(self, raiz_corpus):
     """Cria um objeto do tipo 'CategorizedPlaintextCorpusReader',
     utilizando o diretório raiz do corpus, onde os documentos
     estão localizados, dispostos em seus respectivos subdiretórios,
     de acordo com sua categoria, sejam eles/elas quais for
     
     -->     raiz_corpus/{pos,neg,neu,...}.
     """
     reload(sys)
     sys.setdefaultencoding("utf-8")
     
     self._raiz_corpus = raiz_corpus
     self._corpus = CategorizedPlaintextCorpusReader(raiz_corpus, r'.+\.txt', cat_pattern=r'(\w+)/*',
                                                     encoding='utf-8')
     self._documentos = None
     self._palavras_frequentes = None
     self._todas_palavras = None
     self._featuresets = None
     self._train_set = None
     self._test_set = None
Пример #4
0
    def __create_corpus(self, language, chars):
        """Create a categorized nltk.corpus from data/* where the subfolders are the different categories.

        :chars: List of chars which will be additionally to stopwords  removed before the statistical analysis
        :language: The newspaper language as string
        :returns: nltk.corpus, list(all normalized words)

        """

        # Create corpus from data directory
        news_corpus = CategorizedPlaintextCorpusReader('data/', r'.*\.txt', cat_pattern=r'(\w+)/*')

        # Get all german stopwords and addition chars for removal
        g_stop = stopwords.words(language)
        g_stop.extend(chars)

        # Stemmer
        snow = nlp.stem.SnowballStemmer(language, ignore_stopwords=True)

        # Dict of all words/category
        cat = news_corpus.categories()
        total_words = {}

        for news in cat:
            #Get the words
            words = news_corpus.words(categories=news)

            # Remove stopwords and tokenize
            words = [w.lower() for w in words if w not in g_stop]

            # Stem all tokens
            words = [snow.stem(w) for w in words]

            total_words.update({news: words})

        return news_corpus, total_words
Пример #5
0
    def init_documents(f_re, cat_re):
        logging.debug("Reading corpus")
        reports = CategorizedPlaintextCorpusReader(corpus_dir,
                                                   f_re,
                                                   cat_pattern=cat_re,
                                                   encoding='utf8')
        logging.debug("Found {} fileids".format(len(reports.fileids())))
        logging.debug("Found categories: {}".format(reports.categories()))
        logging.debug("Building docs")

        documents = [
            (tokenize(reports.words(i)), reports.categories(i)[0])
              for i in reports.fileids()]
        return documents
Пример #6
0
def nltk():
    #### FOR TRAINING DATA ####
    stop = stopwords.words('spanish')

    # Reads the training data.
    traindir = '/Users/ruben/Desktop/Formularios_clasificados/training'
    mr = CategorizedPlaintextCorpusReader(traindir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='utf-8')

    # Converts training data into tuples of [(words,label), ...]
    documents = [([w for w in mr.words(i) if w.lower() not in stop and w not in string.punctuation], i.split('/')[0]) for i
                 in mr.fileids()]
    # Extract training features.
    word_features = FreqDist(chain(*[i for i, j in documents]))
    word_features = word_features.keys()[:100]
    # Assuming that you're using full data set
    # since your test set is different.
    train_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in documents]

    #### TRAINS THE TAGGER ####
    # Train the tagger
    classifier = NaiveBayesClassifier.train(train_set)

    #### FOR TESTING DATA ####
    # Now do the same reading and processing for the testing data.
    testdir = '/Users/ruben/Desktop/Formularios_clasificados/testing'
    mr_test = CategorizedPlaintextCorpusReader(testdir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='utf-8')
    # Converts testing data into tuples of [(words,label), ...]
    test_documents = [
        ([w for w in mr_test.words(i) if w.lower() not in stop and w not in string.punctuation], i.split('/')[0]) for i in
        mr_test.fileids()]
    # Reads test data into features:
    test_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in test_documents]

    correct = 0
    wrong = 0
    #### Evaluate the classifier ####
    for doc, gold_label in test_set:
        tagged_label = classifier.classify(doc)
        if tagged_label == gold_label:
            correct += 1
        else:
            wrong += 1

    print correct, wrong, (float(correct) / wrong + correct)
def construct_model(copusPath, modelPath):
    mr = CategorizedPlaintextCorpusReader(copusPath, r'(?!\.).*\.txt',
                                           cat_pattern=r'*/.*', encoding='iso-8859-1')
    stop = stopwords.words('french')
    documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation],
                   i.split('/')[0]) for i in mr.fileids()]
    word_features = FreqDist(chain(*[i for i, j in documents]))
    word_features = list(word_features.keys())
    numtrain = int(len(documents) * 100 / 100)
    train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens, tag in documents[:numtrain]]
    """test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens, tag  in documents[numtrain:]]"""
    classifier = nbc.train(train_set)
    mrtest = CategorizedPlaintextCorpusReader(os.path.abspath("corpus_test"), r'(?!\.).*\.txt', cat_pattern=r'*/.*', encoding='iso-8859-1')
    documentsTest = [([w for w in mrtest.words(i) if w.lower() not in stop and w.lower() 
                   not in string.punctuation],
                   i.split('/')[0]) for i in mrtest.fileids()]
    word_features_test = FreqDist(chain(*[i for i, j in documentsTest]))
    word_features_test = list(word_features_test.keys())
    numtrain_test = int(len(documentsTest) * 100 / 100)
    test_set = [({i:(i in tokens) for i in word_features_test}, tag) for tokens, tag  in documentsTest[:numtrain_test]]
    save_classifier(classifier, modelPath)
Пример #8
0
def display_features(num_features=1000,
                     show_features=200,
                     filepath='classifiers/nltk_nb.pkl',
                     verbose=True):
    '''
    Displays informative features from NHLCorpus
    '''
    stop_words = set(stopwords.words('english'))
    nhl = CategorizedPlaintextCorpusReader(root='data/NHLcorpus/',
                                           fileids=r'.*\.txt',
                                           cat_pattern='(\w+)/*')
    documents = []
    for category in nhl.categories():
        for fileid in nhl.fileids(category):
            documents.append(([
                re.sub(r'\W+', '', w.lower()) for w in nhl.words(fileid)
                if w.lower() not in stop_words
            ], category))
    all_words = nltk.FreqDist(
        re.sub(r'\W+', '', w.lower()) for w in nhl.words()
        if w.lower() not in stop_words)
    word_features = [w[0] for w in all_words.most_common(num_features)]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains({})'.format(word)] = word in document_words
        return features

    featuresets = [(document_features(d), c) for (d, c) in documents]
    nb_clf = nltk.NaiveBayesClassifier.train(featuresets)
    if verbose:
        nb_clf.show_most_informative_features(show_features)
        print('Accuracy on training data: {}'.format(
            nltk.classify.accuracy(nb_clf, featuresets)))

    save_classifier = open(filepath, 'wb')
    pickle.dump(nb_clf, save_classifier)
    save_classifier.close()
print(positive_greg)
print(negative_greg)

positive_consolidated_list = list(pos_list) + positive_greg
negative_consolidated_list = list(neg_list) + negative_greg
print(positive_consolidated_list)
print(negative_consolidated_list)

init_notebook_mode(connected=True)
cf.set_config_file(offline=True, world_readable=True, theme='ggplot')

#%%

corpus_root = "/Users/LENOVO USER/Desktop/FedTranscript1"
data_m = CategorizedPlaintextCorpusReader(corpus_root,
                                          r'.*\.txt',
                                          cat_pattern=r'(\w+)/*',
                                          encoding='latin1')
data_fileids = data_m.fileids()


#%%
def corpus_Stats(crp):
    print('Total number of files: ' + str(len(crp.fileids())))
    print('Number of paragraphs: ' + str(len(crp.paras())))
    print('Number of sentences: ' + str(len(crp.sents())))
    print('Number of words: ' + str(len(crp.words())))


#corpus_Stats(data_m)
#print('\n'+'First file: '+ data_fileids[0])
#print('Last file: '+ data_fileids[-1])
Пример #10
0
'''
import string
from itertools import chain

from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
from nltk.corpus import CategorizedPlaintextCorpusReader
import nltk

# working dir: UN/
mydir = 'corpus/unscrs_renamed_categorized'

mr = CategorizedPlaintextCorpusReader(
    mydir,
    r'(?!\.).*\.txt',
    cat_pattern=r'(intervention|soft_action)/.*',
    encoding='utf-8')
stop = stopwords.words('english')
documents = [([
    w for w in mr.words(i)
    if w.lower() not in stop and w.lower() not in string.punctuation
], i.split('/')[0]) for i in mr.fileids()]

word_features = FreqDist(chain(*[i for i, j in documents]))
word_features = word_features.keys()[:100]

numtrain = int(len(documents) * 90 / 100)
train_set = [({i: (i in tokens)
               for i in word_features}, tag)
             for tokens, tag in documents[:numtrain]]
Пример #11
0
import string
from itertools import chain

from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
from nltk.corpus import CategorizedPlaintextCorpusReader
import nltk

mydir = 'Documents/Plab/Project4/subset/test/neg'

mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='ascii')
stop = stopwords.words('english')
documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()]

word_features = FreqDist(chain(*[i for i,j in documents]))
word_features = word_features.keys()[:100]

numtrain = int(len(documents) * 90 / 100)
train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[:numtrain]]
test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag  in documents[numtrain:]]

classifier = nbc.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(5)
Пример #12
0
import nltk
from nltk.corpus import CategorizedPlaintextCorpusReader
from nltk.corpus import brown

# Abrir os documentos dentro do caminho específico
# Argumentos
# 1. Caminho absoluto para os documentos
# 2. tipo / extensão dos documentos (*.txt)
# 3. indicativo das pastas que formarão as categorias
# todos os argumentos são expressões regulares

leitor = CategorizedPlaintextCorpusReader(
    '../Dados/mix20_rand700_tokens_cleaned/tokens/',
    '.*.txt',
    cat_pattern=r'(\w+)/*')

# Verificar o que foi carregado
print(leitor.categories())
print(leitor.fileids())

# Separar o corpus de acordo com as categorias
posFiles = leitor.fileids(categories='pos')
negFiles = leitor.fileids(categories='neg')
print('Arquivos pos:', posFiles)
print('Arquivos neg:', negFiles)

# Carregar os primeiros arquivos das categorias
arqP = posFiles[0]
arqN = negFiles[1]

print("ArqP: ", arqP)
Пример #13
0
class CorpusUtil(object):
    """Documentar
    """
    def __init__(self, raiz_corpus):
        """Cria um objeto do tipo 'CategorizedPlaintextCorpusReader',
        utilizando o diretório raiz do corpus, onde os documentos
        estão localizados, dispostos em seus respectivos subdiretórios,
        de acordo com sua categoria, sejam eles/elas quais for
        
        -->     raiz_corpus/{pos,neg,neu,...}.
        """
        reload(sys)
        sys.setdefaultencoding("utf-8")
        
        self._raiz_corpus = raiz_corpus
        self._corpus = CategorizedPlaintextCorpusReader(raiz_corpus, r'.+\.txt', cat_pattern=r'(\w+)/*',
                                                        encoding='utf-8')
        self._documentos = None
        self._palavras_frequentes = None
        self._todas_palavras = None
        self._featuresets = None
        self._train_set = None
        self._test_set = None

    def get_documentos(self):
        """Construimos uma lista de documentos, rotulados com as
        categorias apropriadas. Cada documento é representado por
        uma tupla na estrutura abaixo:
        
        (conteudo_do_documento, categoria)
        
        Retorna essa lista com todos os documentos do corpus.
        """
        """
        documentos = [(self.corpus.words(fileid), categoria)
                       for categoria in self.corpus.categories()
                       for fileid in self.corpus.fileids(categoria)]
        """
        print "-- Recuperando documentos do corpus."

        if self._documentos is None:            
            self._documentos = [Documento(" ".join(self._corpus.words(fileid)), categoria, self, fileid)
                                for categoria in self._corpus.categories()
                                for fileid in self._corpus.fileids(categoria)]

        # Embaralha documentos
        for i in range(0, 10):
            shuffle(self._documentos)

        return self._documentos

    def get_palavras_frequentes(self):
        """Documentar.
        """
        if self._palavras_frequentes is None:

            print "-- Verificando as palavras mais frequentes do corpus."

            # Teste - retorna apenas as 2000 palavras mais frequentes do corpus
            todas_palavras = [word.lower() for word in self._corpus.words()]
            freq_dist_palavras = FreqDist(todas_palavras)
            frequencia_palavras = freq_dist_palavras.most_common(2000)  # 2000 palavras mais frequentes
            
            self._palavras_frequentes = [palavra for palavra, frequencia in frequencia_palavras]
            
            # all_words = FreqDist(word.lower() for word in self.corpus.words())
            # self.word_features = list(all_words)[:2000]
        return self._palavras_frequentes

    def get_todas_palavras(self):
        if self._todas_palavras is None:
            print "-- Recuperando todas as palavras do corpus."
            self._todas_palavras = [word.lower() for word in self._corpus.words()]
            self._todas_palavras = set(self._todas_palavras)

        return self._todas_palavras

    def get_featuresets(self):
        """Configura os featuresets que são construídos na
        seguinte estrutura:
            (features_do_documento, categoria)
        
        Retorna uma lista de featuresets
        """
        if self._featuresets is None:
            
            if self._documentos is None:
                self.get_documentos()

            print "-- Recuperando featuresets."

            self._featuresets = apply_features(Documento.get_features, self._documentos)
        
        return self._featuresets

    def get_train_set(self):
        """Documentar
        """
        if self._featuresets is None:
            self.get_featuresets()

        print "-- Recuperando train_set."

        # Para não ocupar toda a memória RAM,
        # não armazena todos os documentos de uma vez nesta.
        # self._train_set = apply_features(Documento.get_features, self._documentos[100:])
        self._train_set = apply_features(Documento.get_features, self._documentos)

        return self._train_set

    def get_test_set(self):
        if self._featuresets is None:
            self.get_featuresets()

        print "-- Recuperando test_set."

        # self._test_set = apply_features(Documento.get_features, self._documentos[:100])

        return self._test_set

    def gravar_palavras_frequentes(self):
        diretorio_destino = "/home/lucas/Documents/mineracao_opiniao/palavras_frequentes_corpus"
        molde_nome_arquivo = "palavras_frequentes_%s.pickle"

        tempo_agora = str(datetime.now())
        # Substitui ':' e espaço em branco por '.'
        tempo_agora = re.sub(ur':|\s', '.', tempo_agora)
        nome_arquivo = molde_nome_arquivo % tempo_agora

        if self._palavras_frequentes is None:
            self.get_palavras_frequentes()

        f = open(diretorio_destino + "/" + nome_arquivo, 'wb')
        pickle.dump(self._palavras_frequentes, f)
        f.close()

        return True

    @staticmethod
    def abrir_arquivo_palavras_frequentes(arquivo_path):
        f = open(arquivo_path, 'rb')
        palavras_frequentes = pickle.load(f)
        f.close()

        return palavras_frequentes
Пример #14
0
negative_words = [word.strip() for word in neg_file.readlines() if not \
        word.startswith(';')]
pos_file.close()
neg_file.close()


# Words for all emotions
lexicon = {}
for emotion in base_emotions:
    f = open('./opinion-lexicon-English/%s-words.txt' % emotion, 'rU')
    words = [word.strip() for word in f.readlines()]
    lexicon[emotion] = words
    f.close()

# Make a classifier based on the feature sets of the poems
poem_corpus = CategorizedPlaintextCorpusReader('./data', 'poems.*',
        cat_file='cats.txt')

poem_set = [(fileid, category) for fileid in poem_corpus.fileids() \
        for category in poem_corpus.categories(fileid)]
random.shuffle(poem_set)

feature_set = [(poem_features(poem_corpus.words(fileids=[fileid])),
        category) for (fileid, category) in poem_set]

train_set, test_set = feature_set[2000:], feature_set[:2000]

# Initialize the classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

# For improving the algorithm
classifier.show_most_informative_features(20)
Пример #15
0
This code uses the meeting records (inputs) corpus.
'''
import string
from itertools import chain

from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
from nltk.corpus import CategorizedPlaintextCorpusReader
import nltk

# working dir: UN/
mydir = 'corpus/meeting_records_final_categorized'

mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(intervention|soft_action)/.*', encoding='utf-8')
stop = stopwords.words('english')
documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()]

word_features = FreqDist(chain(*[i for i,j in documents]))
word_features = word_features.keys()[:100]

numtrain = int(len(documents) * 90 / 100)
train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[:numtrain]]
test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag  in documents[numtrain:]]

classifier = nbc.train(train_set)
print nltk.classify.accuracy(classifier, test_set) # .87 - ?!?!?!
classifier.show_most_informative_features(20)

# for word_features.keys()[:100]
def loadCorpus(category = None) :

    corpus_root = "../corpus/lyric_corpus/files/"
    cat_root = "../categories/"

    if not os.name == 'posix':
        corpus_root = "..\\corpus\\lyric_corpus\\files\\"
    # load the corpus

    # corpus = PlaintextCorpusReader(corpus_root, '.*\.txt')
    corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+')
    # print files in corpus
    # for file in corpus.fileids():
    # print(file)
    # access corpus

    raw = corpus.raw()
    words = corpus.words()
    # print (category)
    if(category == None):
        sents = corpus.sents()
    else:
        sents = corpus.sents(categories = category)
    # sents_pop = corpus.sents(categories="POP")
    # sents_rock = corpus.sents(categories="ROCK")

    shuffledSents = shuffleSent(sents)


    numberSents = len(shuffledSents)
    trainSize = math.floor(numberSents*0.8)
    testSize = len(shuffledSents) - trainSize
    # testSize = math.floor(numberSents*0.1)
    # devSize = len(shuffledSents)-trainSize - testSize

    trainCorpus = []
    testCorpus = []
    # devCorpus = []
    wholeCorpus = []
    testSents = []

    for i in range(numberSents):
        if(i < trainSize):
            for word in shuffledSents[i]:
                trainCorpus.append(word)
                wholeCorpus.append(word)
        # elif(i < (trainSize + testSize)):
        #     for word in shuffledSents[i]:
        #         testCorpus.append(word)
        #         wholeCorpus.append(word)
        else:
            testSents.append(shuffledSents[i])
            for word in shuffledSents[i]:
                testCorpus.append(word)
                wholeCorpus.append(word)



    # testCorpus = []
    # trainCorpus = list(words)
    # for i in range(testSize):
    #     seed = random.randrange(0,numberSents - i)
    #     testCorpus.append(trainCorpus.pop(seed))

    return wholeCorpus, trainCorpus, testCorpus, testSents
Пример #17
0
        os.chdir(directory)
        file = open(fname, 'w')
        file.write(text)
        file.close()


doc_start = {}
doc_start[0] = "Staff Review of the Economic Situation"
doc_start[1] = re.compile('The information (reviewed|received|provided)')
doc_start[
    2] = "The Committee then turned to a discussion of the economic outlook"
doc_start[3] = re.compile('The information  (reviewed|received|provided)')

doc_end = {}
doc_end[0] = re.compile(
    '(At the conclusion of) (this|the) (discussion|meeting)')
doc_end[1] = re.compile('(?i)The Committee voted to authorize')
doc_end[2] = re.compile('(?i)The vote encompassed approval of')

if __name__ == '__main__':
    corpus_root = '/Users/aaroncgw/Google Drive/fednlp/data/minutes/'
    data_m = CategorizedPlaintextCorpusReader(corpus_root,
                                              r'.*\.txt',
                                              cat_pattern=r'(\w+)/*')
    data_fileids = data_m.fileids()

    for f in data_fileids:
        year, fname = f.split('/')
        cropped_text = crop_text(data_m.raw(f), doc_start, doc_end)
        saveFile(fname, year, cropped_text)
def create_corpus():
    poem_corpus = CategorizedPlaintextCorpusReader('../poems/',
                                                   'poems_.*',
                                                   cat_file='cats.txt')
Пример #19
0
import time
start = time.time()
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import CategorizedPlaintextCorpusReader
from sklearn.cluster import KMeans
import numpy as np
import copy
import math
import re

corpus_root = 'C:\\MyData\\PythonPractice\\IMDB\\test'  #Path of IMDB Test Data
reader = CategorizedPlaintextCorpusReader(corpus_root,
                                          r'.*\.txt',
                                          cat_pattern=r'(\w+)/*')

r_neg = reader.fileids(categories=['neg'])
r_pos = reader.fileids(categories=['pos'])

global_shortlisted = []
TEST_GS_POS = []

for i in range(0, 12500):

    doc = reader.raw(r_pos[i:i + 1])  #doc contains the movie review
    sentences = nltk.sent_tokenize(doc)
    senlen = len(sentences)

    def decontracted(phrase):
        # specific
        phrase = re.sub(r"won't", "will not", phrase)
Пример #20
0
import nltk
from nltk.corpus import CategorizedPlaintextCorpusReader

reader = CategorizedPlaintextCorpusReader(
    r'/home/smadyastha/Projects/PythonCheck/Dataset/Reviews/tokens',
    r'.*\.txt',
    cat_pattern=r'(\w+)/*')

posFiles = reader.fileids(categories='pos')
negFiles = reader.fileids(categories='neg')

from random import randint
fileP = posFiles[randint(0, len(posFiles) - 1)]
fileN = negFiles[randint(0, len(posFiles) - 1)]
print(fileP)
print(fileN)

for w in reader.words(fileP):
    print(w + "")
    if (w is '.'):
        print()

# /home/smadyastha/Projects/PythonCheck/Dataset/Reviews
from nltk.corpus import CategorizedPlaintextCorpusReader
from nltk import bigrams
from nltk import trigrams
from nltk.collocations import *
import nltk

corpus_root = "../corpus/lyric_corpus/files/"
cat_root = "../categories/"

# Hacky way to specify path for cat.txt. A better way would be to rewrite regex '.*\.txt'...
corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+')

# word lists
word_list_pop = list(corpus.words(categories="POP"))
word_list_rock = list(corpus.words(categories="ROCK"))

# bigram lists
bigram_list_pop = list(bigrams(word_list_pop))
bigram_list_rock = list(bigrams(word_list_rock))

# trigram lists
trigram_list_pop = list(trigrams(word_list_pop))
trigram_list_rock = list(trigrams(word_list_rock))

# measures
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

# finders default window size is 2
bi_finder_pop = BigramCollocationFinder.from_words(word_list_pop)
bi_finder_rock = BigramCollocationFinder.from_words(word_list_rock)
Пример #22
0
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf


# Provide path to the custom corpora

mydir = '/Users/vasilis/Desktop/Lennon/lyrics_custom_corpus'

# Read data from our custom corpora

mr = CategorizedPlaintextCorpusReader(mydir,
                                      r'(?!\.).*\.txt',
                                      cat_pattern=r'(neg|pos)/.*')

# Clean lyrics from the English stop words.
stop = stopwords.words('english')

documents = [(list(mr.words(fileid)), category)
             for category in mr.categories()
             for fileid in mr.fileids(category)]

classifiers_dir = '/Users/vasilis/vxm773/Lennon/pickled_classifiers'

if os.path.exists(classifiers_dir):
    shutil.rmtree(classifiers_dir)
os.makedirs(classifiers_dir)
from nltk.corpus import CategorizedPlaintextCorpusReader

reader = CategorizedPlaintextCorpusReader(
    r'D:\LEARNING\MISC\DataSet\movieCorpus\review_polarity\txt_sentoken',
    r'.*\.txt',
    cat_pattern=r'(\w+)/*')

print(reader.categories())
print(reader.fileids())

posFiles = reader.fileids(categories='pos')
negFiles = reader.fileids(categories='neg')

from random import randint
fileP = posFiles[randint(0, len(posFiles) - 1)]
fileN = negFiles[randint(0, len(posFiles) - 1)]
print(fileP)
print(fileN)

for w in reader.words(fileP):
    print(w + ' ', end='')
    if (w is '.'):
        print()

for w in reader.words(fileN):
    print(w + ' ', end='')
    if (w is '.'):
        print()
def transform(corpus: CategorizedPlaintextCorpusReader, target_root_dir):
    if not os.path.exists(target_root_dir):
        os.makedirs(target_root_dir)
    open(target_root_dir + "\\meta.info", 'w').write("tagged\nmarks.txt")
    for fileid in corpus.fileids():
        yield process(corpus, target_root_dir, fileid)
Пример #25
0
from nltk.corpus import CategorizedPlaintextCorpusReader
import ProcessText

d1 = "judge people by what they say"

d1_processed = ProcessText.ProcessText.process(d1)

documents = [d1]

#Read documents
reader = CategorizedPlaintextCorpusReader(
    r'\Users\JoeDi\Desktop\MSC\Idioms Corpera',
    r'.*\.txt',
    cat_pattern=r'(\w+)/*')

for w in reader.fileids():
    wd = reader.raw(w)
    documents.append(w + " " + wd)

print("Documents in the collection are: ")
print(documents)
print("\n")

from sklearn.feature_extraction.text import TfidfVectorizer

#build a TF/IDF matrix for each description
tfidf = TfidfVectorizer().fit_transform(documents)

print("Tf-idf weightings are:  ")
print(tfidf)
print("\n")
def read_corpus(root_dir):
    return CategorizedPlaintextCorpusReader(root_dir,
                                            FILE_PATTERN,
                                            cat_pattern=CAT_PATTERN)
Пример #27
0
sjar = '/Users/nischikata/PycharmProjects/JabRef-2.11.1.jar'

from nltk.corpus import stopwords
from nltk.corpus import CategorizedPlaintextCorpusReader
from nltk import word_tokenize
from nltk import TreebankWordTokenizer
import nltk.data

# PLAINTEXT CORPUS READER
# http://www.nltk.org/_modules/nltk/corpus/reader/plaintext.html#CategorizedPlaintextCorpusReader
# important: The TreebankWordTokenizer separates words like "don't" into "do", "n't", consequently the main verb is correctly identified.
# For the Naive Bayes it may be better though to use WordPunctTokenizer - it is the default, so just omit the word_tokenizer param
corpus = CategorizedPlaintextCorpusReader(
    '.',
    r'(?!\.).*\.txt',
    word_tokenizer=TreebankWordTokenizer(),
    cat_pattern=r'(aggressive|not_aggressive)/.*',
    encoding='utf8')

# Getting RAW SENTENCES from RAW Comment seee: http://stackoverflow.com/a/4576110/4866678
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


# returns a list of raw sentences
def get_raw_sentences(fileid):  # works
    data = corpus.raw(fileid)
    return tokenizer.tokenize(data)


def get_raw_paragraph(
    fileid
Пример #28
0
from nltk.corpus import CategorizedPlaintextCorpusReader

reader = CategorizedPlaintextCorpusReader(
    r'/Users/dechamoungsri/NLP_Learning/NLP_tutotial/mix20_rand700_tokens_cleaned/tokens/',
    r'.*\.txt',
    cat_pattern=r'(\w+)/*')
print(reader.categories())
print(reader.fileids())

posFiles = reader.fileids(categories='pos')
negFiles = reader.fileids(categories='neg')

from random import randint
fileP = posFiles[randint(0, len(posFiles) - 1)]
fileN = negFiles[randint(0, len(posFiles) - 1)]

print(fileP)
print(fileN)

for w in reader.words(fileP):
    print(w + ' ', end='')
    if (w is '.'):
        print()
for w in reader.words(fileN):
    print(w + ' ', end='')
    if (w is '.'):
        print()
Пример #29
0
def classify_emails():
    stop_words = set(stopwords.words("english"))

    lemmatizer = WordNetLemmatizer()

    mydir = '/home/ubuntu/nltk_data/corpora/gmail'

    all_words = []
    filtered_words = []
    removedPuncuations_words = []
    lematized_words = []
    test_filter = []

    mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(hotel|flight|other)/.*', encoding='latin-1')
    stop = stopwords.words('english')
    documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()]

    word_features = FreqDist(chain(*[i for i,j in documents]))
    word_features = word_features.keys()[:100]

    def word_feats(document):
        words = set(document)
        features = {}
        for w in word_features:
            features[w] = (w in words)

        return dict(features)

    negids = mr.fileids('hotel')
    posids = mr.fileids('flight')
    neutralids = mr.fileids('other')

    negfeats = [(word_feats(mr.words(fileids=[f])), 'hotel') for f in negids]
    posfeats = [(word_feats(mr.words(fileids=[f])), 'flight') for f in posids]
    neutralfeats = [(word_feats(mr.words(fileids=[f])), 'other') for f in neutralids]

    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
    neutralcutoff = len(neutralfeats)*3/4

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] + neutralfeats[:neutralcutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] + neutralfeats[neutralcutoff:]

    classifier = nltk.NaiveBayesClassifier.train(trainfeats)
    print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testfeats))*100)

    print ('accuracy:', nltk.classify.util.accuracy(classifier, testfeats)*100)


    file_content = open("/home/ubuntu/nltk_data/corpora/gmail/hotel/h12.txt").read()
    tokens = nltk.word_tokenize(file_content)

    test_sent_features = {word.lower(): (word in tokens) for word in mr.words()}

    file_content = open("/home/ubuntu/nltk_data/corpora/gmail/hotel/h12.txt").read()
    tokens = nltk.word_tokenize(file_content)
    tri_tokens = trigrams(tokens)

    cities = []
    matchedIndex = []
    tokenized = []
    addresses = []
    district = ['Akarawita','Angamuwa','Avissawella','Batawala','Battaramulla','Batugampola','Bope','Boralesgamuwa','Borella','Dedigamuwa','Dehiwala','Deltara','Habarakada','Handapangoda','Hanwella','Hewainna','Hiripitya','Hokandara','Homagama','Horagala','Kaduwela','Kahawala','Kalatuwawa','Madapatha','Maharagama','Malabe','Meegoda','Padukka','Pannipitiya','Piliyandala','Pitipana','Homagama','Polgasowita','Puwakpitiya','Ranala','Siddamulla','Slave Island','Sri Jayawardenapura','Talawatugoda','Tummodara','Waga','Watareka','Dickwella']

    for i in tokens:
        tokenized.append(i)

    pattern = re.compile("\d+")
    for i in tokenized:
        if pattern.match(i):
            matchedIndex.append(tokenized.index(i))
            print ("match"+i)
            print (tokenized.index(i))

        else:
            print ("not match")

    for t in tokenized:
        for i in district:
            if t.lower()==i.lower():
                cities.append(tokenized.index(t))

    distance= 200
    start = 0
    end = 0

    for t in cities:
        for i in matchedIndex:
            dis = t-i;
            if (dis<=distance and dis>0):
                distance=dis
                start=t
                end=i
            else:
                print ("higher")

    address = ""

    for token in range(end,start+1):
        address+=tokenized[(token)]
        print (address)
        addresses.append(address)

    for address in addresses:
        try:
            search = geocoder.get(address)
        except ValueError:
            continue
        first_result = search[0]

    output =  [first_result.geometry.location.lat,first_result.geometry.location.lng]


    stri = ','.join(map(str, output))
    return stri
from nltk.corpus import CategorizedPlaintextCorpusReader

corpus_root = "./files/"
cat_root = "../categories/"

# Hacky way to specify path for cat.txt. A better way would be to rewrite regex '.*\.txt'...
corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+')

# get all categories
cats = corpus.categories()
print(cats)

# access corpus
raw = corpus.raw()

# access words, normal and for a category
words = corpus.words()
words_pop = corpus.words(categories="POP")
words_rock = corpus.words(categories="ROCK")

# access sents, normal and for a category
sents = corpus.sents()
sents_pop = corpus.sents(categories="POP")
sents_rock = corpus.sents(categories="ROCK")

# make lists
word_list = list(words)
sents_list = list(sents)

pop_word_list = list(words_pop)
pop_sents_list = list(sents_pop)
import collections
import nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.classify import DecisionTreeClassifier
from nltk.corpus import CategorizedPlaintextCorpusReader
from sklearn import svm
from sklearn.svm import LinearSVC
import string
from tabulate import tabulate

corpus_root1='/Users/tianhan/Dropbox/Advanced_big_data_Project/aclImdb/train'
train=CategorizedPlaintextCorpusReader(corpus_root1,r'(pos|neg)/.*\.txt',cat_pattern=r'(pos|neg)/.*\.txt')
corpus_root2='/Users/tianhan/Dropbox/Advanced_big_data_Project/aclImdb/test'
test=CategorizedPlaintextCorpusReader(corpus_root2,r'(pos|neg)/.*\.txt',cat_pattern=r'(pos|neg)/.*\.txt')

def evaluate_classifier_Naive(featx):
    
    train_negids = train.fileids('neg')
    train_posids = train.fileids('pos')
    test_negids = test.fileids('neg')
    test_posids = test.fileids('pos')
    train_negfeats = [(featx(train.words(fileids=[f])), 'neg') for f in train_negids]
    train_posfeats = [(featx(train.words(fileids=[f])), 'pos') for f in train_posids]
    test_negfeats = [(featx(test.words(fileids=[f])), 'neg') for f in test_negids]
    test_posfeats = [(featx(test.words(fileids=[f])), 'pos') for f in test_posids]
    trainfeats = train_negfeats + train_posfeats
    testfeats = test_negfeats + test_posfeats

    Naive_classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets_Naive = collections.defaultdict(set)
Пример #32
0
#Downloading an external corpus, load it, and access it

from nltk.corpus import CategorizedPlaintextCorpusReader
from random import randint #random

# The first line is where you are reading the corpus by calling
# the CategorizedPlaintextCorpusReader constructor.
# The three arguments from left to right are Absolute Path
# to the folder containing the corpus on your computer, all sample
# document names from the txt_sentoken folder, and the categories
# in the given corpus (in our case, 'pos' and 'neg'

reader = CategorizedPlaintextCorpusReader(r'\Users\JoeDi\Desktop\python projs\tokens', r'.*\.txt', cat_pattern=r'(\w+)/*')


print(reader.categories())
print(reader.fileids())

# Now that we've made sure that the corpus is loaded correctly, let's
# get on with accessing any one of the sample documents from both the categories.
# For that, let's first create a list, each containing samples of both the categories, 'pos' and 'neg', respectively.
# Add the following two lines of code:

posFiles = reader.fileids(categories='pos')
negFiles = reader.fileids(categories='neg')

# The next two files select a random file, each from the set of positive
# and negative category reviews. The last two lines just print the filenames.

fileP = posFiles[randint(0,len(posFiles)-1)]
fileN = negFiles[randint(0, len(posFiles) - 1)]
import bokeh


corpus_root = "../corpus/lyric_corpus/files/"
cat_root = "../categories/"

# basic measures taken from nltk book
def lexical_diversity(text):
    return len(set(text)) / len(text)

def percentage(count, total):
    return 100 * count / total


# Hacky way to specify path for cat.txt. A better way would be to rewrite regex '.*\.txt'...
corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+')



# NLTK brow selection
word_list_brown = brown.words()
sents_list_brown = brown.sents()
vocabulary_brown = set(word_list_brown)
brown_len_words = len(word_list_brown)
brown_len_sents = len(sents_list_brown)
brown_len_vocab = len(vocabulary_brown)
brown_richness = lexical_diversity(word_list_brown)

# Lyric corpus
cats = corpus.categories()
print(len(cats))
Пример #34
0
        if k in emotion_of_poems and v == max_value:
            emotion = k
            print(emotion)
            emotion_correct = {"emotion": emotion}
            return emotion_correct
            break

    return emotion_correct


def classify(poem_text):
    return classifier.classify(features_of_poem(poem_text))


corpus_of_poems = CategorizedPlaintextCorpusReader('poems/',
                                                   'poems.*',
                                                   cat_file='cats.txt')


#code for generating errors
# Return errors in order to improve algorithm
def errors_em(poem_set):
    errors = []
    for (fileid, category) in poem_set:
        poem = corpus_of_poems.words(fileids=[fileid])
        emotion_correct = features_of_poem(poem)
        guess = classifier.classify(features_of_poem(poem))

        if guess != category:
            errors.append((category, guess, poem, emotion_correct['emotions']))
# 1장 말뭉치와 워드넷 - 외부 말뭉치 다운로드, 로드하고 액세스하기
from nltk.corpus import CategorizedPlaintextCorpusReader
from random import randint

# 말뭉치 읽기
reader = CategorizedPlaintextCorpusReader(r'/workspace/NLP_python/tokens', r'.*\.txt', cat_pattern=r'(\w+)/*')
print(reader.categories())
print(reader.fileids())

# 샘플 문서 출력
# pos, neg 카테고리의 샘플 목록
posFiles = reader.fileids(categories='pos')
negFiles = reader.fileids(categories='neg')

# pos, neg 카테고리에서 각각 임의의 파일 선택
fileP = posFiles[randint(0, len(posFiles)-1)]
fileN = negFiles[randint(0, len(negFiles)-1)]
print(fileP)
print(fileN)

# 액세스한 임의 파일을 문장으로 출력
for w in reader.words(fileP):
    print(w + ' ', end='')
    if w is '.':
        print()
for w in reader.words(fileN):
    print(w + ' ', end='')
    if w is '.':
        print()

# Build corpus for specific problem set
problem = 'problemA'
problem_root = nltk.data.find('corpora/AAAC/%s' % (problem))
problem_files = PlaintextCorpusReader(problem_root, '.*\.txt')


# Categorize corpus by author
auth_map = {}
for filename in problem_files.fileids():
	a_n =  filename[:3]
	auth_map[filename] =  [a_n]

# By the entire corpus
problem_cat = CategorizedPlaintextCorpusReader(problem_root, '.*\.txt', cat_map=auth_map)
documents = [(list(problem_cat.words(fileid)), category) 
				for category in problem_cat.categories() 
				for fileid in problem_cat.fileids(category)]
random.shuffle(documents)


# Word Frequency featureset
# Word freq accross corpus
all_words = nltk.FreqDist(words.lower() for words in problem_cat.words())
key_words = all_words.keys()[:2000]


# Compares whether a word from the keywords is in a document
def doc_features(doc):
	doc_words = set(doc)
Пример #37
0
import string
from itertools import chain

from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
from nltk.corpus import CategorizedPlaintextCorpusReader
import nltk

mydir = 'Documents/Plab/Project4/subset/test/neg'

mr = CategorizedPlaintextCorpusReader(mydir,
                                      r'(?!\.).*\.txt',
                                      cat_pattern=r'(neg|pos)/.*',
                                      encoding='ascii')
stop = stopwords.words('english')
documents = [([
    w for w in mr.words(i)
    if w.lower() not in stop and w.lower() not in string.punctuation
], i.split('/')[0]) for i in mr.fileids()]

word_features = FreqDist(chain(*[i for i, j in documents]))
word_features = word_features.keys()[:100]

numtrain = int(len(documents) * 90 / 100)
train_set = [({i: (i in tokens)
               for i in word_features}, tag)
             for tokens, tag in documents[:numtrain]]
test_set = [({i: (i in tokens)
              for i in word_features}, tag)
            for tokens, tag in documents[numtrain:]]
import nltk
from nltk.corpus import CategorizedPlaintextCorpusReader
from pylab import *

import plotly.plotly as py
import plotly.graph_objs as go

corpus_root = "../corpus/lyric_corpus/files/"
cat_root = "../categories/"

corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+')
words = corpus.words()
#frequency distribution

popWords = corpus.words(categories="POP")
rockWords = corpus.words(categories="ROCK")

#print("-----All words-----")
fd = nltk.FreqDist(words)
ALL_FrequentWords = fd.most_common(104)
ALL_FrequentWords_50_100 = []
for i in range(54,104):
	ALL_FrequentWords_50_100.append(ALL_FrequentWords[i])
#print(ALL_FrequentWords)


#print("-----All POP words-----")
fd_POP = nltk.FreqDist(popWords)
POP_FrequentWords = fd_POP.most_common(60)
#print(fd1.most_common(60))
Пример #39
0
 def __init__(self, root, fileids=FILE_PATTERN, **kwargs):
     if not any(key.startswith('cat_') for key in kwargs.keys()):
         kwargs['cat_pattern'] = CAT_PATTERN
     CategorizedPlaintextCorpusReader.__init__(
         self, root, fileids, cat_pattern=kwargs['cat_pattern'])
Пример #40
0
print strftime("%Y-%m-%d %H:%M:%S", gmtime())
# Uni
machinename = 'maj27'

j = 0
for i in range(10):
    dataset = str(i + 1)
    #mydir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Classified News/Training'
    train_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Train+val'
    test_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Testing'
    #test_dir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/NA is negative old/Classified News/Criteria '+dataset+''

    preprocessed = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/data_2.p'

    train_Corpus = CategorizedPlaintextCorpusReader(train_dir,
                                                    r'(?!\.).*\.txt',
                                                    cat_pattern=r'(\w+)/*')

    train_documents = [(list(train_Corpus.words(fileid)), category)
                       for category in train_Corpus.categories()
                       for fileid in train_Corpus.fileids(category)]

    only_docs = [' '.join(doc[:1000]) for (doc, category) in train_documents]
    only_docs = [
        ' '.join(normalize_text(document, lemmatize=True, remove_stop=None))
        for document in only_docs
    ]

    #######################################################################################
    train_labels = [category for (doc, category) in train_documents]
    train_binary_labels = [1 if i == 'pos' else 0 for i in train_labels]
Пример #41
0
#http://www.cs.cornell.edu/people/pabo/movie%2Dreview%2Ddata/

from nltk.corpus import CategorizedPlaintextCorpusReader
from random import randint

reader = CategorizedPlaintextCorpusReader(
    r'mix20_rand700_tokens_cleaned/tokens', r'.*\.txt', cat_pattern=r'(\w+)/*')
print(reader.categories())
print(reader.fileids())

posFiles = reader.fileids(categories='pos')
negFiles = reader.fileids(categories='neg')

fileP = posFiles[randint(0, len(posFiles) - 1)]
fileN = negFiles[randint(0, len(negFiles) - 1)]

print(fileN)
print(fileP)

for w in reader.words(fileP):
    print(w + ' ', end='')
    if w is '.':
        print()

for w in reader.words(fileN):
    print(w + ' ', end='')
    if w is '.':
        print()
Пример #42
0
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
from nltk.corpus import CategorizedPlaintextCorpusReader
import nltk
import sys
import os

mydir_train = '.\\Docs-txt\\train'
mydir_test = '.\\Docs-txt\\test'
featureVector_train = []
featureVector_test = []

mr_train = CategorizedPlaintextCorpusReader(
    mydir_train,
    r'(?!\.).*\.txt',
    cat_pattern=
    r'(Analyst Report|Case Study|Datasheets|Technical Brief|Whitepaper)/.*')
mr_test = CategorizedPlaintextCorpusReader(
    mydir_test,
    r'(?!\.).*\.txt',
    cat_pattern=
    r'(Analyst Report|Case Study|Datasheets|Technical Brief|Whitepaper)/.*')

stop = stopwords.words('english')

with open('.\\stopwords.txt') as f:
    stop = f.read().splitlines()

documents_train = [([
    w for w in mr_train.words(i)