Пример #1
0
def main():
    articles = CategorizedPlaintextCorpusReader(corpusdir, '.*', cat_pattern = r'(.*)[/]')
    feats = {}
    trainfeats = []
    testfeats = []
    for cat in articles.categories():
        wow = len([f for f in articles.fileids(cat)]) # such variable name
        print "for category", cat, ":", wow
        feats[cat] = [(word_feats(articles.words(fileids = [f])), cat) for f in articles.fileids(cat)]
        cutoff = wow - hold_back(wow)
        trainfeats.append(feats[cat][:cutoff])
        testfeats.append(feats[cat][cutoff:])

    train = [item for sublist in trainfeats for item in sublist]
    test = [item for sublist in testfeats for item in sublist]

    print 'train on %d instances, test on %d instances' % (len(train), len(test))

    classifier = NaiveBayesClassifier.train(train)
    print 'accuracy:', nltk.classify.util.accuracy(classifier, test)
    classifier.show_most_informative_features() # I don't understand the output for more than 2 categories :(

    # load with:
    # import pickle
    # f = open('my_classifier.pickle')
    # classifier = pickle.load(f)
    # f.close()
    with open('../data/classifier.pickle', 'wb') as f:
        pickle.dump(classifier, f)
Пример #2
0
 def create_categorized_corpus(self, categories_directory):
     boolean_list = []
     boolean_for_categories_test = ''
     reader = CategorizedPlaintextCorpusReader(categories_directory, r'\.txt.*wordtype_(\w+)', cat_pattern=r'\.txt.*wordtype_(\w+)')
     for category in reader.categories():
         boolean_list.append(category != '') 
     if False in boolean_list:
         boolean_for_categories_test = False
     else:
         boolean_for_categories_test = True
     return reader, boolean_for_categories_test
                                          r'.*\.txt',
                                          cat_pattern=r'(\w+)/*')
from textblob.classifiers import NaiveBayesClassifier
random.seed(1)
train = [
    ('Identity', 'IdentityThreat'),
    ('identity', 'IdentityThreat'),
    ('identities', 'IdentityThreat'),
    ('identity loss', 'IdentityThreat'),
    ('insider', 'InsiderThreat'),
    ('Malware', 'Malware'),
]

# Categorized corpora Reader collect the respective words based on ThreatType
ThreatTypes = [(list(reader.words(fileid)), category)
               for category in reader.categories()
               for fileid in reader.fileids(category)]
random.shuffle(ThreatTypes)
print(reader.categories())
new_train = ThreatTypes
print(new_train)
#Naive Bayes classifiers assume that the value of a particular feature is independent of the value of
#any other feature, given the class variable.
cl = NaiveBayesClassifier(train)
#update the classifier with training keywords from Categorized corpora
cl.update(new_train)
inputpath = nltk.data.find('corpora/abc/threatdescp.txt')
f = open(inputpath, encoding='latin2')
outputpath = nltk.data.find('corpora/abc/ResultNB.txt')
ResultFile = open(outputpath, 'w', encoding='latin2')
for line in f:
Пример #4
0
  dirList = []
  for f in os.listdir( path ) :
    if not os.path.isfile( path ) :
      if not f == ".DS_Store" :
        dirList.append(f)
  return dirList

###############################################
###############################################

#################
# TRAINING DATA #
#################
train_reader = CategorizedPlaintextCorpusReader('./training_data', r'.*\_.*\.txt', cat_pattern=r'.*\_(\w+)\.txt')
train_documents = [(list(train_reader.words(fileid)), category)
                   for category in train_reader.categories()
                   for fileid in train_reader.fileids(category)]
random.shuffle(train_documents)
#print train_documents

train_documents_clean = []
for i in train_documents :
  cat = i[1]
  #print cat
  newList = []
  for word in i[0] :
    #print j
    clean_word = word.encode('ascii', 'ignore').decode('ascii').encode('ascii', 'ignore')
    newList.append(clean_word)
  newTup = (newList, cat)
  train_documents_clean.append(newTup)
Пример #5
0
import nltk, random, string
from nltk.corpus.reader import CategorizedPlaintextCorpusReader 
from nltk.corpus import stopwords

reader = CategorizedPlaintextCorpusReader('./', r'.*\.txt', cat_pattern=r'(\w+)/*')
print reader.categories()
print reader.fileids()

documents = [(list(reader.words(fileid)), category)
	for category in reader.categories()
	for fileid in reader.fileids(category)]
random.shuffle(documents)

# Remove stopwords & punc from content
table = string.maketrans("","")
stopwords = nltk.corpus.stopwords.words('english')
filtered_words = [w for w in reader.words() if not w in stopwords]
filtered_words_nopunc = [w for w in filtered_words if not w in string.punctuation]
all_words = nltk.FreqDist(w.lower() for w in filtered_words_nopunc)

print all_words

word_features = all_words.keys()[:2000]




def document_features(document):
	document_words = set(document)
	features = {}
Пример #6
0
    'test_6.txt': 'Press Release',
    'test_7.txt': 'Market Opinion'
}
art_i = []
class_i = []
#Conversion of Train Data into Single Input File
corpus_root = 'Train_set'

newcorpus = CategorizedPlaintextCorpusReader(corpus_root,
                                             r'.*\.txt',
                                             cat_pattern=r'(\w+)/*')

myfile = open('Input_Article_Data.csv', 'wb')
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL, lineterminator="\n")

for category in newcorpus.categories():
    for fileid in newcorpus.fileids(category):
        #print fileid,category
        data1 = (newcorpus.raw(fileid).encode('utf-8')).replace(",", " ")
        data_list = [data1, category]
        wr.writerow(data_list)

myfile.close()

#Reading of Train Data as Lists
with open('Input_Article_Data.csv', 'r') as f:
    for line in f.readlines():
        l, name = line.strip().split(',')
        l = (re.sub('[^A-Za-z0-9.]+', ' ', l)).lower()
        # l=porter_stemmer.stem(l) #Reduces Accuracy From 50% To 37%
        if (name != "Category"):
Пример #7
0
loc = '/Users/rmoura/nltk_data/corpora/rai/textoSimples/'
corpus1 = PlaintextCorpusReader(loc, '.*\.txt')
print(corpus1.fileids())
print(corpus1.sents())
print(corpus1.words())

# Corpus texto etiquetado
from nltk.corpus.reader.tagged import TaggedCorpusReader
loc = '/Users/rmoura/nltk_data/corpora/rai/textoEtiquetas/'
corpus2 = TaggedCorpusReader(loc, '.*\.txt')
print(corpus2.fileids())
print(corpus2.words())
print("Palavras etiquetadas: ", corpus2.tagged_words())
print(corpus2.tagged_words('003.txt'))
print("Sentencas diretas:")
for s in corpus2.sents():
    print(' '.join(s))

from nltk.corpus.reader import CategorizedPlaintextCorpusReader
loc = '/Users/rmoura/nltk_data/corpora/rai/textoCategorias/'
corpus3 = CategorizedPlaintextCorpusReader(loc, '.*\.txt', cat_file="categorias.txt")
print(corpus3.fileids())
print(corpus3.categories())
print(corpus3.words(categories='brasnam'))

# Definicao de stopwords
stopwords = nltk.corpus.stopwords.words('portuguese')
fd = nltk.FreqDist(w.lower() for w in corpus3.words())
fd1 = nltk.FreqDist(w.lower() for w in corpus3.words()
                    if w.isalpha() and w not in stopwords)
Пример #8
0

def generate_model(cfdist, word, num=15):
    for i in range(num):
        print(word, end=' ')
        word = cfdist[word].max()


# 1. Construir Corpus texto categorizado
locPT = 'ch02/ES'
corpusPT = CategorizedPlaintextCorpusReader(locPT,
                                            '.*\.txt',
                                            cat_file="cat.txt")

print(corpusPT.fileids())
print(corpusPT.categories())
print(corpusPT.words(categories='ciencia'))
#print(corpusPT.raw())

vocab = set(w.lower() for w in corpusPT.words())
print('Tamanho Vocabulario:', len(vocab))
corpusCom = corpusPT.raw()
corpusComList = corpusCom.split()
print('Tamanho Total de palabras:', len(corpusComList))

# 2. Calcular medidas estadisticas simples
'''
Medidas: Tamanho médio das palavras, Tamanho médio das sentenças e Número de vezes que cada
item do vocabulário aparece no	texto em média (escore de diversidade léxica)
'''
print(
Пример #9
0
    number_free = remove_numebrs(user_free)
    hashtag_free = remove_hashtags(number_free)
    twitter_words = [
        term.lower() for term in tweet_tokenizer.tokenize(hashtag_free)
        if term.lower() not in stop_words
    ]
    twitter_words_with_hashtags = [
        term.lower() for term in tweet_tokenizer.tokenize(number_free)
        if term.lower() not in stop_words
    ]
    return twitter_words, twitter_words_with_hashtags


corpus_tokens = []

for category in reader.categories():
    for file in reader.fileids(categories=category):
        without_hashtags, with_hashtags = tokenize_tweets(file)

        # c
        fdist_category = nltk.FreqDist(without_hashtags)
        print("Most common words in", category, ":",
              fdist_category.most_common(10))

        # d
        hashtags = [word for word in with_hashtags if word.startswith("#")]
        fdist_category_hashtag = nltk.FreqDist(hashtags)
        print("Most common hashtags in", category, ":",
              fdist_category_hashtag.most_common(10))

        corpus_tokens += without_hashtags
Пример #10
0
#!/usr/bin/env python
# coding: utf-8

import nltk
from nltk.corpus.reader import CategorizedPlaintextCorpusReader

corpus_root = '/Users/athessen/nltk_data/corpora/eco'
reader = CategorizedPlaintextCorpusReader(corpus_root,r'lion|shark\d*\.txt',cat_file='cats.txt')
print reader.fileids()

print reader.categories()


"""
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000] [1]

def document_features(document): [2]
    document_words = set(document) [3]
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features
"""
Пример #11
0
class PolarityDataReader(object):
    """
    PolarityDataReader:
        Reader for POS/NEG Categorized Sentiword data

    uses:
        nltk.corpus.reader.CategorizedPlaintextCorpusReader

    usage:
        
        dataReader = PolarityDataReader([rootLocation],[readerObject])
        dataReader.getDocuments()
        dataReader.setTerms([No:ofTerms])

        featuresets = dataReader.getTermDocMatrix()

    """
    def __init__(self, rootLocation=config.POLARITY_DATASET, reader=None):
        super(PolarityDataReader, self).__init__()
        if reader == None:
            self.reader = Reader(rootLocation,
                                 r'.*/.*',
                                 cat_pattern=r'(.*)/.*')
        else:
            self.reader = reader
        self.setStopWords()
        self.documents = None
        self.terms = None

    def getDocuments(self):
        if not self.documents:
            self.documents = [(list(self.reader.words(fileid)), category)
                              for category in self.reader.categories()
                              for fileid in self.reader.fileids(category)]
        return self.documents

    def setStopWords(self, fileLocation=config.STOP_WORDS_FILE):
        stopfile = open(fileLocation, 'r')
        self.stopwords = stopfile.read().split()

    def removeStopWords(self, wordList):
        """ Remove common words which have no search value """
        return [word for word in wordList if word not in self.stopwords]

    def setTerms(self, size=2000, featureSelection='PD', removeStopWords=True):
        if featureSelection == 'PD':
            self.__setTermsPD__(size)
            print "Feature Selection : PD :done "

        elif featureSelection == 'CHI_SQUARE':
            self.__setTermsCHISQUARE__(size)
            print "Feature Selection : CHI_SQUARE :done "

        else:
            """
            geting most frequent Words
            """
            all_words = [w.lower() for w in self.reader.words()]
            if removeStopWords:
                all_words = self.removeStopWords(all_words)
            all_words = FreqDist(w for w in all_words)
            self.terms = all_words.keys()[:size]
            print "Feature Selection: frequent Words :done "

    def documentFeatures(self, document, sentiwordnet=False):
        document_words = set(document)
        features = {}
        if sentiwordnet:
            pass
            #TODO
        else:
            for word in self.terms:
                features[word] = (word in document_words)
        return features

    def getTermDocMatrix(self):
        return [(self.documentFeatures(document), category)
                for (document, category) in self.documents]

    def __setTermsPD__(self, size):
        """
        score=|(posDF-negDF)|/(posDF+negDF)
        """
        posWord = {}
        negWord = {}

        for word in self.reader.words(categories=['pos']):
            inc(posWord, word.lower())
        for word in self.reader.words(categories=['neg']):
            inc(negWord, word.lower())

        wordScores = {}
        for word in self.reader.words():
            try:
                posScore = posWord[word]
            except KeyError, e:
                posScore = 0
            try:
                negScore = negWord[word]
            except KeyError, e:
                negScore = 0
            totalScore = posScore + negScore
            if totalScore <= 10:  # min total count
                wordScores[word] = 0.1
            else:
                wordScore[word] = abs(posScore - negScore) / totalScore
Пример #12
0
    if proption == "Y" or proption == "y":
        prchoice = True
    elif proption == "N" or proption == "n":
        prchoice = False

    rocoption = raw_input("ROC curve plot: (Y/N) ")

    if rocoption == "Y" or rocoption == "y":
        rocchoice = True
    elif rocoption == "N" or rocoption == "n":
        rocchoice = False

    confmatplot = raw_input("Confusion matrix plot: (Y/N) ")

    print("\nStarting the classifier...\n")
    classify(traindata,
             testdata,
             classifier=clf,
             learncurve=learnchoice,
             prcurve=prchoice,
             roccurve=rocchoice)

    if confmatplot == "Y" or confmatplot == "y":
        # Plot
        cnf_matrix = confusion_matrix(ytest, predictions)
        class_names = testcorpus.categories()
        plot_confusion_matrix(cnf_matrix, classes=class_names)
        plt.show()

    print("\nFinished!")
Пример #13
0
    return features


if __name__ == '__main__':
    #set up path to data
    data_folder_name = sys.argv[1]
    data_path = os.path.join(os.getcwd(), '', data_folder_name)

    #make article object to read in files
    article = CategorizedPlaintextCorpusReader(data_path,
                                               r'.*\.*\.txt',
                                               cat_pattern=r'(\w+).*\.txt')

    #make list of all articles with labels based on what folder the file is in
    all_articles = []
    for category in article.categories():
        for fileid in article.fileids(category):
            #lowercases words and takes out stopwords
            process = list(
                w.lower() for w in list(article.words(fileid))
                if w.isalpha() and w not in stopwords.words('english'))
            entry = [process, category]
            all_articles.append(entry)

    random.shuffle(all_articles)

    #make bigrams for every article
    word_bigrams = [(nltk.bigrams(all_articles[i][0]))
                    for i in range(len(all_articles))]

    #create frequency distribution for all words and select top 2000 for features
Пример #14
0
def fetch_news(dir):
    base = 'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/{}/rss.xml'

    for category in ['world', 'technology']:
        rss = fp.parse(base.format(category))

        for i, entry in enumerate(rss.entries):
            fname = '{0}_bbc_{1}.txt'.format(i, category)
            fname = os.path.join(dir, fname)

            if not dl.conf.file_exists(fname):
                store_txt(entry.link, fname, entry.title)


if __name__ == "__main__":
    dir = os.path.join(dl.data.get_data_dir(), 'bbc_news_corpus')

    if not os.path.exists(dir):
        os.mkdir(dir)

    fetch_news(dir)
    reader = CategorizedPlaintextCorpusReader(dir,
                                              r'.*bbc.*\.txt',
                                              cat_pattern=r'.*bbc_(\w+)\.txt')
    printer = dl.log_api.Printer(nelems=3)
    printer.print('Categories', reader.categories())
    printer.print('World fileids', reader.fileids(categories=['world']))
    printer.print('Technology fileids',
                  reader.fileids(categories=['technology']))
Пример #15
0
        words = [re.sub(r'[^a-zA-Z0-9_]','_', w) for w in words]
    
    if remove_stopwords:
        sw = set(nltk.corpus.stopwords.words("english"))
        words = [w for w in words if not w in sw]

    if stem:
        porter = nltk.PorterStemmer()
        words = [porter.stem(w) for w in words]
    
    return words

documents = [((fileid, category), preprocess(my_corpus.words(fileid), 
               to_lowercase = True, remove_punctuation = True, remove_digits = True, 
               remove_odd_chars = True, remove_stopwords=True, stem = False)) \
             for category in my_corpus.categories() \
             for fileid in my_corpus.fileids(category)]



def dummy_fun(doc):
    return doc

bow_gen = sklearn.feature_extraction.text.CountVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    ngram_range=(1, 2),
    min_df = 150, # changed from 100
    max_df = 0.85)
Пример #16
0
# Removing oversized collections: hathi, nypl; Also, chunking them out:
# First batch represents what was completed on 4/10-4/11. 
colls = ["searches"]
#colls = ["artstor","biodiv","rumsey","commonwealth","georgia","harvard",
#        "ia","getty","kentucky","minnesota","missouri","mwdl","nara","nocar",
#        "smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"]
#colls = ["ia","getty","kentucky","minnesota","missouri","mwdl"]
#colls = ["nara","nocar","smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"]

#data = {}
stats = {}
common = {}

for coll in colls:
    print(reader.categories(coll+".txt"))
    stats[coll] = {}
    # 'kay. Can't pickle words. It's a stream reader.
    # But maybe you can if you tokenize we regex
    # Which also pulls out punctuation
    print("prep & pickle words")
    words = re.split(r'\W+', reader.raw(coll+'.txt'))
    pickle.dump( words, open( "/media/storage/dpla-data/pickles/new/"+coll+"_words.p", "wb"))
    #words = reader.words(coll+".txt")
    #data[coll]["words"] = reader.words(coll+".txt")
    print("getting count " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()))
    stats[coll]["wc"] = len(words)
    print(stats[coll]["wc"])
    print("getting uniq " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()))
    stats[coll]["uniq"] = len(set([w.lower() for w in words]))
    print(stats[coll]["uniq"])
Пример #17
0
# Removing oversized collections: hathi, nypl; Also, chunking them out:
# First batch represents what was completed on 4/10-4/11. 
#colls = ["searches"]
colls = ["artstor","biodiv","rumsey","commonwealth","georgia","harvard",
        "ia","getty","kentucky","minnesota","missouri","mwdl","nara","nocar",
        "smiths","socar","texas","gpo","illinois","usc","virginia","nocoll",
        "hathi","nypl"]
#colls = ["ia","getty","kentucky","minnesota","missouri","mwdl"]
#colls = ["nara","nocar","smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"]

#data = {}
stats = {}
common = {}

for coll in colls:
    print(reader.categories(coll+".txt"))
    stats[coll] = {}
    # 'kay. Can't pickle words. It's a stream reader.
    # But maybe you can if you tokenize we regex
    # Which also pulls out punctuation
    print("prep & pickle words")
    words = re.split(r'\W+', reader.raw(coll+'.txt'))
    pickle.dump( words, open( "/media/storage/dpla-data/words/colls.oct/pickles/"+coll+"_words.p", "wb"))
    #words = reader.words(coll+".txt")
    #data[coll]["words"] = reader.words(coll+".txt")
    print("getting count " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()))
    stats[coll]["wc"] = len(words)
    print(stats[coll]["wc"])
    print("getting uniq " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()))
    stats[coll]["uniq"] = len(set([w.lower() for w in words]))
    print(stats[coll]["uniq"])
Пример #18
0
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
from nltk.tokenize.casual import TweetTokenizer

from normalization import normalizeTwitterWordsWithExtraFeatures, normalizeTwitterWordsWithNegationHandle
import pickle, nltk

tweetTokenizer = TweetTokenizer(reduce_len=True, preserve_case=True, strip_handles=False)
corpus = CategorizedPlaintextCorpusReader('corpus/2-step/polar', r'(\w+)-tweet[0-9]+\.txt', cat_pattern=r'(\w+)-tweet[0-9]+\.txt', word_tokenizer=tweetTokenizer)

normalizationFunction = normalizeTwitterWordsWithNegationHandle

wordsTaggedToCategory = []

i = 1
for category in corpus.categories():
    for fileid in corpus.fileids(category):
        words = corpus.words(fileids=[fileid])
        normalizedWords = normalizationFunction(words)
        extraNormalizedWords = normalizeTwitterWordsWithExtraFeatures(words)
        wordsTagged = nltk.pos_tag(normalizedWords)
        wordsTaggedToCategory += [(wordsTagged, category)]
        print(i)
        i += 1

with open("wordsTaggedToCategory-polar", 'wb') as fileout:
    pickle.dump(wordsTaggedToCategory, fileout)
Пример #19
0
class PolarityDataReader(object):
    """
    PolarityDataReader:
        Reader for POS/NEG Categorized Sentiword data

    uses:
        nltk.corpus.reader.CategorizedPlaintextCorpusReader

    usage:
        
        dataReader = PolarityDataReader([rootLocation],[readerObject])
        dataReader.getDocuments()
        dataReader.setTerms([No:ofTerms])

        featuresets = dataReader.getTermDocMatrix()

    """
    
    def __init__(self, rootLocation = config.POLARITY_DATASET,reader=None):
        super(PolarityDataReader, self).__init__()
        if reader == None:
            self.reader = Reader(rootLocation,r'.*/.*', cat_pattern=r'(.*)/.*')
        else:
            self.reader = reader
        self.setStopWords()
        self.documents = None;
        self.terms = None;


    def getDocuments(self):
        if not self.documents:
            self.documents = [(list(self.reader.words(fileid)), category) 
                              for category in self.reader.categories()
                              for fileid in self.reader.fileids(category)]
        return self.documents;

    def setStopWords(self,fileLocation = config.STOP_WORDS_FILE):
        stopfile = open(fileLocation, 'r')
        self.stopwords = stopfile.read().split()

    def removeStopWords(self,wordList):
        """ Remove common words which have no search value """
        return [word for word in wordList if word not in self.stopwords]

    def setTerms(self,size=2000,featureSelection='PD',removeStopWords=True):
        if featureSelection == 'PD':
            self.__setTermsPD__(size)
            print "Feature Selection : PD :done "
            
        elif featureSelection == 'CHI_SQUARE':
            self.__setTermsCHISQUARE__(size)
            print "Feature Selection : CHI_SQUARE :done "
        elif featureSelection == 'SWNSS':
            self.__setTermsSWNSS__(size)
            print "Feature Selection : SWNPD :done "
        else:
            """
            geting most frequent Words
            """
            all_words = [w.lower() for w in self.reader.words()];
            if removeStopWords:
                all_words = self.removeStopWords(all_words);
            all_words = FreqDist(w for w  in all_words)
            self.terms = all_words.keys()[:size]
            print "Feature Selection: frequent Words :done "


    def documentFeatures(self,document,sentiwordnet=False):
        document_words = set(document)
        features = {}
        if sentiwordnet:
            pass
            #TODO
        else :
            for word in self.terms:
                features[word] = (word in document_words)
            return features
                

    def getTermDocMatrix(self):
        return [(self.documentFeatures(document), category) 
                for (document,category) in self.documents]

    def __setTermsPD__(self,size):
        """
        score=|(posDF-negDF)|/(posDF+negDF)
        """
        posWord = {};
        negWord = {};
        
        for word in self.reader.words(categories = ['pos']):
            inc(posWord,word.lower());
        for word in self.reader.words(categories = ['neg']):
            inc(negWord,word.lower());
                
        wordScores = {}
        for word in self.reader.words():
            try:
                posScore = posWord[word]
            except KeyError, e:
                posScore = 0
            try:
                negScore = negWord[word]
            except KeyError, e:
                negScore = 0
                totalScore = posScore + negScore
            if totalScore <= 10 : # min total count
                wordScores[word] = 0.1
            else :
                wordScores[word] = abs(posScore-negScore)/totalScore
                #removeStopWords does no affect accurcy          
            termScore = sorted(wordScores.items(),key=lambda(w,s):s,reverse=True)[:size]
            self.terms = [w for (w,s) in termScore];
Пример #20
0
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
import nltk

d = nltk.data.find('corpora/cookbook')
reader = CategorizedPlaintextCorpusReader(d, r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt')
print(reader.categories())
print(reader.fileids(categories='neg'))
print(reader.fileids(categories='pos'))

# from nltk.corpus import brown
# print(brown.categories())
for word, cat in Feature_Set.keys():
    file.write(str(word))
    file.write("\t\t")
    file.write((str(cat)))
    file.write("\t\t")
    file.write(str(Feature_Set[word,cat]))
    file.write("\n")

file.close()

Classification_Accuracy=0
    
for file in Testing_Corpus.fileids():
    pos_prob=1
    neg_prob=1
    real_category=Testing_Corpus.categories([file])
    
    for word, cat in Feature_Set:
        if word in Testing_Corpus.words([file]):
            if cat=="pos":
                pos_prob=Feature_Set[word, cat]*float(pos_prob)*10000
            else:    
                neg_prob=Feature_Set[word, cat]*float(neg_prob)*10000
        
    if float(pos_prob)>=float(neg_prob):
        derived_category="['pos']"
    else:
        derived_category="['neg']"
        
    if str(real_category)==str(derived_category):
        Classification_Accuracy=Classification_Accuracy + 1
Пример #22
0

import random
import nltk as nltk
#nltk.download()
from nltk.corpus import stopwords
import os, os.path
path = os.path.expanduser('~/nltk_data')
if not os.path.exists(path):
    os.mkdir(path)
os.path.exists(path)
import nltk.data
path in nltk.data.path
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
reader = CategorizedPlaintextCorpusReader('.', r'.*_news_.*\.csv', cat_pattern=r'.*_news_(\w+)\.csv')
reader.categories()

def bag_of_words(words):
    return dict([(word, True) for word in words if word[0].isalpha()])
import collections
def bag_of_words_not_in_set(words, badwords):
    return bag_of_words(set(words)-set(badwords))

def bag_of_non_stopwords(words, stopfile='english'):
    badwords = stopwords.words(stopfile)
    return bag_of_words_not_in_set(words, badwords)

from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder

def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=2000):
data = get_data()
print(len(data))
evrth, maindict = tags_assignment(data)

# Save new final dictionary as well as the mapping for categories-numbers
listingssss = json.dumps(evrth)
with open("FinalCleanJuly1.json", "w") as f:
    f.write(listingssss)
dictionaries = json.dumps(maindict)
with open("CorpusCatMapJuly1.json", "w") as f:
    f.write(dictionaries)

#### This is IMPORTANT - CHOOSE ! ##### default is key2
#### Choose the label you want to have for naming!
### two options:
### 1) key1 with format: docID + _(i) where i numerated number of category e.g. -doc-_cr14021.txt
### 2) key2 with format country name + year + _(i) e.g. Albania2015_1.txt
### if you want to change--> line 90: "key2: taglist" to key1
### line 121: filename=evrth[i]['key2'] to key1
create_corpus(evrth)

#### Check if working
reader = CategorizedPlaintextCorpusReader('corpusCategory/',
                                          r'\w+\d+_.*\.txt',
                                          cat_map=maindict)
print(reader.categories())  #print all categories in a list
print(reader.fileids(categories=['Fiscal']))  #check docIDs in fiscal category

#Good reference - https://www.packtpub.com/books/content/python-text-processing-nltk-20-creating-custom-corpora
#They have options for creating chunked (by words, sentences, paragraphs and even customized paragraphs) corpora, tagged corpora etc