예제 #1
0
def train():
    """Train the clasifier"""
    words = {}

    for user in User.objects.all():
        print 'Training for User ' + str(user.id),
        for subscription in user.subscriptions:
            interested_titles_list = []
            unlabled_titles_list = []
            for article in Article.objects(interested_users=user.id,
                                           feed_id=subscription.feed_id):
                interested_titles_list.append(article.features.title)
                unlabled_titles_list.append(article.features.title)
            for article in Article.objects(uninterested_users=user.id,
                                           feed_id=subscription.feed_id):
                unlabled_titles_list.append(article.features.title)
            words = map(get_words_in_title, interested_titles_list)
            print interested_titles_list
            classifier = PositiveNaiveBayesClassifier.train(
                words, map(get_words_in_title, unlabled_titles_list))
            subscription.classifier_object = pickle.dumps(classifier)
        try:
            user.save()
        except Exception as e:
            print 'Failed: %s' % e
        print 'Classifier Saved'
예제 #2
0
	def test_classifier(self):
		bgram_doc = list(self.ft_db.find({"subreddit" : "UkrainianConflict"},to_tuples=True,field="bigrams"))[0]
		allbgram_doc = list(self.ft_db.find({"subreddit" : "all"}, to_tuples=True, field='bigrams'))[0]

		pos_fts = { d[0]:True for d in bgram_doc["bigrams"] }
		neu_fts = { d[0]:True for d in allbgram_doc["bigrams"] }
		
		ukr = []
		neu = []

		for doc, fts in self.source.find_ft({"subreddit" : "UkrainianConflict"}):
			nomore = []
			for key in fts.keys():
				if key not in pos_fts:
					nomore = []
				for n in nomore:
					del fts[n]
			if len(fts.keys()) > 0:
				ukr.append(fts)

		for doc, fts in self.source.find_ft(limit=6000):
			neu.append(fts)

		nvb = PositiveNaiveBayesClassifier.train(ukr,neu)
		for do, fts in self.source.find_ft(skip=6000,limit=10):
			print(nvb.classify(fts))
		nvb.show_most_informative_features()

		"""ukr = []
예제 #3
0
파일: bayes.py 프로젝트: StevenLOL/detie
def train(spam_words, unlabeled_words):

    spams = list(map(features, spam_words))
    unlabeled = list(map(features, unlabeled_words))

    model = PositiveNaiveBayesClassifier.train(spams, unlabeled, 0.5)
    data = PickleData('bayesmodel.pickle')
    data.write(model)
    return model
예제 #4
0
def main():
	positive_featuresets = list(map(features, matches))
	unlabeled_featuresets = list(map(features, nomatches))
	classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, unlabeled_featuresets)
	cursor.rewind()
	events = []
	for doc in cursor:
		body = sent_tokenize(doc['body'])
		for sent in body:
			blob = TextBlob(sent)
			if (classifier.classify(features(blob))) is True:
				# to print what is written in the finalout file
				#print ( '"subreddit_id":' + json.dumps(doc['subreddit_id']) + ',"features":' + json.dumps(commonfeatures(str(blob))))
				events.append('{ "subreddit_id":' + json.dumps(doc['subreddit_id']) + ',"features":' + json.dumps(commonfeatures(str(blob))) + '}')
				
				
	f = open(finaloutput, 'w')
	f.write('{')
	f.write(json.dumps(events))
	f.write('}')
	f.write('\n')
	f.close()					
예제 #5
0
파일: train.py 프로젝트: AkarshES/Readless
def train():
    """Train the clasifier"""
    words = {}

    for user in User.objects.all():
        print "Training for User " + str(user.id),
        for subscription in user.subscriptions:
            interested_titles_list = []
            unlabled_titles_list = []
            for article in Article.objects(interested_users=user.id, feed_id=subscription.feed_id):
                interested_titles_list.append(article.features.title)
                unlabled_titles_list.append(article.features.title)
            for article in Article.objects(uninterested_users=user.id, feed_id=subscription.feed_id):
                unlabled_titles_list.append(article.features.title)
            words = map(get_words_in_title, interested_titles_list)
            print interested_titles_list
            classifier = PositiveNaiveBayesClassifier.train(words, map(get_words_in_title, unlabled_titles_list))
            subscription.classifier_object = pickle.dumps(classifier)
        try:
            user.save()
        except Exception as e:
            print "Failed: %s" % e
        print "Classifier Saved"
예제 #6
0
       Function: Gets the article only version of the URL using Instapaper.
       Extracts the text in the artcile and removes any non AlphaNumeric characters in the text
       Returns a list of words in the article present in the URL.'''
    html_data = BeautifulSoup(urllib.urlopen(
                     "http://www.instapaper.com/m?%s" % urllib.urlencode({'u':url})).read()) #URLencoding the url to pass it to Instapaper
    html_data = html_data.find("body") 		#Using only the contents in HTML <body> tag, avoides Javascript from being treated as text.
    pattern = re.compile('[\W_ ]+')    		#Compile regex for alphanumeric characters and spaces(for multiword strings).
    words = html_data.findAll(text=True)	#setting text to True to extract only the text in the <body>
    word_list = []				            #Stores the list of words
    for word in words[30:]:			        #Removing redundant content from Instapaper Mobilizer headers
	for w in word.split(" "):		        #splitting on spcae for multiword strings
	    wd = (pattern.sub('',w.lower()))	#substituing non alphanumeric characters with ''
	    if len(wd) > 1 : word_list.append(wd)#exclude strings of less than 2 characters
    filtered_words = [w for w in word_list if not w in nltk.corpus.stopwords.words('english')]
    return dict((word,True) for word in word_list)

if __name__ == '__main__':
    print get_article_snippet("sduhfuihsejdsddsfsdfsdf<p>njksnn</p><a>snjkksfbksdbf</a>ksjdfn",15)
    parser = argparse.ArgumentParser(description = "Accepts a URL")
    parser.add_argument("--url",dest = "url") #Extracts url from command line, if available
    urls = parser.parse_args()
    if urls.url == None:
        print ("No URL Specified")
        sys.exit()
    positive_examples = map(get_words_in_article, ['http://www.engadget.com/2012/11/16/htc-droid-dna-review/', 'http://www.engadget.com/2012/10/08/samsung-galaxy-note-ii-review/', 'http://www.engadget.com/2012/11/16/htc-desire-x-review/', 'http://www.engadget.com/2012/11/16/htc-desire-x-review/'])
    misc_examples = map(get_words_in_article, ['http://www.engadget.com/2012/11/16/sharp-aquos-sh930w-reviewed-early-in-russia-with-1080p-display/', 'http://www.engadget.com/2012/11/15/nexus-4-backordered/', 'http://www.engadget.com/2012/11/16/htc-windows-phone-8x-t-mobile-review/', 'http://www.engadget.com/2012/11/16/distro-issue-66-holiday-gift-guide/', 'http://www.engadget.com/2012/10/29/apple-macbook-pro-with-retina-display-review-13-inch/', 'http://www.engadget.com/2012/11/17/skydrive-sdk-net-windows-phone-8/'])
    classifier = PositiveNaiveBayesClassifier.train(positive_examples,misc_examples)

    print classifier.classify(get_words_in_article(urls.url))
    classifier.show_most_informative_features()
예제 #7
0
def getClassifier(tweetfile,cfg):
    degreesToUse = cfg['NLPnGrams']
    print "DEBOOOOO", degreesToUse, type(degreesToUse)
    classMode = cfg['NLPMode'].replace('-',' ').replace('_',' ')
    shortClass = classMode.replace(' ','').lower()
    loadNeeded = True 

    if 'NLPTEST' not in cfg.keys():
	degreeString = '-'.join([str(degree) for degree in degreesToUse])
        pickleFile = 'nlpTrainers/'+tweetfile.replace('.csv','.'+shortClass+degreeString+'.pickle')  
	if isfile(pickleFile):
		print "Loading pickled", shortClass, "classifier"
		fileIn = open(pickleFile)
		classifier = cPickle.load(fileIn)
		fileIn.close()
		loadNeeded = False
    
    if loadNeeded:
        if 'NLPTEST'in cfg.keys():
            content = prepText(tweetfile)
            categorized = prepClassifications(content)
            NGrammized = collectNGrams(categorized,degreesToUse,cfg)
        else:
            print "Loading content & preparing text"
            content = prepText(loadFile(tweetfile))
            print "Categorizing contents"
            categorized = prepClassifications(content)
            print "Deriving NGrams of length(s)", degreesToUse
            NGrammized = collectNGrams(categorized,degreesToUse,cfg)
            print "Compiling Results"
        readyToSend = []
        allCats = [str(key) for key in NGrammized.keys()]
        for category in allCats:
            readyToSend += NGrammized[category]
            
        print "Attempting Classification by mode", classMode, degreesToUse
        if classMode == 'naive bayes':
            from nltk.classify import NaiveBayesClassifier
            classifier = {'class':NaiveBayesClassifier.train(readyToSend),'mode':'nb'}
        elif classMode == 'positive naive bayes':
            from nltk.classify import PositiveNaiveBayesClassifier
            classifier = {'class':PositiveNaiveBayesClassifier.train(readyToSend),'mode':'pnb'}
        elif classMode == 'max ent':
            #import nltk.classify
            #from sklearn.linear_model import LogisticRegression
            #from nltk.classify import SklearnClassifier
            #classifier = {'class':LogisticRegression.train(readyToSend),'mode':'me'}
            from nltk.classify import MaxentClassifier
            classifier = {'class':MaxentClassifier.train(readyToSend,algorithm='iis'),'mode':'me'}
        elif classMode == 'decision tree':
            from nltk.classify import DecisionTreeClassifier
            classifier = {'class':DecisionTreeClassifier.train(readyToSend),'mode':'dt'}
        elif classMode == 'svm':
            if "SVMOrder" in cfg.keys():
                priority =  cfg['SVMOrder']
            else:
                priority =  "ABCDEFGHIJKLMNOPQRSTUVWXYZ9876543210"
            if type(priority) is str:
                priority = list(priority)
            priority = [entry for entry in priority if entry in allCats]
            preppedSVM = prepSVMAll(readyToSend,priority,allCats,cfg)
            classifier = {'class':preppedSVM,'mode':'svm','priority':priority}
	else:
	    from nltk.classify import NaiveBayesClassifier
            classifier = {'class':NaiveBayesClassifier.train(readyToSend),'mode':'nb'}
        
        if 'NLPTEST' not in cfg.keys():
            print "Pickling Classifier"
            fileOut = open(pickleFile, 'wb')
            cPickle.dump(classifier, fileOut)
            fileOut.close() 
              
    if 'NLPTEST' not in cfg.keys():
        if classMode != 'svm': 
        	classifier['class'].show_most_informative_features(n=150)
	"""else:
		for key in classifier['class'].keys():
			print classifier		
			print classifier.keys()
			classifier['class'][key].show_most_informative_features(n=150/len(classifier['class'].keys()))"""    
    
    return classifier
def get_data(path, label):
    examples = glob.glob(path)
    to_text = lambda fname: features(open(fname).read())
    return map(to_text, examples)

def features(sentence):
    words = sentence.lower().split()
    return dict((w, True) for w in words)

print "Extracting relevant and irrelevant examples..."
relevant_examples = get_data("data/relevant/*", "relevant")
irrelevant_examples = get_data("data/irrelevant/*", "irrelevant")

print "Creating training set..."
featuresets = relevant_examples + irrelevant_examples
print "Featuresets: " + str(len(featuresets))

N = 65000
train_set, test_set = featuresets[N:], featuresets[:N]
print "Train set: " + str(len(train_set))
print "Test set: " + str(len(test_set))

print "Training in progress..."
classifier = PositiveNaiveBayesClassifier.train(irrelevant_examples, featuresets)
print "Finished training!"

classifier.show_most_informative_features()
# accuracy = nltk.classify.util.accuracy(classifier, test_set)
# print "Accuracy: " + str(accuracy)
예제 #9
0
파일: indix_tc.py 프로젝트: akdeepak/Indix
def features(sentence):
    words = sentence.lower().split()
    return dict(('contains(%s)' % w, True) for w in words)

data_425_sentences = topicList
various_sentences = [ 'The President did not comment',
                       'I lost the keys',
                       'The team won the game',
                       'Sara has two kids',
                       'The ball went off the court',
                       'They had the ball for the whole game',
                       'The show is over' ]

data_425_featuresets = list(map(features, data_425_sentences))
classifier = PositiveNaiveBayesClassifier.train(data_425_featuresets,
                                                 unlabeled_featuresets)
                                                 
classifier.classify(features('The cat is on the table'))                                                 
classifier.classify(features('sata cable'))

#############################################################

c_filename =  "C:\\DEEPAK\\INDIX\\classification_blind_set_corrected\\classification_blind_set_corrected.tsv"

c_df = pd.read_csv(c_filename ,sep="\t",low_memory = False)
c_data = defaultdict(list)


for c_row in c_df.itertuples():
    c_data[c_row[2]].append(c_row [1])
예제 #10
0
positive_featuresets = map(features, sports_sentences)

print '\n ' 'positive_featuresets' ' full list: \n', positive_featuresets
print '\n positive_featuresets:'
for ii in positive_featuresets:
    print 'answer:', ii

# unlabeled_featuresets - A list of featuresets whose label is unknownself.
unlabeled_featuresets = map(features, various_sentences)

print '\n unlabeled_featuresets:'
for ii in unlabeled_featuresets:
    print 'answer:', ii

# To train, pass in a list of 'true' dictionaries for POS and for NEG
classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
                                                unlabeled_featuresets)

# Is the following sentence about sports?
print '\n', 'The cat is on the table --', classifier.classify(
    features('The cat is on the table'))

# What about this one?
print 'My team lost the game --', classifier.classify(
    features('My team lost the game'))

# Output

#  positive_featuresets full list:
# [{'the': True, 'dominated': True, 'game': True, 'team': True}, {'the': True, 'ball': True, 'lost': True, 'they': True}, {'the': True, 'was': True, 'game': True, 'intense': True}, {'the': True, 'ball': True, 'goalkeeper': True, 'catched': True}, {'the': True, 'other': True, 'controlled': True, 'ball': True, 'team': True}]

#  positive_featuresets:
예제 #11
0
import os
import nltk
from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.classify import PositiveNaiveBayesClassifier
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

def features(sentence):
	words = sentence.lower().split()
	return dict(('contains(%s)' % w, True) for w in words)

corpusdir = './text'
newcorpus = PlaintextCorpusReader(corpusdir, '.*')
positive_featuresets = list(map(features, newcorpus.raw('comp.txt')))
unlabeled_featuresets = list(map(features, newcorpus.raw('animal.txt')))
classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, 
	unlabeled_featuresets, .3)
print classifier.classify(features('.'))
예제 #12
0
                     'I lost the keys',
                     'The team won the game',
                     'Sara has two kids',
                     'The ball went off the court',
                     'They had the ball for the whole game',
                     'The show is over'
                     ]
                    
def features(sentence):
    sentence = ' '.join([word for word in sentence.split() if word.lower() not in cachedStopWords])
    words = sentence.lower().split()
    return dict(('contains(%s)' % w, True) for w in words)
    
positive_featuresets = list(map(features, sports_sentences))
unlabeled_featuresets = list(map(features, various_sentences))
classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, unlabeled_featuresets)

positive_sentiments = list(map(features, posTrainFeatures))
negative_sentiments = list(map(features, negTrainFeatures))
sentimentClassifier = PositiveNaiveBayesClassifier.train(positive_sentiments, negative_sentiments)

#print (classifier.classify(features('The cat is on the table')))
#print (classifier.classify(features('My team lost the game')))

referenceSets = collections.defaultdict(set)
testSets = collections.defaultdict(set)
positives = 0
negatives = 0
tp = 0
tn = 0
fp = 0
예제 #13
0
def main():
        positive_featuresets = list(map(features, sports_sentences))
        unlabeled_featuresets = list(map(features, various_sentences))
        classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,unlabeled_featuresets)
        print classifier.classify(features('My team lost the game'))
예제 #14
0
def positive_naive_bayes(pos_cursor,unlabeled_cursor):
	"""send over entire documents"""
	return PositiveNaiveBayesClassifier.train(list(map(feature, pos_cursor)), list(map(feature, unlabeled_cursor)))
예제 #15
0
def getClassifier(tweetfile, cfg):
    degreesToUse = cfg['NLPnGrams']
    print "DEBOOOOO", degreesToUse, type(degreesToUse)
    classMode = cfg['NLPMode'].replace('-', ' ').replace('_', ' ')
    shortClass = classMode.replace(' ', '').lower()
    loadNeeded = True

    if 'NLPTEST' not in cfg.keys():
        degreeString = '-'.join([str(degree) for degree in degreesToUse])
        pickleFile = 'nlpTrainers/' + tweetfile.replace(
            '.csv', '.' + shortClass + degreeString + '.pickle')
        if isfile(pickleFile):
            print "Loading pickled", shortClass, "classifier"
            fileIn = open(pickleFile)
            classifier = cPickle.load(fileIn)
            fileIn.close()
            loadNeeded = False

    if loadNeeded:
        if 'NLPTEST' in cfg.keys():
            content = prepText(tweetfile)
            categorized = prepClassifications(content)
            NGrammized = collectNGrams(categorized, degreesToUse, cfg)
        else:
            print "Loading content & preparing text"
            content = prepText(loadFile(tweetfile))
            print "Categorizing contents"
            categorized = prepClassifications(content)
            print "Deriving NGrams of length(s)", degreesToUse
            NGrammized = collectNGrams(categorized, degreesToUse, cfg)
            print "Compiling Results"
        readyToSend = []
        allCats = [str(key) for key in NGrammized.keys()]
        for category in allCats:
            readyToSend += NGrammized[category]

        print "Attempting Classification by mode", classMode, degreesToUse
        if classMode == 'naive bayes':
            from nltk.classify import NaiveBayesClassifier
            classifier = {
                'class': NaiveBayesClassifier.train(readyToSend),
                'mode': 'nb'
            }
        elif classMode == 'positive naive bayes':
            from nltk.classify import PositiveNaiveBayesClassifier
            classifier = {
                'class': PositiveNaiveBayesClassifier.train(readyToSend),
                'mode': 'pnb'
            }
        elif classMode == 'max ent':
            #import nltk.classify
            #from sklearn.linear_model import LogisticRegression
            #from nltk.classify import SklearnClassifier
            #classifier = {'class':LogisticRegression.train(readyToSend),'mode':'me'}
            from nltk.classify import MaxentClassifier
            classifier = {
                'class': MaxentClassifier.train(readyToSend, algorithm='iis'),
                'mode': 'me'
            }
        elif classMode == 'decision tree':
            from nltk.classify import DecisionTreeClassifier
            classifier = {
                'class': DecisionTreeClassifier.train(readyToSend),
                'mode': 'dt'
            }
        elif classMode == 'svm':
            if "SVMOrder" in cfg.keys():
                priority = cfg['SVMOrder']
            else:
                priority = "ABCDEFGHIJKLMNOPQRSTUVWXYZ9876543210"
            if type(priority) is str:
                priority = list(priority)
            priority = [entry for entry in priority if entry in allCats]
            preppedSVM = prepSVMAll(readyToSend, priority, allCats, cfg)
            classifier = {
                'class': preppedSVM,
                'mode': 'svm',
                'priority': priority
            }
        else:
            from nltk.classify import NaiveBayesClassifier
            classifier = {
                'class': NaiveBayesClassifier.train(readyToSend),
                'mode': 'nb'
            }

        if 'NLPTEST' not in cfg.keys():
            print "Pickling Classifier"
            fileOut = open(pickleFile, 'wb')
            cPickle.dump(classifier, fileOut)
            fileOut.close()

    if 'NLPTEST' not in cfg.keys():
        if classMode != 'svm':
            classifier['class'].show_most_informative_features(n=150)
        """else:
		for key in classifier['class'].keys():
			print classifier		
			print classifier.keys()
			classifier['class'][key].show_most_informative_features(n=150/len(classifier['class'].keys()))"""

    return classifier
예제 #16
0
    words = sentence.lower().split()
    return dict(('contains(%s)' % w, True) for w in words)


data_425_sentences = topicList
various_sentences = [
    'The President did not comment', 'I lost the keys',
    'The team won the game', 'Sara has two kids',
    'The ball went off the court', 'They had the ball for the whole game',
    'The show is over'
]

data_425_featuresets = list(map(features, data_425_sentences))
unlabeled_featuresets = list(map(features, various_sentences))

classifier = PositiveNaiveBayesClassifier.train(data_425_featuresets,
                                                unlabeled_featuresets)

classifier.classify(features('The cat is on the table'))
classifier.classify(features('sata cable'))

#############################################################


def c_read_fileData(dataFormatter):
    c_data = defaultdict(list)
    for row in dataFormatter.itertuples():
        c_data[row[2]].append(row[1])
    text = c_data[425]
    print(text)
    return c_data
예제 #17
0
train_unknown_responses = unknown_responses.loc[:, ['2_x']]
positive = train_pos_responses['2_x'].tolist()
unlabelled = train_unknown_responses['2_x'].tolist()


def create_features(text):
    # Remove all the punctuations.
    table = str.maketrans({key: None for key in string.punctuation})
    text = text.translate(table)
    words = text.lower().split()
    # Create Bag of words.
    dictionary_words = dict(
        ('contains(%s)' % w, True) for w in words if len(w) > 2)
    return dictionary_words


pos_features = list(map(create_features, positive))
unknown_features = list(map(create_features, unlabelled))

# Learn the model just based on positive Naive Bayes Classifier.
classifier = PositiveNaiveBayesClassifier.train(pos_features, unknown_features)
#print(classifier.classify(create_features()))

for tip in tips.iloc[:, 2].tolist():
    try:
        tips["class"] = classifier.classify(create_features(tip))
    except AttributeError:
        pass
print(tips)
print(np.unique(tips.loc[:, 'class']))
예제 #18
0
if __name__ == '__main__':
    print get_article_snippet(
        "sduhfuihsejdsddsfsdfsdf<p>njksnn</p><a>snjkksfbksdbf</a>ksjdfn", 15)
    parser = argparse.ArgumentParser(description="Accepts a URL")
    parser.add_argument(
        "--url", dest="url")  #Extracts url from command line, if available
    urls = parser.parse_args()
    if urls.url == None:
        print("No URL Specified")
        sys.exit()
    positive_examples = map(get_words_in_article, [
        'http://www.engadget.com/2012/11/16/htc-droid-dna-review/',
        'http://www.engadget.com/2012/10/08/samsung-galaxy-note-ii-review/',
        'http://www.engadget.com/2012/11/16/htc-desire-x-review/',
        'http://www.engadget.com/2012/11/16/htc-desire-x-review/'
    ])
    misc_examples = map(get_words_in_article, [
        'http://www.engadget.com/2012/11/16/sharp-aquos-sh930w-reviewed-early-in-russia-with-1080p-display/',
        'http://www.engadget.com/2012/11/15/nexus-4-backordered/',
        'http://www.engadget.com/2012/11/16/htc-windows-phone-8x-t-mobile-review/',
        'http://www.engadget.com/2012/11/16/distro-issue-66-holiday-gift-guide/',
        'http://www.engadget.com/2012/10/29/apple-macbook-pro-with-retina-display-review-13-inch/',
        'http://www.engadget.com/2012/11/17/skydrive-sdk-net-windows-phone-8/'
    ])
    classifier = PositiveNaiveBayesClassifier.train(positive_examples,
                                                    misc_examples)

    print classifier.classify(get_words_in_article(urls.url))
    classifier.show_most_informative_features()