Exemplo n.º 1
0
Arquivo: Util.py Projeto: hseran/IAR
 def shuffleReviews(input_file, output_file):
     reviewList = Review.readReviewsFromXML(input_file)
     if reviewList == None or len(reviewList) == 0:
         print "No reviews in input file"
     
     random.shuffle(reviewList)
     Review.serializeToXML(reviewList, output_file)
Exemplo n.º 2
0
Arquivo: Util.py Projeto: hseran/IAR
 def countLabeledReviews(file):
     reviewList = Review.readReviewsFromXML(file)
     count = 0
     for review in reviewList:
         if review.getReviewPolarity().strip() != '':
             count += 1
     print count
Exemplo n.º 3
0
Arquivo: Util.py Projeto: hseran/IAR
 def seperateByRating(input_file, output_dir):
     reviewList = Review.readReviewsFromXML(input_file)
     high5 = []
     low1 = []
     medium = []
     low2 = []
     for review in reviewList:
         if str(review.getReviewRating()) == '5.0':
             review.setPolarity('1')
             review.setConfidence('1')
             high5.append(review)
         elif str(review.getReviewRating()) == '1.0':
             review.setPolarity('-1')
             review.setConfidence('1')
             low1.append(review)
         elif str(review.getReviewRating()) == '2.0':
             review.setPolarity('-1')
             review.setConfidence('1')
             low2.append(review)
         else:
             medium.append(review)
     
     Review.serializeToXML(high5, output_dir + "/high.xml")
     Review.serializeToXML(low1, output_dir + "/low1.xml")
     Review.serializeToXML(low2, output_dir + "/low2.xml")
     Review.serializeToXML(medium, output_dir + "/medium.xml")
     print "5: " + str(len(high5))
     print "1: " + str(len(low1))
     print "2: " + str(len(low2))       
Exemplo n.º 4
0
Arquivo: Util.py Projeto: hseran/IAR
 def separateLabeledAndUnlabeled(file, output_dir):
     reviewList = Review.readReviewsFromXML(file)
     labeled = []
     unlabeled = []
     
     for review in reviewList:
         if review.getReviewPolarity().strip() != '':
             labeled.append(review)
         else:
             unlabeled.append(review)
     Review.serializeToXML(labeled, output_dir + "/labeled-neu.xml")
     Review.serializeToXML(unlabeled, output_dir + "/unlabeled-neu.xml")
Exemplo n.º 5
0
Arquivo: Util.py Projeto: hseran/IAR
 def siftReviewsByPolarity(input_file, output_file, polarity):
     '''
     out_file will contain all reviews from input_file 
     other than the ones labeled as polarity 
     '''
     reviewList = Review.readReviewsFromXML(input_file)
     if reviewList == None or len(reviewList) == 0:
         print "No reviews in input file"
     
     outList = []
     for review in reviewList:
         if str(review.getReviewPolarity()) == str(polarity):
             continue
         outList.append(review)
     Review.serializeToXML(outList, output_file)
Exemplo n.º 6
0
Arquivo: Util.py Projeto: hseran/IAR
 def labelTestFile(xml_test_file, weka_csv_results_file, output_file):
     '''
     this method takes the reviews xml file, weka results in CSV format
     applies polarity and confidence to reviews and write the resultant xml to output_file
     '''
     reviewList = Review.readReviewsFromXML(xml_test_file)
     
     results_file = open(weka_csv_results_file, "r")
     
     resultsList = results_file.readlines()
     
     if len(reviewList) != len(resultsList):
         print 'Different number of reviews and results'
         return
     
     counter = 0
     for review in reviewList:
         result = resultsList[counter].strip().split(',')
         counter += 1
         review.setPolarity( Util.getNumericLabel(result[2].split(':')[1]))
         review.setConfidence('0.9' if result[4] == '1' else result[4])
     
     print 'writing labelled test data to ' + output_file    
     Review.serializeToXML(reviewList, output_file)
Exemplo n.º 7
0
from Review import Review

if __name__ == '__main__':
	Review.serializeToXML(Review.readReviewsFromXML('../low-rating-reviews.xml'), '../test.xml')
Exemplo n.º 8
0
Arquivo: Util.py Projeto: hseran/IAR
 def printCount(file):
     reviewList = Review.readReviewsFromXML(file)
     print str(len(reviewList))
Exemplo n.º 9
0
            return
        
        trainingData = {}
        validationData = {}
        self.generateKFolds(outdir, trainingData, validationData)        
        
        for i in range(1,self.k+1):
            print "generating features for fold " + str(i)          
            
            trainCorpus = Corpus(trainingData[str(i)], lemmatizer, POS_tagging)
            '''this dictionary will be used for both training and validation data'''
            dictionary = Dictionary(trainCorpus)
            generator = FeatureGenerator(trainCorpus, dictionary, outdir + '/train' + str(i) + '.csv', 
                                         weightScheme, includeRating, includeDocLength)
            generator.generateFeatures()
            
            validCorpus = Corpus(validationData[str(i)], lemmatizer, POS_tagging);
            generator = FeatureGenerator(validCorpus, dictionary, outdir + '/valid' + str(i) + '.csv', 
                                         weightScheme, includeRating, includeDocLength)
            generator.generateFeatures()

            
if __name__ == '__main__':
    reviews = Review.readReviewsFromXML("../old-training-shuffled.xml")
    lemmatizer = nltk.WordNetLemmatizer()
    print 'reviews: ' + str(len(reviews))
    kfg = KFoldGenerator(reviews, 10)
    kfg.generateFolds("../kfolds/linearSVM/unigrams-lemma-POS-tf-no-stop", lemmatizer, 
                      POS_tagging = True, weightScheme = FeatureWeight.TF,
                      includeRating=False, includeDocLength=False)
Exemplo n.º 10
0
'''
Created on Apr 15, 2013

This is where we invoke modules to generate features for training and test data

@author: naresh
'''
from Review import Review
import nltk
from Corpus import Corpus
from Dictionary import Dictionary
from FeatureGenerator import FeatureGenerator
from FeatureWeight import FeatureWeight

if __name__ == '__main__':
    trainingreviews = Review.readReviewsFromXML("../old-training-shuffled.xml")
    lemmatizer = nltk.WordNetLemmatizer()
    testReviews = Review.readReviewsFromXML("../old-test-data.xml")
    
    trainCorpus = Corpus(trainingreviews, lemmatizer, POS_tagging = True)
    '''this dictionary will be used for both training and validation data'''
    dictionary = Dictionary(trainCorpus)
    generator = FeatureGenerator(trainCorpus, dictionary, '../train.csv', weightScheme= FeatureWeight.TFIDF)
    generator.generateFeatures()
    
    testCorpus = Corpus(testReviews, lemmatizer, POS_tagging = True);
    generator = FeatureGenerator(testCorpus, dictionary, '../test.csv',weightScheme= FeatureWeight.TFIDF)
    generator.generateFeatures()
Exemplo n.º 11
0
	
	#output files
	unlabeled_file='../test-data.xml'
	labeled_file='../traning-data.xml'
	
	#lists for labeled and unlabeled reviews
	unlabeled=[]
	labeled=[]
	labeled_high=[]
	labeled_low=[]
	labeled_mid=[]

	for each_file in review_files:	
		
		#call the readReviewsFromXML
		reviews = Review.readReviewsFromXML(each_file)

		for each_review in reviews:

			#convert reviewId into int, which help in sorting before saving in disk. 

			each_review.reviewId=int(each_review.getReviewId())

			#check and append if polarity is empty
		
			if (each_review.getReviewPolarity() == ""):
				unlabeled.append(each_review)
			elif (each_review.getReviewPolarity() == "-1"):
				labeled_low.append(each_review)
			elif(each_review.getReviewPolarity() == "0"):
				labeled_mid.append(each_review)