def shuffleReviews(input_file, output_file): reviewList = Review.readReviewsFromXML(input_file) if reviewList == None or len(reviewList) == 0: print "No reviews in input file" random.shuffle(reviewList) Review.serializeToXML(reviewList, output_file)
def separateLabeledAndUnlabeled(file, output_dir): reviewList = Review.readReviewsFromXML(file) labeled = [] unlabeled = [] for review in reviewList: if review.getReviewPolarity().strip() != '': labeled.append(review) else: unlabeled.append(review) Review.serializeToXML(labeled, output_dir + "/labeled-neu.xml") Review.serializeToXML(unlabeled, output_dir + "/unlabeled-neu.xml")
def seperateByRating(input_file, output_dir): reviewList = Review.readReviewsFromXML(input_file) high5 = [] low1 = [] medium = [] low2 = [] for review in reviewList: if str(review.getReviewRating()) == '5.0': review.setPolarity('1') review.setConfidence('1') high5.append(review) elif str(review.getReviewRating()) == '1.0': review.setPolarity('-1') review.setConfidence('1') low1.append(review) elif str(review.getReviewRating()) == '2.0': review.setPolarity('-1') review.setConfidence('1') low2.append(review) else: medium.append(review) Review.serializeToXML(high5, output_dir + "/high.xml") Review.serializeToXML(low1, output_dir + "/low1.xml") Review.serializeToXML(low2, output_dir + "/low2.xml") Review.serializeToXML(medium, output_dir + "/medium.xml") print "5: " + str(len(high5)) print "1: " + str(len(low1)) print "2: " + str(len(low2))
def siftReviewsByPolarity(input_file, output_file, polarity): ''' out_file will contain all reviews from input_file other than the ones labeled as polarity ''' reviewList = Review.readReviewsFromXML(input_file) if reviewList == None or len(reviewList) == 0: print "No reviews in input file" outList = [] for review in reviewList: if str(review.getReviewPolarity()) == str(polarity): continue outList.append(review) Review.serializeToXML(outList, output_file)
def generateKFolds(self, location = "./", trainingData = {}, validationData = {}): if self.reviews == None or len(self.reviews) == 0: print 'No data to work on' return i = 0; import os if not os.path.isdir(location): location = "./" for training, validation in self.k_fold_cross_validation(): i = i + 1 Review.serializeToXML(training, location + "/train" + str(i) + ".xml") Review.serializeToXML(validation, location + "/valid" + str(i) + ".xml") trainingData[str(i)] = training validationData[str(i)] = validation
def labelTestFile(xml_test_file, weka_csv_results_file, output_file): ''' this method takes the reviews xml file, weka results in CSV format applies polarity and confidence to reviews and write the resultant xml to output_file ''' reviewList = Review.readReviewsFromXML(xml_test_file) results_file = open(weka_csv_results_file, "r") resultsList = results_file.readlines() if len(reviewList) != len(resultsList): print 'Different number of reviews and results' return counter = 0 for review in reviewList: result = resultsList[counter].strip().split(',') counter += 1 review.setPolarity( Util.getNumericLabel(result[2].split(':')[1])) review.setConfidence('0.9' if result[4] == '1' else result[4]) print 'writing labelled test data to ' + output_file Review.serializeToXML(reviewList, output_file)
from Review import Review if __name__ == '__main__': Review.serializeToXML(Review.readReviewsFromXML('../low-rating-reviews.xml'), '../test.xml')
reviewObj.setReviewRating(rating) #global variables file_location = "../reviews.xml" if __name__ == '__main__': hotel_url= ['http://www.yelp.com/biz/morimoto-new-york'] #variable to loop through pages i=0 #variable to assign doc id to reviews objCount = 1 #we store our reviews temporarily in this before we write to file buffer = [] #crawl in a loop while(i<=1000): web_page= parse(hotel_url[0]+'?start='+str(i)).getroot() for review in web_page.cssselect('#bizReviews .externalReview'): obj = Review(objCount) myparser(obj, review) buffer.append(obj) objCount += 1 i=i+40 print objCount #if we crawl too fast, site comes up with captcha time.sleep(10) Review.serializeToXML(buffer, file_location)
elif (each_review.getReviewPolarity() == "-1"): labeled_low.append(each_review) elif(each_review.getReviewPolarity() == "0"): labeled_mid.append(each_review) elif(each_review.getReviewPolarity() == "1"): labeled_high.append(each_review) #reviews from 3 files are appended to lists, but they are unsorted. Hence sorting them here. unlabeled.sort(key = operator.attrgetter('reviewId')) labeled_low.sort(key = operator.attrgetter('reviewId')) labeled_mid.sort(key = operator.attrgetter('reviewId')) labeled_high.sort(key = operator.attrgetter('reviewId')) labeled.extend(labeled_low) labeled.extend(labeled_mid) labeled.extend(labeled_high) #Saving to disk Review.serializeToXML(unlabeled,unlabeled_file) Review.serializeToXML(labeled,labeled_file) #Comment if not required. print "Labeled-low: " +str(len(labeled_low)) print "Labeled-mid: " +str(len(labeled_mid)) print "Labeled-high: " +str(len(labeled_high)) print "Total Labeled :"+str(len(labeled)) print "Unlabeled :"+str(len(unlabeled))