def option3_top_n_pos_common_phrases(business_ids): limit = int(input("Please enter the number of n: ")) list_of_common_phrase = list() for business_id in business_ids: review = Review("", "", "", "", "", "", "", "", "", "") # geit all positive reviews for one business_id positive_reviews = review.get_all_reviews_by_business_id_and_sentiment( business_id.business_id, 'pos') list_of_words = list() word_dictionary = dict() #split review text into a list of words for each_review in positive_reviews: words = split_text(each_review.text) list_of_words.extend(words) #construction of common phrases dictionary for positive reviews word_dictionary = dictionary_construction(list_of_words, word_dictionary) # sort the dictionary by value, turn it into a tuple sort_word_dictionary = sorted(word_dictionary.items(), key=lambda x: x[1], reverse=True) index = 0 # print result for word in sort_word_dictionary: if index >= limit: break print word[0], word[1] index += 1
def option6_neg_examples(business_ids): for business_id in business_ids: review = Review("", "", "", "", "", "", "", "", "", "") negative_reviews = review.get_reviews_by_business_id_and_sentiment( business_id.business_id, 'neg', 5) index = 1 for negative_review in negative_reviews: print str(index) + ".", negative_review.text.rstrip() index += 1
def option5_pos_examples(business_ids): for business_id in business_ids: review = Review("", "", "", "", "", "", "", "", "", "") positive_reviews = review.get_reviews_by_business_id_and_sentiment( business_id.business_id, 'pos', 5) index = 1 for positive_review in positive_reviews: print str(index) + ".", positive_review.text.rstrip( ) # adjust the format index += 1
def getReviewList(self, asin): reviewsResult = [] totalPage=0 baseUrl = 'http://www.amazon.com/product-reviews/' html = MyHtml.getHtml(baseUrl + asin, ffhead=True) pageNumberList = html.xpath(".//ul[@class='a-pagination']//li") if len(pageNumberList)>0: countOfListItems=len(pageNumberList) indexOfPageTotal=countOfListItems-2 listitem=pageNumberList[indexOfPageTotal] totalPage=int(listitem.xpath('.//a')[0].text.strip()) else: totalPage=1 sortBy = 'recent' pageNumber = 1 foundKnownReview = False while pageNumber <= totalPage: url = baseUrl + asin + \ '?pageNumber={}&sortBy={}'.format(str(pageNumber), sortBy) html = MyHtml.getHtml(url, ffhead=True) isCount=html.xpath('.//div[@id="cm_cr-product_info"]/div/div[1]/div[2]/span') if isCount is not None and len(isCount)>0: countOfReviews=int(isCount[0].text.strip()) else: countofReviews=0 print countOfReviews,"count of reviews" if countOfReviews>0: divWholeReviewList = html.xpath('.//div[@id="cm_cr-review_list"]')[0] divReviewList = divWholeReviewList.xpath('./div[@id]') for divReview in divReviewList: reviewID = divReview.attrib['id'] if (reviewID in reviewsResult) or (reviewID in self.reviewList): foundKnownReview = True break aReview = Review.Review() aReview.reviewID = reviewID Review.saveReview(review=aReview) reviewsResult.append(reviewID) # end of for pageNumber += 1 if foundKnownReview: break else: pageNumber+=1 # end of while return reviewsResult[::-1]
def printReviews(self, ct=None): fout = '' flag = False if ct is None: flag = 1 if not os.path.exists("../data/" + self.fetchDate.isoformat() + "/review/"): os.makedirs( "../data/" + self.fetchDate.isoformat() + "/review/") ct = CommonTool() if not os.path.exists("../data/" + self.fetchDate.isoformat() + "/review/review.txt"): fout = open( "../data/" + self.fetchDate.isoformat() + "/review/review.txt", "w") print "writing reviews in new file" ct.setFout(fout) ct.writeln(Review.Review.tableHead) else: fout = open( "../data/" + self.fetchDate.isoformat() + "/review/review.txt", "a") ct.setFout(fout) for reviewID in self.reviewList: print "printing reviews" review = Review.loadReview(reviewID, self.fetchDate) review.printData(ct) review.insertReviewDataIntoTable() if flag: fout.close()
def CekBayar(ListReview, GameName): codes = [ "XXYY12", "BQQPR1", "MMR20K", "MMR900", "MMR1DI", "MMRR69", "69A420" ] while True: print( "\n>>>Misalkan user mendapatkan email yang berisi kode pembayarannya<<<\n" ) print(">>>Di dalam email user:<<<\n") print("Thank you for purchase, congratulations on your new game.") print("We hope you can enjoy the game to its fulless.") print("This is your payment code:", codes[random.randint(0, len(codes) - 1)]) print("\n>>>Akhir dari email user<<<\n") time.sleep(5) print("Please check your email for your payment code.") code = input("\nPlease enter the payment code (enter 2 to cancel): ") code = code.upper() if (code in codes): Review.PendapatGame(ListReview, GameName) break elif (code == "2"): return None else: print("The payment code is wrong.")
def getReviews(self): print 'getReviews' if self.numOfReviews > 0: if self.reviewList is None: self.reviewList = self.getReviewList(self.asin) else: for review in self.getReviewList(self.asin): if review not in self.reviewList: self.reviewList.append(review) else: self.reviewList = [] print self.reviewList self.calcReviewTopPercent() Review.getReviews(self.asin, numOfReviews=self.numOfReviews, bookPublishDate=self.publishDate)
def parse_metadata_and_vocabulary(self, file, dir): words_list = list() words_indexes_list = list() with open(file, 'r', encoding='ISO-8859-1') as data: for line in data.readlines(): if re.match('^product', line): prod = line.split(': ')[1].strip('\n') elif re.match('^review/helpfulness', line): helpfulness = line.split(': ')[1].strip('\n') elif re.match('^review/score', line): score = line.split(': ')[1].strip('\n') elif re.match('^review/text', line): text = line.split(': ')[1].strip('\n') elif re.match('^\s*$', line): product = p.Product(prod) review = r.Review(product.get_product_id(), helpfulness, score, text) # write review to the metadata file self.write_to_metadata(review, dir) words_list.extend(review.get_text()) print('Created Metadata file') words_list = self.remove_duplicates(words_list) # append words and indexes to the lists wordlist_asa_string = self.append_to_wordlonglist( words_list, words_indexes_list) # write data to vocabulary self.write_to_vocabulary(wordlist_asa_string, words_indexes_list, dir)
def books(id): try: user = session["email"] result = db.session.query(Books).filter(Books.isbn == id).first() data=Review.query.all() r=Review.query.filter_by(isbn=id).all() if request.method=='POST': reviewdata=Review(id,user,request.form['comment'],request.form['rating']) user = Review.query.filter_by(email=user,isbn=id).first() data=Review.query.all() if user is not None: print("User had already given rating.") var1 = "Error: User had already given rating." return render_template("Book_Page.html", user = user,Book_details=result,var1 = var1,comments=r, allreviewdata = data ) db.session.add(reviewdata) db.session.commit() var1="Review submitted" flash(var1) return redirect(url_for('books', id = id)) else: return render_template("Book_Page.html", user = user,Book_details=result,comments=r, allreviewdata = data ) except Exception as e: print(e) var1 = "You must log in to view the homepage" return render_template("reg.html",var1 = var1)
def review(self): fileName, _ = QFileDialog.getOpenFileName( self, self.MultiLang.find_correct_word("Open File"), "saves", "All Files (*);;Python Files (*.py)") if fileName: print(fileName) window = Review.ReviewWindow(self, fileName) window.show()
def make_review_list(path): reviews = [] for i in range(5): reviews.append([]) for curr in parse(path): review = Review.Review(curr['reviewText'], curr['overall']) reviews[review.get_overall() - 1].append(review) return reviews
def guess_review(text): global review_points_list, n if not review_points_list[0]: load_tuple_data() query = Review.Query(text) query_points = query.get_points() output = find_nearest_neighbors(query_points, review_points_list, n) return get_most_occurring(output)
def getReviews(): db = getConnection() collection = db.reviews uniqueId = 0 for review in collection.find(): reviewObj = rv.Review(review) reviewList.append(reviewObj) uniqueId += 1
def jsonToObject(self, rev): product = rev["product"] date = rev["date"] email = rev["email"] rating = rev["rating"] text = rev["reviewText"] revObj = Review.Review(product, date, email, rating, reviewText) self.reviewList.append(revObj)
def solveReviewSummary(self): quotesTable = self.html.xpath(".//table[@id='quotesTable']") if len(quotesTable) > 0: self.hasQuoteTable = 1 quotes = quotesTable[0].xpath( "./td/a[@class='a-link-normal a-text-normal a-color-base']") for quote in quotes: words = quote.attrib["href"].split("/") # print words reviewID = words[4] # print reviewID #try: review = Review.loadReview(reviewID, self.fetchDate) #review.setQuoteTable(1) Review.saveReview(review) #except : sys.stderr.write( 'quotesTable review not found: {0} {1}\n'.format( self.asin, reviewID)) else: self.hasQuoteTable = 0
def submitreview(): if not request.is_json: message = "Invalid request format" return jsonify(message),400 isbn = request.args.get('isbn') try: result = db.session.query(Books).filter(Books.isbn == isbn).first() except: message = "Please Try again Later" return jsonify(message),500 if result is None: message = "Please enter valid ISBN" return jsonify(message), 404 rating = request.get_json()['rating'] comment = request.get_json()['comment'] email = request.get_json()['email'] user = Review.query.filter_by(email=email,isbn=isbn).first() if user is not None: message = "Sorry you can't review this book again" return jsonify(message), 409 reviewdata=Review(isbn,email,comment,rating) try: db.session.add(reviewdata) db.session.commit() except: message = "Please Try Again " return jsonify(message), 500 # print(isbn,rating,comment) try: result = db.session.query(Books).filter(Books.isbn == isbn).first() r=Review.query.filter_by(isbn=isbn).all() except: message = "Please Try again Later" return jsonify(message),500 print(result) if result is None: message = "No book found" return jsonify(message), 404 response = {} reviews = [] for review in r: eachreview = {} eachreview["email"] = review.email eachreview["rating"] = review.rating eachreview["comment"] = review.comment reviews.append(eachreview) response['isbn'] = result.isbn response['title'] = result.title response['author'] = result.author response['year'] = result.year response['reviews'] = reviews return jsonify(response), 200
def parse_reviews_HTML(reviews, data): for review in reviews: length = "-" gotOffer = "-" experience = "-" difficulty = "-" date = review.find("time", { "class" : "date" }).getText().strip() role = review.find("span", { "class" : "reviewer"}).getText().strip() outcomes = review.find_all("div", { "class" : ["tightLt", "col"] }) if (len(outcomes) > 0): gotOffer = outcomes[0].find("span", { "class" : "middle"}).getText().strip() #endif if (len(outcomes) > 1): experience = outcomes[1].find("span", { "class" : "middle"}).getText().strip() #endif if (len(outcomes) > 2): difficulty = outcomes[2].find("span", { "class" : "middle"}).getText().strip() #endif appDetails = review.find("p", { "class" : "applicationDetails"}) if (appDetails): appDetails = appDetails.getText().strip() tookFormat = appDetails.find("took ") if (tookFormat >= 0): start = appDetails.find("took ") + 5 length = appDetails[start :].split('.', 1)[0] #endif else: appDetails = "-" #endif details = review.find("p", { "class" : "interviewDetails"}) if (details): s = details.find("span", { "class" : ["link", "moreLink"] }) if (s): s.extract() # Remove the "Show More" text and link if it exists #endif details = details.getText().strip() #endif questions = [] qs = review.find_all("span", { "class" : "interviewQuestion"}) if (qs): for q in qs: s = q.find("span", { "class" : ["link", "moreLink"] }) if (s): s.extract() # Remove the "Show More" text and link if it exists #endif questions.append(q.getText().encode('utf-8').strip()) #endfor #endif r = Review.Review(date, role, gotOffer, experience, difficulty, length, details, questions) data.append(r) #endfor return data
def findReviewRank(asin, reviewID, fetchDate=date.today()): try: path = getPath(fetchDate) with open(path + asin, "rb") as fin: book = pickle.load(fin) return book.reviewList.index(reviewID) except IOError: # try: # reviewList = Review.getReviewList(asin) # return reviewList.index(reviewID) # except: # return -1 return Review.getReviewIndex(asin, reviewID)
def option4_top_n_neg_common_phrases(business_ids): limit = int(input("Please enter the number of n: ")) list_of_common_phrase = list() for business_id in business_ids: review = Review("", "", "", "", "", "", "", "", "", "") negative_reviews = review.get_all_reviews_by_business_id_and_sentiment( business_id.business_id, 'neg') list_of_words = list() word_dictionary = dict() for each_review in negative_reviews: words = split_text(each_review.text) list_of_words.extend(words) word_dictionary = dictionary_construction(list_of_words, word_dictionary) sort_word_dictionary = sorted(word_dictionary.items(), key=lambda x: x[1], reverse=True) index = 0 for word in sort_word_dictionary: if index >= limit: break print word[0], word[1] index += 1
def calcReviewTopPercent(self): print 'calcReviewTopPercent' import math maxRank = len(self.reviewList) top1Percent = int(math.ceil(maxRank / 100.0)) top5Percent = int(math.ceil(maxRank / 20.0)) top10Percent = int(math.ceil(maxRank / 10.0)) for rank, reviewID in enumerate(self.reviewList): aReview = Review.loadReview(reviewID) try: aReview.timeRank = rank except AttributeError, e: sys.stderr.write(str(e) + '\n') sys.stderr.write('reviewID' + reviewID) sys.exit(-1) if rank <= top1Percent: aReview.top1Percent = 1 elif rank <= top5Percent: aReview.top5Percent = 1 elif rank <= top10Percent: aReview.top10Percent = 1 Review.saveReview(aReview)
def parse(file, dir): words_list = list() words_indexes_list = list() wordid_docid = list() with open(file, 'r') as data: for line in data.readlines(): if re.match('^product', line): prod = line.split(': ')[1].strip('\n') elif re.match('^review/helpfulness', line): helpfulness = line.split(': ')[1].strip('\n') elif re.match('^review/score', line): score = line.split(': ')[1].strip('\n') elif re.match('^review/text', line): text = line.split(': ')[1].strip('\n') elif re.match('^\s*$', line): product = p.Product(prod) review = r.Review(product.get_product_id(), helpfulness, score, text) # delete doubles and add to wordlist. words_list.extend(review.get_text()) # write review to the metadata file write_to_metadata(review, dir) # set tuples with the words and their docids wordid_docid.extend(make_word_docid_tuples(review)) print('Created Metadata file') words_list = remove_duplicates(words_list) # append words and indexes to the lists wordlist_asa_string = append_to_wordlonglist(words_list, words_indexes_list) # write words and indexes to the index file write_to_index_file(wordlist_asa_string, words_indexes_list, dir) # replace the words with word ids wordid_docid = make_wordid_docid_tuples(wordlist_asa_string, words_indexes_list, wordid_docid) # create the binary files: create_word_to_docs_binary_file(wordid_docid, dir) create_doc_to_words_binary_file(wordid_docid, dir)
def parse_metadata_and_vocabulary(self, file, dir): words_list = list() words_indexes_list = list() metadata = open(dir + 'reviews_metadata.csv', 'w', newline='') writer = csv.writer(metadata, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) with open(file, 'r', encoding='ISO-8859-1') as data: for line in data.readlines(): if re.match('^product', line): prod = line.split(': ')[1].strip('\n') elif re.match('^review/helpfulness', line): helpfulness = line.split(': ')[1].strip('\n') elif re.match('^review/score', line): score = line.split(': ')[1].strip('\n') elif re.match('^review/text', line): text = line.split(': ')[1].strip('\n') elif re.match('^\s*$', line): product = p.Product(prod) review = r.Review(product.get_product_id(), helpfulness, score, text) # write review to the metadata file writer.writerow([ review.id, review.product_id, review.helpfulness, review.score, review.num_of_words ]) words_list.extend(review.get_text_without_doubles()) # close metadata file metadata.close() print('Created Metadata file') words_list = self.remove_duplicates(words_list) # append words and indexes to the lists wordlist_asa_string = self.append_to_wordlonglist( words_list, words_indexes_list) # write data to vocabulary self.write_to_vocabulary(wordlist_asa_string, words_indexes_list, dir)
def __init__(self, idRole): #fetch userid from login module self.Userid = idRole[0] while True: #Takes user's input userAction = input( "1. Add Review/Ratings\n2. View reviews/ratings of a movie\n") #compare the input with string.digits if userAction in string.digits: if userAction == "1": # fetch movie reviews for the user and allow the user to input/update reviews self.checkReview(self.Userid) moreActions = input("Do you still want to continue?[y]: ") if moreActions.lower() == "y": continue else: break elif userAction == "2": # fetch movie reviews in visually attracted format(graph or tabular forms) review = Review.Review() moreActions = input("Do you still want to continue?[y]: ") if moreActions.lower() == "y": continue else: break #Invalid user input else: print( "No such action available! Please login again to enter the correct input." ) break #condition to verify special characters else: print("Special characters or alphabets are not for input.") continue
def solveReview(self): with open("../data/" + self.fetchDate.isoformat() + "/review.txt", "w")\ as fout: ct = CommonTool(fout) fout.write(Review.Review.tableHead) fout.write('\n') fout.flush() # ct.writeln(Review.Review.tableHead) for i, reviewID in enumerate(self.reviewList): if reviewID == '': continue print 'solve review {0} of {1}: {2}'.format( i, len(self.reviewList), reviewID) review = Review.loadReview(reviewID) review.printData(ct) if review.isNew: print 'isNewReview' if review.reviewerID not in self.reviewerList: self.reviewerList.append(review.reviewerID) review.isNew = False
def createUserReview(): create_user_form = CreateUserReview(request.form) if request.method == 'POST' and create_user_form.validate(): review_dict = {} db = shelve.open('GoFit.db', 'c') try: review_dict = db['Review'] except: print("Error in retrieving Users from Gofit.db.") review = Review.userReview(session['user_id'], session['first_name'], session['last_name'], review=create_user_form.review.data) review_dict[review.get_user_id()] = review db['Review'] = review_dict db.close() return redirect(url_for('to_profile')) return render_template("userReview.html", form=create_user_form)
def create_user_rev(): create_user_form = CreateReview(request.form) if request.method == 'POST' and create_user_form.validate(): staff_dict = {} db = shelve.open('GoFit.db', 'c') try: staff_dict = db['Review'] except: print("Error in retrieving Users from Gofit.db.") staff = Review.userReview(first_name=create_user_form.first_name.data, last_name=create_user_form.last_name.data, review=create_user_form.review.data) staff_dict[staff.get_user_id()] = staff db['Review'] = staff_dict db.close() return redirect(url_for('review')) return render_template('createReview.html', form=create_user_form)
def main(): review = Review("", "", "", "", "", "", "", "", "", "") # this will call your constructor # get 50 results from databases reviews = review.get_reviews("50") for a_review in reviews: #construction of sentiment table blob = TextBlob(a_review.text, analyzer=NaiveBayesAnalyzer()) text_sentiment = blob.sentiment text_sentiment = text_sentiment[ SENTIMENT_TYPE] #text_sentiment will either be pos (for positive) or neg (for negative) #here is where we create a Sentiment object sentiment = Sentiment(a_review.review_id, a_review.business_id, text_sentiment) sentiment.insert( ) #this will insert information into the sentiment table #construction of review_stats table review_stats = Review_stats("", "", "", "", "") review_stats.insert( ) #insert positive and negative reviews' information to review_stats table #construction of common_phrases table business = Business("", "", "", "", "", "", "", "", "", "", "") business_ids = business.get_all_business_ids( ) #acquire all business_ids from sentiment table Reviews = list() words = list() list_of_words = list() word_dictionary = dict() for business_id in business_ids: review = Review("", "", "", "", "", "", business_id.business_id, "", "", "") Reviews = review.get_reviews_by_business_id( ) # get all reviews by business_id insert_words(business_id.business_id, Reviews, word_dictionary) # insert data into common_phrases table
def movie_extractor(directory_path, aspects, set_of_movies, k): """ Given the directory that contains xml files - each of them representing a single review - this function instantiates movie objects. At the end of the function, it is computed the score of each aspect belonging to the movie. Args: directory_path (str): the directory where the single reviews xml files are kept. aspects(dict): is a dict that maps each aspect to it's KL relevance number. set of movies(list): list containing the instantiated movie objects. k (int) : represents the number of the chosen main aspects (top k aspects) Returns: None. """ for dirpath, dirnames, files in os.walk(directory_path): if dirpath[-4:] != "iews": new_movie = Movie.Movie(dirpath[-4:]) for file in files: file_name = os.path.join(dirpath, file) new_review = Review.Review(file_name) new_review.review_extractor(aspects) #new_review.review_extractor(aspects) new_movie.reviews.append(new_review) new_movie.number_of_reviews += 1 for review in new_movie.reviews: #aspects_in_review is a counter; it contains #the counting of aspects in the current review: aspects_in_review = review.occurrences_of_each_aspect #the above loop iterates through each aspect #belonging to the counter attribute ("aspects_in_review") of the review: for current_aspect in aspects_in_review: #the counter "aspects_in_review" is acessed; #the number of occurences of the "current_aspect" is #attritbuted to "current_aspect_count": current_aspect_count = aspects_in_review[current_aspect] #the KL value of the "current_aspect" is acessed #in the dict of aspects: aspect_KL_rel = aspects[current_aspect] review_sent = review.average_sentiment current_aspect_score = current_aspect_count * aspect_KL_rel * review_sent if current_aspect == "film": with open("debug_score" + new_movie.xml + ".txt", 'a+', encoding="utf-8") as f: print(current_aspect, file=f) print("current_aspect_count: ", current_aspect_count, file=f) print("aspect_KL_rel: ", aspect_KL_rel, file=f) print("review_sent: ", review_sent, file=f) print("current_aspect_score: ", current_aspect_score, file=f) print("new_movie.number_of_reviews: ", new_movie.number_of_reviews, file=f) print("------------", file=f) current_aspect_score = current_aspect_score / new_movie.number_of_reviews #total = new_movie.temp_acumulator_aspect[current_aspect] + current_aspect_score if current_aspect not in list( new_movie.aspects_score.keys()): new_movie.aspects_score[current_aspect] = 0 score_acumulator = new_movie.aspects_score[ current_aspect] + current_aspect_score new_movie.aspects_score[current_aspect] = score_acumulator new_movie.top_k_aspects_evaluation(k) print(new_movie.aspects_score["film"]) set_of_movies.append(new_movie)
def setUpClass(cls): cls.reader = csv.DictReader(open('excel.csv')) cls.review = Review(cls.reader.next())
def main(): # --------------------------------------------------------------- # # Cleaning # # --------------------------------------------------------------- # opening the file and reading it f = open("data.txt", "r") contents = f.readlines() nbOfDoc = 0 stopWords = ('i', 'the', 'a', 'an', 'to', 'it', 'as', 'and', 'is', 'does', 'not', 'was', 'so', 'than', 'of', 'for', 'my', 'you', 'we', 'they', 'this', 'that', 'with', 'are', 'were', 'your', 'their', 'no', 'yes', 'or', 'them', 'did', 'had', 'will', 'may', 'mine', '', 's', 've', 'd', 'can', 'on', 'up', 'down', 'but', 'or', 'me', 'out', ',', 'if', 'by', "don't", "i've", 're', 'be', 'in', 'd', 'have', 'all', 'got', 'go', 'much', '.', 'on', 'one', 'should', 'have', 'these') collection = Collection() # reading all lines one by one for line in contents: # xy contains the 2 helpfulness scores xy = ExtractHelpful(line) # if not useless, we look at the line if xy[1] != "0": score = int(xy[0]) / int(xy[1]) # review is a list of the strings contained in the original review # but the strings are only separated according to white spaces # in the original review (see split function) # therefore we need to clean the strings of the punctuations symbols # and of the stop words rawReview = ExtractReview(line.lower()) cleanedReview = CleanStopWord(rawReview, stopWords) cleanedReview = RemovePunct(cleanedReview, stopWords) cleanedReview = RemovePunct(cleanedReview, stopWords) # collection is a list of perfectly cleaned # reviews and their scores. review = Review(cleanedReview, score) if review.nbOfWords is not 0: collection.AddReview(review) nbOfDoc = nbOfDoc + 1 # --------------------------------------------------------------- # # Feature Selection # # --------------------------------------------------------------- # we create a dictionary dft that will contain all the words encountered in # any document. The words will be the key and be paired with the number of # documents that contain them dft = collection.SetDFT() print("\n dft : ") print(dft) tfidf = collection.SetTFIDF() print("\n tfidf : ") print(tfidf) relWordsScores = collection.SetRelevantWords(0.5) print("\n relevant words and scores : ") print(relWordsScores) relWords = collection.relWords print("\n relevant words only : ") print(relWords) # we now have all our sorted relevant words stocked in relWords # --------------------------------------------------------------- # # Training & Predictions # # --------------------------------------------------------------- allReviews = collection.GetListOfReviews() trainingColl = Collection() testColl = Collection() i = 1 # we create two collections, one of training, one of test for review in allReviews: if i % 10 == 0: testColl.AddReview(review) else: trainingColl.AddReview(review) i += 1 print("\n test and train set are done") trainSorter = PriorityQueue() testSorter = PriorityQueue() # xTrain will be used for the training of the regressions xTrain = np.ndarray((1, trainingColl.nbOfReviews)) xTest = np.ndarray((1, testColl.nbOfReviews)) # xTrainList and yTrain will be used for plots. yTrain will also be used for # training purposes xTrainList = [] yTrain = [] xTestList = [] yTest = [] # for every review we compute a score based on the sum of the tfidf scores # of the relevant words divided by the number of relevant words, and we # associate it with the relevance score of the review. We put this tuple in # the priority queue to sort them by review score. We do this for both # the training and test sample for review in trainingColl.listOfReviews: nbOfRelWords = 0 reviewClass = review.GetScore() reviewScore = 0 for word in review.GetSetOfWords(): if word in collection.relWords: nbOfRelWords += 1 reviewScore += tfidf[word] # reviewScore /= review.nbOfWords if reviewScore != 0: reviewScore /= nbOfRelWords trainSorter.put((reviewScore, reviewClass)) print("all training reviews are treated") for review in testColl.listOfReviews: nbOfRelWords = 0 reviewClass = review.GetScore() reviewScore = 0 for word in review.GetSetOfWords(): if word in collection.relWords: nbOfRelWords += 1 reviewScore += tfidf[word] # reviewScore /= review.nbOfWords if reviewScore != 0: reviewScore /= nbOfRelWords testSorter.put((reviewScore, reviewClass)) print("all testing reviews are treated") i = 0 while not trainSorter.empty(): info = trainSorter.get() xTrain[0][i] = info[0] xTrainList.append(info[0]) yTrain.append(info[1]) i += 1 print("x and y built") j = 0 while not testSorter.empty(): info = testSorter.get() xTest[0][j] = info[0] xTestList.append(info[0]) yTest.append(info[1]) j += 1 xTrain2 = np.reshape(xTrain, (-1, 1)) xTest2 = np.reshape(xTest, (-1, 1)) testLinRegPred = LinearRegPredictions(xTrain2, xTrainList, yTrain, xTest2, xTestList, yTest) testSvrPred = SVRPrediction(xTrain2, xTrainList, yTrain, xTest2, xTestList, yTest) # --------------------------------------------------------------- # # Extra # # --------------------------------------------------------------- # xTrainExtra will be used for the training of the regressions # it will contain the average relevant word scores and the number of words # of the reviews of the training set xTrainExtra = np.ndarray((2, trainingColl.nbOfReviews)) xTestExtra = np.ndarray((2, testColl.nbOfReviews)) xTrainList = [] yTrain = [] xTestList = [] yTest = []
def main(): parser = argparse.ArgumentParser() parser.add_argument('-s', '--select', action='store_true') parser.add_argument('-t', '--test', action='store_true') parser.add_argument('-c', '--count', action='store_true') parser.add_argument('-m', '--merge', '--combine', action='store_true') parser.add_argument('-e', '--extract', action='store_true') parser.add_argument('-d', '--debuginput', action='store_true') parser.add_argument('-r', '--review', action='store_true') parser.add_argument('--replace-debug', action='store_true') parser.add_argument('-p', '--plot-data', action='store_true') parser.add_argument('--base-data', action='store_true') parser.add_argument('--features', default='original') parser.add_argument('-f', '--homology-filter', action='store_true') parser.add_argument('-y', '--classify', action='store_true') parser.add_argument('--grid-search', action='store_true') parser.add_argument('--plot', action='store_true') parser.add_argument('--fit', action='store_true') parser.add_argument('--count-total-number-of-genes', action='store_true') args = parser.parse_args() if args.select: if args.test: selector = Selector("config/Test/selection_config.json") else: selector = Selector("config/selection_config.json") selector.select() selector.selected_to_folder() if args.count: if args.test: counter = Counter("config/Test/counter_config.json") else: counter = Counter("config/counter_config.json") counter.count_all_viruses() if args.merge: if args.test: combiner = Combiner("config/Test/combiner_config.json") else: combiner = Combiner("config/combiner_config.json") combiner.combine_all_viruses() if args.debuginput: debug_input_collector = DebugInfoCollector("config/debug_info_collector_config.json") if args.replace_debug: debug_input_collector.collect(True) else: debug_input_collector.collect() if args.review: import Review Review.run() if args.plot_data: data_plotter = DataPlotter("config/data_plotter_config.json") data_plotter.plot() if args.base_data: base_data = BaseData("config/base_data_config.json") base_data.create_data() if args.homology_filter: homology_filter = HomologyFilter('config/homology_filter.json') homology_filter.filter() if args.extract: feature_extractor = FeatureExtraction("config/feature_extraction_config.json") feature_extractor.extract(args.features) if args.count_total_number_of_genes: combiner = Combiner("config/combiner_config.json") combiner.print_number_of_genes() if args.classify: if args.grid_search: MLgrid = [ { "booster": ["gblinear"], # "lambda": [0, 0.0001, 0.001], "lambda": [0], # "updater": ["shotgun", "coord_descent"], "updater": ["coord_descent", "shotgun"], # "feature_selector": ["cyclic", "shuffle", "random", "greedy", "thrifty"] "feature_selector": ["shuffle"] } # { # "booster": ["gbtree"], # # "max_depth": range(3, 10, 2), # # "min_child_weight": range(1, 6, 2) # } ] _1vsAgrid = [ { "estimator__booster": ["gblinear"], "estimator__lambda": [0.1], "estimator__updater": ["coord_descent"], "estimator__feature_selector": ["shuffle"] }, # { # "estimator__booster": ["gbtree"], # "estimator__max_depth": range(3, 10, 2), # "estimator__min_child_weight": range(1, 6, 2) # } ] RRgrid = [ { "estimator__booster": ["gblinear"], "estimator__lambda": [0.1], "estimator__updater": ["coord_descent"], "estimator__feature_selector": ["shuffle"] }, # { # "estimator__booster": ["gbtree"] # # "estimator__max_depth": range(3, 10, 2), # # "estimator__min_child_weight": range(1, 6, 2) # } ] classification = Classification('config/classification_config.json', args.features) classification.grid_search('ML', 'XGBoost', MLgrid, 200, 'no-pca') else: if args.fit: classification = Classification('config/classification_config.json', args.features) classification.fit_all() if args.plot: cp = ClassificationPlotter('config/classification_config.json', args.features) cp.plot_all()
def pushReviewbtnClicked(self): dlg = Review() dlg.exec_()
def printData(self): with open("../data/" + self.fetchDate.isoformat() + "/dataAll.txt", "w") as fout: ct = CommonTool(fout) fout.write("\t".join(self.tableHeadList)) fout.write('\n') fout.flush() for i, reviewID in enumerate(self.reviewList): if reviewID == '': continue print 'solve review {0} of {1}: {2}'.format( i, len(self.reviewList), reviewID) review = Review.loadReview(reviewID) book = Book.loadBookByAsin(review.asin) reviewer = Reviewer.loadReviewer(review.reviewerID) ct.write(reviewID) ct.write(review.asin) ct.write(review.reviewerID) ct.write(reviewer.rName) ct.write(reviewer.tRev1) ct.write(reviewer.tRev10) ct.write(reviewer.tRev50) ct.write(reviewer.tRev100) ct.write(reviewer.tRev500) ct.write(reviewer.tRev1000) ct.write(reviewer.tRevHall) ct.write(reviewer.vVoice) ct.write(review.verified) ct.write(review.rate) ct.write(review.title) ct.write(review.date) ct.write(review.fetchDate) ct.write(review.reviewBookDate) ct.write(review.elapsedDate) ct.write(review.helpful) ct.write(review.total) ct.write(review.helpfulness) ct.write(review.helpfulRank) ct.write(review.timeRank) #ct.write(review.top1Percent) #ct.write(review.top5Percent) #ct.write(review.top10Percent) ct.write(review.description) ct.write(review.numOfComments) ct.write(review.comment) #ct.write(review.isQuoteTable) ct.write(review.lastReviewRank) ct.write(book.url) ct.write(book.tag) ct.write(book.allowPreview) ct.write(book.binding) ct.write(book.publishDate) ct.write(book.author) ct.write(book.authorInfo) ct.write(book.rate) ct.write(book.numOfReviews) ct.write(book.kindlePrice) ct.write(book.hardcoverPrice) ct.write(book.paperbackPrice) ct.write(book.bookDsc) ct.write(book.listPrice) ct.write(book.pages) ct.write(book.isbn10) ct.write(book.isbn13) ct.write(book.subrank) ct.write(book.hasEditorialReview) ct.write(book.editorialReview) #ct.write(book.hasQuoteTable) ct.write(reviewer.email) ct.write(reviewer.webPage) ct.write(reviewer.hasPhoto) ct.write(reviewer.rNum) ct.write(reviewer.helpRate) ct.write(reviewer.hVote) ct.write(reviewer.tVote) ct.write(reviewer.avgRate) ct.write(reviewer.fRevTime) ct.write(reviewer.lRevTime) ct.write(reviewer.duration) if reviewer.rReal == "N/A": ct.write(0) else: ct.write(1) if reviewer.location == "N/A": ct.write(0) else: ct.write(1) if reviewer.aboutMe == "N/A": ct.write(0) else: ct.write(1) if reviewer.interest == "N/A": ct.write(0) else: ct.write(1) ct.write(review.fromFormat) # if review.fromFormat == "Hardcover": # ct.write(0) # elif review.fromFormat == "Paperback": # ct.write(1) # else: # ct.write(2) if reviewer.rRank == "N/A": ct.write(0) else: ct.write(reviewer.rRank) ct.writeln(book.rank)