def median_approach(llimit,ulimit,isphrase,pathname): posmedlist=[] negmedlist=[] medians=[] lpcount=0 totalcount=ulimit-llimit cnt_var=0 print '\nNo of +ve reviews trained : ' for fid in movie_reviews.fileids(categories=['pos'])[llimit:ulimit]: testmed=proximity_tagger.medianlist(movie_reviews.abspath(fid),isphrase,cnt_var,0,pathname) posmedlist.append(testmed) lpcount=lpcount+1 cnt_var+=1 print 'Training +ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%' lpcount=0 cnt_var=0 print '\nNo of -ve reviews trained : ' for fid in movie_reviews.fileids(categories=['neg'])[llimit:ulimit]: testmed=proximity_tagger.medianlist(movie_reviews.abspath(fid),isphrase,cnt_var,1,pathname) negmedlist.append(testmed) lpcount=lpcount+1 cnt_var+=1 print 'Training -ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%' medians.append([numpy.median(x) for x in itertools.izip(*posmedlist)]) medians.append([numpy.median(x) for x in itertools.izip(*negmedlist)]) f = open('train_result\proximity_median_train_result_'+str(isphrase),'w') json.dump(medians,f) f.close()
def bins_svm_approach(llimit,ulimit,isphrase,pathname): posbinlist=[] negbinlist=[] trainingdata=[] trainingclass=[] bin_train_set=[] totalcount=ulimit-llimit lpcount=0 cnt_var=0 print '\nNo of +ve reviews scanned for training : ' for fid in movie_reviews.fileids(categories=['pos'])[llimit:ulimit]: testbin=proximity_tagger.bin_list(movie_reviews.abspath(fid),isphrase,cnt_var,0,pathname) posbinlist.append(testbin) lpcount+=1 cnt_var+=1 print 'Scanning +ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%' lpcount=0 cnt_var=0 print '\nNo of -ve reviews scanned for training : ' for fid in movie_reviews.fileids(categories=['neg'])[llimit:ulimit]: testbin=proximity_tagger.bin_list(movie_reviews.abspath(fid),isphrase,cnt_var,1,pathname) negbinlist.append(testbin) lpcount+=1 cnt_var+=1 print 'Scanning -ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%' lpcount=0 totalcount=len(posbinlist) print '\nNo of +ve reviews trained : ' trainingdata.extend(posbinlist) for i in range(totalcount): trainingclass.append(1) lpcount+=1 print 'Training +ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%' lpcount=0 totalcount=len(negbinlist) print '\nNo of -ve reviews trained : ' trainingdata.extend(negbinlist) for i in range(totalcount): trainingclass.append(0) lpcount+=1 print 'Training -ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%' bin_train_set.append(trainingdata) bin_train_set.append(trainingclass) f = open('train_result\proximity_bin_train_result_'+str(isphrase),'w') json.dump(bin_train_set,f) f.close()
def phrase_analysis_call(llimit,ulimit): from nltk.corpus import movie_reviews lpcount=0 totalcount=ulimit-llimit testmed=[] phrase_medlist=[] file_exist=0 if os.path.isfile('phrase_analysis_part_file'): fid = open('phrase_analysis_part_file') phrase_medlist=json.load(fid) fid.close() file_exist=1 print '\nNo of +ve reviews trained : ' for fid in movie_reviews.fileids(categories=['pos'])[llimit:ulimit]: if file_exist: phrase_medlist[0].append(proximity_tagger.phrase_analysis(movie_reviews.abspath(fid))) else: testmed.append(proximity_tagger.phrase_analysis(movie_reviews.abspath(fid))) lpcount=lpcount+1 print 'Training +ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%' if not file_exist: phrase_medlist.append(testmed) lpcount=0 testmed=[] print '\nNo of -ve reviews trained : ' for fid in movie_reviews.fileids(categories=['neg'])[llimit:ulimit]: if file_exist: phrase_medlist[1].append(proximity_tagger.phrase_analysis(movie_reviews.abspath(fid))) else: testmed.append(proximity_tagger.phrase_analysis(movie_reviews.abspath(fid))) lpcount=lpcount+1 print 'Training -ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%' if not file_exist: phrase_medlist.append(testmed) fid = open('phrase_analysis_part_file','w') json.dump(phrase_medlist,fid) fid.close()
def imdb_reviews_df(): #print(movie_reviews.fileids()) #print("POS", len(movie_reviews.fileids("pos"))) #print("NEG", len(movie_reviews.fileids("neg"))) #pos_texts = parse_text_files("/Users/USERNAME/nltk_data/corpora/movie_reviews/pos") #neg_texts = parse_text_files("/Users/USERNAME/nltk_data/corpora/movie_reviews/neg") pos_texts = parse_text_files(movie_reviews.abspath("pos")) neg_texts = parse_text_files(movie_reviews.abspath("neg")) #texts = pos_texts + neg_texts pos_df = pd.DataFrame(pos_texts) #pos_df["label"] = ["pos" for d in pos_texts] pos_df["label"] = [1 for d in pos_texts] neg_df = pd.DataFrame(neg_texts) #pos_df["label"] = ["neg" for d in pos_texts] neg_df["label"] = [0 for d in neg_texts] combined_df = pd.concat([pos_df, neg_df]) return combined_df
def findTrainingMedian(): posMedianList = [] negMedianList = [] posFileIdList = mr.fileids(categories = 'pos')[0:50] negFileIdList = mr.fileids(categories = 'neg')[0:50] i = 1 print '\nTraining with Positive Reviews....' for fid in posFileIdList: print 'Review ' + `i` temporaryMedian = findMedian(mr.abspath(fid)) posMedianList.append(temporaryMedian) i += 1 i = 1 print '\nTraining with Negative Reviews....' for fid in negFileIdList: print 'Review ' + `i` temporaryMedian = findMedian(mr.abspath(fid)) negMedianList.append(temporaryMedian) i += 1 trainingMedianList = [posMedianList, negMedianList] return trainingMedianList