def transform_review(self,input_review=None): stopword = stopwords.words('english') if input_review == None: reviews1 = list(map(lambda x: sent_tokenize(x),self.reviews)) else: reviews1 = list(map(lambda x: sent_tokenize(x),input_review)) reviews2 = reviews1.copy() for review in range(len(reviews1)): for sentence in range(len(reviews1[review])): reviews2[review][sentence] = (re.sub(r'\W+', ' ',str(reviews1[review][sentence].lower() ))).strip() reviews2[review][sentence] = lemmatizer().lemmatize_sentence(reviews2[review][sentence]) reviews2[review][sentence] = [j for j in reviews2[review][sentence] if j not in stopword] reviews2[review][sentence] = [j for j in reviews2[review][sentence] if not j.isdigit()] # reviews2[review][sentence]=[lmtzr.lemmatize(j) for j in reviews2[review][sentence]] reviews2[review][sentence] = [j for j in reviews2[review][sentence] if j in self.vocab] if not reviews2[review][sentence]: reviews2[review][sentence].append('<pad>') all_in_sentence = [] for review in range(len(reviews2)): for sentence in range(len(reviews2[review])): all_in_sentence.append(reviews2[review][sentence]) # all_in_sentence=list(map(lambda x: word_tokenize(x),all_in_sentence)) # for i in range(len(all_in_sentence)): # all_in_sentence[i]=[j for j in all_in_sentence[i] if j not in stopword] # all_in_sentence[i]=[j for j in all_in_sentence[i] if not j.isdigit()] # self.all_in_sentence=all_in_sentence return all_in_sentence, reviews2
def build_part_of_speech(review): part_of_speech_vocab={} for i in range(len(review)): for word, pos in pos_tag(lemmatizer().lemmatize_sentence(review[i])): if word not in part_of_speech_vocab: part_of_speech_vocab.update({word:[pos]}) else: part_of_speech_vocab[word].append(pos) return part_of_speech_vocab
def sentence_convert(self): review_in_indice = [] for sentence in self.reviews: indices = [] all_word = [ i for i in lemmatizer().lemmatize_sentence(sentence) if i not in self.stopword ] all_word = [i for i in all_word if not i.isdigit()] # all_word=[lmtzr.lemmatize(i) for i in all_word] all_word = [i for i in all_word if i in self.vocab] if not all_word: all_word.append('<pad>') for word in all_word: indices.append(self.vocab[word]) review_in_indice.append(indices) return review_in_indice, max(len(k) for k in review_in_indice)
def create_all_in_sentence(reviews, stopword=False): if stopword: stopword = stopwords.words('english') else: stopword=[] reviews1 = list(map(lambda x: sent_tokenize(x),reviews)) reviews2 = reviews1.copy() for review in range(len(reviews1)): for sentence in range(len(reviews1[review])): reviews2[review][sentence] = (re.sub(r'\W+', ' ',str(reviews1[review][sentence].lower() ))).strip() reviews2[review][sentence] = lemmatizer().lemmatize_sentence(reviews2[review][sentence]) reviews2[review][sentence] = [j for j in reviews2[review][sentence] if j not in stopword] reviews2[review][sentence] = [j for j in reviews2[review][sentence] if not j.isdigit()] if not reviews2[review][sentence]: reviews2[review][sentence].append('<pad>')#if empty after clean, add '<pad>' which is zero in vocab all_in_sentence = [] for review in range(len(reviews2)): for sentence in range(len(reviews2[review])): all_in_sentence.append(reviews2[review][sentence]) return all_in_sentence
def sentiment_split( self, review1 ): # high_detected_review as review1, all in sentence since SemEval already all in sentence stopword = stopwords.words('english') stopword = [i for i in stopword if i not in ['no', 'not', 'nor', 't']] review_in_index = {} for aspect in review1: review11 = [ lemmatizer().lemmatize_sentence(i) for i in review1[aspect] ] review11 = [[i for i in j if i not in stopword] for j in review11] #clean stopword review11 = [[i for i in j if not i.isdigit()] for j in review11] # clean number # review11=[[lmtzr.lemmatize(i) for i in j] for j in review11] #lemma review11 = [[i for i in j if i in self.vocab] for j in review11] index = [[self.vocab[i] for i in j] for j in review11] for i in index: if not i: i.append(0) review_in_index.update({aspect: index}) sentence_average = {} for aspect in review_in_index: sentence_average.update({ aspect: self.sentence_average(review_in_index[aspect]) }) #average sentence embeddings positive_seed = { aspect: [[self.vocab[i]] for i in self.sentiment_seedword[aspect]['positive']] for aspect in self.sentiment_seedword } negative_seed = { aspect: [[self.vocab[i]] for i in self.sentiment_seedword[aspect]['negative']] for aspect in self.sentiment_seedword } ps = { aspect: self.sentence_average(positive_seed[aspect]) for aspect in self.sentiment_seedword } ns = { aspect: self.sentence_average(negative_seed[aspect]) for aspect in self.sentiment_seedword } ps1 = { aspect: ps[aspect] + self.emb.embedding_matrix[self.vocab[aspect]] for aspect in ps } ns1 = { aspect: ns[aspect] + self.emb.embedding_matrix[self.vocab[aspect]] for aspect in ns } pos_sim = {aspect: [] for aspect in sentence_average} for aspect in sentence_average: for i in range(len(sentence_average[aspect])): aux_sim = [] for j in range(ps1[aspect].shape[0]): aux_sim.append( self.cosin(sentence_average[aspect][i], ps1[aspect][j])) pos_sim[aspect].append(max(aux_sim)) neg_sim = {aspect: [] for aspect in sentence_average} for aspect in sentence_average: for i in range(len(sentence_average[aspect])): aux_sim = [] for j in range(ns1[aspect].shape[0]): aux_sim.append( self.cosin(sentence_average[aspect][i], ns1[aspect][j])) neg_sim[aspect].append(max(aux_sim)) aspect_pos = {aspect: [] for aspect in review1} aspect_neg = {aspect: [] for aspect in review1} for aspect in review1: for i in range(len(review1[aspect])): if pos_sim[aspect][i] > neg_sim[aspect][i]: aspect_pos[aspect].append(review1[aspect][i]) elif neg_sim[aspect][i] > pos_sim[aspect][i]: aspect_neg[aspect].append(review1[aspect][i]) output_review = { aspect: aspect_pos[aspect] + aspect_neg[aspect] for aspect in aspect_pos } grade = {aspect: {} for aspect in aspect_pos} for aspect in grade: for i in range(len(aspect_pos[aspect])): grade[aspect].update({aspect_pos[aspect][i]: 5}) for i in range(len(aspect_neg[aspect])): grade[aspect].update({aspect_neg[aspect][i]: 1}) # grade={aspect:len(aspect_pos[aspect])*[5]+len(aspect_neg[aspect])*[1] for aspect in aspect_pos} return output_review, grade