Exemplo n.º 1
0
    def transform_review(self,input_review=None):
        stopword = stopwords.words('english')
        if input_review == None:
            reviews1 = list(map(lambda x: sent_tokenize(x),self.reviews))
        else:
            reviews1 = list(map(lambda x: sent_tokenize(x),input_review))
        reviews2 = reviews1.copy()
        for review in range(len(reviews1)):
            for sentence in range(len(reviews1[review])):
                reviews2[review][sentence] = (re.sub(r'\W+', ' ',str(reviews1[review][sentence].lower() ))).strip()
                reviews2[review][sentence] = lemmatizer().lemmatize_sentence(reviews2[review][sentence])
                reviews2[review][sentence] = [j for j in reviews2[review][sentence] if j not in stopword]
                reviews2[review][sentence] = [j for j in reviews2[review][sentence] if not j.isdigit()]
#                reviews2[review][sentence]=[lmtzr.lemmatize(j) for j in reviews2[review][sentence]]
                reviews2[review][sentence] = [j for j in reviews2[review][sentence] if j in self.vocab]
                if not reviews2[review][sentence]:
                    reviews2[review][sentence].append('<pad>')
        all_in_sentence = []
        for review in range(len(reviews2)):
            for sentence in range(len(reviews2[review])):
                all_in_sentence.append(reviews2[review][sentence])
#        all_in_sentence=list(map(lambda x: word_tokenize(x),all_in_sentence))        
#        for i in range(len(all_in_sentence)):
#            all_in_sentence[i]=[j for j in all_in_sentence[i] if j not in stopword]
#            all_in_sentence[i]=[j for j in all_in_sentence[i] if not j.isdigit()]
#        self.all_in_sentence=all_in_sentence
       
        return all_in_sentence, reviews2
def build_part_of_speech(review):
    part_of_speech_vocab={}
    for i in range(len(review)):
        for word, pos in pos_tag(lemmatizer().lemmatize_sentence(review[i])):
            if word not in part_of_speech_vocab:
                part_of_speech_vocab.update({word:[pos]})
            else:
                part_of_speech_vocab[word].append(pos)
    return part_of_speech_vocab            
Exemplo n.º 3
0
 def sentence_convert(self):
     review_in_indice = []
     for sentence in self.reviews:
         indices = []
         all_word = [
             i for i in lemmatizer().lemmatize_sentence(sentence)
             if i not in self.stopword
         ]
         all_word = [i for i in all_word if not i.isdigit()]
         #            all_word=[lmtzr.lemmatize(i) for i in all_word]
         all_word = [i for i in all_word if i in self.vocab]
         if not all_word:
             all_word.append('<pad>')
         for word in all_word:
             indices.append(self.vocab[word])
         review_in_indice.append(indices)
     return review_in_indice, max(len(k) for k in review_in_indice)
def create_all_in_sentence(reviews, stopword=False):
    if stopword:
        stopword = stopwords.words('english')
    else:
        stopword=[]
    reviews1 = list(map(lambda x: sent_tokenize(x),reviews))
    reviews2 = reviews1.copy()
    for review in range(len(reviews1)):
        for sentence in range(len(reviews1[review])):
            reviews2[review][sentence] = (re.sub(r'\W+', ' ',str(reviews1[review][sentence].lower() ))).strip()
            reviews2[review][sentence] = lemmatizer().lemmatize_sentence(reviews2[review][sentence])
            reviews2[review][sentence] = [j for j in reviews2[review][sentence] if j not in stopword]
            reviews2[review][sentence] = [j for j in reviews2[review][sentence] if not j.isdigit()]
            if not reviews2[review][sentence]:
                reviews2[review][sentence].append('<pad>')#if empty after clean, add '<pad>' which is zero in vocab
    all_in_sentence = []
    for review in range(len(reviews2)):
        for sentence in range(len(reviews2[review])):
            all_in_sentence.append(reviews2[review][sentence])
       
    return all_in_sentence
Exemplo n.º 5
0
    def sentiment_split(
        self, review1
    ):  # high_detected_review as review1, all in sentence since SemEval already all in sentence
        stopword = stopwords.words('english')
        stopword = [i for i in stopword if i not in ['no', 'not', 'nor', 't']]
        review_in_index = {}
        for aspect in review1:
            review11 = [
                lemmatizer().lemmatize_sentence(i) for i in review1[aspect]
            ]
            review11 = [[i for i in j if i not in stopword]
                        for j in review11]  #clean stopword
            review11 = [[i for i in j if not i.isdigit()]
                        for j in review11]  # clean number
            #            review11=[[lmtzr.lemmatize(i) for i in j] for j in review11]     #lemma
            review11 = [[i for i in j if i in self.vocab] for j in review11]
            index = [[self.vocab[i] for i in j] for j in review11]
            for i in index:
                if not i:
                    i.append(0)
            review_in_index.update({aspect: index})

        sentence_average = {}
        for aspect in review_in_index:
            sentence_average.update({
                aspect:
                self.sentence_average(review_in_index[aspect])
            })  #average sentence embeddings

        positive_seed = {
            aspect: [[self.vocab[i]]
                     for i in self.sentiment_seedword[aspect]['positive']]
            for aspect in self.sentiment_seedword
        }
        negative_seed = {
            aspect: [[self.vocab[i]]
                     for i in self.sentiment_seedword[aspect]['negative']]
            for aspect in self.sentiment_seedword
        }
        ps = {
            aspect: self.sentence_average(positive_seed[aspect])
            for aspect in self.sentiment_seedword
        }
        ns = {
            aspect: self.sentence_average(negative_seed[aspect])
            for aspect in self.sentiment_seedword
        }
        ps1 = {
            aspect: ps[aspect] + self.emb.embedding_matrix[self.vocab[aspect]]
            for aspect in ps
        }
        ns1 = {
            aspect: ns[aspect] + self.emb.embedding_matrix[self.vocab[aspect]]
            for aspect in ns
        }

        pos_sim = {aspect: [] for aspect in sentence_average}
        for aspect in sentence_average:
            for i in range(len(sentence_average[aspect])):
                aux_sim = []
                for j in range(ps1[aspect].shape[0]):
                    aux_sim.append(
                        self.cosin(sentence_average[aspect][i],
                                   ps1[aspect][j]))
                pos_sim[aspect].append(max(aux_sim))
        neg_sim = {aspect: [] for aspect in sentence_average}
        for aspect in sentence_average:
            for i in range(len(sentence_average[aspect])):
                aux_sim = []
                for j in range(ns1[aspect].shape[0]):
                    aux_sim.append(
                        self.cosin(sentence_average[aspect][i],
                                   ns1[aspect][j]))
                neg_sim[aspect].append(max(aux_sim))

        aspect_pos = {aspect: [] for aspect in review1}
        aspect_neg = {aspect: [] for aspect in review1}
        for aspect in review1:
            for i in range(len(review1[aspect])):
                if pos_sim[aspect][i] > neg_sim[aspect][i]:
                    aspect_pos[aspect].append(review1[aspect][i])
                elif neg_sim[aspect][i] > pos_sim[aspect][i]:
                    aspect_neg[aspect].append(review1[aspect][i])

        output_review = {
            aspect: aspect_pos[aspect] + aspect_neg[aspect]
            for aspect in aspect_pos
        }
        grade = {aspect: {} for aspect in aspect_pos}
        for aspect in grade:
            for i in range(len(aspect_pos[aspect])):
                grade[aspect].update({aspect_pos[aspect][i]: 5})
            for i in range(len(aspect_neg[aspect])):
                grade[aspect].update({aspect_neg[aspect][i]: 1})

#        grade={aspect:len(aspect_pos[aspect])*[5]+len(aspect_neg[aspect])*[1] for aspect in aspect_pos}

        return output_review, grade