def pos_tagging(self, data): docs = " ".join([doc['doc'].lower() for doc in data]) tokens = helpers.tokenize(docs) dif_tokens = set(tokens) string_of_tokens = " ".join(dif_tokens).encode() output, errors = helpers.execute_mystem(string_of_tokens) mystem_data = json.loads(output.decode()) for token in mystem_data: pos = helpers.get_pos(token, not_recognized="NOT_RECOGNIZED") stem = token['analysis'][0]['lex'] if token['analysis'] else None self.pos_vocab[token['text']] = {"POS": pos, "stem": stem}
def predict(self, data): labels = [] for doc in data: score = self.p_class.copy() for cl in self.POLARITY: tokens = helpers.tokenize(doc['text'].lower()) for token in tokens: count = self.words[token][cl] probability = (count+1) / (self.words_amount[cl]+self.vocab_length[cl]) \ if self.words_amount[cl] else 1 if self.USE_POS: pass score[cl] += math.log(probability) polarity = self.POSITIVE if score[self.POSITIVE] > score[self.NEGATIVE] \ else self.NEGATIVE labels.append({'id': doc['id'], 'polarity': polarity}) return labels
def train(self, data, positive_label, occurrence=0, delete_pos=None): # TODO: fix stemming # if self.STEM: # docs = " ".join([doc['doc'].lower() for doc in data]) # tokens = helpers.tokenize(docs) # dif_tokens = set(tokens) # string_of_tokens = " ".join(dif_tokens).encode() # output, errors = helpers.execute_mystem(string_of_tokens) # mystem_data = json.loads(output.decode()) # for token in mystem_data: # pos = re.findall(r'[A-Z]+', token['analysis'][0]['gr'])[0] if token['analysis'] \ # else "NOT RECOGNIZED" # stem = token['analysis'][0]['lex'] if token['analysis'] else None # self.stem_vocab[token['text']] = {"POS": pos, "stem": stem} for doc in data: cl = self.POSITIVE if doc['polarity'] == positive_label else self.NEGATIVE self.docs[cl] += 1 tokens = helpers.tokenize(doc['doc'].lower()) for token in tokens: self.words[token][cl] += 1 if delete_pos: for token in self.pos_vocab: pos = self.pos_vocab[token]["POS"] if pos == delete_pos: self.words.pop(token, None) if occurrence: for token in self.words: for cl in self.POLARITY: if self.words[token][cl] <= occurrence: self.words[token][cl] = 0 # TODO: delete this using part of speech and make new # if self.USE_POS: # string_of_tokens = " ".join(self.words.keys()).encode() # output, errors = helpers.execute_mystem(string_of_tokens) # mystem_data = json.loads(output.decode()) # pos_count = defaultdict(lambda: {self.POSITIVE: 0, self.NEGATIVE: 0}) # for token in mystem_data: # pos = re.findall(r'[A-Z]+', token['analysis'][0]['gr'])[0] if token['analysis'] \ # else None # if not pos: # continue # self.pos_vocab[token['text']] = pos # for cl in self.CLS: # pos_count[pos][cl] += self.words[token['text']][cl] # all_positive = sum([i[self.POSITIVE] for i in pos_count.values()]) # all_negative = sum([i[self.NEGATIVE] for i in pos_count.values()]) # for pos, count in pos_count.items(): # rel_positive = count[self.POSITIVE]/all_positive # rel_negative = count[self.NEGATIVE]/all_negative # difference = rel_positive - rel_negative # self.pos_difference[pos] = difference for cl in self.POLARITY: self.p_class[cl] = math.log( self.docs[cl]/len(data) if self.docs[cl] and data else 1) self.vocab_length[cl] = sum( [1 if token[cl] else 0 for token in self.words.values()]) self.words_amount[cl] = sum( [token[cl] for token in self.words.values()])