def pos_tagging(self, data):
     docs = " ".join([doc['doc'].lower() for doc in data])
     tokens = helpers.tokenize(docs)
     dif_tokens = set(tokens)
     string_of_tokens = " ".join(dif_tokens).encode()
     output, errors = helpers.execute_mystem(string_of_tokens)
     mystem_data = json.loads(output.decode())
     for token in mystem_data:
         pos = helpers.get_pos(token, not_recognized="NOT_RECOGNIZED")
         stem = token['analysis'][0]['lex'] if token['analysis'] else None
         self.pos_vocab[token['text']] = {"POS": pos, "stem": stem}
 def predict(self, data):
     labels = []
     for doc in data:
         score = self.p_class.copy()
         for cl in self.POLARITY:
             tokens = helpers.tokenize(doc['text'].lower())
             for token in tokens:
                 count = self.words[token][cl]
                 probability = (count+1) / (self.words_amount[cl]+self.vocab_length[cl]) \
                     if self.words_amount[cl] else 1
                 if self.USE_POS:
                     pass
                 score[cl] += math.log(probability)
         polarity = self.POSITIVE if score[self.POSITIVE] > score[self.NEGATIVE] \
             else self.NEGATIVE
         labels.append({'id': doc['id'], 'polarity': polarity})
     return labels
    def train(self, data, positive_label, occurrence=0, delete_pos=None):
        # TODO: fix stemming
        # if self.STEM:
        #     docs = " ".join([doc['doc'].lower() for doc in data])
        #     tokens = helpers.tokenize(docs)
        #     dif_tokens = set(tokens)
        #     string_of_tokens = " ".join(dif_tokens).encode()
        #     output, errors = helpers.execute_mystem(string_of_tokens)
        #     mystem_data = json.loads(output.decode())
        #     for token in mystem_data:
        #         pos = re.findall(r'[A-Z]+', token['analysis'][0]['gr'])[0] if token['analysis'] \
        #             else "NOT RECOGNIZED"
        #         stem = token['analysis'][0]['lex'] if token['analysis'] else None
        #         self.stem_vocab[token['text']] = {"POS": pos, "stem": stem}

        for doc in data:
            cl = self.POSITIVE if doc['polarity'] == positive_label else self.NEGATIVE
            self.docs[cl] += 1
            tokens = helpers.tokenize(doc['doc'].lower())
            for token in tokens:
                self.words[token][cl] += 1

        if delete_pos:
            for token in self.pos_vocab:
                pos = self.pos_vocab[token]["POS"]
                if pos == delete_pos:
                    self.words.pop(token, None)

        if occurrence:
            for token in self.words:
                for cl in self.POLARITY:
                    if self.words[token][cl] <= occurrence:
                        self.words[token][cl] = 0

        # TODO: delete this using part of speech and make new
        # if self.USE_POS:
        #     string_of_tokens = " ".join(self.words.keys()).encode()
        #     output, errors = helpers.execute_mystem(string_of_tokens)
        #     mystem_data = json.loads(output.decode())
        #     pos_count = defaultdict(lambda: {self.POSITIVE: 0, self.NEGATIVE: 0})
        #     for token in mystem_data:
        #         pos = re.findall(r'[A-Z]+', token['analysis'][0]['gr'])[0] if token['analysis'] \
        #             else None
        #         if not pos:
        #             continue
        #         self.pos_vocab[token['text']] = pos
        #         for cl in self.CLS:
        #             pos_count[pos][cl] += self.words[token['text']][cl]
        #     all_positive = sum([i[self.POSITIVE] for i in pos_count.values()])
        #     all_negative = sum([i[self.NEGATIVE] for i in pos_count.values()])
        #     for pos, count in pos_count.items():
        #         rel_positive = count[self.POSITIVE]/all_positive
        #         rel_negative = count[self.NEGATIVE]/all_negative
        #         difference = rel_positive - rel_negative
        #         self.pos_difference[pos] = difference

        for cl in self.POLARITY:
            self.p_class[cl] = math.log(
                self.docs[cl]/len(data) if self.docs[cl] and data else 1)
            self.vocab_length[cl] = sum(
                [1 if token[cl] else 0 for token in self.words.values()])
            self.words_amount[cl] = sum(
                [token[cl] for token in self.words.values()])