예제 #1
0
class ToySentimentClassifier(object):
    def __init__(self):
        self.feature_extractor = FeatureExtractor()

    def extract_features(self, doc):
        all_features = {}
        for i in range(1, 4):  # range numero di ngrammi da calcolare
            all_features.update(
                self.feature_extractor.extract_word_ngrams(
                    doc, i))  # restituisce features degli ngrammi
        for i in range(1, 4):
            all_features.update(
                self.feature_extractor.extract_lemma_ngrams(doc, i))
        for i in range(3, 6):
            all_features.update(self.feature_extractor.compute_n_chars(doc, i))
        all_features.update(
            self.feature_extractor.compute_document_length(doc))

        we_feats = self.feature_extractor.compute_embeddings(doc, embeddings)
        if we_feats:
            all_features.update(we_feats)
        return all_features

    def train(self, model_name, input_file_name):
        reader = InputReader(input_file_name)
        all_docs = []
        for doc in reader.generate_documents():
            doc.features = self.extract_features(doc)
            all_docs.append(doc)  # lista con documenti+features del documento
        print(len(all_docs))

        # Encoding of samples
        all_collected_feats = [doc.features for doc in all_docs]
        # trasformare features in vettori (vettore singolo doc, matrice collezione di documenti)
        X_dict_vectorizer = DictVectorizer(
            sparse=True
        )  # funzione di sklearn - prende un dizionario key:id feature value: valore feature
        # trasforma in matrice
        encoded_features = X_dict_vectorizer.fit_transform(
            all_collected_feats
        )  # crea matrice (sparse=True matrice con hashmap)

        # Scale to increase performances and reduce training time
        # vogliamo che ogni feature contribuisca in modo equo - scalare feature fra 0 e 1
        scaler = preprocessing.StandardScaler(with_mean=False).fit(
            encoded_features
        )  # calcola i parametri che devono essere usati per scalare
        encoded_scaled_features = scaler.transform(
            encoded_features)  # scala i parametri

        # Encoding of labels (Y)
        label_encoder = preprocessing.LabelEncoder(
        )  # è multilabel, anche le classi vanno scalate
        label_encoder.fit([doc.label for doc in all_docs])
        encoded_labels = label_encoder.transform(
            [doc.label for doc in all_docs])

        # Classifier Algorithm
        scoring = ['accuracy', 'precision', 'recall', 'f1']
        clf = SVC(kernel='linear', C=1e3)
        # Cross validation
        cross_val_scores = cross_validate(clf,
                                          encoded_scaled_features,
                                          encoded_labels,
                                          cv=10,
                                          scoring=scoring)

        print('accuracy\tprecision\trecall\tf1\n')
        print(
            str(np.average(cross_val_scores['test_accuracy'])) + '\t' +
            str(np.average(cross_val_scores['test_precision'])) + '\t' +
            str(np.average(cross_val_scores['test_recall'])) + '\t' +
            str(np.average(cross_val_scores['test_f1'])) + '\n')
예제 #2
0
class ToySentimentClassifier(object):
    def __init__(self):
        self.feature_extractor = FeatureExtractor()

    def extract_features(self, doc):
        all_features = {}
        for i in range(1, 3):
            all_features.update(
                self.feature_extractor.extract_word_ngrams(doc, i))
        for i in range(1, 3):
            all_features.update(
                self.feature_extractor.extract_lemma_ngrams(doc, i))
        for i in range(1, 3):
            all_features.update(self.feature_extractor.compute_n_chars(doc, i))
        all_features.update(
            self.feature_extractor.compute_document_length(doc))
        return all_features

    def train(self, model_name, input_file_name):
        reader = InputReader(input_file_name)
        all_docs = []
        for doc in reader.generate_documents():
            doc.features = self.extract_features(doc)
            all_docs.append(doc)

        # Encoding of samples
        all_collected_feats = [doc.features for doc in all_docs]
        X_dict_vectorizer = DictVectorizer(sparse=True)
        encoded_features = X_dict_vectorizer.fit_transform(all_collected_feats)

        # Scale to increase performances and reduce training time
        scaler = preprocessing.StandardScaler(
            with_mean=False).fit(encoded_features)
        encoded_scaled_features = scaler.transform(encoded_features)

        # Encoding of labels
        label_encoder = preprocessing.LabelEncoder()
        label_encoder.fit([doc.label for doc in all_docs])
        encoded_labels = label_encoder.transform(
            [doc.label for doc in all_docs])

        # Classifier Algorithm
        clf = LinearSVC()

        # Cross validation
        cross_val_scores = cross_val_score(clf,
                                           encoded_scaled_features,
                                           encoded_labels,
                                           scoring='f1_weighted')
        print("Average F1 Weighted: %s" %
              (reduce(lambda x, y: x + y, cross_val_scores) /
               len(cross_val_scores), ))

        clf.fit(encoded_scaled_features, encoded_labels)

        # Save model
        joblib.dump(clf, 'clf.pkl')
        joblib.dump(scaler, "scaler.pkl")
        joblib.dump(label_encoder, "label_encoder.pkl")
        joblib.dump(X_dict_vectorizer, open("vectorizer.pkl", "wb"))
        tar = tarfile.open("%s" % model_name, "w")
        for fname in [
                'clf.pkl', "scaler.pkl", "label_encoder.pkl", "vectorizer.pkl"
        ]:
            tar.add(fname)
            os.remove(fname)
        tar.close()

    def evaluate_sentipolc(self, docs):
        def clz_to_opos_oneg(clz):
            if clz == "POS":
                opos = 1
                oneg = 0
            if clz == "NEG":
                opos = 0
                oneg = 1
            if clz == "O":
                opos = 0
                oneg = 0
            if clz == "POS_NEG":
                opos = 1
                oneg = 1
            return (opos, oneg)

        predicted_csv_file = open("predicted.csv", 'w')
        field_names = [
            "id", "sub", "opos", "oneg", "iro", "lpos", "lneg", "top"
        ]
        writer = csv.DictWriter(predicted_csv_file, fieldnames=field_names)
        for doc in docs:
            opos, oneg = clz_to_opos_oneg(doc.labeled_prediction)
            writer.writerow({'id': doc.id, 'opos': opos, 'oneg': oneg})
        predicted_csv_file.close()

        # Generate gold file
        gold_csv_file = open("gold.csv", 'w')
        writer = csv.DictWriter(gold_csv_file, fieldnames=field_names)
        for doc in docs:
            opos, oneg = clz_to_opos_oneg(doc.label)
            writer.writerow({'id': doc.id, 'opos': opos, 'oneg': oneg})
        gold_csv_file.close()

        # Evaluation
        evaluate("gold.csv", "predicted.csv")

    def load_model(self, model_name):
        tar = tarfile.open("%s" % model_name, 'r')
        for tarinfo in tar:
            f = tar.extractfile(tarinfo)
            if tarinfo.name == "clf.pkl":
                self.classifier = joblib.load(f)
            if tarinfo.name == "scaler.pkl":
                self.scaler = joblib.load(f)
            if tarinfo.name == "label_encoder.pkl":
                self.label_encoder = joblib.load(f)
            if tarinfo.name == "vectorizer.pkl":
                self.vectorizer = joblib.load(f)

    def parse(self, input_file_name):
        reader = InputReader(input_file_name)
        all_docs = []
        original_labels = []
        predicted_labels = []
        for doc in reader.generate_documents():
            doc.features = self.extract_features(doc)
            all_docs.append(doc)

            # Encoding of samples
            encoded_features = self.vectorizer.transform(doc.features)
            encoded_scaled_features = self.scaler.transform(encoded_features)
            predictions = self.classifier.predict(encoded_scaled_features)
            labeled_prediction = self.label_encoder.inverse_transform(
                predictions)[0]
            original_labels.append(doc.label)
            predicted_labels.append(labeled_prediction)
            doc.labeled_prediction = labeled_prediction
        print(
            sklearn.metrics.classification_report(original_labels,
                                                  predicted_labels))
        self.evaluate_sentipolc(all_docs)