示例#1
0
    def process(self):

        log.info("Commencing execution")

        # Get tagged articles from Veriday
        log.info("Getting tagged Veriday articles ... ")
        veriday_articles_raw = file_helper.get_articles_list(self.articles_source_file_path)
        veriday_tagged_articles = doc2vec_helper.get_tagged_articles_veriday(veriday_articles_raw)

        # Convert articles file into a Tagged documents for doc2vec
        log.info("Getting tagged Semeval articles ... ")
        articles = file_helper.get_articles_list(self.labeled_articles_file_path)
        tagged_articles, sentiment_scores_dict = doc2vec_helper.get_tagged_articles_scores(articles)

        # combine both article sets
        tagged_articles.extend(veriday_tagged_articles)

        # model initialization and vocab building
        log.info("Initializing the doc2vec model ...")
        doc2vec_model = doc2vec_helper.init_model(tagged_articles)

        # shuffling and training the model
        log.info("Training the doc2vec model ...")
        for i in range(self.shuffle_count):
            log.info("Shuffles remaining: " + str(self.shuffle_count - i))
            doc2vec_helper.shuffle_and_train_articles(doc2vec_model, tagged_articles)

        # saving the doc2vec model to disk
        doc2vec_model.save(self.doc2vec_model_file_path)

        # Extracting parameters for and training the ML model
        x_docvecs, y_scores = scikit_ml_helper.extract_training_parameters(doc2vec_model, sentiment_scores_dict)
        log.info("Training the ML model ...")
        ml_model = scikit_ml_helper.train_linear_model(x_docvecs, y_scores)

        # saving the ml model to disk
        scikit_ml_helper.persist_model_to_disk(ml_model, self.ml_model_file_path)

        log.info("Completed execution")
示例#2
0
    def process(self):

        log.info("Commencing execution")

        # Get tagged articles from Semeval
        log.info("Getting Semeval articles ... ")
        semeval_articles_raw = file_helper.get_articles_list(
            self.labeled_articles_file_path)
        semeval_tagged_articles, document_sentiment_classes = \
            doc2vec_helper.get_tagged_semeval_articles(semeval_articles_raw)

        # model initialization and vocab building
        log.info("Initializing the doc2vec model ...")
        doc2vec_model = doc2vec_helper.init_model(semeval_tagged_articles)

        # shuffling and training the model
        log.info("Training the doc2vec model ...")
        for i in range(self.shuffle_count):
            log.info("Shuffles remaining: " + str(self.shuffle_count - i))
            doc2vec_helper.shuffle_and_train_articles(doc2vec_model,
                                                      semeval_tagged_articles)

        # saving the doc2vec model to disk
        doc2vec_model.save(self.doc2vec_model_file_path)

        # Extracting parameters for and training the ML model
        x_docvecs, y_scores = scikit_ml_helper.extract_training_parameters(
            doc2vec_model, document_sentiment_classes)
        log.info("Training the ML model ...")
        # ml_model = scikit_ml_helper.train_linear_model(x_docvecs, y_scores)
        ml_model = scikit_ml_helper.train_gnb_classifier(x_docvecs, y_scores)

        # saving the ml model to disk
        scikit_ml_helper.persist_model_to_disk(ml_model,
                                               self.ml_model_file_path)

        log.info("Completed execution")
示例#3
0
 def __init__(self, source):
     super(SemevalTaggedLineDocument, self).__init__(source)
     self.semeval_articles = file_helper.get_articles_list(source)
     self.counter = -1
    def process(self):

        log.info("Commencing execution")

        with open(self.classification_sources_file_path) as source_cfg:
            sources_dict = json.load(source_cfg)

        x_documents = list()
        y_scores = list()

        log.info("Parsing documents")
        for sentiment in sources_dict:
            review_file_path = sources_dict[sentiment]

            with open(review_file_path) as review_file:
                for line in review_file:
                    x_document = line
                    y_score = sentiment

                    x_documents.append(x_document)
                    y_scores.append(y_score)

        semeval_classified_articles_file = self.labeled_articles_file_path
        semeval_classified_articles = file_helper.get_articles_list(
            semeval_classified_articles_file)

        y_true = list()
        semeval_count = 0
        for semeval_classified_article in semeval_classified_articles:

            # article_text = semeval_classified_article['articleText']
            article_text = semeval_classified_article['title']

            if not article_text:
                continue

            semeval_count += 1
            x_documents.append(article_text)
            y_true.append(semeval_classified_article['label'])

        log.info("Initiating training for document vectors")
        vectorizer = TfidfVectorizer(sublinear_tf=True,
                                     max_df=0.5,
                                     stop_words='english')
        x_vectors = vectorizer.fit_transform(x_documents)
        labelled_docs = x_vectors[(-1 * semeval_count):]

        log.info("Vectors have been trained")

        log.info("Training the ML models")
        ml_model_logreg = scikit_ml_helper.train_logistic_reg_classifier(
            x_vectors[:(-1 * semeval_count)], y_scores)
        ml_model_nb = scikit_ml_helper.train_gnb_classifier_dense(
            x_vectors[:(-1 * semeval_count)], y_scores)
        ml_model_svm_linear = scikit_ml_helper.train_svm_classifier(
            x_vectors[:(-1 * semeval_count)], y_scores)

        log.info("Saving the ML models to disk")
        scikit_ml_helper.persist_model_to_disk(
            ml_model_logreg, self.ml_model_file_path + ".tfidf.log_reg")
        scikit_ml_helper.persist_model_to_disk(
            ml_model_nb, self.ml_model_file_path + ".tfidf.nb")
        scikit_ml_helper.persist_model_to_disk(
            ml_model_svm_linear, self.ml_model_file_path + ".tfidf.svm_linear")

        predictions_logreg = ml_model_logreg.predict(labelled_docs)
        predictions_linearsvm = ml_model_svm_linear.predict(labelled_docs)
        predictions_nb = ml_model_nb.predict(labelled_docs)

        accuracy_logreg = \
            sklearn.metrics.accuracy_score(y_true=y_true, y_pred=predictions_logreg, normalize=True, sample_weight=None)
        accuracy_linearsvm = \
            sklearn.metrics.accuracy_score(y_true=y_true, y_pred=predictions_linearsvm, normalize=True, sample_weight=None)
        accuracy_nb = \
            sklearn.metrics.accuracy_score(y_true=y_true, y_pred=predictions_nb, normalize=True, sample_weight=None)

        log.info("accuracy_logreg: " + str(accuracy_logreg))
        log.info("accuracy_linearsvm: " + str(accuracy_linearsvm))
        log.info("accuracy_nb: " + str(accuracy_nb))

        log.info("\ncm_logreg\n" + str(
            scikit_ml_helper.get_confusion_matrix(y_true, predictions_logreg)))
        log.info("\ncm_linearsvm\n" + str(
            scikit_ml_helper.get_confusion_matrix(y_true,
                                                  predictions_linearsvm)))
        log.info(
            "\ncm_nb\n" +
            str(scikit_ml_helper.get_confusion_matrix(y_true, predictions_nb)))

        log.info("Completed execution")
nb_ml_model_file_path = \
    "/home/v2john/Documents/amazon/models/books_ml_docvec.model.docvec.nb"

doc2vec_model = Doc2Vec.load(doc2vec_model_file_path)

logreg_model = scikit_ml_helper.get_model_from_disk(logreg_ml_model_file_path)
svm_model = scikit_ml_helper.get_model_from_disk(svm_linear_ml_model_file_path)
nb_model = scikit_ml_helper.get_model_from_disk(nb_ml_model_file_path)


semeval_classified_articles_file = \
    "/home/v2john/Dropbox/Personal/Academic/Masters/UWaterloo/Academics/ResearchProject/semeval_task/" + \
    "semeval-2017-task-5-subtask-2/semeval_combined_fulltext_classified.json"


semeval_classified_articles = file_helper.get_articles_list(semeval_classified_articles_file)

article_vectors = list()
count = 0
y_true = list()

for semeval_classified_article in semeval_classified_articles:

    print 'working on article ' + str(count)
    count += 1

    article_text = semeval_classified_article['articleText']
    # article_text = semeval_classified_article['title']
    if not article_text:
        continue