Пример #1
0
def startAnalysis(folder, S1_path, S2_path):

    fetcher = PageFetcher()
    S1 = fetcher.fetchPages(folder, S1_path)
    S2 = fetcher.fetchPages(folder, S2_path)

    #We use a document representation based on TF-IDF model
    TF_IDF = Vectorizer()
    S1_HTML = TF_IDF.fit_transform(S1)
    S2_HTML = TF_IDF.fit_transform(S2)
    pageAllignament = PageAllignament()
    S1S2_Pairs = pageAllignament.allignSources(S1_HTML, S2_HTML)

    print("Stats of: " + str(S1_path) + " and " + str(S2_path))
    evaluation_pipeline(S1S2_Pairs)
Пример #2
0
def vectorize_jobs(df_jobs, vectorizer_path, tfidfs_path, debug=False):
    #initializing tfidf vectorizer
    if debug:
        print('[Job Vectorization 2/5] Initializing Vectorizer \n')
    vectorizer = Vectorizer()

    if debug:
        print('[Job Vectorization 3/5] Tranforming/Vectorizing data \n')
    tfidf_jobs = vectorizer.fit_transform(
        (df_jobs['text']))  #fitting and transforming the vector

    if debug:
        print('[Job Vectorization 4/5] saving vectorizer to {path} \n'.format(
            path=vectorizer_path))
    vectorizer.save_vectorizer(vectorizer_path)

    if debug:
        print('[Job Vectorization 5/5] saving tfidf to {path} \n'.format(
            path=tfidfs_path))
    vectorizer.save_tfidfs(tfidf_jobs, tfidfs_path)
Пример #3
0
def main():
    # 学習データ読み込み
    with timer("train data load"):
        df = load_data_from_gcs()

    # 前処理
    with timer("preprocess"):
        df = preprocess(df)
        vectorizer = Vectorizer()

    X_train = df.drop(columns="price")
    y_train = df["price"]

    with timer("training"):
        X_train = vectorizer.fit_transform(X_train)

        # 学習
        base_params = {
            'input_dropout': 0.2,
            'hidden_layers': 3,
            'hidden_units': 256,
            'hidden_activation': 'relu',
            'hidden_dropout': 0.2,
            'batch_norm': 'before_act',
            'optimizer': {
                'type': 'adam',
                'lr': 5e-5
            },
            'batch_size': 64,
        }

        model = ModelMLP(base_params)
        model.fit(X_train, y_train)

    with timer("save model"):
        #モデルとパイプラインの保存
        vectorizer.save_vectorizer()
        model.save_model()
Пример #4
0
class Trainer(object):
    """Trains the classifier with training data and does the cross validation.
    """

    def __init__(self):
        """Initializes the datastructures required.
        """
        # The actual text extraction object (does text to vector mapping).
        self.vectorizer = Vectorizer()

        # A list of already hand classified tweets to train our classifier.
        self.data = None

        # A list containing the classification to each individual tweet
        # in the tweets list.
        self.classification = None

        self.classifier = None
        self.scores = None

    def initialize_training_data(self):
        """Initializes all types of training data we have.
        """
        corpus_file = open(os.path.join(datasettings.DATA_DIRECTORY,
                                        'full-corpus.csv'))

        classification, tweets = parse_training_corpus(corpus_file)

        reviews_positive = parse_imdb_corpus(
            os.path.join(datasettings.DATA_DIRECTORY, 'positive'))

        num_postive_reviews = len(reviews_positive)
        class_positive = ['positive'] * num_postive_reviews

        reviews_negative = parse_imdb_corpus(
            os.path.join(datasettings.DATA_DIRECTORY, 'negative'))
        num_negative_reviews = len(reviews_negative)
        class_negative = ['negative'] * num_negative_reviews

        self.data = tweets
        self.classification = classification

        #self.date_time = date_time
        #self.retweet = retweets
        #self.favorited = favorited

    def initial_fit(self):
        """Initializes the vectorizer by doing a fit and then a transform.
        """
        # We map the sentiments to the values specified in the SENTIMENT_MAP.
        # For any sentiment that is not part of the map we give a value 0.
        classification_vector = numpy.array(map(
            lambda s: SENTIMENT_MAP.get(s.lower(), 0),
                                        self.classification))

        feature_vector = self.vectorizer.fit_transform(self.data)

        return (classification_vector, feature_vector)

    def build_word_dict(self):
        """ Build sentiment dictionary and build vector of 
            weights for tweets.
        """
        fileIn = open(os.path.join(datasettings.DATA_DIRECTORY,
                                   'AFINN-96.txt'))
        wordDict = {}
        line = fileIn.readline()
        while line != '':
            temp = string.split(line, '\t')
            wordDict[temp[0]] = int(temp[1])
            line = fileIn.readline()
        fileIn.close()

        fileIn = open(os.path.join(datasettings.DATA_DIRECTORY,
                                   'AFINN-111.txt'))
        line = fileIn.readline()
        while line != '':
            temp = string.split(line, '\t')
            wordDict[temp[0]] = int(temp[1])
            line = fileIn.readline()
        fileIn.close()

        word_dict_vector = []
        for tweet in self.data:
            word_list = tweet.split()
            sum = 0
            for word in word_list:
                if word in wordDict.keys():
                    sum += wordDict[word]    
            word_dict_vector.append(sum)

        return word_dict_vector

    def transform(self, test_data):
        """Performs the transform using the already initialized vectorizer.
        """
        feature_vector = self.vectorizer.transform(test_data)

    def score_func(self, true, predicted):
        """Score function for the validation.
        """
        return metrics.precision_recall_fscore_support(
            true, predicted,
            pos_label=[
                SENTIMENT_MAP['positive'],
                SENTIMENT_MAP['negative'],
                SENTIMENT_MAP['neutral'],
                ],
            average='macro')

    def cross_validate(self, k=10):
        """Performs a k-fold cross validation of our training data.

        Args:
            k: The number of folds for cross validation.
        """
        self.scores = []

        X, y = check_arrays(self.feature_vector,
                            self.classification_vector,
                            sparse_format='csr')
        cv = cross_validation.check_cv(
            k, self.feature_vector, self.classification_vector,
            classifier=True)

        for train, test in cv:
            self.classifier1.fit(self.feature_vector[train],
                          self.classification_vector[train])
            self.classifier2.fit(self.feature_vector[train],
                          self.classification_vector[train])
            self.classifier3.fit(self.feature_vector[train],
                          self.classification_vector[train])
            classification1 = self.classifier1.predict(
                self.feature_vector[test])
            classification2 = self.classifier2.predict(
                self.feature_vector[test])
            classification3 = self.classifier3.predict(
                self.feature_vector[test])

            classification = []
            for predictions in zip(classification1, classification2,
                                   classification3):
                neutral_count = predictions.count(0)
                positive_count = predictions.count(1)
                negative_count = predictions.count(-1)
                if (neutral_count == negative_count and
                    negative_count == positive_count):
                    classification.append(predictions[0])
                elif (neutral_count > positive_count and
                    neutral_count > negative_count):
                    classification.append(0)
                elif (positive_count > neutral_count and
                    positive_count > negative_count):
                    classification.append(1)
                elif (negative_count > neutral_count and
                    negative_count > positive_count):
                    classification.append(-1)
            classification = numpy.array(classification)

            self.scores.append(self.score_func(y[test], classification))

    def train_and_validate(self, cross_validate=False, mean=False,
                           serialize=False):
        """Trains the SVC with the training data and validates with the test data.

        We do a K-Fold cross validation with K = 10.
        """
        self.classification_vector, self.feature_vector = self.initial_fit()

        self.classifier1 = naive_bayes.MultinomialNB()
        self.classifier2 = naive_bayes.BernoulliNB()
        self.classifier3 = svm.LinearSVC(loss='l2', penalty='l1',
                                         C=1000,dual=False, tol=1e-3)

        if cross_validate:
            self.cross_validate(k=cross_validate)
        else:
            self.classifier1.fit(self.feature_vector,
                                 self.classification_vector)
            self.classifier2.fit(self.feature_vector,
                                 self.classification_vector)
            self.classifier3.fit(self.feature_vector,
                                 self.classification_vector)

        if serialize:
            classifiers_file = open(os.path.join(
                datasettings.DATA_DIRECTORY, 'classifiers.pickle'), 'wb')
            cPickle.dump([self.classifier1,
                          self.classifier2,
                          self.classifier3], classifiers_file)
            vectorizer_file = open(os.path.join(
                datasettings.DATA_DIRECTORY, 'vectorizer.pickle'), 'wb')
            cPickle.dump(self.vectorizer, vectorizer_file)

        return self.scores

    def build_ui(self, mean=False):
        """Prints out all the scores calculated.
        """
        for i, score in enumerate(self.scores):
            print "Cross Validation: %d" % (i + 1)
            print "*" * 40
            if mean:
                print "Mean Accuracy: %f" % (score)
            else:
                print "Precision\tRecall\t\tF-Score"
                print "~~~~~~~~~\t~~~~~~\t\t~~~~~~~"
                precision = score[0]
                recall = score[1]
                f_score = score[2]
                print "%f\t%f\t%f" % (precision, recall, f_score)


            print
Пример #5
0
class Trainer(object):
    """Trains the classifier with training data and does the cross validation.
    """
    def __init__(self):
        """Initializes the datastructures required.
        """
        # The actual text extraction object (does text to vector mapping).
        self.vectorizer = Vectorizer()

        # A list of already hand classified tweets to train our classifier.
        self.data = None

        # A list containing the classification to each individual tweet
        # in the tweets list.
        self.classification = None

        self.classifier = None
        self.scores = None

    def initialize_training_data(self):
        """Initializes all types of training data we have.
        """
        corpus_file = open(
            os.path.join(datasettings.DATA_DIRECTORY, 'full-corpus.csv'))

        classification, tweets = parse_training_corpus(corpus_file)

        reviews_positive = parse_imdb_corpus(
            os.path.join(datasettings.DATA_DIRECTORY, 'positive'))

        num_postive_reviews = len(reviews_positive)
        class_positive = ['positive'] * num_postive_reviews

        reviews_negative = parse_imdb_corpus(
            os.path.join(datasettings.DATA_DIRECTORY, 'negative'))
        num_negative_reviews = len(reviews_negative)
        class_negative = ['negative'] * num_negative_reviews

        self.data = tweets
        self.classification = classification

        #self.date_time = date_time
        #self.retweet = retweets
        #self.favorited = favorited

    def initial_fit(self):
        """Initializes the vectorizer by doing a fit and then a transform.
        """
        # We map the sentiments to the values specified in the SENTIMENT_MAP.
        # For any sentiment that is not part of the map we give a value 0.
        classification_vector = numpy.array(
            map(lambda s: SENTIMENT_MAP.get(s.lower(), 0),
                self.classification))

        feature_vector = self.vectorizer.fit_transform(self.data)

        return (classification_vector, feature_vector)

    def build_word_dict(self):
        """ Build sentiment dictionary and build vector of 
            weights for tweets.
        """
        fileIn = open(os.path.join(datasettings.DATA_DIRECTORY,
                                   'AFINN-96.txt'))
        wordDict = {}
        line = fileIn.readline()
        while line != '':
            temp = string.split(line, '\t')
            wordDict[temp[0]] = int(temp[1])
            line = fileIn.readline()
        fileIn.close()

        fileIn = open(
            os.path.join(datasettings.DATA_DIRECTORY, 'AFINN-111.txt'))
        line = fileIn.readline()
        while line != '':
            temp = string.split(line, '\t')
            wordDict[temp[0]] = int(temp[1])
            line = fileIn.readline()
        fileIn.close()

        word_dict_vector = []
        for tweet in self.data:
            word_list = tweet.split()
            sum = 0
            for word in word_list:
                if word in wordDict.keys():
                    sum += wordDict[word]
            word_dict_vector.append(sum)

        return word_dict_vector

    def transform(self, test_data):
        """Performs the transform using the already initialized vectorizer.
        """
        feature_vector = self.vectorizer.transform(test_data)

    def score_func(self, true, predicted):
        """Score function for the validation.
        """
        return metrics.precision_recall_fscore_support(
            true,
            predicted,
            pos_label=[
                SENTIMENT_MAP['positive'],
                SENTIMENT_MAP['negative'],
                SENTIMENT_MAP['neutral'],
            ],
            average='macro')

    def cross_validate(self, k=10):
        """Performs a k-fold cross validation of our training data.

        Args:
            k: The number of folds for cross validation.
        """
        self.scores = []

        X, y = check_arrays(self.feature_vector,
                            self.classification_vector,
                            sparse_format='csr')
        cv = cross_validation.check_cv(k,
                                       self.feature_vector,
                                       self.classification_vector,
                                       classifier=True)

        for train, test in cv:
            self.classifier1.fit(self.feature_vector[train],
                                 self.classification_vector[train])
            self.classifier2.fit(self.feature_vector[train],
                                 self.classification_vector[train])
            self.classifier3.fit(self.feature_vector[train],
                                 self.classification_vector[train])
            classification1 = self.classifier1.predict(
                self.feature_vector[test])
            classification2 = self.classifier2.predict(
                self.feature_vector[test])
            classification3 = self.classifier3.predict(
                self.feature_vector[test])

            classification = []
            for predictions in zip(classification1, classification2,
                                   classification3):
                neutral_count = predictions.count(0)
                positive_count = predictions.count(1)
                negative_count = predictions.count(-1)
                if (neutral_count == negative_count
                        and negative_count == positive_count):
                    classification.append(predictions[0])
                elif (neutral_count > positive_count
                      and neutral_count > negative_count):
                    classification.append(0)
                elif (positive_count > neutral_count
                      and positive_count > negative_count):
                    classification.append(1)
                elif (negative_count > neutral_count
                      and negative_count > positive_count):
                    classification.append(-1)
            classification = numpy.array(classification)

            self.scores.append(self.score_func(y[test], classification))

    def train_and_validate(self,
                           cross_validate=False,
                           mean=False,
                           serialize=False):
        """Trains the SVC with the training data and validates with the test data.

        We do a K-Fold cross validation with K = 10.
        """
        self.classification_vector, self.feature_vector = self.initial_fit()

        self.classifier1 = naive_bayes.MultinomialNB()
        self.classifier2 = naive_bayes.BernoulliNB()
        self.classifier3 = svm.LinearSVC(loss='l2',
                                         penalty='l1',
                                         C=1000,
                                         dual=False,
                                         tol=1e-3)

        if cross_validate:
            self.cross_validate(k=cross_validate)
        else:
            self.classifier1.fit(self.feature_vector,
                                 self.classification_vector)
            self.classifier2.fit(self.feature_vector,
                                 self.classification_vector)
            self.classifier3.fit(self.feature_vector,
                                 self.classification_vector)

        if serialize:
            classifiers_file = open(
                os.path.join(datasettings.DATA_DIRECTORY,
                             'classifiers.pickle'), 'wb')
            cPickle.dump(
                [self.classifier1, self.classifier2, self.classifier3],
                classifiers_file)
            vectorizer_file = open(
                os.path.join(datasettings.DATA_DIRECTORY, 'vectorizer.pickle'),
                'wb')
            cPickle.dump(self.vectorizer, vectorizer_file)

        return self.scores

    def build_ui(self, mean=False):
        """Prints out all the scores calculated.
        """
        for i, score in enumerate(self.scores):
            print "Cross Validation: %d" % (i + 1)
            print "*" * 40
            if mean:
                print "Mean Accuracy: %f" % (score)
            else:
                print "Precision\tRecall\t\tF-Score"
                print "~~~~~~~~~\t~~~~~~\t\t~~~~~~~"
                precision = score[0]
                recall = score[1]
                f_score = score[2]
                print "%f\t%f\t%f" % (precision, recall, f_score)

            print
Пример #6
0
        else:
            res += pred

        # next_hidden = sess.run(tensors['next_hidden'], feed_dict=feed_dict)
        # initial_state = np.vstack((initial_state, next_hidden))[1:]

    return res


if __name__ == '__main__':
    print 'Loading data...'
    with open('../../data/smalldata.txt', 'r') as f:
        data = [line.strip() for line in f]
    vectorizer = Vectorizer(seq_length=25)
    print 'Fitting Vectorizer...'
    X_data, y_data = vectorizer.fit_transform(data)

    with open('vectorizer.pkl', 'w') as f:
        pickle.dump(vectorizer, f)

    N, seq_length, input_dim = X_data.shape
    hidden_dim = 128
    output_dim = input_dim

    X = tf.placeholder(tf.float32, [None, seq_length, input_dim], 'X')
    y = tf.placeholder(tf.float32, [None, output_dim], 'y')
    initial_state = tf.placeholder(tf.float32, [None, 2 * hidden_dim], 'initial_state')
    
    lstm, next_hidden = lstm_layer(X, input_dim, seq_length, hidden_dim, 
                                   output_dim, initial_state, 'lstm')
    with tf.name_scope('predictions'):
Пример #7
0
        output_nonlinearity=softmax,
        update=nesterov_momentum,
        update_learning_rate=0.1,
        update_momentum=0.9,
        # update=adam,
        # update_learning_rate=0.01,
        max_epochs=10000,
        on_epoch_finished=[SaveBestModel('rnn', vectorizer)],
        batch_iterator_train=BatchIterator(batch_size),
        train_split=TrainSplit(eval_size=0.0),
        regression=False,
        verbose=2)
    return net


if __name__ == '__main__':
    print 'Loading data...'
    with open('data/data.txt', 'r') as f:
        data = [line.strip() for line in f]
    vectorizer = Vectorizer(seq_length=25)
    print 'Fitting Vectorizer...'
    X, y = vectorizer.fit_transform(data)
    with open('vectorizer.pkl', 'w') as f:
        pickle.dump(vectorizer, f)
    print 'Training Model...'
    net = build_net(vectorizer)
    try:
        net.fit(X, y)
    except KeyboardInterrupt:
        pass