from featurizer.mult_featurizer import MultiFeaturizer
from featurizer.bert_featurizer import BertFeaturizer
from featurizer.tfidf_featurizer import TfidfFeaturizer
from featurizer.onehot_featurizer import OneHotFeaturizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

if __name__ == '__main__':

    spark = SparkSession.builder \
        .appName('Spark SQL and DataFrame') \
        .getOrCreate()
    # Load training data
    # dataPath = '../../example_data/twitter/20190528sentences_data_integrated.csv'
    dataPath = '../../example_data/twitter_2020-03-10.csv'
    df = load_as_df(dataPath, twitter_schema)
    converted_df = shape_df(spark, df).drop("age")
    converted_df.show(3)
    # model_path = "../../param/word2vec/entity_vector/entity_vector.model.bin"
    # wv = Word2VecFeaturizer(spark, model_path)
    # feat_df = wv.featurize(converted_df)
    model_path = "../../param/word2vec/twitter_model/w2v_gensim/word2vec_tweet.model"
    wv_tweet = Word2VecFeaturizer(spark, model_path, False)
    # feat_df = wv_tweet.featurize(converted_df)
    # model_path = "../../param/word2vec/niconico_model/nico_vec.bin"
    # wv_nico = Word2VecFeaturizer(spark, model_path, False)
    # feat_df = wv_nico.featurize(converted_df)
    # model_path = "../../param/bert/Japanese_L-24_H-1024_A-16_E-30_BPE_WWM_transformers"
    # bert = BertFeaturizer(spark, model_path)
    # feat_df = bert.featurize(converted_df)
    converted_df2 = shape_df(spark, df, 'nagisa', ['補助記号']).drop("age")
    tfidf = TfidfFeaturizer(spark)
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import GBTClassifier

if __name__ == '__main__':

    spark = SparkSession.builder \
        .appName('Spark SQL and DataFrame') \
        .getOrCreate()
    # Load training data
    # dataPath = '../../example_data/twitter/20190528sentences_data_integrated.csv'
    dataPath = '../../example_data/twitter_2020-03-10.csv'
    df = load_as_df(dataPath, twitter_schema)
    converted_df = shape_df(spark, df).drop("age")
    converted_df.show(3)
    # model_path = "../../param/word2vec/entity_vector/entity_vector.model.bin"
    # wv = Word2VecFeaturizer(spark, model_path)
    # feat_df = wv.featurize(converted_df)
    model_path = "../../param/word2vec/twitter_model/w2v_gensim/word2vec_tweet.model"
    wv_tweet = Word2VecFeaturizer(spark, model_path, False)
    feat_df = wv_tweet.featurize(converted_df)
    # model_path = "../../param/bert/Japanese_L-24_H-1024_A-16_E-30_BPE_WWM_transformers"
    # bert = BertFeaturizer(spark, model_path)
    # feat_df = bert.featurize(converted_df)
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = feat_df.randomSplit([0.8, 0.2], seed=2)

    gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=15)
示例#3
0
            for word in node_list:
                tmp_list.append(word)
            if len(tmp_list) != 0:
                label_list.append(float(data[0]))
                bert_tokens = bert_tokenizer.tokenize(
                    " ".join(["[CLS]"] + tmp_list + ["[SEP]"]))
                token_ids = bert_tokenizer.convert_tokens_to_ids(bert_tokens)
                tokens_tensor = torch.tensor(token_ids).unsqueeze(0)
                all_outputs = bert_model(tokens_tensor)
                embedding = all_outputs[-2].detach().numpy()[0]
                vec = np.mean(embedding, axis=0).tolist()
                vec_list.append(Vectors.dense(vec))
        zip_list = zip(label_list, vec_list)
        new_df = self.spark.createDataFrame(zip_list, ("label", "features"))
        return new_df


if __name__ == '__main__':
    sc = SparkSession.builder\
        .appName('Spark SQL and DataFrame')\
        .getOrCreate()
    df = sc.createDataFrame([
        (21, "male", "友達が作ってくれたビネの白ドレス可愛すぎてたまらん😍"),
        (30, "female", "できればダブりたくないが初期の方のLRは避けたい"),
        (40, "male", "だから一生孤独でも構わんよ親にも作れと言われているけど"),
    ], ("age", "sex", "sentence"))
    converted_df = shape_df(sc, df).drop('age')
    data_path = "../param/bert/Japanese_L-24_H-1024_A-16_E-30_BPE_WWM_transformers"
    bert = BertFeaturizer(sc, data_path)
    result_df = bert.featurize(converted_df)
    result_df.show(3)
示例#4
0
                tmp_list.append(word)
            if len(tmp_list) != 0:
                label_list.append(data[0])
                wakati_list.append(tmp_list)

        self.global_dict.add_documents(wakati_list)
        dim = len(self.global_dict)
        vec_list = []
        for wakati in wakati_list:
            vec = [0 for _ in range(dim)]
            for word in wakati:
                vec[self.global_dict.token2id[word]] = 1
            vec_list.append(Vectors.dense(vec))
        zip_list = zip(label_list, vec_list)
        new_df = self.spark.createDataFrame(zip_list, ("label", "features"))
        return new_df


if __name__ == '__main__':
    spark = SparkSession.builder\
        .appName('Spark SQL and DataFrame')\
        .getOrCreate()
    df = spark.createDataFrame([
        (21, "male", "友達が作ってくれたビネの白ドレス可愛すぎてたまらん😍"),
        (30, "female", "できればダブりたくないが初期の方のLRは避けたい"),
        (40, "male", "だから一生孤独でも構わんよ親にも作れと言われているけど"),
    ], ("age", "sex", "sentence"))
    converted_df = shape_df(spark, df).drop('age')
    oneHot = OneHotFeaturizer(spark)
    result_df = oneHot.featurize(converted_df)
    result_df.show(3)