def lr_example(): min_freq = 1 n_common = 10 pwd = os.path.dirname(os.path.abspath(__file__)) path = pwd + '/example_data/twitter_2020-03-10_slim.csv' print(path) df = csv_parser.load_as_df(path, twitter_schema) df.show(3) converted = featurizer.convert_df_to_feature( df, n_common, min_freq).filter( lambda row: row['age'] is not None and row['feature'] is not None) converted = converted.map( # (age, sex, feature) lambda row: LabeledPoint(row['age'], concat_vectors(row['feature']))) converted = converted.zipWithIndex() sample = converted.take(3) train_rdd = converted.filter(lambda x: x[1] % 2 == 0).map(lambda x: x[0]) feature_dim = len(train_rdd.first().features) test_rdd = converted.filter(lambda x: x[1] % 2 == 1).map(lambda x: x[ 0]).filter(lambda x: len(x.features) == feature_dim).collect() print("confirming dim of train rdd") sample = train_rdd.take(3) for e in sample: print(e.features) print(len(e.features)) lrm = LinearRegressionWithSGD.train(train_rdd) n = len(test_rdd) mse = 0 # テスト for lp in test_rdd: gt = lp.label feat = lp.features pred = lrm.predict(feat) print(gt, pred) mse += (pred - gt) * (pred - gt) import math rmse = math.sqrt(mse / n) print('Root mean square error: ' + str(rmse))
''' 識別モデル構築の流れ (参考:https://qiita.com/MahoTakara/items/b3d719ed1a3665730826, https://qiita.com/Hironsan/items/2466fe0f344115aff177) 1. DONE: 単語分割 2. DONE: 形態素解析 3. DONE: クリーニング 4. DONE: 正規化 (ストップワードの除去含む) 5. DONE: 辞書作成 (単語とIDの対応づけ) https://qiita.com/tatsuya-miyamoto/items/f505dfa8d5307f8c6e98 簡単にできそう 6. DONE: ベクトル化 (埋め込み or IDからone-hot) 7. TODO: 文章特徴抽出 (文章の長さなど) 8. TODO: 提案特徴抽出 (フォロー/フォロワーの特徴) 9. TODO: 識別モデルの実装 10. TODO: 評価メトリクスの実装 11. TODO: 実験実行 ''' if __name__ == "__main__": import os pwd = os.path.dirname(os.path.abspath(__file__)) from parser.csv_parser import load_as_df from schema.twitter_schema import twitter_schema path = pwd + '/../example_data/20190528sentences_data_integrated.csv' df = load_as_df(path, twitter_schema) df.show(3) converted = convert_df_to_feature(df) sample = converted.take(3) for e in sample: print(e)
from featurizer.word2vec_featurizer import Word2VecFeaturizer from featurizer.mult_featurizer import MultiFeaturizer from featurizer.bert_featurizer import BertFeaturizer from featurizer.tfidf_featurizer import TfidfFeaturizer from featurizer.onehot_featurizer import OneHotFeaturizer from pyspark.ml.evaluation import MulticlassClassificationEvaluator if __name__ == '__main__': spark = SparkSession.builder \ .appName('Spark SQL and DataFrame') \ .getOrCreate() # Load training data # dataPath = '../../example_data/twitter/20190528sentences_data_integrated.csv' dataPath = '../../example_data/twitter_2020-03-10.csv' df = load_as_df(dataPath, twitter_schema) converted_df = shape_df(spark, df).drop("age") converted_df.show(3) # model_path = "../../param/word2vec/entity_vector/entity_vector.model.bin" # wv = Word2VecFeaturizer(spark, model_path) # feat_df = wv.featurize(converted_df) model_path = "../../param/word2vec/twitter_model/w2v_gensim/word2vec_tweet.model" wv_tweet = Word2VecFeaturizer(spark, model_path, False) # feat_df = wv_tweet.featurize(converted_df) # model_path = "../../param/word2vec/niconico_model/nico_vec.bin" # wv_nico = Word2VecFeaturizer(spark, model_path, False) # feat_df = wv_nico.featurize(converted_df) # model_path = "../../param/bert/Japanese_L-24_H-1024_A-16_E-30_BPE_WWM_transformers" # bert = BertFeaturizer(spark, model_path) # feat_df = bert.featurize(converted_df) converted_df2 = shape_df(spark, df, 'nagisa', ['補助記号']).drop("age")