'Daily Mail' ]), ['TITLE', 'CATEGORY']] # データの分割 train, valid_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=123, stratify=df['CATEGORY']) valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, random_state=123, stratify=valid_test['CATEGORY']) # 特徴ベクトルの作成 X_train = torch.stack([transform_w2v(text) for text in train['TITLE']]) X_valid = torch.stack([transform_w2v(text) for text in valid['TITLE']]) X_test = torch.stack([transform_w2v(text) for text in test['TITLE']]) # ラベルベクトルの作成 category_dict = {'b': 0, 't': 1, 'e': 2, 'm': 3} y_train = torch.tensor( train['CATEGORY'].map(lambda x: category_dict[x]).values) y_valid = torch.tensor( valid['CATEGORY'].map(lambda x: category_dict[x]).values) y_test = torch.tensor(test['CATEGORY'].map(lambda x: category_dict[x]).values) # Datasetを作成するには、X_train, y_trainを利用 dataset_train = NewsDataset(X_train, y_train) dataset_valid = NewsDataset(X_valid, y_valid) dataset_test = NewsDataset(X_test, y_test)
from torch import nn # データの読込 df = pd.read_csv('./../chapter06/data/NewsAggregatorDataset/newsCorpora_re.csv', header=None, sep='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP']) # データの抽出 df = df.loc[ df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']), ['TITLE', 'CATEGORY']] # データの分割 train, valid_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=123, stratify=df['CATEGORY']) valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, random_state=123, stratify=valid_test['CATEGORY']) X_train = torch.stack([transform_w2v(text) for text in train['TITLE']]) # SGLNetという単層ニューラルネットワークを定義 class SGLNet(nn.Module): # ネットのlayerを定義 def __init__(self, input_size, output_size): super().__init__() self.fc = nn.Linear(input_size, output_size, bias=False) nn.init.normal_(self.fc.weight, 0.0, 1.0) # 正規乱数で重みを初期化 # forwardで入力データが順伝播時に通るレイヤーを順に配置しておく def forward(self, x): x = self.fc(x) return x