def main(): path = os.path.join('..', '..', 'dataset', 'eRISK2020_T1_training_data', 'td') print("Creating Corpus Reader for training") corpus_reader_train = CorpusReader(path) corpus_reader_train.load() print("Corpus Reader for training created") path = os.path.join('..', '..', 'dataset', 'T1_test_data', 'td') gt_name = 'T1_erisk_golden_truth.txt' corpus_reader_test = CorpusReader(path, gt_name) corpus_reader_test.load() all_texts = [ ''.join(map(lambda x: str(x), subject.posts)) for subject in corpus_reader_train.subjects ] all_gt = [subject.gt for subject in corpus_reader_train.subjects] count_vectorizer = CountVectorizer(analyzer='word', token_pattern=r'\w+', ngram_range=(1, 2)) bow = dict() bow["train"] = (count_vectorizer.fit_transform(all_texts), all_gt) lr_classifier = LogisticRegression(solver='liblinear') lr_classifier.fit(*bow["train"]) matrix = Matrix(len(corpus_reader_test.subjects), corpus_reader_test.subjects) args = {'matrix': matrix, 'vec': count_vectorizer, 'class': lr_classifier} matrix = run_simulation(args) print(matrix) # analyze results precision = measures.calc_precision(corpus_reader_test.subjects, matrix) recall = measures.calc_recall(corpus_reader_test.subjects, matrix) f1 = measures.calc_f1(precision, recall) ERDE = measures.calc_ERDE(corpus_reader_test.subjects, matrix)
def run(self): corpus_reader = CorpusReader(self.path) corpus_reader.load() analyser = SentimentIntensityAnalyzer() num_subs = len(corpus_reader.subjects) for i, sub in enumerate(corpus_reader.subjects): print(f"Number os subjects left : {num_subs - i}") for post in sub.posts: score = analyser.polarity_scores(str(post)) s = score['compound'] if abs(s) > self.threshold: string = spplit(str(post)) for j in range(3): for i in range(len(string) - j): score_word = analyser.polarity_scores(' '.join( string[i:(i + j)])) word_compound = score_word['compound'] if abs(word_compound) > self.threshold: if string[i] not in self.imp_words: self.imp_words.append(' '.join( string[i:(i + j)]))
def train_model4(classifier): path1 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRISK2020_T1_training_data', 'eRISK2020_T1_training_data', 'eRISK2020_training_data') path2 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRisk2020_T1_test_data', 'eRisk2020_T1_test_data', 'T1') print("Creating Corpus Reader for training") corpus_reader_train = CorpusReader(path1) corpus_reader_train.load() print("Corpus Reader for training created") corpus_reader_test = CorpusReader(path2) corpus_reader_test.load() print("Corpus Reader for testing created") emo = Emojis() token = Token("yake") """ set the tokenizer and model parameters """ bert_model = SentenceTransformer('paraphrase-mpnet-base-v2') # create the bert bert_transformer = BigBird(bert_model) sentiment = Sentiment() """ training the model """ print("Initializing Training") model = Pipeline([ ('emojis', emo), ('tokenizer', token), ('union', FeatureUnion(transformer_list=[ ("vectorizer", bert_transformer), ("sentiment", sentiment), ])), ("classifier", classifier), ]) batch_size = 40 num_users = len(corpus_reader_train.subjects) count = 0 all_texts = list() all_gt = list() for i in range(0, num_users, batch_size): all_texts.append([ subject.posts for subject in corpus_reader_train.subjects[(batch_size * count):(batch_size * (count + 1))] ]) all_gt.append([ subject.gt for subject in corpus_reader_train.subjects[(batch_size * count):(batch_size * (count + 1))] ]) count += 1 for i in range(len(all_texts)): model.fit(all_texts[i], all_gt[i]) ''' num_users = len(corpus_reader_test.subjects) all_texts = list() all_gt = list() count = 0 for i in range(0, num_users, batch_size): all_texts.append([ subject.posts for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))] ]) all_gt.append([ subject.gt for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))] ]) count += 1 for i in range(len(all_texts)): model.fit(all_texts[i], all_gt[i]) ''' print("End of training") # Its important to use binary mode dbfile = open(f'model4_{classifier.__class__.__name__}.sav', 'wb') pickle.dump(model, dbfile) return model
def train_model1(classifier): path1 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRISK2020_T1_training_data', 'eRISK2020_T1_training_data', 'eRISK2020_training_data') path2 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRisk2020_T1_test_data', 'eRisk2020_T1_test_data', 'T1') print("Creating Corpus Reader for training") corpus_reader_train = CorpusReader(path1) corpus_reader_train.load() print("Corpus Reader for training created") corpus_reader_test = CorpusReader(path2) corpus_reader_test.load() print("Corpus Reader for testing created") emo = Emojis() token = Token("normal") """ set the tokenizer and model parameters """ #tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") #bert_model = BertModel.from_pretrained("bert-base-uncased") bert_model = SentenceTransformer('paraphrase-mpnet-base-v2') #device = torch.device("cuda") #bert_model.to(device) # create the bert bert_transformer = BigBird(bert_model) sentiment = Sentiment() """ training the model """ print("Initializing Training") #classifier = svm.SVC(C = 1, gamma = 'scale', kernel = 'linear', probability = True) #clf = CalibratedClassifierCV(classifier) #classifier = svm.SVC(C = 1, gamma = 'scale', kernel = 'linear', probability = True) #classifier = AdaBoostClassifier(learning_rate = 0.01, n_estimators = 100) #clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) model = Pipeline([ ('emojis', emo), ('tokenizer', token), ( 'union', FeatureUnion(transformer_list=[ ("vectorizer", bert_transformer), #("sentiment", sentiment), ])), ("classifier", classifier), ]) batch_size = 40 num_users = len(corpus_reader_train.subjects) #print(num_users) for j in range(50, 2000, 50): count = 0 all_texts = list() all_gt = list() for i in range(0, num_users, batch_size): #print(i) all_texts.append([ subject.posts[0:j] for subject in corpus_reader_train.subjects[( batch_size * count):(batch_size * (count + 1))] ]) all_gt.append([ subject.gt for subject in corpus_reader_train.subjects[( batch_size * count):(batch_size * (count + 1))] ]) count += 1 print(all_gt[0]) for i in range(len(all_texts)): model.fit(all_texts[i], all_gt[i]) num_users = len(corpus_reader_test.subjects) #print(num_users) for j in range(50, 2000, 50): all_texts = list() all_gt = list() count = 0 for i in range(0, num_users, batch_size): print(i) all_texts.append([ subject.posts[0:j] for subject in corpus_reader_test.subjects[( batch_size * count):(batch_size * (count + 1))] ]) all_gt.append([ subject.gt for subject in corpus_reader_test.subjects[( batch_size * count):(batch_size * (count + 1))] ]) count += 1 for i in range(len(all_texts)): model.fit(all_texts[i], all_gt[i]) print("End of training") # Its important to use binary mode dbfile = open(f'model1_{classifier.__class__.__name__}.sav', 'wb') pickle.dump(model, dbfile) return model
def train6(): with open("log.txt", 'w') as f: pass #path1 = os.path.join( '..', '..', 'dataset', 'eRISK2020_T1_training_data', 'train') #path1 = os.path.join( '..', 'data', 'erisk-2021-t2', 'td') path1 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRISK2020_T1_training_data', 'eRISK2020_T1_training_data', 'eRISK2020_training_data') path2 = os.path.join('..', 'data', 'erisk-2021-t2', 'training', 'eRisk2020_T1_test_data', 'eRisk2020_T1_test_data', 'T1') print("Creating Corpus Reader for training") corpus_reader_train = CorpusReader(path1) corpus_reader_train.load() print("Corpus Reader for training created") corpus_reader_test = CorpusReader(path2) corpus_reader_test.load() print("Corpus Reader for testing created") emo = Emojis() token = Token() """ set the tokenizer and model parameters """ #tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") #bert_model = BertModel.from_pretrained("bert-base-uncased") bert_model = SentenceTransformer('paraphrase-mpnet-base-v2') #device = torch.device("cuda") #bert_model.to(device) # create the bert bert_transformer = BigBird(bert_model) sentiment = Sentiment() """ training the model """ print("Initializing Training") #n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0 parameters = { 'classifier__n_estimators':[50, 100, 500, 1000], 'classifier__learning_rate' : [ 0.001, 0.01, 0.1, 1.0], 'classifier__max_depth' : [1, 3, 5, 10]} classifier = GradientBoostingClassifier() model = Pipeline( [ ('emojis', emo), #('tokenizer', token), ('union', FeatureUnion(transformer_list = [ ("vectorizer", bert_transformer), ("sentiment", sentiment), ])), ("classifier", classifier), ] ) clf = GridSearchCV(model, parameters) batch_size = 40 num_users = len(corpus_reader_train.subjects) #print(num_users) for j in range(50, 2000, 50): count = 0 all_texts = list() all_gt = list() for i in range(0, num_users, batch_size): #print(i) all_texts.append([ subject.posts[0:j] for subject in corpus_reader_train.subjects[(batch_size * count) : (batch_size * (count + 1))] ]) all_gt.append([ subject.gt for subject in corpus_reader_train.subjects[(batch_size * count) : (batch_size * (count + 1))] ]) count += 1 print(all_gt[0]) for i in range(len(all_texts)): clf.fit(all_texts[i], all_gt[i]) num_users = len(corpus_reader_test.subjects) #print(num_users) for j in range(50, 2000, 50): all_texts = list() all_gt = list() count = 0 for i in range(0, num_users, batch_size): print(i) all_texts.append([ subject.posts[0:j] for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))] ]) all_gt.append([ subject.gt for subject in corpus_reader_test.subjects[(batch_size * count) : (batch_size * (count + 1))] ]) count += 1 for i in range(len(all_texts)): clf.fit(all_texts[i], all_gt[i]) print("End of training") return clf
#mod4 = pickle.load(open(MODEL4_NAME, 'rb')) #mod5 = pickle.load(open(MODEL5_NAME, 'rb')) #mod6 = pickle.load(open(MODEL6_NAME, 'rb')) #mod7 = pickle.load(open(MODEL7_NAME, 'rb')) #device = torch.device("cuda") #no_vader.to(device) path = os.path.join( '..', 'data', 'erisk-2021-t2') #path = os.path.join( '..', '..', 'dataset', 'T1_test_data', 'test') gt_name = 'golden_truth.txt' corpus_reader_test = CorpusReader(path) corpus_reader_test.load() with open("file.txt", 'w') as f: for sub in corpus_reader_test.subjects: f.write("{} - {}\n".format(sub.id, sub.gt)) filename = "RESULTS_TEST_more_model3_no_token_param.txt" #clean file with open(filename, 'w') as file: pass # find the greatest number of posts posts_max = max([ len(s.posts) for s in corpus_reader_test.subjects ]) print(posts_max)