import pickle import pandas as pd from classifier_config import ClassifierConfig from feature_extractor import FeatureExtractor from model_trainer import ModelTrainer from wsa_classifier import WordSenseAlignmentClassifier german_config = ClassifierConfig('nl_core_news_sm', "dutch", 'data/test', balancing_strategy="none", testset_ratio=0.0, logger='dutch_testset', is_testdata=True) feature_extractor = FeatureExtractor() \ .first_word() \ .similarity() \ .diff_pos_count() \ .tfidf() \ .ont_hot_pos() \ .matching_lemma() \ .count_each_pos() \ .cosine() \ .jaccard() \ .difference_in_length() model_trainer = ModelTrainer(german_config, german_config.logger)
from model_trainer import ModelTrainer, BaseClassifier from wsa_classifier import WordSenseAlignmentClassifier def configure(): pd.set_option('display.max_colwidth', -1) logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) if __name__ == '__main__': configure() german_config = ClassifierConfig('de_core_news_md', "german", 'data/train', balancing_strategy="swap", testset_ratio=0.2, logger='de_all_features_swap', with_testset=True) feature_extractor = FeatureExtractor() \ .first_word() \ .similarity() \ .diff_pos_count() \ .tfidf() \ .ont_hot_pos() \ .matching_lemma() \ .count_each_pos() \ .cosine() \ .jaccard() \ .difference_in_length()\
from model_trainer import ModelTrainer from wsa_classifier import WordSenseAlignmentClassifier def configure(): pd.set_option('display.max_colwidth', -1) logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) if __name__ == '__main__': configure() english_config = ClassifierConfig('en_core_web_lg', "english", 'data/train', balancing_strategy="none", testset_ratio=0.0, with_wordnet=True, dataset='english_nuig', logger='en_nuig_split_biggest') feature_extractor = FeatureExtractor() \ .diff_pos_count() \ .ont_hot_pos() \ .matching_lemma() \ .count_each_pos() \ .avg_count_synsets() \ .difference_in_length()\ .similarity_diff_to_target()\ .max_dependency_tree_depth() \ .target_word_synset_count()\ .token_count_norm_diff()\
import pickle import pandas as pd from classifier_config import ClassifierConfig from feature_extractor import FeatureExtractor from model_trainer import ModelTrainer from wsa_classifier import WordSenseAlignmentClassifier german_config = ClassifierConfig('de_core_news_md', "german", 'data/test', balancing_strategy="none", testset_ratio=0.0, logger='de_testset', is_testdata=True) feature_extractor = FeatureExtractor() \ .first_word() \ .similarity() \ .diff_pos_count() \ .tfidf() \ .ont_hot_pos() \ .matching_lemma() \ .count_each_pos() \ .cosine() \ .jaccard() \ .difference_in_length() model_trainer = ModelTrainer(german_config, german_config.logger)
import pickle import pandas as pd from classifier_config import ClassifierConfig from feature_extractor import FeatureExtractor from model_trainer import ModelTrainer from wsa_classifier import WordSenseAlignmentClassifier german_config = ClassifierConfig('en_core_web_lg', "english", 'data/test', balancing_strategy="split_biggest", testset_ratio=0.0, with_wordnet=True, dataset='english_nuig', logger='en_nuig', is_testdata=True) feature_extractor = FeatureExtractor() \ .first_word() \ .similarity() \ .diff_pos_count() \ .tfidf() \ .ont_hot_pos() \ .matching_lemma() \ .count_each_pos() \ .cosine() \ .jaccard() \ .avg_count_synsets() \ .difference_in_length() \
from classifier_config import ClassifierConfig from feature_extractor import FeatureExtractor from model_trainer import ModelTrainer from wsa_classifier import WordSenseAlignmentClassifier def configure(): pd.set_option('display.max_colwidth', -1) logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) if __name__ == '__main__': configure() german_config = ClassifierConfig('de_core_news_md', "german", 'data/train', balancing_strategy="none",testset_ratio=0.2, logger = 'de_all_features_nonebalance') feature_extractor = FeatureExtractor() \ .first_word() \ .similarity() \ .diff_pos_count() \ .tfidf() \ .ont_hot_pos() \ .matching_lemma() \ .count_each_pos() \ .cosine() \ .jaccard() \ .difference_in_length() lr = {'estimator': LogisticRegression(), 'parameters': {
from feature_extractor import FeatureExtractor from model_trainer import ModelTrainer from wsa_classifier import WordSenseAlignmentClassifier def configure(): pd.set_option('display.max_colwidth', -1) logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) if __name__ == '__main__': configure() german_config = ClassifierConfig('nl_core_news_sm', "dutch", 'data/train', balancing_strategy="none", testset_ratio=0.2, logger='dutch_all_features_nonebalance') feature_extractor = FeatureExtractor() \ .first_word() \ .similarity() \ .diff_pos_count() \ .tfidf() \ .ont_hot_pos() \ .matching_lemma() \ .count_each_pos() \ .cosine() \ .jaccard() \ .difference_in_length()
from classifier_config import ClassifierConfig from feature_extractor import FeatureExtractor from model_trainer import ModelTrainer from wsa_classifier import WordSenseAlignmentClassifier def configure(): pd.set_option('display.max_colwidth', -1) logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) if __name__ == '__main__': configure() english_config = ClassifierConfig('en_core_web_lg', "english", '../data/train', balancing_strategy="none", testset_ratio=0.2, with_wordnet=True) feature_extractor = FeatureExtractor() \ .first_word() \ .similarity() \ .diff_pos_count() \ .tfidf() \ .ont_hot_pos() \ .matching_lemma() \ .count_each_pos() \ .cosine() \ .jaccard() \ .similarity_diff_to_target()\ .avg_count_synsets() \ .difference_in_length()
from classifier_config import ClassifierConfig from feature_extractor import FeatureExtractor from model_trainer import ModelTrainer from wsa_classifier import WordSenseAlignmentClassifier def configure(): pd.set_option('display.max_colwidth', -1) logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) if __name__ == '__main__': configure() english_config = ClassifierConfig('en_core_web_lg', "english", 'data/train', balancing_strategy="divide", testset_ratio=0.0, with_wordnet= True, dataset='english_nuig', logger = 'en_nuig_overfithandling') feature_extractor = FeatureExtractor() \ .first_word() \ .similarity() \ .diff_pos_count() \ .tfidf() \ .ont_hot_pos() \ .matching_lemma() \ .count_each_pos() \ .cosine() \ .jaccard() \ .avg_count_synsets() \ .difference_in_length()\ .similarity_diff_to_target()\ .max_dependency_tree_depth() \
from feature_extractor import FeatureExtractor from model_trainer import ModelTrainer, BaseClassifier from wsa_classifier import WordSenseAlignmentClassifier def configure(): pd.set_option('display.max_colwidth', -1) logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) if __name__ == '__main__': configure() german_config = ClassifierConfig('de_core_news_md', "german", 'data/train', balancing_strategy="oversampling", testset_ratio=0.0, logger='de_all_features_oversampling') feature_extractor = FeatureExtractor() \ .first_word() \ .similarity() \ .diff_pos_count() \ .tfidf() \ .ont_hot_pos() \ .matching_lemma() \ .count_each_pos() \ .cosine() \ .jaccard() \ .difference_in_length()\