def prepare_dataset(X, y, pipeline, y_one_hot=True, y_as_is=False): try: print_dataset_statistics(y) except: pass X = pipeline.fit_transform(X) if y_as_is: try: return X, numpy.asarray(y, dtype=float) except: return X, y # 1 - Labels to categories y_cat = labels_to_categories(y) if y_one_hot: # 2 - Labels to one-hot vectors return X, categories_to_onehot(y_cat) return X, y_cat
def __init__(self, word_indices, text_lengths, loading_data=True, datafolder="", preprocess_typ="ekphrasis", **kwargs): self.word_indices = word_indices self.y_one_hot = kwargs.get("y_one_hot", True) self.pipeline = Pipeline([ ('ext', EmbeddingsExtractor(word_indices=word_indices, max_lengths=text_lengths, add_tokens=(True), unk_policy="random")) ]) if (loading_data): print("Loading data...") self.X_train = pickle.load( open("{}X_train_{}.pickle".format(datafolder, preprocess_typ), "rb")) self.X_test = pickle.load( open("{}X_test_{}.pickle".format(datafolder, preprocess_typ), "rb")) self.y_train = pickle.load( open("{}y_train_{}.pickle".format(datafolder, preprocess_typ), "rb")) self.y_test = pickle.load( open("{}y_test_{}.pickle".format(datafolder, preprocess_typ), "rb")) print( "-------------------\ntraining set stats\n-------------------") print_dataset_statistics(self.y_train) print("-------------------")
def __init__(self, word_indices, text_lengths, subtask="A", silver=False, **kwargs): self.word_indices = word_indices filter_classes = kwargs.get("filter_classes", None) self.y_one_hot = kwargs.get("y_one_hot", True) self.pipeline = Pipeline([ ('preprocess', CustomPreProcessor( TextPreProcessor( backoff=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], include_tags={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]))), ('ext', EmbeddingsExtractor(word_indices=word_indices, max_lengths=text_lengths, add_tokens=(False, True) if subtask != "A" else True, unk_policy="random")) ]) # loading data print("Loading data...") dataset = SemEvalDataLoader(verbose=False).get_data(task=subtask, years=None, datasets=None, only_semeval=True) random.Random(42).shuffle(dataset) if filter_classes: dataset = [d for d in dataset if d[0] in filter_classes] self.X = [obs[1] for obs in dataset] self.y = [obs[0] for obs in dataset] print("total observations:", len(self.y)) print("-------------------\ntraining set stats\n-------------------") print_dataset_statistics(self.y) print("-------------------") if silver: print("Loading silver data...") dataset = SemEvalDataLoader().get_silver() self.silver_X = [obs[1] for obs in dataset] self.silver_y = [obs[0] for obs in dataset] print("total observations:", len(self.silver_y))