def load_instances(config, instances): for instance_config in config["REST_instances"]: instance = Instance(instance_config["name"], instance_config["language"], instance_config["embeddings_path"], instance_config["preprocessing_style"], instance_config["model_path"], instance_config["labels"]) instance.text_processor = TextPreProcessor( # terms that will be normalized normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter=instance_config["preprocessing_style"], # corpus from which the word statistics are going to be used # for spell correction corrector=instance_config["preprocessing_style"], unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) instance.itos, instance.stoi, instance.vectors, instance.embeddings_size = \ load_embeddings(instance.embeddings_path) instance.text = data.Field() instance.text.build_vocab([instance.itos]) instance.text.vocab.set_vectors(instance.stoi, instance.vectors, instance.embeddings_size) instance.model = torch.load( instance.model_path, map_location='cpu' if not cuda_available else None) instance.model = instance.model.eval() instances[instance_config["name"]] = instance
def __init__( self, liwc_path: str = '', emolex_path: str = 'english_emolex.csv', estimator_path: str = 'english_twitter_politeness_estimator.joblib', feature_defn_path: str = 'english_twitter_additional_features.pickle', countVectorizer_path: str = '') -> None: # Preload LIWC dictionary: if liwc_path: liwc_df = pd.read_csv(liwc_path) liwc_df['*'] = liwc_df['term'].str.endswith('*') liwc_df['t'] = liwc_df['term'].str.rstrip('*') self.liwc_prefx = liwc_df[liwc_df['*']].groupby( 'category')['t'].apply(set) self.liwc_whole = liwc_df[~liwc_df['*']].groupby( 'category')['t'].apply(set) self.use_liwc = True # Preload EmoLex dictionary: emolex_df = pd.read_csv(emolex_path, index_col=0) self.emolex = emolex_df.apply(lambda s: set(s[s == 1].index)) # Preload additional feature rules: pltlex = pd.read_pickle(feature_defn_path) types = pltlex.apply(type) self.pltlex_ptn = pltlex[types == re.Pattern].to_dict() self.pltlex_set = pltlex[types == set].to_dict() # Initialize Tokenizer: self.text_processor = TextPreProcessor( # terms that will be normalized: normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated: annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, # perform word segmentation on hashtags: unpack_hashtags=False, # Unpack contractions (can't -> can not): unpack_contractions=True, tokenizer=SocialTokenizer(lowercase=True).tokenize, ) # preload classifier: self.clf = joblib.load(estimator_path) if countVectorizer_path: self.counter = joblib.load(countVectorizer_path) self.use_cntVec = True
def __init__(self): self.label2emotion = {0: "others", 1: "happy", 2: "sad", 3: "angry"} self.emotion2label = {"others": 0, "happy": 1, "sad": 2, "angry": 3} self.emoticons_additional = { '(^・^)': '<happy>', ':‑c': '<sad>', '=‑d': '<happy>', ":'‑)": '<happy>', ':‑d': '<laugh>', ':‑(': '<sad>', ';‑)': '<happy>', ':‑)': '<happy>', ':\\/': '<sad>', 'd=<': '<annoyed>', ':‑/': '<annoyed>', ';‑]': '<happy>', '(^�^)': '<happy>', 'angru': 'angry', "d‑':": '<annoyed>', ":'‑(": '<sad>', ":‑[": '<annoyed>', '(�?�)': '<happy>', 'x‑d': '<laugh>', } self.text_processor = TextPreProcessor( # terms that will be normalized normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=True, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons, self.emoticons_additional])
def __init__(self, args): if args.datastories: tokenizer = SocialTokenizer(lowercase=True) else: tokenizer = TweetTokenizer() self.RAW = data.RawField() self.TEXT = data.Field(batch_first=True, include_lengths=True, lower=True, tokenize=tokenizer.tokenize) self.LABEL = data.Field(sequential=False, unk_token=None) self.train, self.dev, self.test = datasets.EMO.splits( args, self.RAW, self.TEXT, self.LABEL, args.train_data_path, args.valid_data_path, args.test_data_path) self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=GloVe(name='840B', dim=300)) if args.fasttext: self.FASTTEXT = data.Field(batch_first=True, include_lengths=True, lower=True, tokenize=tokenizer.tokenize) self.FASTTEXT.vocab = copy.deepcopy(self.TEXT.vocab) self.FASTTEXT.vocab.set_vectors(self.FASTTEXT.vocab.stoi, vectors=FastText(language='en'), dim=300) self.LABEL.build_vocab(self.train) self.train_iter, self.dev_iter, self.test_iter = \ data.BucketIterator.splits((self.train, self.dev, self.test), batch_size=args.batch_size, device=args.device, repeat=False) self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos]) # for <pad> self.char_vocab = {'': 0} # for <unk> and <pad> self.characterized_words = [[0] * self.max_word_len, [0] * self.max_word_len] if args.char_emb: self.build_char_vocab() filehandler = open('./data/vocab.obj', 'wb') pickle.dump(self.TEXT.vocab, filehandler) filehandler = open('./data/label.obj', 'wb') pickle.dump(self.LABEL.vocab, filehandler)
def __init__(self, word_indices, text_lengths, **kwargs): self.word_indices = word_indices filter_classes = kwargs.get("filter_classes", None) self.y_one_hot = kwargs.get("y_one_hot", True) self.pipeline = Pipeline([ ('preprocess', CustomPreProcessor( TextPreProcessor( backoff=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], include_tags={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]))), ('ext', EmbeddingsExtractor(word_indices=word_indices, max_lengths=text_lengths, add_tokens=True, unk_policy="random")) ]) # loading data print("Loading data...") dataset = DataLoader(verbose=False).get_data(years=None, datasets=None) random.Random(42).shuffle(dataset) if filter_classes: dataset = [d for d in dataset if d[0] in filter_classes] self.X = [obs[1] for obs in dataset] self.y = [obs[0] for obs in dataset] print("total observations:", len(self.y)) print("-------------------\ntraining set stats\n-------------------") print_dataset_statistics(self.y) print("-------------------")
def tokenizer(tweet): """Returns the tokenized sentence using a tokenizer specially designed for social network content, that can handle complex emoticons, emojis and other unstructured expressions like dates, times and more. Args: tweet (str) : the original tweet. Returns: tokenized_tweet (str) : the tokenized tweet. """ social_tokenizer = SocialTokenizer(lowercase=False).tokenize return " ".join(s for s in social_tokenizer(tweet))
def twitter_preprocessor(): preprocessor = TextPreProcessor( normalize=['url', 'email', 'phone', 'user'], annotate={ "hashtag", "elongated", "allcaps", "repeated", 'emphasis', 'censored' }, all_caps_tag="wrap", fix_text=False, segmenter="twitter_2018", corrector="twitter_2018", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize).pre_process_doc return preprocessor
def emotion_and_split(): text_process = TextPreProcessor( segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons] ) return text_process
def __init__(self): self.root_dir = "CrisisLexT26/" self.count = 0 self.natural_disasters = [] self.non_natural_disasters = [] self.prep_natural_disasters = [] self.prep_non_natural_disasters = [] self.nat_labels = [] self.non_natural_labels = [] self.en_prep_nat_tweets = [] self.en_prep_non_nat_tweets = [] self.text_processor = TextPreProcessor( # terms that will be normalized normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=True, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons])
def __init__(self, **kwargs): self.text_processor = TextPreProcessor( omit=kwargs.get('normalize', []), normalize=kwargs.get( 'normalize', ['url', 'email', 'phone', 'user', 'time', 'url', 'date']), annotate=kwargs.get('annotate', {}), fix_html=kwargs.get('fix_html', True), segmenter=kwargs.get('segmenter', "twitter"), corrector=kwargs.get('corrector', "twitter"), unpack_hashtags=kwargs.get('unpack_hashtags', True), unpack_contractions=kwargs.get('unpack_contractions', True), spell_correct_elong=kwargs.get('fix_elongation', True), spell_correction=kwargs.get('spell_correction', True), fix_bad_unicode=kwargs.get('fix_bad_unicode', True), tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons])
def datastories_processor(x): from ekphrasis.dicts.emoticons import emoticons from ekphrasis.classes.tokenizer import SocialTokenizer from ekphrasis.classes.preprocessor import TextPreProcessor text_processor = TextPreProcessor( # terms that will be normalized normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) x = [text_processor.pre_process_doc(sent) for sent in x] temp = [] for sent in x: context = '' for word in sent: context = context + ' ' + word temp.append(context) return temp
def bow_model(task, max_features=10000): if task == "clf": algo = LogisticRegression(C=0.6, random_state=0, class_weight='balanced') elif task == "reg": algo = SVR(kernel='linear', C=0.6) else: raise ValueError("invalid task!") word_features = TfidfVectorizer( ngram_range=(1, 1), tokenizer=lambda x: x, analyzer='word', min_df=5, # max_df=0.9, lowercase=False, use_idf=True, smooth_idf=True, max_features=max_features, sublinear_tf=True) preprocessor = TextPreProcessor( backoff=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], include_tags={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) model = Pipeline([('preprocess', CustomPreProcessor(preprocessor, to_list=True)), ('bow-feats', word_features), ('normalizer', Normalizer(norm='l2')), ('clf', algo)]) return model
class Const: MODEL_NAME = "cardiffnlp/twitter-roberta-base" MAX_TOKEN_LEN = 128 SPECIAL_TOKENS = [ "<head>", "</head>", "<tail>", "</tail>", "<url>", "<user>", "<date>", "<number>", "<money>", "<email>", "<percent>", "<phone>", "<time>", "<hashtag>", "</hashtag>", ] NORMALIZE = [ "url", "email", "percent", "money", "phone", "user", "time", "url", "date", "number", ] TEXT_PROCESSOR_ARGS = dict( normalize=NORMALIZE, annotate={"hashtag"}, fix_html=True, segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons], )
def preprocess_(dataset): preprocessor = TextPreProcessor( normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number'], annotate={"hashtag", "elongated", "allcaps", "repeated", 'emphasis', 'censored'}, all_caps_tag="wrap", fix_text=True, segmenter="twitter_2018", corrector="twitter_2018", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons] ).pre_process_doc return [preprocessor(x) for x in dataset]
def twitter_preprocess(): preprocessor = TextPreProcessor( normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number'], annotate={"hashtag", "elongated", "allcaps", "repeated", 'emphasis', 'censored'}, all_caps_tag="wrap", fix_text=True, segmenter="twitter_2018", corrector="twitter_2018", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons] ).pre_process_doc def preprocess(name, dataset): desc = "PreProcessing dataset {}...".format(name) data = [None for _ in range(len(dataset))] N = len(data) for i, x in tqdm(enumerate(dataset), desc=desc, total=N): data[i] = preprocessor(x) data = [preprocessor(x) for x in tqdm(dataset, desc=desc)] return data def parallel_preprocess(name, dataset): N = len(dataset) batchsize = 1000 n_splits = N // batchsize + (1 if N % batchsize > 0 else 0) batches = (dataset[i*batchsize:(i+1)*batchsize] for i in range(n_splits)) data = [] with Pool(processes=6) as p: for result in tqdm(p.imap(preprocess_, batches), total=n_splits): data += result return data # return preprocess return parallel_preprocess
def create_preprocessor(self): preprocessor = TextPreProcessor( normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], annotate={ "hashtag", "allcaps", "elongated", 'emphasis', 'censored' }, fix_html=True, segmenter='twitter', corrector='twitter', unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=True, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) return preprocessor
def ekphrasis_config(str): social_tokenizer = SocialTokenizer(lowercase=True).tokenize str_list = social_tokenizer(str) # print(str_list) # for index in range(len(str_list)): # str_list[index] = sp.correct(str_list[index]) # for index in range(len(str_list)): # if str_list[index] in EMOTICONS_TOKEN.keys(): # str_list[index] = EMOTICONS_TOKEN[str_list[index]] # for index in range(len(str_list)): # if str_list[index] in EMOTICONS_TOKEN.keys(): # str_list[index] = EMOTICONS_TOKEN[str_list[index]][1:len(EMOTICONS_TOKEN[str_list[index]]) - 1] # # for index in range(len(str_list)): # if str_list[index] in LOGOGRAM.keys(): # str_list[index] = LOGOGRAM[str_list[index]] return str_list
def __init__(self): self.text_processor_options = TextPreProcessor( normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], unpack_contractions=False, annotate={"allcaps", "elongated", "repeated", 'emphasis', 'censored'}, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation and correction segmenter="english", corrector="english", unpack_hashtags=False, # perform word segmentation on hashtags spell_correct_elong=False, # spell correction for elongated words # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, dicts=[emoticons] )
def __new__(cls, with_vinai=False): if cls.__singleton is None: cls.__singleton = super(Tokenizer, cls).__new__(cls) if with_vinai: cls.__tokenizer = normalizeTweet else: cls.__tokenizer = TextPreProcessor( # terms that will be normalized normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number' ], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags= True, # perform word segmentation on hashtags unpack_contractions= True, # Unpack contractions (can't -> can not) spell_correct_elong= False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]).pre_process_doc return cls.__singleton
def preprocess_through_ekphrasis(train_file_path, test_file_path, trial_file_path): text_processor = TextPreProcessor( normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=True, spell_correction=True, all_caps_tag="wrap", fix_bad_unicode=True, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) for file_path in [train_file_path, test_file_path, trial_file_path]: with open(file_path, 'r', newline='') as file: new_sentences = list() labels = list() for line in file: labels.append(line.split('\t')[0]) new_sentences.append(" ".join( text_processor.pre_process_doc(line.split('\t')[1]))) with open(file_path[:-4] + "_ekphrasis.csv", 'w', newline='') as new_file: for label, sentence in zip(labels, new_sentences): new_file.write("{}\t{}\n".format( label, sentence.replace("[ <hashtag> triggerword </hashtag> #]", "[#TRIGGERWORD#]").replace( "[ <allcaps> newline </allcaps> ]", "[NEWLINE]")))
def nbow_model(task, embeddings, word2idx): if task == "clf": algo = LogisticRegression(C=0.6, random_state=0, class_weight='balanced') elif task == "reg": algo = SVR(kernel='linear', C=0.6) else: raise ValueError("invalid task!") embeddings_features = NBOWVectorizer(aggregation=["mean"], embeddings=embeddings, word2idx=word2idx, stopwords=False) preprocessor = TextPreProcessor( backoff=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], include_tags={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) model = Pipeline([('preprocess', CustomPreProcessor(preprocessor, to_list=True)), ('embeddings-feats', embeddings_features), ('normalizer', Normalizer(norm='l2')), ('clf', algo)]) return model
def __init__(self, text, **kwargs): self.text = text self.text_processor = TextPreProcessor( # terms that will be normalize e.g. [email protected] to <email> normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number' ], # terms that will be annotated e.g. <hashtag>#test</hashtag> annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis' }, fix_html=True, # fix HTML tokens unpack_hashtags=True, # perform word segmentation on hashtags # select a tokenizer. You can use SocialTokenizer, or pass your own if not text tokenized on whitespace # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons])
def get_preprocessed_data(raw_data,sentence_segmentation=False,pre_lang_check=True): text_processor = TextPreProcessor( omit=['url', 'email', 'user'], normalize=['url', 'email', 'user'], annotate={"elongated", "repeated",'emphasis', 'censored'}, segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, spell_correction=False, tokenizer=SocialTokenizer(lowercase=True).tokenize) processed_data=[] for data_type in raw_data: if len(raw_data[data_type])>=MAX_SEQ: chunks = int(len(raw_data[data_type])/MAX_SEQ) for j in range(chunks): processed_data+=TextPreprocessing(raw_data[data_type][MAX_SEQ*j:MAX_SEQ*(j+1)],text_processor,sentence_segmentation,pre_lang_check,get_mode(data_type)) if MAX_SEQ*chunks!=len(raw_data[data_type]): processed_data+=TextPreprocessing(raw_data[data_type][MAX_SEQ*chunks:],text_processor,sentence_segmentation,pre_lang_check,get_mode(data_type)) else: processed_data+=TextPreprocessing(raw_data[data_type],text_processor,sentence_segmentation,pre_lang_check,get_mode(data_type)) return processed_data
def __init__(self, args): if args.datastories: tokenizer = SocialTokenizer(lowercase=True) else: tokenizer = TweetTokenizer() self.RAW = data.RawField() self.TEXT = data.Field(batch_first=True, include_lengths=True, lower=True, tokenize=tokenizer.tokenize) self.LABEL = data.Field(sequential=False, unk_token=None) filehandler = open('./data/vocab.obj', 'rb') self.TEXT.vocab = pickle.load(filehandler) filehandler = open('./data/label.obj', 'rb') self.LABEL.vocab = pickle.load(filehandler) self.test = datasets.EMO.getTestData(args, self.RAW, self.TEXT, args.test_data_path) self.test_iter = \ data.Iterator(self.test, batch_size=args.batch_size, device=args.device, shuffle=False, sort=False, repeat=False) self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos]) # for <pad> self.char_vocab = {'': 0} # for <unk> and <pad> self.characterized_words = [[0] * self.max_word_len, [0] * self.max_word_len] if args.char_emb: self.build_char_vocab()
def build_vocab(dataset): # use text processing tool to do word normalization, annotation, segmentation, tokenization, and spell correction # return a vocabulary set vocabulary_set = set() text_processor = TextPreProcessor( normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'}, fix_html=True, segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons] ) for text_tensor, _ in dataset: text = str(text_tensor.numpy()[0], 'utf-8') some_tokens = text_processor.pre_process_doc(text) vocabulary_set.update(some_tokens) return vocabulary_set
def twitter_preprocess(self): preprocessor = TextPreProcessor( normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number' ], annotate={ "hashtag", "elongated", "allcaps", "repeated", 'emphasis', 'censored' }, all_caps_tag="wrap", fix_text=True, segmenter="twitter_2018", corrector="twitter_2018", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) text = self.data cache_file = os.path.join('./', "cached", "preprocessed_" + self.name + ".pkl") preprocessed = None if os.path.isfile(cache_file): with open(cache_file, 'rb') as f: preprocessed = pickle.load(f) else: preprocessed = [ preprocessor.pre_process_doc(x) for x in tqdm(text, desc="Preprocessing dataset...") ] with open(cache_file, 'wb') as f: pickle.dump(preprocessed, f) return preprocessed
annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=True, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons, emoticons_additional]) def tokenize(text): print("in tokenize") text = " ".join(text_processor.pre_process_doc(text)) return text def preprocessData(dataFilePath, mode): print("in preprocess data") conversations = [] labels = []
def yelpInstanceConstructFromTrain(self, paramFpathInTrainTxt, paramFpathOutToken2IndexDict, paramFpathOutIndex2TokenDict, paramFpathOutTrainParams, paramFpathOutTrainInstance): ''' combine reviews with stars, reshuffle reviews, and split into two sets =================================================== parameters: ----------- paramFpathInTrainTxt: review texted train paramFpathOutToken2IndexDict: map token to index paramFpathOutIndex2TokenDict: map index to token paramFpathOutTest: test se paramFpathOutParams: the parameters needed for training paramTrainsetPercent: train set percent return: ----------- None ''' # read in the train.txt fpointerInTrainTxt = open(paramFpathInTrainTxt, 'rt', encoding='utf8') def __function4map(elem4map): ''' stripe elem =================================================== parameters: ----------- elem4map return: ----------- mapped elem ''' elemstriped = elem4map.strip() return elemstriped listTrainTxt = list(map(__function4map, fpointerInTrainTxt.readlines())) fpointerInTrainTxt.close() # ----------initialize TextPreProcessor text_processor = TextPreProcessor( normailze=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number' ], annotate={ "hashtag", "allcaps", "elongated", "repeated", "emphasis", "censored" }, fix_html=True, segmenter="english", corrector="english", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) # ----------Initialize TextPreProcessor listTrainTxtTokenized = \ list(text_processor.pre_process_docs(listTrainTxt)) listTrainTxt = None # ----------save the vocabulary table, # calculate and save the parameters # filter top 20,000 tokens dictVocabulary2Freq = dict() for listTokens in listTrainTxtTokenized: for aToken in listTokens: if aToken in dictVocabulary2Freq: dictVocabulary2Freq[aToken] += 1 else: dictVocabulary2Freq[aToken] = 1 itemgetter1 = operator.itemgetter(1) list_k_v_top_20000 = sorted(dictVocabulary2Freq.items(), key=itemgetter1, reverse=True)[0:20000] dict_k_v_top_20000 = {k: v for k, v in list_k_v_top_20000} dictVocabulary2Freq = None list_k_v_top_20000 = None # calculate maxDocumentSize and vocabularySize maxDocumentSize = 0 vocabularySize = 0 dictVocabulary2Index = dict() dictIndex2Vocabulary = dict() tokenCurrentIndex = 0 for listTokens in listTrainTxtTokenized: if maxDocumentSize < len(listTokens): maxDocumentSize = len(listTokens) for aToken in listTokens: # filter rare words, reduce vocabulary size if aToken not in dict_k_v_top_20000: continue if aToken in dictVocabulary2Index: pass else: dictVocabulary2Index[aToken] = tokenCurrentIndex dictIndex2Vocabulary[tokenCurrentIndex] = aToken tokenCurrentIndex += 1 vocabularySize = tokenCurrentIndex assert vocabularySize == len(dictVocabulary2Index) # trim doc_size to 0.5 maxDocSize # trimmed_doc_size = maxDocumentSize * 0.5 # json write using the fp4jsonoutput = open(,'wt', encoding='utf8') fp4jsonoutput = open(paramFpathOutToken2IndexDict, 'wt', encoding='utf8') json.dump(dictVocabulary2Index, fp4jsonoutput, ensure_ascii=False) fp4jsonoutput.close() fp4jsonoutput = open(paramFpathOutIndex2TokenDict, 'wt', encoding='utf8') json.dump(dictIndex2Vocabulary, fp4jsonoutput, ensure_ascii=False) fp4jsonoutput.close() # dictVocabulary2Index = None dictIndex2Vocabulary = None fpointerOutParams = open(paramFpathOutTrainParams, 'wt', encoding='utf8') str4write = 'TrainingInstances: %d\n' % len(listTrainTxtTokenized)\ + 'DocumentSeqLen: %d\n' % maxDocumentSize\ + 'VocabularySize: %d\n' % vocabularySize fpointerOutParams.write(str4write) fpointerOutParams.close() # ----------calculate and save the parameters # ----------construct training instances and perform padding print('Hello1') def __function_tokenlist_to_traininstance(tokenlist): ''' from tokenlist to padded instance list adding subsampling ''' tokenlist_size = len(tokenlist) traininginstance = list() for n in range(tokenlist_size): # ----------split tokenlist section tokenlist_section = None if n - HALF_WINDOW_SIZE < 0: if n + HALF_WINDOW_SIZE >= tokenlist_size: tokenlist_section = tokenlist else: tokenlist_section = tokenlist[:n + HALF_WINDOW_SIZE] else: if n + HALF_WINDOW_SIZE >= tokenlist_size: tokenlist_section = tokenlist[n - HALF_WINDOW_SIZE:] else: tokenlist_section = tokenlist[n - HALF_WINDOW_SIZE:n + HALF_WINDOW_SIZE] # ----------calculate tokenlist multiterm countlist_vocab = [0 for i in range(vocabularySize)] countlist_vocab[dictVocabulary2Index[tokenlist[n]]] += 1 traininginstance.append(countlist_vocab) countlist_vocab = [0 for i in range(vocabularySize)] for atoken in tokenlist_section: countlist_vocab[dictVocabulary2Index[atoken]] += 1 traininginstance.append(countlist_vocab) # ----------padding for n in range(tokenlist_size, maxDocumentSize): fullzero_vocab = [0 for i in range(vocabularySize)] traininginstance.append(fullzero_vocab) fullzero_vocab = [0 for i in range(vocabularySize)] traininginstance.append(fullzero_vocab) return traininginstance def __function_traininstance_to_string(traininstance): ''' from traininstance to a string ''' str_training_instance = '' for acountlist_vocab in traininstance: acountlist_vocab = list(map(str, acountlist_vocab)) str_acountlist_vocab = ' '.join(acountlist_vocab) str_training_instance += ' ' + str_acountlist_vocab str_training_instance += '\n' return str_training_instance fpointerOutTrainInstance = open(paramFpathOutTrainInstance, 'wt', encoding='utf8') for aTrainTxtTokenized in listTrainTxtTokenized: aTrainInstance = __function_tokenlist_to_traininstance( aTrainTxtTokenized) aStrTrainInstance = __function_traininstance_to_string( aTrainInstance) fpointerOutTrainInstance.write(aStrTrainInstance) fpointerOutTrainInstance.close() return None
DATA_PATH_ITA = args.trainSet DATA_PATH_TEST_ITA = args.testSet OUTPUT_DIR = args.odir preproc = args.preproc data_train = pd.read_csv(DATA_PATH_ITA,sep=';',encoding='utf-8',engine='c') data_test = pd.read_csv(DATA_PATH_TEST_ITA,sep=";",encoding='utf_8') if args.doShuffle == True: data_train = data_train.reindex(np.random.permutation(data_train.index)) if preproc == 'mirko': text_processor = TextPreProcessor ( remove=[ 'email', 'percent', 'money', 'phone', 'time', 'date', 'number'] , annotate={} , fix_html=True , unpack_hashtags=False , tokenizer=SocialTokenizer(lowercase=args.doLower).tokenize, dicts = [ emoticons ]) data_train.text.astype(str) data_test.text.astype(str) data_train['text_preprocessed'] = data_train.apply(lambda row: mirkoPreprocessing(row,args,text_processor), axis=1) data_test['text_preprocessed'] = data_test.apply(lambda row: mirkoPreprocessing(row,args,text_processor), axis=1) if preproc == 'raw': data_train['text_preprocessed'] = data_train['text'] data_test['text_preprocessed'] = data_test['text'] pd.set_option('display.max_colwidth', 800) print('***** TRAIN HEAD ***') print(data_train.head()) print('***** TEST HEAD ***') print(data_test.head())
text_processor = TextPreProcessor( normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'time', 'url', 'date', 'number' ], annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=True, tokenizer=SocialTokenizer(lowercase=False).tokenize, dicts=[emoticons]) REMOVE_TAGS = [ "<emphasis>", "<kiss>", "<repeated>", "<laugh>", "<allcaps>", "</allcaps>", "<angel>", "<elongated>", "<tong>", "<annoyed>", "<censored>", "<happy>", "<percent>", "<wink>", "<headdesk>", "<surprise>", "<date>", "<time>", "<url>", "<sad>", "<email>", "<phone>", "<hashtag>", "</hashtag>" ] ADD_TO_GLOVE = ["<number>", "<money>"] # Try removing punctuations as well def pre_process_single(tweet, t_id):