def build_vocab(dataset):
    vocabulary_set = set()
    text_processor = TextPreProcessor(
        normalize=[
            'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url',
            'date', 'number'
        ],
        annotate={
            "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
            'censored'
        },
        fix_html=True,
        segmenter="twitter",
        corrector="twitter",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=False,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons])
    for text_tensor, _ in dataset:
        text = str(text_tensor.numpy()[1], 'utf-8')
        some_tokens = text_processor.pre_process_doc(text)
        vocabulary_set.update(some_tokens)

    return vocabulary_set
Пример #2
0
    def __init__(self):

        # Define a Text Pre-Processing pipeline
        # You can easily define a preprocessing pipeline, by using the TextPreProcessor.
        self.text_processor = TextPreProcessor(
            # terms that will be normalized
            normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                'time', 'url', 'date', 'number'],
            # terms that will be annotated
            annotate={"hashtag", "allcaps", "elongated", "repeated",
                'emphasis', 'censored'},
            fix_html=True,  # fix HTML tokens
            
            # corpus from which the word statistics are going to be used 
            # for word segmentation 
            segmenter="english", 
            
            # corpus from which the word statistics are going to be used 
            # for spell correction
            corrector="english", 
            
            unpack_hashtags=True,  # perform word segmentation on hashtags
            unpack_contractions=True,  # Unpack contractions (can't -> can not)
            spell_correct_elong=False,  # spell correction for elongated words
            
            # select a tokenizer. You can use SocialTokenizer, or pass your own
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
            
            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=[emoticons])
Пример #3
0
 def __init__(self, args, lang):
     self.others = Strategy(args.others)  # valori validi 0,1,2 //ANDATA
     self.emoji = Strategy(
         args.emoji)  #0,1 emoji ,2 (emoji) ,3, 4 ,5 (traduzione) //
     self.emoticon = Strategy(
         args.emoticon)  #0,1 emoticon ,2 (emoticon) ,3, 4 ,5 (traduzione)
     self.url = Strategy(args.url)  # 0,1,2,3
     self.hashtag = Strategy(
         args.hashtag)  # 0,1 = #hashtag,2 ,3 (#hashtag),4,5
     self.punctuation = Strategy(args.punctuation)  #Valori validi 0,3
     self.mention = Strategy(args.mention)  #0,1,2,3
     self.lower = args.lower  #true o false
     self.lang = lang  # EN o IT
     self.ita_moji = pd.read_csv('./data/italianMoji.csv', sep=';')
     if self.lang == 'IT':
         self.lm = wordninja.LanguageModel('./data/words.last_all.txt.gz')
     else:
         self.lm = None
     self.text_processor = TextPreProcessor(
         remove=[
             'email',  #raw o nomralize.
             'percent',  #raw o nomralize: EN: percentage, IT: percentuale.
             'money',  # raw o nomralize: EN: money, IT: soldi. verificare se becca le valute
             'phone',  # raw o nomralize: EN: phone, IT: telefono
             'time',  # raw o nomralize: EN time, It: ore 
             'date',  # raw o nomralize EN date, It data
             'number'  #raw o nomralize En number, it numero.
         ],
         annotate={},
         fix_html=True,
         unpack_hashtags=False,
         tokenizer=SocialTokenizer(lowercase=self.lower).tokenize,
         dicts=[emoticons])
Пример #4
0
    def __init__(self,
                 prep=prep_default
                 ):
        """
        Constructor of clean functions over extracted texts/tweets
        :param prep: paramter settings of the text-preprocessor
        """

        # check existence of the keys within prep dict, which needs to be a list
        for k in self.prep_default.keys():
            if not k in prep.keys():
                prep[k] = self.prep_default[k]

        self.prep = prep
        self.omit = list(emoticons.keys()) + list(emoticons.values())
        self.text_processor = TextPreProcessor(
            fix_html=True,
            normalize=[],
            segmenter='twitter',
            corrector='twitter',
            fix_text=True,
            unpack_hashtags=True,  # perform word segmentation on hashtags
            unpack_contractions=prep['spell'],  # Unpack contractions (can't -> can not)
            spell_correction=prep['spell'],
            spell_correct_elong=prep['spell'],
            tokenizer=SocialTokenizer(lowercase=prep['lowercase']).tokenize,
            dicts=[{}],
            omit=list(emoticons.keys()) + list(emoticons.values()),
        )
        self.nlp = spacy.load("en_core_web_sm")
        self.nlp_sent = English()  # just the language with no model
        sentencizer = self.nlp_sent.create_pipe("sentencizer")
        self.nlp_sent.add_pipe(sentencizer)
        def build_vocab_list(dataframe):
            vocab_set = set()
            sentenses = []

            text_processor = TextPreProcessor(
                normalize=[
                    'url', 'email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'number'
                ],
                annotate={
                    "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
                    'censored'
                },
                fix_html=True,
                segmenter="twitter",
                corrector="twitter",
                unpack_hashtags=True,
                unpack_contractions=True,
                spell_correct_elong=False,
                tokenizer=SocialTokenizer(lowercase=True).tokenize,
                dicts=[emoticons])

            for index in range(dataframe.shape[0]):
                tweet = dataframe["tweet"][index]
                tok = text_processor.pre_process_doc(tweet)
                sentenses.append(" ".join(tok))
                vocab_set.update(tok)

            df_sentenses = pd.DataFrame(sentenses, columns=['content'])
            return vocab_set, df_sentenses
Пример #6
0
def preprocess_data(genuine_filepath, bot_filepath):
    """
    Preprocess data and normalize tweets.
    """
    # Open csv file and get the tweet part of the csv.
    # Strip out newlines and quotes around text.
    with codecs.open(bot_filepath, 'r', encoding='utf-8',
                     errors='ignore') as bots_file:
        bot_sentences = [
            x.split(',')[1].strip('\n').strip('"').lower()
            if len(x.split(',')) > 1 else '' for x in bots_file.readlines()
        ]
    bot_sentences = bot_sentences[1:]

    with codecs.open(genuine_filepath, 'r', encoding='utf-8',
                     errors='ignore') as genuine_file:
        genuine_sentences = [
            x.split(',')[1].strip('\n').strip('"').lower()
            if len(x.split(',')) > 1 else '' for x in genuine_file.readlines()
        ]
    genuine_sentences = genuine_sentences[1:]

    text_processor = TextPreProcessor(
        # terms that will be normalized
        normalize=[
            'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url',
            'date', 'number'
        ],
        # terms that will be annotated
        annotate={
            "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
            'censored'
        },
        fix_html=True,  # fix HTML tokens

        # corpus from which the word statistics are going to be used
        # for word segmentation
        segmenter="twitter",

        # corpus from which the word statistics are going to be used
        # for spell correction
        corrector="twitter",
        unpack_hashtags=True,  # perform word segmentation on hashtags
        unpack_contractions=True,  # Unpack contractions (can't -> can not)
        spell_correct_elong=False,  # spell correction for elongated words

        # select a tokenizer. You can use SocialTokenizer, or pass your own
        # the tokenizer, should take as input a string and return a list of tokens
        tokenizer=SocialTokenizer(lowercase=True).tokenize,

        # list of dictionaries, for replacing tokens extracted from the text,
        # with other expressions. You can pass more than one dictionaries.
        dicts=[emoticons])
    bot_sentences = [text_processor.pre_process_doc(s) for s in bot_sentences]
    genuine_sentences = [
        text_processor.pre_process_doc(s) for s in genuine_sentences
    ]

    return genuine_sentences, bot_sentences
def preprocess_dataset(tweets, y):
    """uses ekphrasis API to preprocess the tweets"""

    text_processor = TextPreProcessor(
        # terms that will be normalized
        normalize=[
            'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url',
            'date', 'number'
        ],
        # terms that will be annotated
        fix_html=True,  # fix HTML tokens
        # corpus from which the word statistics are going to be used
        # for word segmentation
        segmenter="twitter",
        # corpus from which the word statistics are going to be used
        # for spell correction
        corrector="twitter",
        unpack_hashtags=True,  # perform word segmentation on hashtags
        unpack_contractions=True,  # Unpack contractions (can't -> can not)
        spell_correct_elong=False,  # spell correction for elongated words
        spell_correction=False,
        # select a tokenizer. You can use SocialTokenizer, or pass your own
        # the tokenizer, should take as input a string and return a list of tokens
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        # list of dictionaries, for replacing tokens extracted from the text,
        # with other expressions. You can pass more than one dictionaries.
        dicts=[emoticons])
    ynew = []
    filter_tweets = []
    for t in range(0, len(tweets)):
        tokens = text_processor.pre_process_doc(tweets[t])
        newtokens = []
        i = 0
        while (i < len(tokens)):
            try:
                if (tokens[i] == "pic" and tokens[i + 1] == "."
                        and tokens[i + 2] == "twitter"):
                    break
                elif (tokens[i] in [
                        "<url>", "<email>", "<user>", "<money>", "<percent>",
                        "<phone>", "<time>", "<date>", "<number>"
                ]):
                    i += 1
                    continue
                elif (tokens[i] == "<" and tokens[i + 1] == "emoji"):
                    while (tokens[i] != ">"):
                        i += 1
                    i += 1
                else:
                    newtokens.append(tokens[i])
                    i += 1
            except:
                break
        if (len(newtokens) != 0):
            filter_tweets.append(" ".join(newtokens))
            ynew.append(y[t])
    return filter_tweets, ynew  #tokenizing and other preprocessing #removes emojis
Пример #8
0
    def __init__(self):
        self.label2emotion = {0: "others", 1: "happy", 2: "sad", 3: "angry"}
        self.emotion2label = {"others": 0, "happy": 1, "sad": 2, "angry": 3}

        self.emoticons_additional = {
            '(^・^)': '<happy>',
            ':‑c': '<sad>',
            '=‑d': '<happy>',
            ":'‑)": '<happy>',
            ':‑d': '<laugh>',
            ':‑(': '<sad>',
            ';‑)': '<happy>',
            ':‑)': '<happy>',
            ':\\/': '<sad>',
            'd=<': '<annoyed>',
            ':‑/': '<annoyed>',
            ';‑]': '<happy>',
            '(^�^)': '<happy>',
            'angru': 'angry',
            "d‑':": '<annoyed>',
            ":'‑(": '<sad>',
            ":‑[": '<annoyed>',
            '(�?�)': '<happy>',
            'x‑d': '<laugh>',
        }

        self.text_processor = TextPreProcessor(
            # terms that will be normalized
            normalize=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'url', 'date', 'number'
            ],
            # terms that will be annotated
            annotate={
                "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
                'censored'
            },
            fix_html=True,  # fix HTML tokens
            # corpus from which the word statistics are going to be used
            # for word segmentation
            segmenter="twitter",
            # corpus from which the word statistics are going to be used
            # for spell correction
            corrector="twitter",
            unpack_hashtags=True,  # perform word segmentation on hashtags
            unpack_contractions=True,  # Unpack contractions (can't -> can not)
            spell_correct_elong=True,  # spell correction for elongated words
            # select a tokenizer. You can use SocialTokenizer, or pass your own
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=[emoticons, self.emoticons_additional])
Пример #9
0
    def __init__(
            self,
            liwc_path: str = '',
            emolex_path: str = 'english_emolex.csv',
            estimator_path: str = 'english_twitter_politeness_estimator.joblib',
            feature_defn_path:
        str = 'english_twitter_additional_features.pickle',
            countVectorizer_path: str = '') -> None:
        # Preload LIWC dictionary:
        if liwc_path:
            liwc_df = pd.read_csv(liwc_path)
            liwc_df['*'] = liwc_df['term'].str.endswith('*')
            liwc_df['t'] = liwc_df['term'].str.rstrip('*')
            self.liwc_prefx = liwc_df[liwc_df['*']].groupby(
                'category')['t'].apply(set)
            self.liwc_whole = liwc_df[~liwc_df['*']].groupby(
                'category')['t'].apply(set)
            self.use_liwc = True

        # Preload EmoLex dictionary:
        emolex_df = pd.read_csv(emolex_path, index_col=0)
        self.emolex = emolex_df.apply(lambda s: set(s[s == 1].index))

        # Preload additional feature rules:
        pltlex = pd.read_pickle(feature_defn_path)
        types = pltlex.apply(type)
        self.pltlex_ptn = pltlex[types == re.Pattern].to_dict()
        self.pltlex_set = pltlex[types == set].to_dict()

        # Initialize Tokenizer:
        self.text_processor = TextPreProcessor(
            # terms that will be normalized:
            normalize=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'url', 'date', 'number'
            ],
            # terms that will be annotated:
            annotate={
                "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
                'censored'
            },
            # perform word segmentation on hashtags:
            unpack_hashtags=False,
            # Unpack contractions (can't -> can not):
            unpack_contractions=True,
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
        )
        # preload classifier:
        self.clf = joblib.load(estimator_path)

        if countVectorizer_path:
            self.counter = joblib.load(countVectorizer_path)
            self.use_cntVec = True
Пример #10
0
class TextPreprocessor():

    def __init__(self):

        self.text_processor_options = TextPreProcessor(
            normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                       'time', 'url', 'date', 'number'],
            unpack_contractions=False,
            annotate={"allcaps", "elongated", "repeated",
                      'emphasis', 'censored'},
            fix_html=True,  # fix HTML tokens
            # corpus from which the word statistics are going to be used
            # for word segmentation and correction
            segmenter="english",
            corrector="english",
            unpack_hashtags=False,  # perform word segmentation on hashtags
            spell_correct_elong=False,  # spell correction for elongated words
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
            # list of dictionaries, for replacing tokens extracted from the text,
            dicts=[emoticons]
        )

    def do_ekphrasis_preprocessing(self, sentences):
        if isinstance(sentences, str):
            return self.text_processor_options.pre_process_doc(sentences)

        assert (type(sentences).__module__ == np.__name__)
        preprocessed = [self.text_processor_options.pre_process_doc(s) for s in sentences]
        return np.array(preprocessed)

    def do_decontraction(self, sentences):
        if isinstance(sentences, str):
            sentences = np.array([sentences])
        assert(type(sentences).__module__ == np.__name__)
        preprocessed = []
        for s in sentences:
            ''' Does not deal with 'd as it is ambiguous'''
            s = re.sub(r"[W, w]on\'t", "will not", s)
            s = re.sub(r"[C, c]an\'t", "can not", s)
            s = re.sub(r"[C, c]annot", "can not", s)
            s = re.sub(r"n\'t", " not", s)
            s = re.sub(r"\'re", " are", s)
            s = re.sub(r"[H, h]e\'s", "he is", s)
            s = re.sub(r"[S, s]he\'s", "she is", s)
            s = re.sub(r"[I, i]t\'s", "it is", s)
            s = re.sub(r"\'ll", " will", s)
            s = re.sub(r"\'ve", " have", s)
            s = re.sub(r"\'m", " am", s)
            s = re.sub(r"[D, d]idn\'t", "did not", s)
            preprocessed.append(s)
        return np.array(preprocessed)
Пример #11
0
def twitter_preprocess():
    preprocessor = TextPreProcessor(
        normalize=[
            'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
            'date', 'number'
        ],
        annotate={
            "hashtag", "elongated", "allcaps", "repeated", 'emphasis',
            'censored'
        },
        all_caps_tag="wrap",
        fix_text=True,
        segmenter="twitter_2018",
        corrector="twitter_2018",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=False,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons]).pre_process_doc

    def preprocess(name, dataset):
        desc = "PreProcessing dataset {}...".format(name)

        # data = []
        # with multiprocessing.Pool(processes=4) as pool:
        #     iterator = pool.imap_unordered(preprocessor, X, 1000)
        #     for i, result in enumerate(tqdm(iterator, total=len(X))):
        #         pass

        data = [preprocessor(x) for x in tqdm(dataset, desc=desc)]
        return data

    return preprocess
Пример #12
0
def get_text_processor(word_stats='twitter'):
    return TextPreProcessor(
        # terms that will be normalized
        normalize=['url', 'email', 'phone', 'user'],
        # terms that will be annotated
        annotate={
            "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
            'censored'
        },
        fix_html=True,  # fix HTML tokens

        # corpus from which the word statistics are going to be used
        # for word segmentation
        segmenter=word_stats,

        # corpus from which the word statistics are going to be used
        # for spell correction
        corrector=word_stats,
        unpack_hashtags=True,  # perform word segmentation on hashtags
        unpack_contractions=True,  # Unpack contractions (can't -> can not)
        spell_correct_elong=False,  # spell correction for elongated words

        # select a tokenizer. You can use SocialTokenizer, or pass your own
        # the tokenizer, should take as input a string and return a list of tokens
        tokenizer=SocialTokenizer(lowercase=True).tokenize,

        # list of dictionaries, for replacing tokens extracted from the text,
        # with other expressions. You can pass more than one dictionaries.
        dicts=[emoticons])
Пример #13
0
def clean_then_tokenize_text(data):
	text_all = []
	text_processor = TextPreProcessor(
		normalize=['user','url'],)
	for key in data:
		text = data[key]
		a= []
		temp = ""
		for line in text:
			if True:
				line = text_processor.pre_process_doc(line)
				temp=" ".join( text_to_word_sequence(line) )
				a.append(temp)
		data[key]['cln_text'] = a
		text_all +=a
	return text_all
Пример #14
0
def twitter_preprocess():
    """
		ekphrasis-social tokenizer sentence preprocessor.
		Substitutes a series of terms by special coins when called
		over an iterable (dataset)
	"""
    norm = [
        'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date',
        'number'
    ]
    ann = {
        "hashtag", "elongated", "allcaps", "repeated", "emphasis", "censored"
    }
    preprocessor = TextPreProcessor(
        normalize=norm,
        annotate=ann,
        all_caps_tag="wrap",
        fix_text=True,
        segmenter="twitter_2018",
        corrector="twitter_2018",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=False,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons]).pre_process_doc

    def preprocess(name, dataset):
        description = "  Ekphrasis-based preprocessing dataset "
        description += "{}...".format(name)
        data = [preprocessor(x) for x in tqdm(dataset, desc=description)]
        return data

    return preprocess
Пример #15
0
def twitter_preprocess():
    preprocessor = TextPreProcessor(
        normalize=[
            'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
            'date', 'number'
        ],
        annotate={
            "hashtag", "elongated", "allcaps", "repeated", 'emphasis',
            'censored'
        },
        all_caps_tag="wrap",
        fix_text=True,
        segmenter="twitter_2018",
        corrector="twitter_2018",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=False,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons]).pre_process_doc

    def preprocess(name, dataset):
        desc = "PreProcessing dataset from nlp.py:33 {}...".format(name)

        data = [preprocessor(x) for x in tqdm(dataset, desc=desc)]
        return data

    return preprocess
Пример #16
0
def datastories_processor(x):
    from ekphrasis.dicts.emoticons import emoticons
    from ekphrasis.classes.tokenizer import SocialTokenizer
    from ekphrasis.classes.preprocessor import TextPreProcessor

    text_processor = TextPreProcessor(
        # terms that will be normalized
        normalize=[
            'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url',
            'date', 'number'
        ],
        # terms that will be annotated
        annotate={
            "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
            'censored'
        },
        fix_html=True,  # fix HTML tokens

        # corpus from which the word statistics are going to be used
        # for word segmentation
        segmenter="twitter",

        # corpus from which the word statistics are going to be used
        # for spell correction
        corrector="twitter",
        unpack_hashtags=True,  # perform word segmentation on hashtags
        unpack_contractions=True,  # Unpack contractions (can't -> can not)
        spell_correct_elong=False,  # spell correction for elongated words

        # select a tokenizer. You can use SocialTokenizer, or pass your own
        # the tokenizer, should take as input a string and return a list of tokens
        tokenizer=SocialTokenizer(lowercase=True).tokenize,

        # list of dictionaries, for replacing tokens extracted from the text,
        # with other expressions. You can pass more than one dictionaries.
        dicts=[emoticons])

    x = [text_processor.pre_process_doc(sent) for sent in x]
    temp = []
    for sent in x:
        context = ''
        for word in sent:
            context = context + ' ' + word
        temp.append(context)

    return temp
Пример #17
0
 def __init__(self, **kwargs):
     self.text_processor = TextPreProcessor(
         omit=kwargs.get('normalize', []),
         normalize=kwargs.get(
             'normalize',
             ['url', 'email', 'phone', 'user', 'time', 'url', 'date']),
         annotate=kwargs.get('annotate', {}),
         fix_html=kwargs.get('fix_html', True),
         segmenter=kwargs.get('segmenter', "twitter"),
         corrector=kwargs.get('corrector', "twitter"),
         unpack_hashtags=kwargs.get('unpack_hashtags', True),
         unpack_contractions=kwargs.get('unpack_contractions', True),
         spell_correct_elong=kwargs.get('fix_elongation', True),
         spell_correction=kwargs.get('spell_correction', True),
         fix_bad_unicode=kwargs.get('fix_bad_unicode', True),
         tokenizer=SocialTokenizer(lowercase=True).tokenize,
         dicts=[emoticons])
Пример #18
0
def get_tweet_processor(additional_dictionary_list=None):

    dicts = [emoticons]
    #print (dicts)
    print(len(dicts))
    if additional_dictionary_list:
        dicts.extend(additional_dictionary_list)

    print(len(dicts))
    '''
    Test with this code block:

    sentences = [
    "he's aaaaaaaaand rt CANT WAIT for the ijwts new season of #TwinPeaks \(^o^)/!!! #davidlynch #tvseries :)))",
    "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/",
    "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! :-D http://sentimentsymposium.com/."
    ]

    for s in sentences:
    print(" ".join(text_processor.pre_process_doc(s)))

    '''

    text_processor = TextPreProcessor(
        # terms that will be normalized
        normalize=[
            'email', 'percent', 'money', 'phone', 'user', 'time', 'url',
            'date', 'number'
        ],
        # terms that will be annotated
        annotate={
            "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
            'censored'
        },
        fix_html=True,  # fix HTML tokens

        # corpus from which the word statistics are going to be used
        # for word segmentation
        segmenter="twitter",

        # corpus from which the word statistics are going to be used
        # for spell correction
        corrector="twitter",
        unpack_hashtags=True,  # perform word segmentation on hashtags
        unpack_contractions=True,  # Unpack contractions (can't -> can not)
        spell_correct_elong=True,  # spell correction for elongated words

        # select a tokenizer. You can use SocialTokenizer, or pass your own
        # the tokenizer, should take as input a string and return a list of tokens
        tokenizer=SocialTokenizer(lowercase=True).tokenize,

        # list of dictionaries, for replacing tokens extracted from the text,
        # with other expressions. You can pass more than one dictionary.
        dicts=dicts)

    return text_processor
Пример #19
0
    def __init__(self, verbose: int=0, omit=None,
                 normalize=None, annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis'},
                 segmenter="twitter", corrector="twitter", unpack_hashtags=False, unpack_contractions=True,
                 spell_correct_elong=True, spell_correction=True, tokenizer=Tokenizer(lowercase=True),
                 dicts=None):
        super().__init__(name="EkhprasisPreprocessor", verbose=verbose)
        if dicts is None:
            dicts = [others, emoticons_original]
        if normalize is None:
            normalize = ['number']
        if omit is None:
            omit = ['email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date']
        logging.info("{} loading...".format(self._name))
        self.tweet_processor = TextPreProcessor(
            # omit terms
            omit=omit,
            # terms that will be normalized
            normalize=normalize,
            # terms that will be annotated
            annotate=annotate,

            # corpus from which the word statistics are going to be used
            # for word segmentation
            segmenter=segmenter,

            # corpus from which the word statistics are going to be used
            # for spell correction
            corrector=corrector,

            unpack_hashtags=unpack_hashtags,  # perform word segmentation on hashtags
            unpack_contractions=unpack_contractions,  # Unpack contractions (can't -> can not)
            spell_correct_elong=spell_correct_elong,  # spell correction for elongated words
            spell_correction=spell_correction,  # spell correction

            # select a tokenizer. You can use SocialTokenizer, or pass your own
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=tokenizer.tokenize,

            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=dicts
        )
Пример #20
0
def load_instances(config, instances):
    for instance_config in config["REST_instances"]:
        instance = Instance(instance_config["name"],
                            instance_config["language"],
                            instance_config["embeddings_path"],
                            instance_config["preprocessing_style"],
                            instance_config["model_path"],
                            instance_config["labels"])

        instance.text_processor = TextPreProcessor(
            # terms that will be normalized
            normalize=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'url', 'date', 'number'
            ],
            # terms that will be annotated
            annotate={
                "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
                'censored'
            },
            fix_html=True,  # fix HTML tokens

            # corpus from which the word statistics are going to be used
            # for word segmentation
            segmenter=instance_config["preprocessing_style"],

            # corpus from which the word statistics are going to be used
            # for spell correction
            corrector=instance_config["preprocessing_style"],
            unpack_hashtags=True,  # perform word segmentation on hashtags
            unpack_contractions=True,  # Unpack contractions (can't -> can not)
            spell_correct_elong=False,  # spell correction for elongated words

            # select a tokenizer. You can use SocialTokenizer, or pass your own
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,

            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=[emoticons])

        instance.itos, instance.stoi, instance.vectors, instance.embeddings_size = \
            load_embeddings(instance.embeddings_path)

        instance.text = data.Field()
        instance.text.build_vocab([instance.itos])
        instance.text.vocab.set_vectors(instance.stoi, instance.vectors,
                                        instance.embeddings_size)

        instance.model = torch.load(
            instance.model_path,
            map_location='cpu' if not cuda_available else None)
        instance.model = instance.model.eval()
        instances[instance_config["name"]] = instance
Пример #21
0
    def __init__(self):

        self.text_processor_options = TextPreProcessor(
            normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                       'time', 'url', 'date', 'number'],
            unpack_contractions=False,
            annotate={"allcaps", "elongated", "repeated",
                      'emphasis', 'censored'},
            fix_html=True,  # fix HTML tokens
            # corpus from which the word statistics are going to be used
            # for word segmentation and correction
            segmenter="english",
            corrector="english",
            unpack_hashtags=False,  # perform word segmentation on hashtags
            spell_correct_elong=False,  # spell correction for elongated words
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
            # list of dictionaries, for replacing tokens extracted from the text,
            dicts=[emoticons]
        )
Пример #22
0
def tokenize(
    data, 
    is_lower=True, 
    remove_stopwords=True, 
    remove_puncts=True, 
    remove_num=True, 
    remove_currency=True
):

    text_processor = TextPreProcessor(
        annotate=['hashtag'],
        fix_html=True,  # fix HTML tokens
        
        # corpus from which the word statistics are going to be used 
        # for word segmentation 
        segmenter="english", 
        
        # corpus from which the word statistics are going to be used 
        # for spell correction
        corrector="english", 
        
        unpack_hashtags=True,  # perform word segmentation on hashtags
        unpack_contractions=True,  # Unpack contractions (can't -> can not)
        spell_correct=True,
    )

    tokenized_corpus = []

    for sentence in data:

        tokenized_sentence = []
        # processed_sentence = text_processor.pre_process_doc(sentence)
        # clean_sentence = clean(processed_sentence, **clean_text_param)
        spacy_doc = nlp(sentence)

        for token in spacy_doc:
            processed_token = token
            if (remove_stopwords and processed_token.is_stop):
                continue
            elif (remove_puncts and processed_token.is_punct):
              continue
            elif (remove_num and processed_token.is_digit):
              continue
            elif (remove_currency and processed_token.is_currency):
              continue
            elif (is_lower):
              tokenized_sentence.append(token.lower_)
            else:
              tokenized_sentence.append(token.text)

        tokenized_corpus.append(tokenized_sentence)

    return tokenized_corpus
Пример #23
0
def preprocess_through_ekphrasis(train_file_path, test_file_path,
                                 trial_file_path):
    text_processor = TextPreProcessor(
        normalize=[
            'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url',
            'date', 'number'
        ],
        annotate={
            "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
            'censored'
        },
        fix_html=True,
        segmenter="twitter",
        corrector="twitter",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=True,
        spell_correction=True,
        all_caps_tag="wrap",
        fix_bad_unicode=True,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons])

    for file_path in [train_file_path, test_file_path, trial_file_path]:
        with open(file_path, 'r', newline='') as file:
            new_sentences = list()
            labels = list()
            for line in file:
                labels.append(line.split('\t')[0])
                new_sentences.append(" ".join(
                    text_processor.pre_process_doc(line.split('\t')[1])))
        with open(file_path[:-4] + "_ekphrasis.csv", 'w',
                  newline='') as new_file:
            for label, sentence in zip(labels, new_sentences):
                new_file.write("{}\t{}\n".format(
                    label,
                    sentence.replace("[ <hashtag> triggerword </hashtag> #]",
                                     "[#TRIGGERWORD#]").replace(
                                         "[ <allcaps> newline </allcaps> ]",
                                         "[NEWLINE]")))
Пример #24
0
    def __init__(self):
        self.transformations = []
        self.text_processor = TextPreProcessor(
            fix_html=True,  # fix HTML tokens

            # corpus from which the word statistics are going to be used
            # for word segmentation
            segmenter="english",

            # corpus from which the word statistics are going to be used
            # for spell correction
            corrector="english",
            unpack_hashtags=False,  # perform word segmentation on hashtags
            unpack_contractions=False,  # Unpack contractions (can't -> can not)
            spell_correct=True,  # spell correction for elongated words
        )

        self.punct = "[\.,:;\(\)\[\]@\-\$£]"

        nltk.download('stopwords')
        self.stops = stopwords.words('english')

        self.nlp = spacy.load('en_core_web_lg')
Пример #25
0
    def __init__(self, text, **kwargs):
        self.text = text
        self.text_processor = TextPreProcessor(
            # terms that will be normalize e.g. [email protected] to <email>
            normalize=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'date', 'number'
            ],

            # terms that will be annotated e.g. <hashtag>#test</hashtag>
            annotate={
                "hashtag", "allcaps", "elongated", "repeated", 'emphasis'
            },
            fix_html=True,  # fix HTML tokens
            unpack_hashtags=True,  # perform word segmentation on hashtags

            # select a tokenizer. You can use SocialTokenizer, or pass your own if not text tokenized on whitespace
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,

            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=[emoticons])
Пример #26
0
    def __init__(self, word_indices, text_lengths, **kwargs):

        self.word_indices = word_indices

        filter_classes = kwargs.get("filter_classes", None)
        self.y_one_hot = kwargs.get("y_one_hot", True)

        self.pipeline = Pipeline([
            ('preprocess',
             CustomPreProcessor(
                 TextPreProcessor(
                     backoff=[
                         'url', 'email', 'percent', 'money', 'phone', 'user',
                         'time', 'url', 'date', 'number'
                     ],
                     include_tags={
                         "hashtag", "allcaps", "elongated", "repeated",
                         'emphasis', 'censored'
                     },
                     fix_html=True,
                     segmenter="twitter",
                     corrector="twitter",
                     unpack_hashtags=True,
                     unpack_contractions=True,
                     spell_correct_elong=False,
                     tokenizer=SocialTokenizer(lowercase=True).tokenize,
                     dicts=[emoticons]))),
            ('ext',
             EmbeddingsExtractor(word_indices=word_indices,
                                 max_lengths=text_lengths,
                                 add_tokens=True,
                                 unk_policy="random"))
        ])

        # loading data
        print("Loading data...")
        dataset = DataLoader(verbose=False).get_data(years=None, datasets=None)
        random.Random(42).shuffle(dataset)

        if filter_classes:
            dataset = [d for d in dataset if d[0] in filter_classes]

        self.X = [obs[1] for obs in dataset]
        self.y = [obs[0] for obs in dataset]
        print("total observations:", len(self.y))

        print("-------------------\ntraining set stats\n-------------------")
        print_dataset_statistics(self.y)
        print("-------------------")
Пример #27
0
    def twitter_preprocess(self):
        preprocessor = TextPreProcessor(
            normalize=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'date', 'number'
            ],
            annotate={
                "hashtag", "elongated", "allcaps", "repeated", 'emphasis',
                'censored'
            },
            all_caps_tag="wrap",
            fix_text=True,
            segmenter="twitter_2018",
            corrector="twitter_2018",
            unpack_hashtags=True,
            unpack_contractions=True,
            spell_correct_elong=False,
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
            dicts=[emoticons])

        text = self.data
        cache_file = os.path.join('./', "cached",
                                  "preprocessed_" + self.name + ".pkl")
        preprocessed = None
        if os.path.isfile(cache_file):
            with open(cache_file, 'rb') as f:
                preprocessed = pickle.load(f)
        else:
            preprocessed = [
                preprocessor.pre_process_doc(x)
                for x in tqdm(text, desc="Preprocessing dataset...")
            ]
            with open(cache_file, 'wb') as f:
                pickle.dump(preprocessed, f)

        return preprocessed
Пример #28
0
def twitter_preprocessor():
    preprocessor = TextPreProcessor(
        normalize=['url', 'email', 'phone', 'user'],
        annotate={
            "hashtag", "elongated", "allcaps", "repeated", 'emphasis',
            'censored'
        },
        all_caps_tag="wrap",
        fix_text=False,
        segmenter="twitter_2018",
        corrector="twitter_2018",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=False,
        tokenizer=SocialTokenizer(lowercase=True).tokenize).pre_process_doc
    return preprocessor
def emotion_and_split():

    text_process = TextPreProcessor(

        segmenter="twitter",

        corrector="twitter",

        unpack_hashtags=True,
        unpack_contractions=True,

        tokenizer=SocialTokenizer(lowercase=True).tokenize,

        dicts=[emoticons]
    )

    return text_process
Пример #30
0
    def __init__(self):
        self.root_dir = "CrisisLexT26/"
        self.count = 0
        self.natural_disasters = []
        self.non_natural_disasters = []

        self.prep_natural_disasters = []
        self.prep_non_natural_disasters = []

        self.nat_labels = []
        self.non_natural_labels = []

        self.en_prep_nat_tweets = []
        self.en_prep_non_nat_tweets = []

        self.text_processor = TextPreProcessor(
            # terms that will be normalized
            normalize=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'url', 'date', 'number'
            ],
            # terms that will be annotated
            annotate={
                "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
                'censored'
            },
            fix_html=True,  # fix HTML tokens

            # corpus from which the word statistics are going to be used
            # for word segmentation
            segmenter="twitter",

            # corpus from which the word statistics are going to be used
            # for spell correction
            corrector="twitter",
            unpack_hashtags=True,  # perform word segmentation on hashtags
            unpack_contractions=True,  # Unpack contractions (can't -> can not)
            spell_correct_elong=True,  # spell correction for elongated words

            # select a tokenizer. You can use SocialTokenizer, or pass your own
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,

            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=[emoticons])