Exemplo n.º 1
0
    def tokenize(self, text):

        text = _replace_html_entities(text)

        for regexp, substitution in self.STARTING_QUOTES:
            text = regexp.sub(substitution, text)

        for regexp, substitution in self.PUNCTUATION:
            text = regexp.sub(substitution, text)

        text = " " + text + " "

        # split contractions
        for regexp, substitution in self.ENDING_QUOTES:
            text = regexp.sub(substitution, text)
        for regexp in self.CONTRACTIONS:
            text = regexp.sub(r' \1 \2 ', text)

        # handle emojis
        for emoticon in list(EMOTICON_RE.finditer(text))[::-1]:
            pos = emoticon.span()[0]
            if text[pos - 1] != ' ':
                text = text[:pos] + ' ' + text[pos:]

        return text.split()
def tokenize(sentence, word_tokenizer = ReviewTokenizer(), stemmer = None, lower = False, remove_punc = False, remove_stopwords = False, remove_emoji = False, convert_neg = False):

    tokens = word_tokenizer.tokenize(sentence)

    # convert tokens to lowercase
    if lower:
        tokens = [token.lower() for token in tokens]

    # remove stopword tokens
    if remove_stopwords:
        tokens = [token for token in tokens if token not in STOPWORDS]

    # remove emoji tokens
    if remove_emoji:
        tokens = [token for token in tokens if not EMOTICON_RE.search(token)]

    # NEG_SENTENCE_BOUNDARY_LIST
    if convert_neg == True:
        tokens = _convert_neg(tokens)

    # remove punctuation tokens
    if remove_punc:
        tokens = [token for token in tokens if token not in PUNCTUATION]

    # stem tokens
    if stemmer:
        tokens = [stemmer.stem(token) for token in tokens]

    tokens = [SPECIAL_TOKEN_DICT[token] if SPECIAL_TOKEN_DICT.get(token, '') else token for token in tokens]

    return tokens
Exemplo n.º 3
0
def strip_emoticon(tweets):
    no_emoticon = [[
        re.sub(r'(?!\n)\s+', ' ', EMOTICON_RE.sub('', token))
        for token in tweet if tweet
    ] for tweet in tweets]

    return [[token for token in tweet if token] for tweet in no_emoticon]
Exemplo n.º 4
0
 def preprocess(self, text: str) -> str:
     get_emoticon = self._emoticon_mapping.get
     text = URLS_RE.sub("<url>", text)
     text = USERNAMES_RE.sub("<user>", text)
     text = HASHTAGS_RE.sub("<hashtag>", text)
     text = NUMBERS_RE.sub("<number>", text)
     text = EMOTICON_RE.sub(lambda m: get_emoticon(m.group()) or m.group(), text)
     return text
Exemplo n.º 5
0
    def exec_simple(self, text):
        tweet = TweetTokenizer()
        # simple preprocessing
        new_text = text.lower()

        # tokenizing
        tokens = tweet.tokenize(new_text)

        # removing emoticons
        new_tokens = []
        for c in tokens:
            if EMOTICON_RE.fullmatch(c) is not None:  # deleting emoticons
                continue

            if c.startswith('#') or c.startswith(
                    '@'):  # deleting User names and Hashtags
                continue

            if re.sub(r"(http[s]?|ftp):\S+", "", c) == '':  # removing urls
                continue

            new_tokens.append(c)

        # negating from 'not' to next punctuation sign
        for i in range(0, len(new_tokens)):
            if new_tokens[i] == 'not' or new_tokens[i] == 'dont' or new_tokens[
                    i].endswith('n\'t'):
                for j in range(i + 1, len(new_tokens)):
                    if new_tokens[j] not in string.punctuation:
                        new_tokens[j] = new_tokens[j] + '_neg'
                    else:
                        break

        # removing punctuation signs
        punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~]+'
        new_text = []
        found = False
        for c in new_tokens:
            for char in c:
                if char in punctuation:
                    found = True
                    break
                else:
                    break
            if found:
                found = False
                continue
            new_text.append(c)

        # print(tokens)
        for mod in self.queue:
            tokens = mod(new_text).process()

        str = ''
        for s in tokens:
            str += ' ' + s

        return str
Exemplo n.º 6
0
def _lowerize(word, keep_all_upper=False):
    if EMOTICON_RE.search(word):
        return word
    elif word.isupper() and keep_all_upper:
        return word
    elif word == 'URL':
        return word
    elif word == '@USER':
        return word
    else:
        return word.lower()
Exemplo n.º 7
0
def segment_sent(text, emoji_tokenizer=TweetTokenizer()):
    text = sanitise(text)
    sentences = []
    for sentence in sent_tokenize(text):
        if EMOTICON_RE.search(sentence):
            new_sentences = []
            tokens = emoji_tokenizer.tokenize(sentence)
            new_sentence = []
            for token in tokens:
                new_sentence.append(token)
                if EMOTICON_RE.search(token) or token in '.?!':
                    new_sentences.append(' '.join(new_sentence))
                    new_sentence = []
            if new_sentence:
                new_sentences.append(' '.join(new_sentence))
            sentences += new_sentences
        else:
            sentences.append(sentence)

    if len(sentences) != 0:
        if sentences[-1] in ['.', '!', '?']:
            sentences[-2] = sentences[-2] + sentences[-1]
            sentences = sentences[:-1]
    return sentences
Exemplo n.º 8
0
def classifyEmoticons(text):

    # find all emoticons
    emoticons = EMOTICON_RE.findall(text)

    pos = any([emo in POS_EMOTICONS
               for emo in emoticons]) or bool(POS_EMOJIS_RE.search(text))
    neg = any([emo in NEG_EMOTICONS
               for emo in emoticons]) or bool(NEG_EMOJIS_RE.search(text))

    if pos and neg:
        return 'N/A'
    elif pos and not neg:
        return 'pos'
    elif neg and not pos:
        return 'neg'
    elif not pos and not neg:
        return None
def emoticonList(corpus):
    emolist = []
    emocount = []
    emoTorF = []
    for a in corpus:
        ct = []  # store count
        TorF = []  # possive or negative
        all_emoticons = EMOTICON_RE.findall(a)
        ct.append(len(all_emoticons))
        if (len(all_emoticons) != 0):
            ed = EmoticonDetector()
            all_emoticons.sort()
            tf = (-1, 1)[ed.is_positive(all_emoticons[0])]
            TorF.append(tf)
        else:
            TorF.append(0)
        emolist.append(all_emoticons)
        emocount.append(ct)
        emoTorF.append(TorF)
    return emocount, emoTorF, emolist
Exemplo n.º 10
0
 def tokenize(self, text):
     """
     :param text: str
     :rtype: list(str)
     :return: a tokenized list of strings; concatenating this list returns\
     the original string if `preserve_case=False`
     """
     # Fix HTML character entities:
     text = _replace_html_entities(text)
     # Remove username handles
     if self.strip_handles:
         text = remove_handles(text)
     # Normalize word lengthening
     if self.reduce_len:
         text = reduce_lengthening(text)
     # Shorten problematic sequences of characters
     safe_text = HANG_RE.sub(r"\1\1\1", text)
     # Tokenize:
     r"|<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>"
     custom_Re = regex.compile(
         r"""(%s)"""
         % "|".join(
             (
                 r":[^:\s]+:",
                 r"<:[^:\s]+:[0-9]+>",
                 r"<a:[^:\s]+:[0-9]+>",
                 r"<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>",
             )
             + REGEXPS
         ),
         regex.VERBOSE | regex.I | regex.UNICODE,
     )
     words = custom_Re.findall(safe_text)
     # Possibly alter the case, but avoid changing emoticons like :D into :d:
     if not self.preserve_case:
         words = list(
             map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)
         )
     return words
Exemplo n.º 11
0
def json2csv_preprocess(json_file,
                        outfile,
                        fields,
                        encoding='utf8',
                        errors='replace',
                        gzip_compress=False,
                        skip_retweets=True,
                        skip_tongue_tweets=True,
                        skip_ambiguous_tweets=True,
                        strip_off_emoticons=True,
                        remove_duplicates=True,
                        limit=None):
    """
    Convert json file to csv file, preprocessing each row to obtain a suitable
    dataset for tweets Semantic Analysis.

    :param json_file: the original json file containing tweets.
    :param outfile: the output csv filename.
    :param fields: a list of fields that will be extracted from the json file and
        kept in the output csv file.
    :param encoding: the encoding of the files.
    :param errors: the error handling strategy for the output writer.
    :param gzip_compress: if True, create a compressed GZIP file.

    :param skip_retweets: if True, remove retweets.
    :param skip_tongue_tweets: if True, remove tweets containing ":P" and ":-P"
        emoticons.
    :param skip_ambiguous_tweets: if True, remove tweets containing both happy
        and sad emoticons.
    :param strip_off_emoticons: if True, strip off emoticons from all tweets.
    :param remove_duplicates: if True, remove tweets appearing more than once.
    :param limit: an integer to set the number of tweets to convert. After the
        limit is reached the conversion will stop. It can be useful to create
        subsets of the original tweets json data.
    """
    with codecs.open(json_file, encoding=encoding) as fp:
        (writer, outf) = outf_writer_compat(outfile, encoding, errors,
                                            gzip_compress)
        # write the list of fields as header
        writer.writerow(fields)

        if remove_duplicates == True:
            tweets_cache = []
        i = 0
        for line in fp:
            tweet = json.loads(line)
            row = extract_fields(tweet, fields)
            try:
                text = row[fields.index('text')]
                # Remove retweets
                if skip_retweets == True:
                    if re.search(r'\bRT\b', text):
                        continue
                # Remove tweets containing ":P" and ":-P" emoticons
                if skip_tongue_tweets == True:
                    if re.search(r'\:\-?P\b', text):
                        continue
                # Remove tweets containing both happy and sad emoticons
                if skip_ambiguous_tweets == True:
                    all_emoticons = EMOTICON_RE.findall(text)
                    if all_emoticons:
                        if (set(all_emoticons) & HAPPY) and (set(all_emoticons)
                                                             & SAD):
                            continue
                # Strip off emoticons from all tweets
                if strip_off_emoticons == True:
                    row[fields.index('text')] = re.sub(
                        r'(?!\n)\s+', ' ', EMOTICON_RE.sub('', text))
                # Remove duplicate tweets
                if remove_duplicates == True:
                    if row[fields.index('text')] in tweets_cache:
                        continue
                    else:
                        tweets_cache.append(row[fields.index('text')])
            except ValueError:
                pass
            writer.writerow(row)
            i += 1
            if limit and i >= limit:
                break
        outf.close()
Exemplo n.º 12
0
Arquivo: util.py Projeto: DrDub/nltk
def json2csv_preprocess(json_file, outfile, fields, encoding='utf8', errors='replace',
            gzip_compress=False, skip_retweets=True, skip_tongue_tweets=True,
            skip_ambiguous_tweets=True, strip_off_emoticons=True, remove_duplicates=True,
            limit=None):
    """
    Convert json file to csv file, preprocessing each row to obtain a suitable
    dataset for tweets Semantic Analysis.

    :param json_file: the original json file containing tweets.
    :param outfile: the output csv filename.
    :param fields: a list of fields that will be extracted from the json file and
        kept in the output csv file.
    :param encoding: the encoding of the files.
    :param errors: the error handling strategy for the output writer.
    :param gzip_compress: if True, create a compressed GZIP file.

    :param skip_retweets: if True, remove retweets.
    :param skip_tongue_tweets: if True, remove tweets containing ":P" and ":-P"
        emoticons.
    :param skip_ambiguous_tweets: if True, remove tweets containing both happy
        and sad emoticons.
    :param strip_off_emoticons: if True, strip off emoticons from all tweets.
    :param remove_duplicates: if True, remove tweets appearing more than once.
    :param limit: an integer to set the number of tweets to convert. After the
        limit is reached the conversion will stop. It can be useful to create
        subsets of the original tweets json data.
    """
    with codecs.open(json_file, encoding=encoding) as fp:
        (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
        # write the list of fields as header
        writer.writerow(fields)

        if remove_duplicates == True:
            tweets_cache = []
        i = 0
        for line in fp:
            tweet = json.loads(line)
            row = extract_fields(tweet, fields)
            try:
                text = row[fields.index('text')]
                # Remove retweets
                if skip_retweets == True:
                    if re.search(r'\bRT\b', text):
                        continue
                # Remove tweets containing ":P" and ":-P" emoticons
                if skip_tongue_tweets == True:
                    if re.search(r'\:\-?P\b', text):
                        continue
                # Remove tweets containing both happy and sad emoticons
                if skip_ambiguous_tweets == True:
                    all_emoticons = EMOTICON_RE.findall(text)
                    if all_emoticons:
                        if (set(all_emoticons) & HAPPY) and (set(all_emoticons) & SAD):
                            continue
                # Strip off emoticons from all tweets
                if strip_off_emoticons == True:
                    row[fields.index('text')] = re.sub(r'(?!\n)\s+', ' ', EMOTICON_RE.sub('', text))
                # Remove duplicate tweets
                if remove_duplicates == True:
                    if row[fields.index('text')] in tweets_cache:
                        continue
                    else:
                        tweets_cache.append(row[fields.index('text')])
            except ValueError:
                pass
            writer.writerow(row)
            i += 1
            if limit and i >= limit:
                break
        outf.close()
Exemplo n.º 13
0
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 10)

DEFAULT_DATA_FILE = "./data/sample_data.json"
IMAGES_DIRECTORY = './images'
DEFAULT_SEED = 42
STOPWORDS = set(
    stopwords.words('english') +
    ["'ve','s", "one", "use", "would", "get", "also"]) - {
        'not', 'no', 'won', 'more', 'above', 'very', 'against', 'again'
    }
PUNCTUATION = string.punctuation + '...'
SPECIAL_TOKEN_DICT = {"n't": 'not'}

flatten = lambda l: [item for sublist in l for item in sublist]
is_word = lambda token: not (EMOTICON_RE.search(token) or token in string.
                             punctuation or token in STOPWORDS)

NEG_SENT_BOUND_RE = re.compile(EMOTICON_RE.pattern + '|' +
                               '|'.join(['\.', '\:', '\;', '\!', '\?', '\,']))
NEG_WORD_RE = re.compile(r"(?:^(?:never|no|nothing|nowhere|noone|none|not)$)")


def main(data_file, seed):

    # set seed
    np.random.seed(seed)

    # load in a pd.df
    data = [json.loads(line) for file in data_file for line in file]
    df = pd.DataFrame.from_dict(data)
Exemplo n.º 14
0
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 10)

DEFAULT_DATA_FILE = "./data/sample_data.json"
IMAGES_DIRECTORY = './images'
DEFAULT_SEED = 42
STOPWORDS = set(stopwords.words('english') + ["'ve", "'d", "'s", "one", "use", "would", "get", "also"]) - {'not', 'no', 'won', 'more', 'above', 'very', 'against', 'again'}
PUNCTUATION = string.punctuation + '...'
SPECIAL_TOKEN_DICT = {"n't": 'not'}
boundary_punc = '.:;!?,'
NEG_SENT_BOUND_RE = re.compile(EMOTICON_RE.pattern + '|' + '|'.join([re.escape(punc) for punc in boundary_punc]))
NEG_WORD_RE = re.compile(r"(?:^(?:never|no|nothing|nowhere|noone|none|not)$)")

flatten = lambda l: [item for sublist in l for item in sublist]
is_word = lambda token: not(EMOTICON_RE.search(token) or token in string.punctuation or token in STOPWORDS)

class ReviewTokenizer(TreebankWordTokenizer):

    _contractions = MacIntyreContractions()
    CONTRACTIONS = list(map(re.compile, _contractions.CONTRACTIONS2 + _contractions.CONTRACTIONS3))

    PUNCTUATION = [
        (re.compile(r'([,])([^\d])'), r' \1 \2'),
        (re.compile(r'([,])$'), r' \1 '),
        (re.compile(r'\.\.\.'), r' ... '),
        (re.compile(r'[;@#&]'), r' \g<0> '),
        # Handles the final period
        (re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'), r'\1 \2\3 '),
        (re.compile(r'[?!]'), r' \g<0> '),
        (re.compile(r"([^'])' "), r"\1 ' "),
Exemplo n.º 15
0
i = 0
url_re = re.compile(URLS, re.VERBOSE | re.I | re.UNICODE)
hashtag_re = re.compile('(?:^|\s)[##]{1}(\w+)', re.UNICODE)
#mention_re = re.compile('(?:^|\s)[@@]{1}([^\s#<>[\]|{}]+)', re.UNICODE) # To include more complete names
mention_re = re.compile('(?:^|\s)[@@]{1}(\w+)', re.UNICODE)

with open(text_file, 'r') as text_reader, open(
        words_file, 'w', encoding='utf-8') as words_writer, open(
            emo_file, 'w', encoding='utf-8') as emo_writer, open(
                hash_file, 'w', encoding='utf-8') as hash_writer, open(
                    at_file, 'w', encoding='utf-8') as at_writer, open(
                        link_file, 'w', encoding='utf-8') as link_writer:
    for line in text_reader:
        line = line.rstrip().lower()
        hashs = hashtag_re.findall(line)
        ats = mention_re.findall(line)
        links = url_re.findall(line)
        line = clean(line, hashs, ats, links)
        emoticons = emo_re.findall(line)
        emojis = [w for w in line if w in emoji.UNICODE_EMOJI]
        words = re.findall('[a-záéíóúñ][a-záéíóúñ_-]+',
                           line)  #Revisar para remover ats, hashs y links

        words_writer.write(' '.join(w for w in words) + '\n')
        emo_writer.write(' '.join(w for w in emoticons + emojis) + '\n')
        hash_writer.write(' '.join(w for w in hashs) + '\n')
        at_writer.write(' '.join(w for w in ats) + '\n')
        link_writer.write(' '.join(w for w in links) + '\n')
        i += 1
        print(i)
Exemplo n.º 16
0
def strip_emoticon(tweets):
	no_emoticon = [[re.sub(r'(?!\n)\s+', ' ', EMOTICON_RE.sub('', token)) for token in tweet if tweet] for tweet in tweets]

	return [[token for token in tweet if token] for tweet in no_emoticon]