def tokenize(sentence, word_tokenizer = ReviewTokenizer(), stemmer = None, lower = False, remove_punc = False, remove_stopwords = False, remove_emoji = False, convert_neg = False):

    tokens = word_tokenizer.tokenize(sentence)

    # convert tokens to lowercase
    if lower:
        tokens = [token.lower() for token in tokens]

    # remove stopword tokens
    if remove_stopwords:
        tokens = [token for token in tokens if token not in STOPWORDS]

    # remove emoji tokens
    if remove_emoji:
        tokens = [token for token in tokens if not EMOTICON_RE.search(token)]

    # NEG_SENTENCE_BOUNDARY_LIST
    if convert_neg == True:
        tokens = _convert_neg(tokens)

    # remove punctuation tokens
    if remove_punc:
        tokens = [token for token in tokens if token not in PUNCTUATION]

    # stem tokens
    if stemmer:
        tokens = [stemmer.stem(token) for token in tokens]

    tokens = [SPECIAL_TOKEN_DICT[token] if SPECIAL_TOKEN_DICT.get(token, '') else token for token in tokens]

    return tokens
Exemplo n.º 2
0
def _lowerize(word, keep_all_upper=False):
    if EMOTICON_RE.search(word):
        return word
    elif word.isupper() and keep_all_upper:
        return word
    elif word == 'URL':
        return word
    elif word == '@USER':
        return word
    else:
        return word.lower()
Exemplo n.º 3
0
def segment_sent(text, emoji_tokenizer=TweetTokenizer()):
    text = sanitise(text)
    sentences = []
    for sentence in sent_tokenize(text):
        if EMOTICON_RE.search(sentence):
            new_sentences = []
            tokens = emoji_tokenizer.tokenize(sentence)
            new_sentence = []
            for token in tokens:
                new_sentence.append(token)
                if EMOTICON_RE.search(token) or token in '.?!':
                    new_sentences.append(' '.join(new_sentence))
                    new_sentence = []
            if new_sentence:
                new_sentences.append(' '.join(new_sentence))
            sentences += new_sentences
        else:
            sentences.append(sentence)

    if len(sentences) != 0:
        if sentences[-1] in ['.', '!', '?']:
            sentences[-2] = sentences[-2] + sentences[-1]
            sentences = sentences[:-1]
    return sentences
Exemplo n.º 4
0
 def tokenize(self, text):
     """
     :param text: str
     :rtype: list(str)
     :return: a tokenized list of strings; concatenating this list returns\
     the original string if `preserve_case=False`
     """
     # Fix HTML character entities:
     text = _replace_html_entities(text)
     # Remove username handles
     if self.strip_handles:
         text = remove_handles(text)
     # Normalize word lengthening
     if self.reduce_len:
         text = reduce_lengthening(text)
     # Shorten problematic sequences of characters
     safe_text = HANG_RE.sub(r"\1\1\1", text)
     # Tokenize:
     r"|<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>"
     custom_Re = regex.compile(
         r"""(%s)"""
         % "|".join(
             (
                 r":[^:\s]+:",
                 r"<:[^:\s]+:[0-9]+>",
                 r"<a:[^:\s]+:[0-9]+>",
                 r"<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>",
             )
             + REGEXPS
         ),
         regex.VERBOSE | regex.I | regex.UNICODE,
     )
     words = custom_Re.findall(safe_text)
     # Possibly alter the case, but avoid changing emoticons like :D into :d:
     if not self.preserve_case:
         words = list(
             map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)
         )
     return words
Exemplo n.º 5
0
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 10)

DEFAULT_DATA_FILE = "./data/sample_data.json"
IMAGES_DIRECTORY = './images'
DEFAULT_SEED = 42
STOPWORDS = set(
    stopwords.words('english') +
    ["'ve','s", "one", "use", "would", "get", "also"]) - {
        'not', 'no', 'won', 'more', 'above', 'very', 'against', 'again'
    }
PUNCTUATION = string.punctuation + '...'
SPECIAL_TOKEN_DICT = {"n't": 'not'}

flatten = lambda l: [item for sublist in l for item in sublist]
is_word = lambda token: not (EMOTICON_RE.search(token) or token in string.
                             punctuation or token in STOPWORDS)

NEG_SENT_BOUND_RE = re.compile(EMOTICON_RE.pattern + '|' +
                               '|'.join(['\.', '\:', '\;', '\!', '\?', '\,']))
NEG_WORD_RE = re.compile(r"(?:^(?:never|no|nothing|nowhere|noone|none|not)$)")


def main(data_file, seed):

    # set seed
    np.random.seed(seed)

    # load in a pd.df
    data = [json.loads(line) for file in data_file for line in file]
    df = pd.DataFrame.from_dict(data)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 10)

DEFAULT_DATA_FILE = "./data/sample_data.json"
IMAGES_DIRECTORY = './images'
DEFAULT_SEED = 42
STOPWORDS = set(stopwords.words('english') + ["'ve", "'d", "'s", "one", "use", "would", "get", "also"]) - {'not', 'no', 'won', 'more', 'above', 'very', 'against', 'again'}
PUNCTUATION = string.punctuation + '...'
SPECIAL_TOKEN_DICT = {"n't": 'not'}
boundary_punc = '.:;!?,'
NEG_SENT_BOUND_RE = re.compile(EMOTICON_RE.pattern + '|' + '|'.join([re.escape(punc) for punc in boundary_punc]))
NEG_WORD_RE = re.compile(r"(?:^(?:never|no|nothing|nowhere|noone|none|not)$)")

flatten = lambda l: [item for sublist in l for item in sublist]
is_word = lambda token: not(EMOTICON_RE.search(token) or token in string.punctuation or token in STOPWORDS)

class ReviewTokenizer(TreebankWordTokenizer):

    _contractions = MacIntyreContractions()
    CONTRACTIONS = list(map(re.compile, _contractions.CONTRACTIONS2 + _contractions.CONTRACTIONS3))

    PUNCTUATION = [
        (re.compile(r'([,])([^\d])'), r' \1 \2'),
        (re.compile(r'([,])$'), r' \1 '),
        (re.compile(r'\.\.\.'), r' ... '),
        (re.compile(r'[;@#&]'), r' \g<0> '),
        # Handles the final period
        (re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'), r'\1 \2\3 '),
        (re.compile(r'[?!]'), r' \g<0> '),
        (re.compile(r"([^'])' "), r"\1 ' "),