Пример #1
0
class MySentences(object):
    def __init__(self, listings, gzpFiles):
        self.listings = listings
        self.gzip_files = gzpFiles
        self.tknzr = TweetTokenizer()

    def __iter__(self):
        for files in self.listings:
            file_done = False
            counter = 0
            for (fname) in files:
                if file_done:
                    break
                for line in open(fname, 'rb'):
                    if counter >= MAX_TW_LANG:
                        file_done = True
                        break

                    counter += 1
                    tweet = line.split('\t')[-1]
                    tweet = preprocess_tweet(tweet)
                    tweet = self.tknzr.tokenize(tweet.decode('utf-8'))
                    yield filter(lambda word: ' ' not in word, tweet)

        counter = 0
        for (fname) in self.gzip_files:
            for line in gzip.open(fname, 'rb'):
                if counter >= MAX_TW_LANG:
                    return

                counter += 1
                tweet = line.split('\t')[-1]
                tweet = preprocess_tweet(tweet)
                tweet = self.tknzr.tokenize(tweet.decode('utf-8'))
                yield filter(lambda word: ' ' not in word, tweet)
Пример #2
0
def token(X_train, X_test):
    tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
    x_train = []
    word_dict = {}
    word_index = 1

    for doc in X_train:
        word_seq = []
        for word in tknzr.tokenize(doc):
            if word not in word_dict:
                word_dict[word] = word_index
                word_index += 1
            word_seq.append(word_dict[word])
        x_train.append(word_seq)

    x_train = sequence.pad_sequences(x_train, maxlen=200, padding='post')
    word_dict['unknown-words-in-test'] = 0

    x_test = []
    for doc in X_test:
        word_seq = []
        for word in tknzr.tokenize(doc):
            if word in word_dict:
                word_seq.append(word_dict[word])
            else:
                word_seq.append(0)
        x_test.append(word_seq)

    x_test = sequence.pad_sequences(x_test, maxlen=200, padding='post')

    return x_train, x_test, word_dict
Пример #3
0
class SentenceParser:
    def __init__(self):
        self.tokenizer = TweetTokenizer()
        self.emo_parser = NRC_AffectIntensity()

    def parse_sent(self,
                   str_response,
                   expressiveness=0.3):  # TODO:Remove later on
        response_list = []
        for sent in re.split('[?.!]', str_response):
            word_list = [word for word in self.tokenizer.tokenize(sent)]
            if word_list:
                d = {"word_list": word_list, "expressiveness": expressiveness}
                response_list.append(d)
        return response_list

    def return_emotions(self, word_list):
        emotion_list = []
        for word in word_list:
            # {'value': data[k]['value'], 'emotion': data[k]['emotion']}
            emotion_list.append(self.emo_parser.get_affect(word))
        return emotion_list

    def return_mood(self, word_list, mood):
        emotion_list = [None] * len(word_list)
        emotion_list[int(len(word_list) / 2)] = {
            'value': mood[1],
            'emotion': mood[0]
        }
        return emotion_list

    def parse_emo_sent(self, str_response, expressiveness=0.3):
        response_list = []
        d = {
            "word_list": [],
            "expressiveness": expressiveness,
            "emotion_list": []
        }
        for sent in re.split('[?.!]', str_response):
            for word in self.tokenizer.tokenize(sent):
                d["word_list"].append(word)
                d["emotion_list"].append(self.emo_parser.get_affect(word))
            d["word_list"].append(' . ')
        if d["word_list"]:
            response_list.append(d)
        return response_list

    def parse_mood_sent(self,
                        str_response,
                        expressiveness=0.3,
                        mood=('joy', 1.0)):
        # This makes all sentence with a certain mood, regardless of word based sentiment
        responses = self.parse_sent(str_response,
                                    expressiveness=expressiveness)
        for response in responses:
            response['emotion_list'] = self.return_mood(
                response['word_list'], mood)
        return responses
Пример #4
0
class LanguageModel:
    """
    N-gram model
    """
    def __init__(self, n_gram=2, missed_value=0.99):
        """

        :param n_gram: length of n-gram
        :param missed_value: default value for all unseen n-gram
        """
        self.n = n_gram
        self.n_grams = {}
        self.context = {}
        self.sentence_tokenizer = SentenceTokenizer()
        self.tokenizer = Tokenizer()
        self.missed_value = missed_value

    def build_model(self, text):
        sentenses = self.sentence_tokenizer.tokenize(text)
        words = [
            list(
                filter(
                    lambda s: s.isalpha(),
                    self.tokenizer.tokenize(sentence.strip())
                )
            ) for sentence in sentenses
        ]
        for sentence in words:
            if len(sentence) < self.n:
                key = " ".join(sentence)
                self.context.update({key: self.context.get(key, 0) + 1})
            else:
                for i in range(len(sentence) - self.n + 1):
                    context_key = " ".join(sentence[i:i + self.n - 1])
                    n_gram_key = " ".join(sentence[i:i + self.n])
                    self.context.update({context_key: self.context.get(context_key, 0) + 1})
                    self.n_grams.update({n_gram_key: self.n_grams.get(n_gram_key, 0) + 1})

    def calculate_proba(self, sentence):
        words = list(
            filter(
                lambda s: s.isalpha(),
                self.tokenizer.tokenize(sentence.strip())
            )
        )
        result = 1
        for i in range(min(self.n - 2, len(words) - 1), len(words)):
            if i < self.n - 1:
                size = sum([val for key, val in self.context.items() if len(key.split(" ")) == i+1])
                result *= self.context.get(" ".join(words[:i+1]), self.missed_value if i == self.n - 2 else 0) / size
            elif i > self.n - 2:
                context_key = " ".join(words[i-self.n+1:i])
                n_gram_key = " ".join(words[i-self.n+1:i+1])
                context_val = self.context.get(context_key, self.missed_value)
                n_gram_val = self.n_grams.get(n_gram_key, self.missed_value)
                p = n_gram_val / context_val
                result *= p
        return result
Пример #5
0
def clean_tweets(classifier, df, stop_words):
    tknzr = TweetTokenizer()
    for i in df.iterrows():
        # print('tweet: '+df['tweet_text'][i[0]])
        tokens = tknzr.tokenize(
            i[1]['tweet_text'])  # using NLTK tweet tokenizer

        custom_tokens = remove_noise(tokens, stop_words)
        df['tokens'][i[0]] = custom_tokens  # need to fix this warning later
        # SettingWithCopyWarning:
        # A value is trying to be set on a copy of a slice from a DataFrame

        # See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

        # grabs the current row: df.loc[i[0]]
        # grabs the tokens column of the current row: df.loc[i[0]]['tokens']
        # this is a python object of type array: df.loc[df.id == i[0], 'tokens']

        # df.loc[df.id == i[0], 'tokens'] = remove_noise(tokens, stop_words)

        score = classifier.classify(
            dict([token, True] for token in custom_tokens))
        df['sentiment'][i[0]] = score

    return df
Пример #6
0
def search():

    # validate screen_name
    screen_name = request.args.get("screen_name", "")
    if not screen_name:
        return redirect(url_for("index"))
    positives = os.path.join(sys.path[0], "positive-words.txt")
    negatives = os.path.join(sys.path[0], "negative-words.txt")
    # get screen_name's tweets
    tweets = helper.get_user_timeline(screen_name)

    # TODO
    analyzer = Analyzer(positives, negatives)

    s = tweets
    s = str(s)
    # analyze word
    tw = TweetTokenizer()
    #print(tw.tokenize(s))
    p = tw.tokenize(s)
    score = analyzer.analyze2(p)

    positive = float(score[0])
    if score[1] < 0:
        score[1] = -score[1]
        negative = float(score[1])
    else:
        negative = float(score[1])
    neutral = score[2]

    # generate chart
    chart = helper.chart(positive, negative, neutral)

    # render results
    return render_template("search.html", chart=chart, screen_name=screen_name)
Пример #7
0
def normalize_tweet(tweet):
    # convert the tweet to lower case
    tweet.lower()
    # convert all urls to sting "URL"
    tweet = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet)

    # correct all multiple white spaces and punctuations to a single white space/punctuation
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    tweet = re.sub(r'[\s]+', ' ', tweet)
    tweet = re.sub(r'\!{2,}', '!', tweet)

    # convert "#topic" to just "topic"
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)

    # Extracting words(tokens) from the tweet
    twt_token = TweetTokenizer(strip_handles=True)
    token = twt_token.tokenize(tweet)

    # Removing stop words
    stop_words = set(stopwords.words('english'))
    word_list = [tkn for tkn in token if tkn not in stop_words]

    # Using Rule Based Stemmer to find word stems
    stemmer = PorterStemmer()
    stems = [stemmer.stem(word) for word in word_list]

    # Creating a sentence from the stems
    norm_tweet = " ".join(stems)

    return norm_tweet
Пример #8
0
def render_wordcloud(form, **kwargs):
    session = Session()
    results = search.search(session, **form.values())
    # Create the corpus from the results
    tknzr = TweetTokenizer()
    texts = []
    for r in results:
        tokens = []
        for sent in sent_tokenize(r.text.strip()):
            tokens += [
                w for w in tknzr.tokenize(sent.strip())
                if w.lower() not in stopwords_en
            ]
        texts.append(tokens)
    corpus = nltk.TextCollection(texts)
    corpus.collocations(100)
    # noinspection PyProtectedMember
    results = {
        'vocabulary': [list(i) for i in corpus.vocab().most_common(1000)],
        'collocations': corpus._collocations,
    }
    view = render_template('./templates/search/results_wordcloud.html',
                           form=form,
                           results=results,
                           **kwargs)
    session.close()
    return view
Пример #9
0
 def load(self):
     # Load dictionary
     inBody = False
     with open(self.dict_path, 'r', encoding='utf-8') as r:
         next(r)
         for line in r:
             if inBody:
                 segs = line.strip().split('\t')
                 token = segs[0]
                 for cate_id in segs[1:]:
                     self.token_category[token].append(int(cate_id))
             else:
                 if line.startswith('%'):
                     inBody = True
                 else:
                     self.category_num += 1
     tokenizer = TweetTokenizer()
     with open(self.data_path, 'r', encoding='utf-8') as r:
         for line in r:
             tid, tweet, _ = line.rstrip().split('\t')
             tokens = tokenizer.tokenize(tweet)
             tokens = [t.replace('#', '').lower() for t in tokens]
             category_count = [0] * self.category_num
             for token in tokens:
                 for i in range(min(len(token), 5)):
                     if token[:-i] in self.token_category:
                         for cate in self.token_category[token[:-i]]:
                             category_count[cate - 1] += 1
                         break
             if len(tokens) > 0:
                 category_count = [c / len(tokens) for c in category_count]
             self.tid_vector[tid] = torch.FloatTensor(category_count)
Пример #10
0
 def text_total_counts(self):
     with codecs.open(self._filepath + ".txt", "r", "latin-1") as f:
         lines = f.read()
         tknzr = TweetTokenizer()
         tknz_lines = tknzr.tokenize(lines)
         self._totalcount = len(tknz_lines)
     return self._totalcount
Пример #11
0
 def ngrams(self):
     #        name = re.findall("\w+$",self._filepath)
     name = str(input("choose a seed: "))
     with codecs.open(self._filepath + ".txt", "r", "latin-1") as f:
         lines = f.read()
         tknzr = TweetTokenizer()
         tknz_lines = tknzr.tokenize(lines)
     emptylist = []
     maxhistory = int(input("Choose n for ngram, preferably 2 or 3: "))
     for i in range(2, maxhistory + 1):
         emptylist += nltk.ngrams(tknz_lines, i)
     cfd = ConditionalFreqDist([(tuple(a), b) for *a, b in emptylist])
     seed = [name]
     for i in range(100):
         for j in range(maxhistory - 1, 0, -1):
             if tuple(seed[-j:]) in cfd:
                 valuesum = sum(cfd[tuple(seed[-j:])].values())
                 value = random.randint(0, valuesum)
                 for key in cfd[tuple(seed[-j:])].keys():
                     value -= cfd[tuple(seed[-j:])][key]
                     if value <= 0:
                         seed.append(key)
                         break
                 break
             else:
                 continue
     return seed
     print(seed)
     return
Пример #12
0
def tokenize_tweets(tweet_dict):
    tokenized_tweets = {}
    tknzr = TweetTokenizer()
    for k, v in tweet_dict.iteritems():
        tokenized_tweet = tknzr.tokenize(v)
        tokenized_tweets[k] = tokenized_tweet
    return tokenized_tweets
Пример #13
0
class NltkTweetTokenizer(Tokenizer):
    def __init__(self) -> None:
        super().__init__()
        self._base_tokenizer = TweetTokenizer()

    def tokenize_text(self, text: str) -> List[str]:
        return self._base_tokenizer.tokenize(text)
def preprocess_text(tweet_text):
    tweet_tokenizer = TweetTokenizer()

    tokens = [
        token.lower().lstrip("@").lstrip("#")
        for token in tweet_tokenizer.tokenize(tweet_text)
    ]
    tokens_no_contra = [
        contractions[token].split() if token in contractions else [token]
        for token in tokens
    ]
    flat_list = [item for sublist in tokens_no_contra for item in sublist]
    tokens_semi_final = [
        token for token in flat_list
        if token not in punctuations and token not in en_stopwords
    ]
    final_t = [
        token.replace("'s", "") for token in tokens_semi_final
        if not re.match('((www\.[^\s]+)|(https?://[^\s]+))', token)
    ]

    text = []
    wnl = WordNetLemmatizer()
    tagged = pos_tag(final_t)
    for word, tag_prior in tagged:
        tag = nltk_tag_to_wordnet_tag(tag_prior)
        word = "not" if word == "n't" else word
        if tag:
            text.append(wnl.lemmatize(word.lower(), tag))
        else:
            text.append(wnl.lemmatize(word.lower()))

    return text
Пример #15
0
def tokenize_tweets(texts, segment=True, segment_vocab=None):
    tknzr = TweetTokenizer()
    token_x = [tknzr.tokenize(t) for t in texts]
    if not segment:
        return token_x

# if need to segment
    wordsegment.load()
    tokens = []
    for line in token_x:
        tokens += line
    counter = Counter(tokens)
    # identify segment-able words
    segmented = {}
    for word in counter:
        if word not in segment_vocab:
            segment = wordsegment.segment(word)
            if len(segment) > 1:
                segmented[word] = segment
    # reconstruct the list
    _token_x = []
    for line in token_x:
        _line = []
        for token in line:
            if token in segmented.keys():
                _line += segmented[token]
            else:
                _line.append(token)
        _token_x.append(_line)
    return _token_x
Пример #16
0
def tokenize_tweets(input_file_name, out_file_name,type_file):
    outf = open(out_file_name,'w')
    infn = open(input_file_name,'r')
    tknzr = TweetTokenizer()
   
    while 1:                       
          lines = infn.readlines(100000)                         
          if not lines:                    
             break                            
          for line in lines:
              # ignore blank lines                 
              if not line.strip():
                 continue   
              if type_file =='split':
                 tweetId, startPos, endPos, mention, screenName,tweet,mediaURL = line.strip().split('\t')  # test,dev,train tokenization 
              elif type_file =='kb':             
                  x, y,tweet,mediaURL = line.strip().split('\t')  # timeline tokenization
              else:
                  sys.exit("set type param from {split,kb}")

              tweet = tknzr.tokenize(str(tweet))            
#             if not 6 < len(tweet) < 110:
#                    continue
              if len(tweet) < 6:
                 continue
              tweet = preprocess_tweet(' '.join(tweet))
#             out_fs.write(id+'\t'+timestamp+'\t'+username+'\t'+tweet+'\n')                
#             out_fs.write( str(tweetId) + '\t' + str(startPos) + '\t' + str(endPos) +'\t' + mention +  '\t'+ str(screenName) + '\t' + str(tweet) + '\t' + str(mediaURL) + '\n')
              outf.write(str(tweet) +'\n')  
Пример #17
0
def clean_tweets(tweet):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)

    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)

    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)

    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False,
                               strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in emoticons and  # remove emoticons
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean
def tokenize_tweet(txt, *args, **kwargs):
    tokenizer = Tokenizer()
    if len(args) > 1:
        db = MySQLInterface(*args)
    else:
        db = args[0]

    txt_res = '\n'.join(
        tokenizer.tokenize(txt.replace('\n', ' ').replace('\\n', ' ').lower()))
    wordids = []
    for token in txt_res.split('\n'):
        #add the word to the table of integerized words if it doesn't already exist there
        index = db.query(
            'SELECT ID FROM WORDOCCURRENCES WHERE WORD=%s LIMIT 1', token)
        word_id = None
        if index is not None:
            if len(index) >= 1:
                word_id = int(index[0][0])
        if word_id is None:
            word_id = int(
                db.query('SELECT COUNT(DISTINCT WORD) FROM WORDOCCURRENCES')[0]
                [0])

        db.execute('INSERT INTO WORDOCCURRENCES VALUES (%i,%%s)' % (word_id),
                   token)
        wordids.append((word_id))
    return wordids
Пример #19
0
class SpaceSeparatedWordsMixIn(AbstractLanguage, metaclass=abc.ABCMeta):
    """Language in which words are separated by spaces."""

    def __init__(self):
        super().__init__()
        self.__tokenizer = TweetTokenizer(preserve_case=False)

    def split_sentence_to_words(self, sentence: str) -> List[str]:
        """Splits a sentence into words using spaces (for Latin languages)."""
        sentence = decode_object_from_bytes_if_needed(sentence)
        if sentence is None:
            log.warning("Sentence is None.")
            return []

        # Normalize apostrophe so that "it’s" and "it's" get treated identically
        sentence = sentence.replace("’", "'")

        tokens = self.__tokenizer.tokenize(text=sentence)

        def is_word(token_: str) -> bool:
            """Returns True if token looks like a word."""
            if re.match(pattern=r'\w', string=token_, flags=re.UNICODE):
                return True
            else:
                return False

        # TweetTokenizer leaves punctuation in-place
        tokens = [token for token in tokens if is_word(token)]

        return tokens
Пример #20
0
class Tokeniser(BaseEstimator, TransformerMixin):
    def __init__(self, return_flags=False):
        self.tokeniser = TweetTokenizer()
        self.return_flags = return_flags

    def fit(self, *args, **kwargs):
        return self

    def tokenise(self, sequence):
        flag = ""
        ix = 0
        tokens, positions = [], []
        for t in self.tokeniser.tokenize(sequence):
            ix = sequence.find(t, ix)
            if len(t) == 1 and ord(t) >= 127462:  # this is the code for 🇦
                if not self.return_flags:
                    continue
                if flag:
                    tokens.append(flag + t)
                    positions.append(ix - 1)
                    flag = ""
                else:
                    flag = t
            else:
                tokens.append(t)
                positions.append(ix)
            ix = +1
        return tokens, positions

    def transform(self, x, y=None):
        return [self.tokenise(sequence) for sequence in x]
Пример #21
0
    def convertDataToVec(self, data, labels, batchSize=5000):
        if data.__len__() - self.indexTracking < batchSize:
            batchSize = data.__len__() - self.indexTracking
            self.batchFlag = True

        clf = Word2Vec.load("w2v.model")
        d = np.array([])
        counts = 0
        for line in data[self.indexTracking:]:
            if counts == batchSize:
                break
            counts += 1
            tmp = np.array([0] * 300)
            tk = TweetTokenizer()
            l = tk.tokenize(self.normalizeSentence(line))
            count = 0
            for w in l:
                count += 1
                try:
                    s = clf.wv.get_vector(w)
                    s = np.array(s)
                    tmp = np.add(tmp, s)
                except:
                    continue

            tmp = tmp / count
            d = np.concatenate((d, tmp))

        l = self.convertLabelToVec(labels, batchSize)
        self.indexTracking += batchSize

        return l, d
Пример #22
0
    def removeHighAndLowFrequencyWords(self, lines, percentage=0.4):
        tk = TweetTokenizer()
        dictionary = OrderedDict()

        # create dictionary
        for line in lines:
            l = tk.tokenize(self.normalizeSentence(line))
            self.lines.append(l)
            for token in l:
                if len(token) > 1 or re.search('\w', token):
                    if dictionary.get(token) is None:
                        dictionary[token] = 1
                    else:
                        dictionary[token] += 1

        # remove high frequency and low frequency words
        dictionary = sorted(dictionary.items(),
                            key=operator.itemgetter(1),
                            reverse=False)

        while dictionary[0][1] < 5:
            del dictionary[0]

        index = math.floor(dictionary.__len__() * percentage)
        for i in range(index):
            del dictionary[0]
            del dictionary[-1]
        self.dictionary = dictionary
Пример #23
0
class MySentences(object):
    def __init__(self, files):
        self.files = files
        self.tknzr = TweetTokenizer()

    def max_reached(self, language_tags):
        all_max = True
        for lang in max_for_lang.keys():
            for sent in ['positive', 'negative']:
                tag = '{}_{}'.format(lang, sent)
                curr_is_max = language_tags[tag] >= max_for_lang[lang]
                all_max &= curr_is_max
        return all_max

    def __iter__(self):
        language_tags = defaultdict(lambda: 0)
        for (fname) in self.files:
            for line in open(fname, 'rb'):
                if self.max_reached(language_tags):
                    return

                splits = line.split('\t')
                lang_tag = splits[0].strip()
                sent_tag = splits[4].strip()
                tag = '{}_{}'.format(lang_tag, sent_tag)
                if language_tags[tag] < max_for_lang[lang_tag]:
                    language_tags[tag] += 1
                    tweet = line.split('\t')[-1]
                    tweet = preprocess_tweet(tweet)
                    tweet = self.tknzr.tokenize(tweet.decode('utf-8'))
                    yield filter(lambda word: ' ' not in word, tweet)
Пример #24
0
def main(model_file, out_tsv_file, out_labels_file, data_file_path, vocab_file_path):
    model = load_keras_model(model_file)

    uid = uuid4().hex
    os.makedirs(uid)

    samples = load_samples(data_file_path)
    train_samples, val_samples = train_val_split(samples)
    val_provider = TripletProvider(val_samples, shuffle=True)

    tokenizer = TweetTokenizer()
    tokenized_samples = [tokenizer.tokenize(sample.text) for sample in train_samples]

    vocabulary = joblib.load(vocab_file_path)
    vocabulary.fit((c for tokens in tokenized_samples for token in tokens for c in token))

    transformer = HierarchicalTripletTransformer(vocabulary)

    max_document_length, max_token_length = get_max_length(tokenized_samples)
    val_generator = TripletBatchGenerator(val_provider, transformer, max_document_length, max_token_length,
                                          len(vocabulary), 1)

    vectors = []
    labels = []
    for sample in val_generator:
        X, y, triplet = sample
        for xi in X:
            prediction = model.predict(xi)
            vectors.append(prediction)
            labels.append(sample.text)

    model.predict()
    np.savetxt('vectors_out.tsv', vectors, delimiter='\t')
Пример #25
0
def pre_process():
    data = []
    emotions = []
    word_dict = {}
    sentence = []

    with open('../data/text_emotion.csv') as csvDataFile:
        csv_reader = csv.reader(csvDataFile)
        for row in csv_reader:
            emotions.append(row[1])
            data.append(row[3])

    tknzr = TweetTokenizer()
    for d in data:
        tokens = tknzr.tokenize(d)
        sentence.append(tokens)

        # print(tokens)

    for s in sentence:
        for i in s:
            if i.lower() in word_dict:
                word_dict[i.lower()] += 1
            else:
                word_dict[i.lower()] = 1

    return [word_dict, sentence, emotions]
Пример #26
0
def getTopics(tweets, count=10):
    stop_words = set(stopwords.words("english"))
    stop_words.update([
        "rt", "anybody", "anyone", "anything", "everybody", "everyone",
        "everything", "nobody", "noone", "nothing", "somebody", "someone",
        "something", "thing", "things"
    ])

    tknzr = TweetTokenizer()

    trimmed_tweets = [[
        word for (word, pos) in pos_tag(tknzr.tokenize(tweet)) if len(word) > 1
        and word.casefold() not in stop_words and pos[0] == 'N'
    ] for tweet in tweets]

    t = trimmed_tweets
    t[:] = [[
        word.lower() if not match(r"\b[A-Z]{2,}\b", word) else word
        for word in wordlist
    ] for wordlist in trimmed_tweets]

    trimmed_tweets_counts = [Counter(wordlist) for wordlist in t]

    topics = Counter()
    for c in trimmed_tweets_counts:
        topics.update(c)

    # Counter dict `topics` can be very important. We can put preferences on twitter handles
    # they are complete nouns as opposed to parts of broken-down noun phrases like "graphic"
    # and "novel" which individually do not give the idea of the original phrase.
    # A large number of handles might mean they are connected to their followers better, interactive, etc.

    return topics.most_common(count)
Пример #27
0
def cleanText(x):
    # x = json.loads(x)
    # tmp = x
    # x = x["text"]
    #
    if len(x) != 0:
        #Unicode remover
        regex03 = 'u[a-zA-Z0-9]{4}'
        k = re.sub(regex03, '', str(x))
        text = re.sub(r"http\S+", "", str(k))
        text = text.decode('utf-8')

        # removes emoticons and other symbols
        try:
            # UCS-4
            highpoints = re.compile(u'[\U00010000-\U0010ffff]')
        except re.error:
            # UCS-2
            highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
        text = highpoints.sub('', text)

        tknzr = TweetTokenizer(reduce_len=True)
        a = tknzr.tokenize(text)

        # Pnctuations remover
        c = [i for i in a if i not in removal_list]
        c = " ".join(c)
        c = [i for i in a if i.isalnum()]  # not in removal_list]
        c = " ".join(c)
        # c = {"id" : tmp["id"], "text" : c}
        return c
Пример #28
0
def preprocess(docs, sentiments, n):
    """
    Filters <br> tags, URLs and twitter handles
    :param docs: Document list
    :param sentiments: Sentiment list
    :param n: Number of documents
    :return: Processed corpus
    """
    processed_tweets = list()
    processed_sentiments = list()
    tok = TweetTokenizer()


    for i, doc in enumerate(docs):
        if i > n:
            return processed_tweets, processed_sentiments

        if not pd.isna(sentiments[i]):
            #print(doc)
            #print(type(doc))
            #tokens = list(filter(lambda a: not a.startswith('<br' or '@' or 'http'), tok.tokenize(doc))) #tokenize and filter out <br>
            tokens = tok.tokenize(doc)
            tweet_new = ' '.join(tokens)
            processed_tweets.append(tweet_new)
            processed_sentiments.append(str(sentiments[i]))




    return processed_tweets, processed_sentiments
Пример #29
0
def main():
    HOME_DIR = "semeval_parsed"
    np.random.seed(123)
    input_fname = '200M'
    embedding = 'custom'
    type = '200M'
    ndim = 52

    data_dir = HOME_DIR + '_' + input_fname
    fname_vocab = os.path.join(data_dir, 'vocab_{}.pickle'.format('topic'))

    tknr = TweetTokenizer()
    alphabet = cPickle.load(open(fname_vocab))
    words = alphabet.keys()
    tok_words = {}
    words = []
    for word, idx in alphabet.iteritems():
        tok_word = tknr.tokenize(word.decode('utf-8'))
        tok_words[idx] = tok_word
        words.extend(tok_word)

    print len(tok_words)
    print len(words)
    print "Vocab size", len(alphabet)
    fname, delimiter, ndim = (
        'embeddings/updated_embeddings_custom_200M'.format(type, str(ndim)),
        ' ', ndim)

    word2vec = load_glove_vec(fname, words, delimiter, ndim)

    print 'len', len(word2vec)
    ndim = len(word2vec[word2vec.keys()[0]])
    print 'ndim', ndim

    random_words_count = 0
    vocab_emb = np.zeros((len(alphabet) + 1, ndim), dtype='float32')

    for idx, tok_word in tok_words.iteritems():
        isrand = 1
        word_vec = np.zeros(ndim)
        for tok in tok_word:
            if tok in word2vec.keys():
                word_vec += word2vec[tok]
                isrand = 0

        if isrand:
            word_vec = np.random.uniform(-0.25, 0.25, ndim)
            random_words_count += 1
        vocab_emb[idx] = word_vec.astype(np.float32) / len(tok_word)
    print "Using zero vector as random"
    print 'random_words_count', random_words_count

    svd = TruncatedSVD(n_components=5)
    vocab_emb = svd.fit_transform(vocab_emb).astype(np.float32)
    print vocab_emb.shape
    fname = 'embeddings/smiley_tweets_embedding_{}'.format('topic')
    outfile = os.path.join(data_dir,
                           'emb_{}.npy'.format(os.path.basename(fname)))
    print outfile
    np.save(outfile, vocab_emb)
Пример #30
0
class SpaceSeparatedWordsMixIn(AbstractLanguage, metaclass=abc.ABCMeta):
    """Language in which words are separated by spaces."""
    def __init__(self):
        super().__init__()
        self.__tokenizer = TweetTokenizer(preserve_case=False)

    def split_sentence_to_words(self, sentence: str) -> List[str]:
        """Splits a sentence into words using spaces (for Latin languages)."""
        sentence = decode_object_from_bytes_if_needed(sentence)
        if sentence is None:
            log.warning("Sentence is None.")
            return []

        # Normalize apostrophe so that "it’s" and "it's" get treated identically
        sentence = sentence.replace("’", "'")

        tokens = self.__tokenizer.tokenize(text=sentence)

        def is_word(token_: str) -> bool:
            """Returns True if token looks like a word."""
            if re.match(pattern=r'\w', string=token_, flags=re.UNICODE):
                return True
            else:
                return False

        # TweetTokenizer leaves punctuation in-place
        tokens = [token for token in tokens if is_word(token)]

        return tokens
Пример #31
0
def main():
    input_fname = 'small'
    if len(sys.argv) > 1:
        input_fname = sys.argv[1]

    tknzr = TweetTokenizer()
    tagger = PerceptronTagger()

    fout = (
        'embeddings/smiley_tweets_embedding_expanded_{}'.format(input_fname))
    fname, delimiter, ndim = (
        'embeddings/smiley_tweets_embedding_{}'.format(input_fname), ' ', 52)
    word2vec = load_glove_vec(fname, {}, delimiter, ndim)

    tagdict = tagger.tagdict
    tagidx = {}
    nRows = len(word2vec)
    nCols = len(tagdict)

    print nRows, ':', nCols

    counter = 0
    for tag in tagdict.keys():
        tagidx[tag] = counter
        counter += 1

    exp_wemb = {}
    for word in word2vec.keys():
        exp_wemb[word] = np.zeros(nCols)

    print tagidx

    train = "semeval/task-B-train-plus-dev.tsv.gz"
    test = "semeval/task-B-test2014-twitter.tsv.gz"
    dev = "semeval/twitter-test-gold-B.downloaded.tsv.gz"
    test15 = "semeval/task-B-test2015-twitter.tsv.gz"
    smiley_pos = 'semeval/smiley_tweets_{}.gz'.format(input_fname)

    it = 0
    files = [train, test, dev, test15, smiley_pos]
    for filen in files:
        for tweet in gzip.open(filen, 'rb'):
            tweet = tknzr.tokenize(tweet.decode('utf-8'))
            tags = _pos_tag(tweet, None, tagger)
            for (word, tag) in tags:
                if word in exp_wemb.keys() and tag in tagidx.keys():
                    idx = tagidx[tag]
                    exp_wemb[word][idx] = 1
            if (it % 10) == 0:
                print 'Progress:', it
            it += 1

    f = open(fout, 'wb')
    for word in exp_wemb:
        f.write(word)
        tags = exp_wemb[word]
        for i in np.nditer(tags):
            f.write(' {}'.format(i))
        fname.write("\n")
Пример #32
0
def main():
    HOME_DIR = "semeval_parsed"
    np.random.seed(123)
    input_fname = '200M'
    embedding = 'custom'
    type = '200M'
    ndim = 52

    data_dir = HOME_DIR + '_' + input_fname
    fname_vocab = os.path.join(data_dir, 'vocab_{}.pickle'.format('topic'))

    tknr = TweetTokenizer()
    alphabet = cPickle.load(open(fname_vocab))
    words = alphabet.keys()
    tok_words = {}
    words = []
    for word,idx in alphabet.iteritems():
        tok_word = tknr.tokenize(word.decode('utf-8'))
        tok_words[idx] = tok_word
        words.extend(tok_word)

    print len(tok_words)
    print len(words)
    print "Vocab size", len(alphabet)
    fname,delimiter,ndim = ('embeddings/updated_embeddings_custom_200M'.format(type,str(ndim)),' ',ndim)

    word2vec = load_glove_vec(fname,words,delimiter,ndim)

    print 'len',len(word2vec)
    ndim = len(word2vec[word2vec.keys()[0]])
    print 'ndim', ndim

    random_words_count = 0
    vocab_emb = np.zeros((len(alphabet) + 1, ndim),dtype='float32')

    for idx,tok_word in tok_words.iteritems():
        isrand = 1
        word_vec = np.zeros(ndim)
        for tok in tok_word:
            if tok in word2vec.keys():
                word_vec += word2vec[tok]
                isrand = 0

        if isrand:
          word_vec = np.random.uniform(-0.25, 0.25, ndim)
          random_words_count += 1
        vocab_emb[idx] = word_vec.astype(np.float32)/len(tok_word)
    print "Using zero vector as random"
    print 'random_words_count', random_words_count

    svd = TruncatedSVD(n_components=5)
    vocab_emb = svd.fit_transform(vocab_emb).astype(np.float32)
    print vocab_emb.shape
    fname = 'embeddings/smiley_tweets_embedding_{}'.format('topic')
    outfile = os.path.join(data_dir, 'emb_{}.npy'.format(os.path.basename(fname)))
    print outfile
    np.save(outfile, vocab_emb)
Пример #33
0
def main():
    input_fname = 'small'
    if len(sys.argv) > 1:
        input_fname = sys.argv[1]

    tknzr = TweetTokenizer()
    tagger = PerceptronTagger()

    fout = ('embeddings/smiley_tweets_embedding_expanded_{}'.format(input_fname))
    fname,delimiter,ndim = ('embeddings/smiley_tweets_embedding_{}'.format(input_fname),' ',52)
    word2vec = load_glove_vec(fname,{},delimiter,ndim)

    tagdict = tagger.tagdict
    tagidx = {}
    nRows = len(word2vec)
    nCols = len(tagdict)

    print nRows,':',nCols

    counter = 0
    for tag in tagdict.keys():
        tagidx[tag] = counter
        counter += 1

    exp_wemb = {}
    for word in word2vec.keys():
        exp_wemb[word] = np.zeros(nCols)

    print tagidx

    train = "semeval/task-B-train-plus-dev.tsv.gz"
    test = "semeval/task-B-test2014-twitter.tsv.gz"
    dev = "semeval/twitter-test-gold-B.downloaded.tsv.gz"
    test15 = "semeval/task-B-test2015-twitter.tsv.gz"
    smiley_pos = 'semeval/smiley_tweets_{}.gz'.format(input_fname)

    it = 0
    files = [train,test,dev,test15,smiley_pos]
    for filen in files:
        for tweet in gzip.open(filen,'rb'):
            tweet = tknzr.tokenize(tweet.decode('utf-8'))
            tags = _pos_tag(tweet, None, tagger)
            for (word,tag) in tags:
                if word in exp_wemb.keys() and tag in tagidx.keys():
                    idx = tagidx[tag]
                    exp_wemb[word][idx] = 1
            if (it%10) == 0:
                print 'Progress:',it
            it += 1

    f = open(fout,'wb')
    for word in exp_wemb:
        f.write(word)
        tags = exp_wemb[word]
        for i in np.nditer(tags):
            f.write(' {}'.format(i))
        fname.write("\n")
class MySentences(object):
    def __init__(self, files):
        self.files = files
        self.tknzr = TweetTokenizer()

    def __iter__(self):
       for fname in self.files:
             for line in gzip.open(fname,'rb'):
                 tweet = preprocess_tweet(line)
                 tweet = self.tknzr.tokenize(tweet.decode('utf-8'))
                 yield filter(lambda word: ' ' not in word, tweet)
Пример #35
0
def load_data(fname):
    tid,topics,tweets,sentiments = [],[],[],[]
    tknzr = TweetTokenizer(reduce_len=True)
    n_not_available = 0
    with open(fname) as f:
        for line in f:
            splits = line.split('\t')
            tweet = splits[3]
            sentiment = convertSentiment(splits[2])
            if tweet != "Not Available\n":
                tid.append(splits[0])
                topic = pts.preprocess_tweet(splits[1])
                topic_tok = tknzr.tokenize(topic.decode('utf-8'))
                topics.append(splits[1])

                tweet = pts.preprocess_tweet(tweet)
                tweet_tok = tknzr.tokenize(tweet.decode('utf-8'))
                tweets.append(tweet_tok)
                sentiments.append(int(sentiment))
            else:
                n_not_available += 1

    print "Number of not availalbe tweets:", n_not_available
    return tid,topics,tweets,sentiments
Пример #36
0
def load_data(fname,pos):
    tid,tweets,sentiments = [],[],[]
    tknzr = TweetTokenizer(reduce_len=True)
    n_not_available = 0
    with open(fname) as f:
        for line in f:
            splits = line.split('\t')
            tweet = splits[pos + 1]
            sentiment = convertSentiment(splits[pos])

            tid.append(splits[0])
            tweet = pts.preprocess_tweet(tweet)
            tweet_tok = tknzr.tokenize(tweet.decode('utf-8'))
            tweets.append(tweet_tok)
            sentiments.append(int(sentiment))

    return tid,tweets,sentiments
Пример #37
0
def tokenize_tweets(filename, dest_folder):
    basename = os.path.basename(filename)
    dest = os.path.join(dest_folder, basename + '.tok')
    print("processing %s" % basename)
    tknzr = TweetTokenizer()
    with codecs.open(dest, 'w', "utf-8") as out_fs:
        with open(filename, 'r', encoding="utf-8") as in_fs:
            for line in in_fs:
                try:
                    language, id, timestamp, username, tweet = line.strip().split('\t')
                except:
                    print("could not parse line.")
                    continue
                if language != 'en':
                    continue
                tweet = tknzr.tokenize(tweet)
                if not 6 < len(tweet) < 110:
                    continue
                tweet = preprocess_tweet(' '.join(tweet))
                filter(lambda word: ' ' not in word, tweet)
                out_fs.write(id+'\t'+timestamp+'\t'+username+'\t'+tweet+'\n')
Пример #38
0
from collections import defaultdict

tokenizer = TweetTokenizer()
csvfile = open('trainingandtestdata/testdata.manual.2009.06.14.csv', 'rb')
reader = csv.reader(csvfile, delimiter=',')
rownum = 0
sentiments = []
tokens = [[]]
for row in reader:
    colnum = 0
    for col in row:
        if colnum == 0:
            sentiments.insert(rownum,int(col))
        if colnum == 5:
            raw = col #.read().decode('utf8')
            tokens.insert(rownum,tokenizer.tokenize(raw))
##            print("tokens contents:", end='')
##            for word in tokens[rownum]:
##                print(word, end = " ")
##            print()
        colnum += 1
    rownum += 1
csvfile.close()

#Divide into training and test data - randomly allocate 4/5 to training and 1/5 to test
position = []
posPosts = []
negPosts = []
neuPosts = []
for posts in range(0,len(sentiments)):
    position.insert(posts,random.randint(0,5))