예제 #1
0
def main():
    input_fname = 'small'
    if len(sys.argv) > 1:
        input_fname = sys.argv[1]

    tknzr = TweetTokenizer()
    tagger = PerceptronTagger()

    fout = ('embeddings/smiley_tweets_embedding_expanded_{}'.format(input_fname))
    fname,delimiter,ndim = ('embeddings/smiley_tweets_embedding_{}'.format(input_fname),' ',52)
    word2vec = load_glove_vec(fname,{},delimiter,ndim)

    tagdict = tagger.tagdict
    tagidx = {}
    nRows = len(word2vec)
    nCols = len(tagdict)

    print nRows,':',nCols

    counter = 0
    for tag in tagdict.keys():
        tagidx[tag] = counter
        counter += 1

    exp_wemb = {}
    for word in word2vec.keys():
        exp_wemb[word] = np.zeros(nCols)

    print tagidx

    train = "semeval/task-B-train-plus-dev.tsv.gz"
    test = "semeval/task-B-test2014-twitter.tsv.gz"
    dev = "semeval/twitter-test-gold-B.downloaded.tsv.gz"
    test15 = "semeval/task-B-test2015-twitter.tsv.gz"
    smiley_pos = 'semeval/smiley_tweets_{}.gz'.format(input_fname)

    it = 0
    files = [train,test,dev,test15,smiley_pos]
    for filen in files:
        for tweet in gzip.open(filen,'rb'):
            tweet = tknzr.tokenize(tweet.decode('utf-8'))
            tags = _pos_tag(tweet, None, tagger)
            for (word,tag) in tags:
                if word in exp_wemb.keys() and tag in tagidx.keys():
                    idx = tagidx[tag]
                    exp_wemb[word][idx] = 1
            if (it%10) == 0:
                print 'Progress:',it
            it += 1

    f = open(fout,'wb')
    for word in exp_wemb:
        f.write(word)
        tags = exp_wemb[word]
        for i in np.nditer(tags):
            f.write(' {}'.format(i))
        fname.write("\n")
예제 #2
0
def main():
    HOME_DIR = "semeval_parsed"
    np.random.seed(123)
    input_fname = '200M'
    embedding = 'custom'
    type = '200M'
    ndim = 52

    data_dir = HOME_DIR + '_' + input_fname
    fname_vocab = os.path.join(data_dir, 'vocab_{}.pickle'.format('topic'))

    tknr = TweetTokenizer()
    alphabet = cPickle.load(open(fname_vocab))
    words = alphabet.keys()
    tok_words = {}
    words = []
    for word,idx in alphabet.iteritems():
        tok_word = tknr.tokenize(word.decode('utf-8'))
        tok_words[idx] = tok_word
        words.extend(tok_word)

    print len(tok_words)
    print len(words)
    print "Vocab size", len(alphabet)
    fname,delimiter,ndim = ('embeddings/updated_embeddings_custom_200M'.format(type,str(ndim)),' ',ndim)

    word2vec = load_glove_vec(fname,words,delimiter,ndim)

    print 'len',len(word2vec)
    ndim = len(word2vec[word2vec.keys()[0]])
    print 'ndim', ndim

    random_words_count = 0
    vocab_emb = np.zeros((len(alphabet) + 1, ndim),dtype='float32')

    for idx,tok_word in tok_words.iteritems():
        isrand = 1
        word_vec = np.zeros(ndim)
        for tok in tok_word:
            if tok in word2vec.keys():
                word_vec += word2vec[tok]
                isrand = 0

        if isrand:
          word_vec = np.random.uniform(-0.25, 0.25, ndim)
          random_words_count += 1
        vocab_emb[idx] = word_vec.astype(np.float32)/len(tok_word)
    print "Using zero vector as random"
    print 'random_words_count', random_words_count

    svd = TruncatedSVD(n_components=5)
    vocab_emb = svd.fit_transform(vocab_emb).astype(np.float32)
    print vocab_emb.shape
    fname = 'embeddings/smiley_tweets_embedding_{}'.format('topic')
    outfile = os.path.join(data_dir, 'emb_{}.npy'.format(os.path.basename(fname)))
    print outfile
    np.save(outfile, vocab_emb)
def cosineSimilarity(text_a, text_b):
    # Tokenize sentences
    tknzr = TweetTokenizer()
    word_list_a = tknzr.tokenize(text_a)
    word_list_b = tknzr.tokenize(text_b)

    keys = list(set(word_list_a + word_list_b))
    vector_size = len(keys)
    vector_a = [0] * vector_size
    vector_b = [0] * vector_size

    for i in range(vector_size):
        vector_a[i] = word_list_a.count(keys[i])
        vector_b[i] = word_list_b.count(keys[i])

    return dot(vector_a, vector_b) / (norm(vector_a) * norm(vector_b))
예제 #4
0
def main():
    # x, y = load_dataset("datasets/sentiment_uci/yelp_labelled.txt")
    x, y = load_datasets(["../datasets/sentiment_uci/yelp_labelled.txt"])

    stopwords = set()
    with open('../stopwords.txt', 'r') as f:
        for w in f:
            stopwords.add(w)

    tok = TweetTokenizer()
    stemmer = EnglishStemmer()
    vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True, binary=True, preprocessor=stemmer.stem,
                                 tokenizer=tok.tokenize, ngram_range=(1, 2))

    accu_p = np.zeros(shape=(2,))
    accu_r = np.zeros(shape=(2,))
    accu_f = np.zeros(shape=(2,))
    accu_a = 0.0
    folds = 10
    for train_idx, test_idx in StratifiedKFold(y=y, n_folds=folds, shuffle=True):
        train_x, train_y = x[train_idx], y[train_idx]
        test_x, test_y = x[test_idx], y[test_idx]

        cls = tree.DecisionTreeClassifier()

        # train
        train_x = vectorizer.fit_transform(train_x).toarray()

        cls.fit(train_x, train_y)

        # test
        test_x = vectorizer.transform(test_x).toarray()

        pred_y = cls.predict(test_x)

        # evaluate
        p, r, f, _ = precision_recall_fscore_support(test_y, pred_y)
        a = accuracy_score(test_y, pred_y)
        accu_p += p
        accu_r += r
        accu_f += f
        accu_a += a

        print("Evaluating classifier:")
        print("\tAccuracy: {}".format(a))
        print("\tPrecision[0]: {}".format(p[0]))
        print("\tPrecision[1]: {}".format(p[1]))
        print("\tRecall[0]: {}".format(r[0]))
        print("\tRecall[1]: {}".format(r[1]))
        print("\tF1-score[0]: {}".format(f[0]))
        print("\tF1-score[1]: {}".format(f[1]))

    print("Average evaluation")
    print("\tAccuracy: {}".format(accu_a / folds))
    print("\tPrecision[0]: {}".format(accu_p[0] / folds))
    print("\tPrecision[1]: {}".format(accu_p[1] / folds))
    print("\tRecall[0]: {}".format(accu_r[0] / folds))
    print("\tRecall[1]: {}".format(accu_r[1] / folds))
    print("\tF1-score[0]: {}".format(accu_f[0] / folds))
    print("\tF1-score[1]: {}".format(accu_f[1] / folds))
예제 #5
0
def tokenize_with(kwargs):
    tokenizer = TweetTokenizer(**kwargs)

    def tweet_tokenizer(data):
        return [' '.join(tokenizer.tokenize(tweet)) for tweet in data]

    return tweet_tokenizer
예제 #6
0
class NltkTweetTokenizer(Tokenizer):
    def __init__(self) -> None:
        super().__init__()
        self._base_tokenizer = TweetTokenizer()

    def tokenize_text(self, text: str) -> List[str]:
        return self._base_tokenizer.tokenize(text)
예제 #7
0
class SpaceSeparatedWordsMixIn(AbstractLanguage, metaclass=abc.ABCMeta):
    """Language in which words are separated by spaces."""
    def __init__(self):
        super().__init__()
        self.__tokenizer = TweetTokenizer(preserve_case=False)

    def split_sentence_to_words(self, sentence: str) -> List[str]:
        """Splits a sentence into words using spaces (for Latin languages)."""
        sentence = decode_object_from_bytes_if_needed(sentence)
        if sentence is None:
            log.warning("Sentence is None.")
            return []

        # Normalize apostrophe so that "it’s" and "it's" get treated identically
        sentence = sentence.replace("’", "'")

        tokens = self.__tokenizer.tokenize(text=sentence)

        def is_word(token_: str) -> bool:
            """Returns True if token looks like a word."""
            if re.match(pattern=r'\w', string=token_, flags=re.UNICODE):
                return True
            else:
                return False

        # TweetTokenizer leaves punctuation in-place
        tokens = [token for token in tokens if is_word(token)]

        return tokens
예제 #8
0
class Tokeniser(BaseEstimator, TransformerMixin):
    def __init__(self, return_flags=False):
        self.tokeniser = TweetTokenizer()
        self.return_flags = return_flags

    def fit(self, *args, **kwargs):
        return self

    def tokenise(self, sequence):
        flag = ""
        ix = 0
        tokens, positions = [], []
        for t in self.tokeniser.tokenize(sequence):
            ix = sequence.find(t, ix)
            if len(t) == 1 and ord(t) >= 127462:  # this is the code for 🇦
                if not self.return_flags:
                    continue
                if flag:
                    tokens.append(flag + t)
                    positions.append(ix - 1)
                    flag = ""
                else:
                    flag = t
            else:
                tokens.append(t)
                positions.append(ix)
            ix = +1
        return tokens, positions

    def transform(self, x, y=None):
        return [self.tokenise(sequence) for sequence in x]
예제 #9
0
class MySentences(object):
    def __init__(self, files):
        self.files = files
        self.tknzr = TweetTokenizer()

    def max_reached(self, language_tags):
        all_max = True
        for lang in max_for_lang.keys():
            for sent in ['positive', 'negative']:
                tag = '{}_{}'.format(lang, sent)
                curr_is_max = language_tags[tag] >= max_for_lang[lang]
                all_max &= curr_is_max
        return all_max

    def __iter__(self):
        language_tags = defaultdict(lambda: 0)
        for (fname) in self.files:
            for line in open(fname, 'rb'):
                if self.max_reached(language_tags):
                    return

                splits = line.split('\t')
                lang_tag = splits[0].strip()
                sent_tag = splits[4].strip()
                tag = '{}_{}'.format(lang_tag, sent_tag)
                if language_tags[tag] < max_for_lang[lang_tag]:
                    language_tags[tag] += 1
                    tweet = line.split('\t')[-1]
                    tweet = preprocess_tweet(tweet)
                    tweet = self.tknzr.tokenize(tweet.decode('utf-8'))
                    yield filter(lambda word: ' ' not in word, tweet)
예제 #10
0
class SpaceSeparatedWordsMixIn(AbstractLanguage, metaclass=abc.ABCMeta):
    """Language in which words are separated by spaces."""

    def __init__(self):
        super().__init__()
        self.__tokenizer = TweetTokenizer(preserve_case=False)

    def split_sentence_to_words(self, sentence: str) -> List[str]:
        """Splits a sentence into words using spaces (for Latin languages)."""
        sentence = decode_object_from_bytes_if_needed(sentence)
        if sentence is None:
            log.warning("Sentence is None.")
            return []

        # Normalize apostrophe so that "it’s" and "it's" get treated identically
        sentence = sentence.replace("’", "'")

        tokens = self.__tokenizer.tokenize(text=sentence)

        def is_word(token_: str) -> bool:
            """Returns True if token looks like a word."""
            if re.match(pattern=r'\w', string=token_, flags=re.UNICODE):
                return True
            else:
                return False

        # TweetTokenizer leaves punctuation in-place
        tokens = [token for token in tokens if is_word(token)]

        return tokens
    def boolenModel(self, freq, onlyfiles):

        self.comboBox_4.clear()
        self.comboBox_4.addItem(' ')
        requete = self.plainTextEdit_2.toPlainText()
        requete = requete.lower()

        req = TweetTokenizer().tokenize(requete)

        for file in onlyfiles:

            reqtemp = []
            for mot in req:
                mot.lower()
                if (mot in ['and', 'or', '(', ')', 'not']):
                    reqtemp.append(mot)
                    reqtemp.append(' ')
                else:
                    listfile = self.indexmotSimple(mot)
                    if (file in listfile):
                        reqtemp.append('1')
                        reqtemp.append(' ')
                    else:
                        reqtemp.append('0')
                        reqtemp.append(' ')
            evaluation = eval(''.join(reqtemp))
            if (evaluation == 1):

                self.comboBox_4.addItem(file)
예제 #12
0
class PartsOfSpeechExtractor(BaseEstimator, TransformerMixin):

    IGNORE_TAGS = ['PUNCT', 'CCONJ']
    _vectorizer = None
    _tokenizer = TweetTokenizer(reduce_len=True)
    _pos_helper = PartsOfSpeechHelper()

    def __init__(self):
        pass

    def transform(self, data, y=None):
        result = []

        for tweet in data:
            result.append(self.pos_tag(tweet))

        if self._vectorizer == None:
            self._vectorizer = DictVectorizer(sparse=False)
            self._vectorizer.fit(result)

        return self._vectorizer.transform(result)

    def pos_tag(self, tweet):
        tokens = self._tokenizer.tokenize(tweet)
        pos_tweet = self._pos_helper.pos_tag(tokens)
        return Counter([t for w, t in pos_tweet if t not in self.IGNORE_TAGS])

    def fit(self, df, y=None):
        return self
def run(dataset, hyperparameters, metrics, fname=None):
    # # Load Resources
    word2vec = None
    if hyperparameters['model'] != 'rand':
        word2vec = load_word2vec()
    # # Load Dataset
    df = load_dataset(dataset[0], **dataset[1])
    # # Preprocess
    df['clean_tweets'] = df.tweet.apply(
        TweetPreprocessor(normalize=['link', 'mention']).preprocess)
    df['tokens'] = df.clean_tweets.apply(TweetTokenizer().tokenize)
    X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split(
        df.tokens, df.label)
    # # Train
    clf = NeuralNetClassifier(module=TextCNN,
                              corpus=df.tokens,
                              word_vectors=word2vec,
                              metrics=metrics,
                              **hyperparameters)
    clf.fit(X_train, y_train, validation_data=(X_dev, y_dev))
    # # Predict
    y_pred = clf.predict(X_test)
    # # Evaluate
    pprint(
        dict(dataset=dataset,
             hyperparameters=hyperparameters,
             scores={
                 scorer: get_score_func(scorer)(y_test, y_pred)
                 for scorer in metrics
             }))
    # # Save to file
    X_test['pred'] = y_pred
    X_test.to_excel(scratch_path('predictions_%s.xlsx' % fname))
예제 #14
0
    def word2vec_feature_from_tweets(self,
                                     glove_input_file,
                                     embedd_dim,
                                     name='default'):
        # --- loaded saved features if it's exist ? ---
        features_path = 'saved_objects/features/train/embedding_features-' + name + '.pkl'
        if (os.path.exists(features_path)):
            file = open(features_path, 'rb')
            return pickle.load(file)

        # --- otherwise generate embedding features ---
        word2vec = KeyedVectors.load_word2vec_format(glove_input_file,
                                                     unicode_errors='ignore',
                                                     binary=False)

        # get tfidf from each word required in embedding features
        _, _, tfidf_scores = self.tfidf_from_tweets()
        tfidf = dict(zip(tfidf_scores.get_feature_names(), tfidf_scores.idf_))

        # ---weighted-average tweet2vec. ---
        def build_average_Word2vec(tokens, size):
            vec = np.zeros(size)
            count = 0.
            for word in tokens:
                try:
                    vec += word2vec[word] * tfidf[word]
                    count += 1.
                except KeyError:
                    continue
            if count != 0:
                vec /= count
            return vec

        tokenizer = TweetTokenizer()
        embedd_table = {}
        for _, row in self.norm_df.iterrows():  # self.norm_test_df.iterrows()
            tweet2vec = build_average_Word2vec(tokenizer.tokenize(
                row['norm_tweets']),
                                               size=embedd_dim)
            embedd_table[row['tweet_id']] = tweet2vec

        # ----- saving embedding features to disk --------
        file = open(features_path, 'wb')
        pickle.dump(embedd_table, file)
        file.close()

        return embedd_table
예제 #15
0
	def preprocess_tweet(self, tweet):
		"""Pre-process a tweet and/or profile description.
		The following pre-processing operations are done on the text:
		- Replace emojis like: "Python is :thumbs_up:"
		- Replace repeated character sequences of length 3 or greater with sequences of length 3
		- Lowercase
		- Replace all URLs and username mentions with the following tags:
			URL		    <URLURL>
			@Username   <UsernameMention>
		Args:
			tweet: String
		Returns:
			The pre-processed tweet as String
		IMPROVEMENTS TO MAKE:
		- Instead of tokenizing and detokenizing, which is messy, the strings should be directly replaced using regex.
		"""

		replaced_urls = []  # Create an empty list
		replaced_mentions = []  # Create an empty list

		# Replace emojis
		tweet = emoji.demojize(tweet)

		# Tokenize using NLTK
		tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True)
		tokens = tokenizer.tokenize(tweet)

		# Iterate over tokens
		for index, token in enumerate(tokens):
			# Replace URLs
			if token[0:4] == "http":
				replaced_urls.append(token)
				tokens[index] = "<URLURL>"
				# ↳ *tokens[index]* will directly modify *tokens*, whereas any changes to *token* will be lost.

			# Replace mentions (Twitter handles; usernames)
			elif token[0] == "@" and len(token) > 1:
				# ↳ Skip the single '@' tokens
				replaced_mentions.append(token)
				tokens[index] = "<UsernameMention>"

		# Detokenize using NLTK's Treebank Word Detokenizer
		detokenizer = TreebankWordDetokenizer()
		processed_tweet = detokenizer.detokenize(tokens)

		# *replaced_urls* and *replaced_mentions* will contain all of the replaced URLs and Mentions of the input string.
		return processed_tweet
def main():
    x, y = load_datasets(["../datasets/sentiment_uci/yelp_labelled.txt"])
    features(stopwords)
    stopwords = set()
    with open('../stopwords.txt', 'r') as f:
        for w in f:
            stopwords.add(w.strip())

    tok = TweetTokenizer()

    x = [remove_stopwords(tok.tokenize(s.lower()), stopwords) for s in x]
    x = np.array(x)

    accumulate = dict()
    folds = 10
    for train_idx, test_idx in StratifiedKFold(y=y, n_folds=folds, shuffle=True):
        train_x, train_y = x[train_idx], y[train_idx]
        test_x, test_y = x[test_idx], y[test_idx]

        train_docs = [(sent, label) for sent, label in zip(train_x, train_y)]
        test_docs = [(sent, label) for sent, label in zip(test_x, test_y)]

        cls = SentimentAnalyzer()

        # train
        words_with_neg = cls.all_words([mark_negation(a) for a in train_x])
        unigram_feats = cls.unigram_word_feats(words_with_neg)
        bigram_feats = cls.bigram_collocation_feats(train_x)

        cls.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats, handle_negation=True)
        cls.add_feat_extractor(extract_bigram_feats, bigrams=bigram_feats)

        training_set = cls.apply_features(train_docs, labeled=True)

        cls.train(PositiveNaiveBayesClassifier.train, training_set)

        # test & evaluate
        test_set = cls.apply_features(test_docs)

        for key, value in sorted(cls.evaluate(test_set).items()):
            print('\t{0}: {1}'.format(key, value))
            accumulate.setdefault(key, 0.0)
            accumulate[key] += value

    print("Averages")
    for key, value in sorted(accumulate.items()):
        print('\tAverage {0}: {1}'.format(key, value/folds))
예제 #17
0
def remove_common_words(data, proportion):
    """Removes the top words of a sample by a give proportion.

    Parameters
    ----------
    data: np.array
        Corpus of text where each phrase is a separate array.

    proportion: float
        The proportion of words that you would like removed.

    Returns
    -------
    top_words_removed: np.array
        Returns the corpus back with the top words removed.
    """
    tokenizer = TweetTokenizer()

    # tokenize the data
    tokenized_data = []
    for s in data:
        try:
            tokenized_data.append(tokenizer.tokenize(s))
        except TypeError:
            pass

    # flatten and remove punctuation
    tokens = [word.lower() for phrase in tokenized_data for word in phrase]
    tokens = [word for word in tokens if word not in set(string.punctuation)]

    # count token occurences
    token_counts = Counter(tokens)

    # find the number for removal
    n_top = round(len(token_counts.keys()) * proportion)

    top_tokens = [t[0] for t in token_counts.most_common(n_top)]

    top_words_removed = []
    for phrase in tokenized_data:
        top_words_removed.append(" ".join(
            [word for word in phrase if word.lower() not in top_tokens]))

    top_words_removed = np.array(top_words_removed)

    return top_words_removed
예제 #18
0
 def __init__(self, reduce_len=True, preserve_case=False, stopwords=[]):
     """Initialize a Preprocessor object.
     arguments:
        reduce_len: Whether repeated occurences of letters in words should
                    be shortened to at most 3 letter. E.g hellooooooo -> hellooo
        preserve_case: Whether the case of words should be preserved.
        stopwords: List of words that should be filtered out of the tokenized
                   tweets.
     """
     self.tokenizer = TweetTokenizer(reduce_len=reduce_len,
                                     preserve_case=preserve_case)
     self.stopwords = stopwords
     self.url_token = '<url>'
     self.user_token = '<user>'
     self.email_token = '<email>'
     self.tag_token = '<tag>'
     self.number_token = '<number>'
예제 #19
0
    def tokenize_2(self, text, ngrams_sizes=(3, 2), remove_stopwords=True):
        tknzr = TweetTokenizer()
        text = text.lower()
        if ngrams_sizes:
            for i in ngrams_sizes:
                # join ngrams with '_'
                tokens = tknzr.tokenize(text)
                ngs = ngrams(tokens, i)
                for ng in ngs:
                    phrs = "_".join(ng)
                    if phrs in self.dictionary:
                        text = text.replace(" ".join(ng), phrs)

        tokens = tknzr.tokenize(text)
        if remove_stopwords:
            tokens = [t for t in tokens if t not in self.stopwords]
        return tokens
예제 #20
0
def tokenize_tweets(filename, dest_folder):
    basename = os.path.basename(filename)
    dest = os.path.join(dest_folder, basename + '.tok')
    print("processing %s" % basename)
    tknzr = TweetTokenizer()
    with codecs.open(dest, 'w', "utf-8") as out_fs:
        with open(filename, 'r', encoding="utf-8") as in_fs:
            for line in in_fs:
예제 #21
0
def load_data(fname,pos):
    tid,tweets,sentiments = [],[],[]
    tknzr = TweetTokenizer(reduce_len=True)
    n_not_available = 0
    with open(fname) as f:
        for line in f:
            splits = line.split('\t')
            tweet = splits[pos + 1]
            sentiment = convertSentiment(splits[pos])

            tid.append(splits[0])
            tweet = pts.preprocess_tweet(tweet)
            tweet_tok = tknzr.tokenize(tweet.decode('utf-8'))
            tweets.append(tweet_tok)
            sentiments.append(int(sentiment))

    return tid,tweets,sentiments
예제 #22
0
def cargar_twitts():
    direc = "data_set/"
    files = os.listdir(direc)
    archivos = [direc + twitt for twitt in files]
    twitts = []
    for a in archivos:
        fp = open(a, "r")
        lineas = fp.readlines()[1:]
        for x in lineas:
            palabras = ''.join(
                [c for c in x.split('	')[1] if c not in non_words])
            tt = TweetTokenizer()
            twitt = tt.tokenize(palabras)
            twitts.append([twitt, x.split('	')[2]])
    fp.close()

    return twitts
예제 #23
0
 def analyze(self, text):
     s = 0
     li = TweetTokenizer().tokenize(text)
     for w in li:
         w.lower
         if (w in d):
             s = s + d[w]
     return s
예제 #24
0
    def lineNormalization(self, line):
        tknzr = TweetTokenizer()
        norm = w2vAndGramsConverter()

        line = re.sub(r"#\s+", "", line)
        tmp = line.split(" ")
        st = ""
        for t in tmp:
            #reStart = time.time()
            links = re.findall(
                r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
                t)
            #print("reg time spend: " +  str(time.time() - reStart))
            if links.__len__() > 0:
                #etStart = time.time()
                dom = tldextract.extract(links[0]).domain
                #print("first ext time spend: " + str(time.time() - etStart))
                if dom == 'bit':
                    if self.linkHash.get(links[0]) is None:
                        try:
                            #exreStart = time.time()
                            t = tldextract.extract(
                                self.sess.head(
                                    links[0]).headers['location']).domain

                            #print("request and extract time spend:" + str(time.time() - exreStart))
                        except:
                            t = "invaildURL"

                        self.linkHash[links[0]] = t

                    else:
                        t = self.linkHash.get(links[0])
                else:
                    t = dom

            st += t + " "

        line = st

        line = norm.normalizeSentence(line)
        line = self.removePuncu(line)
        line = line.lower()
        tokens = tknzr.tokenize(line)
        return tokens
def load_twetts():
    direc = "data_set_eng/"
    files = os.listdir(direc)
    files = [direc + twitt for twitt in files]
    twetts = []
    for a in files:
        fp = open(a, "r")
        lines = fp.readlines()[1:]
        for x in lines:
            words = ''.join(
                [c for c in x.split('\t')[1] if c not in non_words])
            words = clean_text(words)
            tt = TweetTokenizer()
            twitt = tt.tokenize(words)
            twetts.append([twitt, x.split('\t')[2]])
    fp.close()

    return twetts
예제 #26
0
def tokenize_alexa(filename, dest_folder):
    basename = os.path.basename(filename)
    dest = os.path.join(dest_folder, basename + '.tok')
    print("processing %s" % basename)
    tknzr = TweetTokenizer()
    with codecs.open(dest, 'w', "utf-8") as out_fs:
        with open(filename, 'r', encoding="utf-8") as in_fs:
            for line in in_fs:
                try:
                    service_desc = line  #.strip().split('\t')
                except:
                    print("could not parse line.")
                    continue
                tweet = tknzr.tokenize(service_desc)
                if not 6 < len(tweet) < 110:
                    continue
                tweet = preprocess_tweet(' '.join(tweet))
                out_fs.write(tweet + '\t' + '\n')
예제 #27
0
def featureVecotrize(batch):
    # global debug_procces_words
    tknzr = TweetTokenizer()
    vect = [0] * (feature_len - 6)
    tokenized_batch = tknzr.tokenize(batch)
    sent_text = nltk.sent_tokenize(batch)
    # ordered_dict_listed = tuple(OrderedDict(analyze_text.unigrams_dict).keys())
    ordered_dict_listed = tuple(OrderedDict(top_500_in_dict).keys())
    len_sum = 0
    num_of_words = 0
    shell_nouns_count = 0
    references_count = 0
    function_words_count = 0
    for word in tokenized_batch:
        if word in ordered_dict_listed:
            vect[ordered_dict_listed.index(word)] += 1 / len(all_ages_batched[0])
        if word in SHELL_NOUNS:
            shell_nouns_count += 1
        if word in REFERENCES:
            references_count += 1
        if word in FUNCTION_WORDS:
            function_words_count += 1
        if len(word) < 2 or (len(word) == 1 and "." in word or "," in word):
            continue
        else:
            num_of_words +=1
            len_sum += len(word)


    avg_sent = sum([len(sent.replace(","," ").split()) for sent in sent_text])/ len(sent_text)
    batch_wo_punc = batch.replace(",","")
    unique_words = len(set((batch_wo_punc.split())))

    avg_word = len_sum/num_of_words

    vect.append(avg_sent)
    vect.append(avg_word)
    vect.append(shell_nouns_count/len(all_ages_batched[0]))
    vect.append(references_count/len(all_ages_batched[0]))
    vect.append(function_words_count/len(all_ages_batched[0]))
    vect.append(unique_words/len(all_ages_batched[0]))
    # print(avg_sent,avg_word,shell_nouns_count,references_count,function_words_count,unique_words,file=debug_file)

    return vect
예제 #28
0
    def tweetLength(self, line):
        # normalize the line
        w2vLib = w2vAndGramsConverter()
        line = w2vLib.normalizeSentence(line)

        # tokenize sentence
        tnz = TweetTokenizer()
        tokens = tnz.tokenize(line)

        if tokens.__len__() <= 10:
            return 1
        elif tokens.__len__() <= 20:
            return 2
        elif tokens.__len__() <= 30:
            return 3
        elif tokens.__len__() <= 40:
            return 4
        else:
            return 5
예제 #29
0
def get_sentence_embeddings(sentences, phrase = True, ngram='bigrams', model='concat_wiki_twitter'):
    """ Returns a numpy matrix of embeddings for one of the published models. It
    handles tokenization and can be given raw sentences.
    Arguments:
        - ngram: 'unigrams' or 'bigrams'
        - model: 'wiki', 'twitter', or 'concat_wiki_twitter'
        - sentences: a list of raw sentences ['Once upon a time', 'This is another sentence.', ...]
    """
    wiki_embeddings = None
    twitter_embbedings = None
    tokenized_sentences_NLTK_tweets = None
    tokenized_sentences_SNLP = None
    if model == "wiki" or model == 'concat_wiki_twitter':
        tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8')
        #print("sentences",sentences) 
        #print(tknzr)
        s = ' <delimiter> '.join(sentences) #just a trick to make things faster
        #print("S",s)
        tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s])
        tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(' <delimiter> ')
        assert(len(tokenized_sentences_SNLP) == len(sentences))

        if phrase:
            temp = extract_keyphrase_candidates(str(tokenized_sentences_SNLP))
            #print("Temp = ", temp)
            temp1 = list([' '.join(x) for x in temp if type(x) == list])
            temp2 = list([x for x in temp if type(x) == str])
            tokenized_sentences_SNLP = [*temp1, *temp2]
            
        #print("Senetences = ", sentences, "\n")
        if ngram == 'unigrams':
            wiki_embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \
                                     MODEL_WIKI_UNIGRAMS, FASTTEXT_EXEC_PATH)
        else:
            wiki_embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \
                                     MODEL_WIKI_BIGRAMS, FASTTEXT_EXEC_PATH)
    if model == "twitter" or model == 'concat_wiki_twitter':
        tknzr = TweetTokenizer()
        tokenized_sentences_NLTK_tweets = tokenize_sentences(tknzr, sentences)
        if ngram == 'unigrams':
            twitter_embbedings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_NLTK_tweets, \
                                     MODEL_TWITTER_UNIGRAMS, FASTTEXT_EXEC_PATH)
        else:
            twitter_embbedings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_NLTK_tweets, \
                                     MODEL_TWITTER_BIGRAMS, FASTTEXT_EXEC_PATH)
    if model == "twitter":
        return twitter_embbedings
    elif model == "wiki":
        if phrase:
            return tokenized_sentences_SNLP, wiki_embeddings
        else:
            return wiki_embeddings
    elif model == "concat_wiki_twitter":
        return np.concatenate((wiki_embeddings, twitter_embbedings), axis=1)
    sys.exit(-1)
예제 #30
0
def clean_text(phrase):
    # remove hyperlinks
    phrase = re.sub(r'https?:\/\/.*[\r\n]*', '', phrase)

    # remove hashtags
    # only removing the hash # sign from the word
    phrase = re.sub('#[\w\.\-]+', '', phrase)
    phrase = re.sub('@[\w\.\-]+', '', phrase)

    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    phrase_tokens = tokenizer.tokenize(phrase)
    phrase_clean = []
    for word in phrase_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation and
                word not in emoticons):
            stem_word = stemmer.stem(word)  # stemming word
            phrase_clean.append(stem_word)
    return phrase_clean
예제 #31
0
class PartsOfSpeechPatternExtractor(BaseEstimator, TransformerMixin):

    POS_PATTERNS = [('NOUN','ADJ'), ('NOUN','NOUN'), ('ADJ','NOUN'), ('VERB','NOUN'), ('AUX','NOUN'),
                    ('NOUN','PRON','NOUN'), ('VERB','PRON','NOUN'), ('AUX','PRON','NOUN')]
    IGNORE_TAGS = ['PUNCT']

    _vectorizer = None
    _tokenizer = TweetTokenizer(reduce_len=True)
    _processor = Preprocessor(stemming=True)
    _pos_helper = PartsOfSpeechHelper()

    def __init__(self):
        pass
    
    def transform(self, data, y=None):
        result = []

        for tweet in data:
            result.append(self.get_patterns(tweet))

        if self._vectorizer == None :
            self._vectorizer = DictVectorizer(sparse=False)
            self._vectorizer.fit(result)

        return self._vectorizer.transform(result)

    def get_patterns(self, tweet):
        result = []
        tokens = self._tokenizer.tokenize(tweet)
        pos_tags = self._pos_helper.pos_tag(tokens)
        if len(pos_tags) > 1:
            pos_tags = [p for p in pos_tags if p[1] not in self.IGNORE_TAGS]
            words, tags = zip(*pos_tags)

            for pattern in self.POS_PATTERNS:
                found = self.find_sublist(list(pattern), list(tags))
                for i,j in found:
                    # Added patterns instead of tokens
                    result.append('_'.join(list(pattern)))
                    # result.append(self._processor.preprocess(' '.join(words[i:j])))

        return Counter(result)

    def fit(self, df, y=None):
        return self

    def find_sublist(self, sl, l):
        results = []
        sll = len(sl)
        for ind in (i for i, e in enumerate(l) if e == sl[0]):
            if l[ind:ind + sll] == sl:
                results.append((ind, ind + sll))

        return results
예제 #32
0
def processTweet(tweet):
    _stopwords = set(
        stopwords.words('english') + list(punctuation) +
        ['AT_USER', 'URL', 'RT', 'rt', 'at_user', 'url'])
    tweet = tweet.lower()  # convert text to lower-case
    tweet = expandContractions(
        tweet)  # expand the contractions to remove the stop words
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL',
                   tweet)  # remove URLs
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)  # remove the # in #hashtag
    tokenizer = TweetTokenizer(
        strip_handles=True,
        reduce_len=True)  # tokenize the tweet and remove the handle
    tweet = tokenizer.tokenize(tweet)
    # remove the stop words from the tokenzied tweet
    tweet = [word for word in tweet if word not in _stopwords]
    # perform lemmatization on the words which helps to find the root of the word
    # The Lemmatization will not be performed for NER as the Lexicons are derived from Twitter
    # tweet = [WordNetLemmatizer().lemmatize(w, get_wordnet_pos(w)) for w in tweet]
    return tweet
예제 #33
0
class MySentences(object):
    def __init__(self, files):
        self.files = files
        self.tknzr = TweetTokenizer()

    def __iter__(self):
       for fname in self.files:
             for line in gzip.open(fname,'rb'):
                 tweet = preprocess_tweet(line)
                 tweet = self.tknzr.tokenize(tweet.decode('utf-8'))
                 yield filter(lambda word: ' ' not in word, tweet)
예제 #34
0
class LexiconExtractor(BaseEstimator, TransformerMixin):

    NGRAM_LENGTH = 3
    REVERSE_WORDS = ['no', 'ni', 'tampoc', 'ningun']

    _tokenizer = TweetTokenizer()
    _preprocessor = Preprocessor(twitter_features=Preprocessor.REMOVE,
                                 stemming=True)

    def __init__(self):
        self._neg_words = self.file_to_list('lexicon/negative_words.txt')
        self._pos_words = self.file_to_list('lexicon/positive_words.txt')

    def transform(self, data, y=None):
        result = []

        for tweet in data:
            tweet = self._preprocessor.preprocess(tweet)
            result.append(self.count_polarity_words(tweet))

        return preprocessing.normalize(result)

    def count_polarity_words(self, text):
        num_pos_words = 0
        num_neg_words = 0

        list_ngrams = list(
            ngrams(self._tokenizer.tokenize(text),
                   self.NGRAM_LENGTH,
                   pad_left=True))

        for ngram in list_ngrams:
            pre_words = ngram[:self.NGRAM_LENGTH - 1]
            word = ngram[self.NGRAM_LENGTH - 1]

            if word in self._pos_words:
                if any(w in pre_words for w in self.REVERSE_WORDS):
                    num_neg_words += 1
                else:
                    num_pos_words += 1

            elif word in self._neg_words:
                if any(w in pre_words for w in self.REVERSE_WORDS):
                    num_pos_words += 1
                else:
                    num_neg_words += 1

        return [num_pos_words, num_neg_words]

    def fit(self, df, y=None):
        return self

    def file_to_list(self, filename):
        return io.open(filename).read().splitlines()
예제 #35
0
def tokenize_tweets(filename, dest_folder):
    basename = os.path.basename(filename)
    dest = os.path.join(dest_folder, basename + '.tok')
    print("processing %s" % basename)
    tknzr = TweetTokenizer()
    with codecs.open(dest, 'w', "utf-8") as out_fs:
        with open(filename, 'r', encoding="utf-8") as in_fs:
            for line in in_fs:
                try:
                    language, id, timestamp, username, tweet = line.strip().split('\t')
                except:
                    print("could not parse line.")
                    continue
                if language != 'en':
                    continue
                tweet = tknzr.tokenize(tweet)
                if not 6 < len(tweet) < 110:
                    continue
                tweet = preprocess_tweet(' '.join(tweet))
                filter(lambda word: ' ' not in word, tweet)
                out_fs.write(id+'\t'+timestamp+'\t'+username+'\t'+tweet+'\n')
예제 #36
0
def load_data(fname):
    tid,topics,tweets,sentiments = [],[],[],[]
    tknzr = TweetTokenizer(reduce_len=True)
    n_not_available = 0
    with open(fname) as f:
        for line in f:
            splits = line.split('\t')
            tweet = splits[3]
            sentiment = convertSentiment(splits[2])
            if tweet != "Not Available\n":
                tid.append(splits[0])
                topic = pts.preprocess_tweet(splits[1])
                topic_tok = tknzr.tokenize(topic.decode('utf-8'))
                topics.append(splits[1])

                tweet = pts.preprocess_tweet(tweet)
                tweet_tok = tknzr.tokenize(tweet.decode('utf-8'))
                tweets.append(tweet_tok)
                sentiments.append(int(sentiment))
            else:
                n_not_available += 1

    print "Number of not availalbe tweets:", n_not_available
    return tid,topics,tweets,sentiments
예제 #37
0
 def __init__(self):
     super().__init__()
     self.__tokenizer = TweetTokenizer(preserve_case=False)
예제 #38
0
from __future__ import print_function
import nltk
#nltk.download()
from nltk import TweetTokenizer
import csv
import random
from collections import defaultdict

tokenizer = TweetTokenizer()
csvfile = open('trainingandtestdata/testdata.manual.2009.06.14.csv', 'rb')
reader = csv.reader(csvfile, delimiter=',')
rownum = 0
sentiments = []
tokens = [[]]
for row in reader:
    colnum = 0
    for col in row:
        if colnum == 0:
            sentiments.insert(rownum,int(col))
        if colnum == 5:
            raw = col #.read().decode('utf8')
            tokens.insert(rownum,tokenizer.tokenize(raw))
##            print("tokens contents:", end='')
##            for word in tokens[rownum]:
##                print(word, end = " ")
##            print()
        colnum += 1
    rownum += 1
csvfile.close()

#Divide into training and test data - randomly allocate 4/5 to training and 1/5 to test
예제 #39
0
 def __init__(self, files):
     self.files = files
     self.tknzr = TweetTokenizer()