class MySentences(object): def __init__(self, listings, gzpFiles): self.listings = listings self.gzip_files = gzpFiles self.tknzr = TweetTokenizer() def __iter__(self): for files in self.listings: file_done = False counter = 0 for (fname) in files: if file_done: break for line in open(fname, 'rb'): if counter >= MAX_TW_LANG: file_done = True break counter += 1 tweet = line.split('\t')[-1] tweet = preprocess_tweet(tweet) tweet = self.tknzr.tokenize(tweet.decode('utf-8')) yield filter(lambda word: ' ' not in word, tweet) counter = 0 for (fname) in self.gzip_files: for line in gzip.open(fname, 'rb'): if counter >= MAX_TW_LANG: return counter += 1 tweet = line.split('\t')[-1] tweet = preprocess_tweet(tweet) tweet = self.tknzr.tokenize(tweet.decode('utf-8')) yield filter(lambda word: ' ' not in word, tweet)
def token(X_train, X_test): tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) x_train = [] word_dict = {} word_index = 1 for doc in X_train: word_seq = [] for word in tknzr.tokenize(doc): if word not in word_dict: word_dict[word] = word_index word_index += 1 word_seq.append(word_dict[word]) x_train.append(word_seq) x_train = sequence.pad_sequences(x_train, maxlen=200, padding='post') word_dict['unknown-words-in-test'] = 0 x_test = [] for doc in X_test: word_seq = [] for word in tknzr.tokenize(doc): if word in word_dict: word_seq.append(word_dict[word]) else: word_seq.append(0) x_test.append(word_seq) x_test = sequence.pad_sequences(x_test, maxlen=200, padding='post') return x_train, x_test, word_dict
class SentenceParser: def __init__(self): self.tokenizer = TweetTokenizer() self.emo_parser = NRC_AffectIntensity() def parse_sent(self, str_response, expressiveness=0.3): # TODO:Remove later on response_list = [] for sent in re.split('[?.!]', str_response): word_list = [word for word in self.tokenizer.tokenize(sent)] if word_list: d = {"word_list": word_list, "expressiveness": expressiveness} response_list.append(d) return response_list def return_emotions(self, word_list): emotion_list = [] for word in word_list: # {'value': data[k]['value'], 'emotion': data[k]['emotion']} emotion_list.append(self.emo_parser.get_affect(word)) return emotion_list def return_mood(self, word_list, mood): emotion_list = [None] * len(word_list) emotion_list[int(len(word_list) / 2)] = { 'value': mood[1], 'emotion': mood[0] } return emotion_list def parse_emo_sent(self, str_response, expressiveness=0.3): response_list = [] d = { "word_list": [], "expressiveness": expressiveness, "emotion_list": [] } for sent in re.split('[?.!]', str_response): for word in self.tokenizer.tokenize(sent): d["word_list"].append(word) d["emotion_list"].append(self.emo_parser.get_affect(word)) d["word_list"].append(' . ') if d["word_list"]: response_list.append(d) return response_list def parse_mood_sent(self, str_response, expressiveness=0.3, mood=('joy', 1.0)): # This makes all sentence with a certain mood, regardless of word based sentiment responses = self.parse_sent(str_response, expressiveness=expressiveness) for response in responses: response['emotion_list'] = self.return_mood( response['word_list'], mood) return responses
class LanguageModel: """ N-gram model """ def __init__(self, n_gram=2, missed_value=0.99): """ :param n_gram: length of n-gram :param missed_value: default value for all unseen n-gram """ self.n = n_gram self.n_grams = {} self.context = {} self.sentence_tokenizer = SentenceTokenizer() self.tokenizer = Tokenizer() self.missed_value = missed_value def build_model(self, text): sentenses = self.sentence_tokenizer.tokenize(text) words = [ list( filter( lambda s: s.isalpha(), self.tokenizer.tokenize(sentence.strip()) ) ) for sentence in sentenses ] for sentence in words: if len(sentence) < self.n: key = " ".join(sentence) self.context.update({key: self.context.get(key, 0) + 1}) else: for i in range(len(sentence) - self.n + 1): context_key = " ".join(sentence[i:i + self.n - 1]) n_gram_key = " ".join(sentence[i:i + self.n]) self.context.update({context_key: self.context.get(context_key, 0) + 1}) self.n_grams.update({n_gram_key: self.n_grams.get(n_gram_key, 0) + 1}) def calculate_proba(self, sentence): words = list( filter( lambda s: s.isalpha(), self.tokenizer.tokenize(sentence.strip()) ) ) result = 1 for i in range(min(self.n - 2, len(words) - 1), len(words)): if i < self.n - 1: size = sum([val for key, val in self.context.items() if len(key.split(" ")) == i+1]) result *= self.context.get(" ".join(words[:i+1]), self.missed_value if i == self.n - 2 else 0) / size elif i > self.n - 2: context_key = " ".join(words[i-self.n+1:i]) n_gram_key = " ".join(words[i-self.n+1:i+1]) context_val = self.context.get(context_key, self.missed_value) n_gram_val = self.n_grams.get(n_gram_key, self.missed_value) p = n_gram_val / context_val result *= p return result
def clean_tweets(classifier, df, stop_words): tknzr = TweetTokenizer() for i in df.iterrows(): # print('tweet: '+df['tweet_text'][i[0]]) tokens = tknzr.tokenize( i[1]['tweet_text']) # using NLTK tweet tokenizer custom_tokens = remove_noise(tokens, stop_words) df['tokens'][i[0]] = custom_tokens # need to fix this warning later # SettingWithCopyWarning: # A value is trying to be set on a copy of a slice from a DataFrame # See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy # grabs the current row: df.loc[i[0]] # grabs the tokens column of the current row: df.loc[i[0]]['tokens'] # this is a python object of type array: df.loc[df.id == i[0], 'tokens'] # df.loc[df.id == i[0], 'tokens'] = remove_noise(tokens, stop_words) score = classifier.classify( dict([token, True] for token in custom_tokens)) df['sentiment'][i[0]] = score return df
def search(): # validate screen_name screen_name = request.args.get("screen_name", "") if not screen_name: return redirect(url_for("index")) positives = os.path.join(sys.path[0], "positive-words.txt") negatives = os.path.join(sys.path[0], "negative-words.txt") # get screen_name's tweets tweets = helper.get_user_timeline(screen_name) # TODO analyzer = Analyzer(positives, negatives) s = tweets s = str(s) # analyze word tw = TweetTokenizer() #print(tw.tokenize(s)) p = tw.tokenize(s) score = analyzer.analyze2(p) positive = float(score[0]) if score[1] < 0: score[1] = -score[1] negative = float(score[1]) else: negative = float(score[1]) neutral = score[2] # generate chart chart = helper.chart(positive, negative, neutral) # render results return render_template("search.html", chart=chart, screen_name=screen_name)
def normalize_tweet(tweet): # convert the tweet to lower case tweet.lower() # convert all urls to sting "URL" tweet = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # correct all multiple white spaces and punctuations to a single white space/punctuation tweet = re.sub(r'\.{2,}', ' ', tweet) tweet = re.sub(r'[\s]+', ' ', tweet) tweet = re.sub(r'\!{2,}', '!', tweet) # convert "#topic" to just "topic" tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # Extracting words(tokens) from the tweet twt_token = TweetTokenizer(strip_handles=True) token = twt_token.tokenize(tweet) # Removing stop words stop_words = set(stopwords.words('english')) word_list = [tkn for tkn in token if tkn not in stop_words] # Using Rule Based Stemmer to find word stems stemmer = PorterStemmer() stems = [stemmer.stem(word) for word in word_list] # Creating a sentence from the stems norm_tweet = " ".join(stems) return norm_tweet
def render_wordcloud(form, **kwargs): session = Session() results = search.search(session, **form.values()) # Create the corpus from the results tknzr = TweetTokenizer() texts = [] for r in results: tokens = [] for sent in sent_tokenize(r.text.strip()): tokens += [ w for w in tknzr.tokenize(sent.strip()) if w.lower() not in stopwords_en ] texts.append(tokens) corpus = nltk.TextCollection(texts) corpus.collocations(100) # noinspection PyProtectedMember results = { 'vocabulary': [list(i) for i in corpus.vocab().most_common(1000)], 'collocations': corpus._collocations, } view = render_template('./templates/search/results_wordcloud.html', form=form, results=results, **kwargs) session.close() return view
def load(self): # Load dictionary inBody = False with open(self.dict_path, 'r', encoding='utf-8') as r: next(r) for line in r: if inBody: segs = line.strip().split('\t') token = segs[0] for cate_id in segs[1:]: self.token_category[token].append(int(cate_id)) else: if line.startswith('%'): inBody = True else: self.category_num += 1 tokenizer = TweetTokenizer() with open(self.data_path, 'r', encoding='utf-8') as r: for line in r: tid, tweet, _ = line.rstrip().split('\t') tokens = tokenizer.tokenize(tweet) tokens = [t.replace('#', '').lower() for t in tokens] category_count = [0] * self.category_num for token in tokens: for i in range(min(len(token), 5)): if token[:-i] in self.token_category: for cate in self.token_category[token[:-i]]: category_count[cate - 1] += 1 break if len(tokens) > 0: category_count = [c / len(tokens) for c in category_count] self.tid_vector[tid] = torch.FloatTensor(category_count)
def text_total_counts(self): with codecs.open(self._filepath + ".txt", "r", "latin-1") as f: lines = f.read() tknzr = TweetTokenizer() tknz_lines = tknzr.tokenize(lines) self._totalcount = len(tknz_lines) return self._totalcount
def ngrams(self): # name = re.findall("\w+$",self._filepath) name = str(input("choose a seed: ")) with codecs.open(self._filepath + ".txt", "r", "latin-1") as f: lines = f.read() tknzr = TweetTokenizer() tknz_lines = tknzr.tokenize(lines) emptylist = [] maxhistory = int(input("Choose n for ngram, preferably 2 or 3: ")) for i in range(2, maxhistory + 1): emptylist += nltk.ngrams(tknz_lines, i) cfd = ConditionalFreqDist([(tuple(a), b) for *a, b in emptylist]) seed = [name] for i in range(100): for j in range(maxhistory - 1, 0, -1): if tuple(seed[-j:]) in cfd: valuesum = sum(cfd[tuple(seed[-j:])].values()) value = random.randint(0, valuesum) for key in cfd[tuple(seed[-j:])].keys(): value -= cfd[tuple(seed[-j:])][key] if value <= 0: seed.append(key) break break else: continue return seed print(seed) return
def tokenize_tweets(tweet_dict): tokenized_tweets = {} tknzr = TweetTokenizer() for k, v in tweet_dict.iteritems(): tokenized_tweet = tknzr.tokenize(v) tokenized_tweets[k] = tokenized_tweet return tokenized_tweets
class NltkTweetTokenizer(Tokenizer): def __init__(self) -> None: super().__init__() self._base_tokenizer = TweetTokenizer() def tokenize_text(self, text: str) -> List[str]: return self._base_tokenizer.tokenize(text)
def preprocess_text(tweet_text): tweet_tokenizer = TweetTokenizer() tokens = [ token.lower().lstrip("@").lstrip("#") for token in tweet_tokenizer.tokenize(tweet_text) ] tokens_no_contra = [ contractions[token].split() if token in contractions else [token] for token in tokens ] flat_list = [item for sublist in tokens_no_contra for item in sublist] tokens_semi_final = [ token for token in flat_list if token not in punctuations and token not in en_stopwords ] final_t = [ token.replace("'s", "") for token in tokens_semi_final if not re.match('((www\.[^\s]+)|(https?://[^\s]+))', token) ] text = [] wnl = WordNetLemmatizer() tagged = pos_tag(final_t) for word, tag_prior in tagged: tag = nltk_tag_to_wordnet_tag(tag_prior) word = "not" if word == "n't" else word if tag: text.append(wnl.lemmatize(word.lower(), tag)) else: text.append(wnl.lemmatize(word.lower())) return text
def tokenize_tweets(texts, segment=True, segment_vocab=None): tknzr = TweetTokenizer() token_x = [tknzr.tokenize(t) for t in texts] if not segment: return token_x # if need to segment wordsegment.load() tokens = [] for line in token_x: tokens += line counter = Counter(tokens) # identify segment-able words segmented = {} for word in counter: if word not in segment_vocab: segment = wordsegment.segment(word) if len(segment) > 1: segmented[word] = segment # reconstruct the list _token_x = [] for line in token_x: _line = [] for token in line: if token in segmented.keys(): _line += segmented[token] else: _line.append(token) _token_x.append(_line) return _token_x
def tokenize_tweets(input_file_name, out_file_name,type_file): outf = open(out_file_name,'w') infn = open(input_file_name,'r') tknzr = TweetTokenizer() while 1: lines = infn.readlines(100000) if not lines: break for line in lines: # ignore blank lines if not line.strip(): continue if type_file =='split': tweetId, startPos, endPos, mention, screenName,tweet,mediaURL = line.strip().split('\t') # test,dev,train tokenization elif type_file =='kb': x, y,tweet,mediaURL = line.strip().split('\t') # timeline tokenization else: sys.exit("set type param from {split,kb}") tweet = tknzr.tokenize(str(tweet)) # if not 6 < len(tweet) < 110: # continue if len(tweet) < 6: continue tweet = preprocess_tweet(' '.join(tweet)) # out_fs.write(id+'\t'+timestamp+'\t'+username+'\t'+tweet+'\n') # out_fs.write( str(tweetId) + '\t' + str(startPos) + '\t' + str(endPos) +'\t' + mention + '\t'+ str(screenName) + '\t' + str(tweet) + '\t' + str(mediaURL) + '\n') outf.write(str(tweet) +'\n')
def clean_tweets(tweet): # remove stock market tickers like $GE tweet = re.sub(r'\$\w*', '', tweet) # remove old style retweet text "RT" tweet = re.sub(r'^RT[\s]+', '', tweet) # remove hyperlinks tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet) # remove hashtags # only removing the hash # sign from the word tweet = re.sub(r'#', '', tweet) # tokenize tweets tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) tweet_tokens = tokenizer.tokenize(tweet) tweets_clean = [] for word in tweet_tokens: if (word not in stopwords_english and # remove stopwords word not in emoticons and # remove emoticons word not in string.punctuation): # remove punctuation # tweets_clean.append(word) stem_word = stemmer.stem(word) # stemming word tweets_clean.append(stem_word) return tweets_clean
def tokenize_tweet(txt, *args, **kwargs): tokenizer = Tokenizer() if len(args) > 1: db = MySQLInterface(*args) else: db = args[0] txt_res = '\n'.join( tokenizer.tokenize(txt.replace('\n', ' ').replace('\\n', ' ').lower())) wordids = [] for token in txt_res.split('\n'): #add the word to the table of integerized words if it doesn't already exist there index = db.query( 'SELECT ID FROM WORDOCCURRENCES WHERE WORD=%s LIMIT 1', token) word_id = None if index is not None: if len(index) >= 1: word_id = int(index[0][0]) if word_id is None: word_id = int( db.query('SELECT COUNT(DISTINCT WORD) FROM WORDOCCURRENCES')[0] [0]) db.execute('INSERT INTO WORDOCCURRENCES VALUES (%i,%%s)' % (word_id), token) wordids.append((word_id)) return wordids
class SpaceSeparatedWordsMixIn(AbstractLanguage, metaclass=abc.ABCMeta): """Language in which words are separated by spaces.""" def __init__(self): super().__init__() self.__tokenizer = TweetTokenizer(preserve_case=False) def split_sentence_to_words(self, sentence: str) -> List[str]: """Splits a sentence into words using spaces (for Latin languages).""" sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: log.warning("Sentence is None.") return [] # Normalize apostrophe so that "it’s" and "it's" get treated identically sentence = sentence.replace("’", "'") tokens = self.__tokenizer.tokenize(text=sentence) def is_word(token_: str) -> bool: """Returns True if token looks like a word.""" if re.match(pattern=r'\w', string=token_, flags=re.UNICODE): return True else: return False # TweetTokenizer leaves punctuation in-place tokens = [token for token in tokens if is_word(token)] return tokens
class Tokeniser(BaseEstimator, TransformerMixin): def __init__(self, return_flags=False): self.tokeniser = TweetTokenizer() self.return_flags = return_flags def fit(self, *args, **kwargs): return self def tokenise(self, sequence): flag = "" ix = 0 tokens, positions = [], [] for t in self.tokeniser.tokenize(sequence): ix = sequence.find(t, ix) if len(t) == 1 and ord(t) >= 127462: # this is the code for 🇦 if not self.return_flags: continue if flag: tokens.append(flag + t) positions.append(ix - 1) flag = "" else: flag = t else: tokens.append(t) positions.append(ix) ix = +1 return tokens, positions def transform(self, x, y=None): return [self.tokenise(sequence) for sequence in x]
def convertDataToVec(self, data, labels, batchSize=5000): if data.__len__() - self.indexTracking < batchSize: batchSize = data.__len__() - self.indexTracking self.batchFlag = True clf = Word2Vec.load("w2v.model") d = np.array([]) counts = 0 for line in data[self.indexTracking:]: if counts == batchSize: break counts += 1 tmp = np.array([0] * 300) tk = TweetTokenizer() l = tk.tokenize(self.normalizeSentence(line)) count = 0 for w in l: count += 1 try: s = clf.wv.get_vector(w) s = np.array(s) tmp = np.add(tmp, s) except: continue tmp = tmp / count d = np.concatenate((d, tmp)) l = self.convertLabelToVec(labels, batchSize) self.indexTracking += batchSize return l, d
def removeHighAndLowFrequencyWords(self, lines, percentage=0.4): tk = TweetTokenizer() dictionary = OrderedDict() # create dictionary for line in lines: l = tk.tokenize(self.normalizeSentence(line)) self.lines.append(l) for token in l: if len(token) > 1 or re.search('\w', token): if dictionary.get(token) is None: dictionary[token] = 1 else: dictionary[token] += 1 # remove high frequency and low frequency words dictionary = sorted(dictionary.items(), key=operator.itemgetter(1), reverse=False) while dictionary[0][1] < 5: del dictionary[0] index = math.floor(dictionary.__len__() * percentage) for i in range(index): del dictionary[0] del dictionary[-1] self.dictionary = dictionary
class MySentences(object): def __init__(self, files): self.files = files self.tknzr = TweetTokenizer() def max_reached(self, language_tags): all_max = True for lang in max_for_lang.keys(): for sent in ['positive', 'negative']: tag = '{}_{}'.format(lang, sent) curr_is_max = language_tags[tag] >= max_for_lang[lang] all_max &= curr_is_max return all_max def __iter__(self): language_tags = defaultdict(lambda: 0) for (fname) in self.files: for line in open(fname, 'rb'): if self.max_reached(language_tags): return splits = line.split('\t') lang_tag = splits[0].strip() sent_tag = splits[4].strip() tag = '{}_{}'.format(lang_tag, sent_tag) if language_tags[tag] < max_for_lang[lang_tag]: language_tags[tag] += 1 tweet = line.split('\t')[-1] tweet = preprocess_tweet(tweet) tweet = self.tknzr.tokenize(tweet.decode('utf-8')) yield filter(lambda word: ' ' not in word, tweet)
def main(model_file, out_tsv_file, out_labels_file, data_file_path, vocab_file_path): model = load_keras_model(model_file) uid = uuid4().hex os.makedirs(uid) samples = load_samples(data_file_path) train_samples, val_samples = train_val_split(samples) val_provider = TripletProvider(val_samples, shuffle=True) tokenizer = TweetTokenizer() tokenized_samples = [tokenizer.tokenize(sample.text) for sample in train_samples] vocabulary = joblib.load(vocab_file_path) vocabulary.fit((c for tokens in tokenized_samples for token in tokens for c in token)) transformer = HierarchicalTripletTransformer(vocabulary) max_document_length, max_token_length = get_max_length(tokenized_samples) val_generator = TripletBatchGenerator(val_provider, transformer, max_document_length, max_token_length, len(vocabulary), 1) vectors = [] labels = [] for sample in val_generator: X, y, triplet = sample for xi in X: prediction = model.predict(xi) vectors.append(prediction) labels.append(sample.text) model.predict() np.savetxt('vectors_out.tsv', vectors, delimiter='\t')
def pre_process(): data = [] emotions = [] word_dict = {} sentence = [] with open('../data/text_emotion.csv') as csvDataFile: csv_reader = csv.reader(csvDataFile) for row in csv_reader: emotions.append(row[1]) data.append(row[3]) tknzr = TweetTokenizer() for d in data: tokens = tknzr.tokenize(d) sentence.append(tokens) # print(tokens) for s in sentence: for i in s: if i.lower() in word_dict: word_dict[i.lower()] += 1 else: word_dict[i.lower()] = 1 return [word_dict, sentence, emotions]
def getTopics(tweets, count=10): stop_words = set(stopwords.words("english")) stop_words.update([ "rt", "anybody", "anyone", "anything", "everybody", "everyone", "everything", "nobody", "noone", "nothing", "somebody", "someone", "something", "thing", "things" ]) tknzr = TweetTokenizer() trimmed_tweets = [[ word for (word, pos) in pos_tag(tknzr.tokenize(tweet)) if len(word) > 1 and word.casefold() not in stop_words and pos[0] == 'N' ] for tweet in tweets] t = trimmed_tweets t[:] = [[ word.lower() if not match(r"\b[A-Z]{2,}\b", word) else word for word in wordlist ] for wordlist in trimmed_tweets] trimmed_tweets_counts = [Counter(wordlist) for wordlist in t] topics = Counter() for c in trimmed_tweets_counts: topics.update(c) # Counter dict `topics` can be very important. We can put preferences on twitter handles # they are complete nouns as opposed to parts of broken-down noun phrases like "graphic" # and "novel" which individually do not give the idea of the original phrase. # A large number of handles might mean they are connected to their followers better, interactive, etc. return topics.most_common(count)
def cleanText(x): # x = json.loads(x) # tmp = x # x = x["text"] # if len(x) != 0: #Unicode remover regex03 = 'u[a-zA-Z0-9]{4}' k = re.sub(regex03, '', str(x)) text = re.sub(r"http\S+", "", str(k)) text = text.decode('utf-8') # removes emoticons and other symbols try: # UCS-4 highpoints = re.compile(u'[\U00010000-\U0010ffff]') except re.error: # UCS-2 highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') text = highpoints.sub('', text) tknzr = TweetTokenizer(reduce_len=True) a = tknzr.tokenize(text) # Pnctuations remover c = [i for i in a if i not in removal_list] c = " ".join(c) c = [i for i in a if i.isalnum()] # not in removal_list] c = " ".join(c) # c = {"id" : tmp["id"], "text" : c} return c
def preprocess(docs, sentiments, n): """ Filters <br> tags, URLs and twitter handles :param docs: Document list :param sentiments: Sentiment list :param n: Number of documents :return: Processed corpus """ processed_tweets = list() processed_sentiments = list() tok = TweetTokenizer() for i, doc in enumerate(docs): if i > n: return processed_tweets, processed_sentiments if not pd.isna(sentiments[i]): #print(doc) #print(type(doc)) #tokens = list(filter(lambda a: not a.startswith('<br' or '@' or 'http'), tok.tokenize(doc))) #tokenize and filter out <br> tokens = tok.tokenize(doc) tweet_new = ' '.join(tokens) processed_tweets.append(tweet_new) processed_sentiments.append(str(sentiments[i])) return processed_tweets, processed_sentiments
def main(): HOME_DIR = "semeval_parsed" np.random.seed(123) input_fname = '200M' embedding = 'custom' type = '200M' ndim = 52 data_dir = HOME_DIR + '_' + input_fname fname_vocab = os.path.join(data_dir, 'vocab_{}.pickle'.format('topic')) tknr = TweetTokenizer() alphabet = cPickle.load(open(fname_vocab)) words = alphabet.keys() tok_words = {} words = [] for word, idx in alphabet.iteritems(): tok_word = tknr.tokenize(word.decode('utf-8')) tok_words[idx] = tok_word words.extend(tok_word) print len(tok_words) print len(words) print "Vocab size", len(alphabet) fname, delimiter, ndim = ( 'embeddings/updated_embeddings_custom_200M'.format(type, str(ndim)), ' ', ndim) word2vec = load_glove_vec(fname, words, delimiter, ndim) print 'len', len(word2vec) ndim = len(word2vec[word2vec.keys()[0]]) print 'ndim', ndim random_words_count = 0 vocab_emb = np.zeros((len(alphabet) + 1, ndim), dtype='float32') for idx, tok_word in tok_words.iteritems(): isrand = 1 word_vec = np.zeros(ndim) for tok in tok_word: if tok in word2vec.keys(): word_vec += word2vec[tok] isrand = 0 if isrand: word_vec = np.random.uniform(-0.25, 0.25, ndim) random_words_count += 1 vocab_emb[idx] = word_vec.astype(np.float32) / len(tok_word) print "Using zero vector as random" print 'random_words_count', random_words_count svd = TruncatedSVD(n_components=5) vocab_emb = svd.fit_transform(vocab_emb).astype(np.float32) print vocab_emb.shape fname = 'embeddings/smiley_tweets_embedding_{}'.format('topic') outfile = os.path.join(data_dir, 'emb_{}.npy'.format(os.path.basename(fname))) print outfile np.save(outfile, vocab_emb)
def main(): input_fname = 'small' if len(sys.argv) > 1: input_fname = sys.argv[1] tknzr = TweetTokenizer() tagger = PerceptronTagger() fout = ( 'embeddings/smiley_tweets_embedding_expanded_{}'.format(input_fname)) fname, delimiter, ndim = ( 'embeddings/smiley_tweets_embedding_{}'.format(input_fname), ' ', 52) word2vec = load_glove_vec(fname, {}, delimiter, ndim) tagdict = tagger.tagdict tagidx = {} nRows = len(word2vec) nCols = len(tagdict) print nRows, ':', nCols counter = 0 for tag in tagdict.keys(): tagidx[tag] = counter counter += 1 exp_wemb = {} for word in word2vec.keys(): exp_wemb[word] = np.zeros(nCols) print tagidx train = "semeval/task-B-train-plus-dev.tsv.gz" test = "semeval/task-B-test2014-twitter.tsv.gz" dev = "semeval/twitter-test-gold-B.downloaded.tsv.gz" test15 = "semeval/task-B-test2015-twitter.tsv.gz" smiley_pos = 'semeval/smiley_tweets_{}.gz'.format(input_fname) it = 0 files = [train, test, dev, test15, smiley_pos] for filen in files: for tweet in gzip.open(filen, 'rb'): tweet = tknzr.tokenize(tweet.decode('utf-8')) tags = _pos_tag(tweet, None, tagger) for (word, tag) in tags: if word in exp_wemb.keys() and tag in tagidx.keys(): idx = tagidx[tag] exp_wemb[word][idx] = 1 if (it % 10) == 0: print 'Progress:', it it += 1 f = open(fout, 'wb') for word in exp_wemb: f.write(word) tags = exp_wemb[word] for i in np.nditer(tags): f.write(' {}'.format(i)) fname.write("\n")
def main(): HOME_DIR = "semeval_parsed" np.random.seed(123) input_fname = '200M' embedding = 'custom' type = '200M' ndim = 52 data_dir = HOME_DIR + '_' + input_fname fname_vocab = os.path.join(data_dir, 'vocab_{}.pickle'.format('topic')) tknr = TweetTokenizer() alphabet = cPickle.load(open(fname_vocab)) words = alphabet.keys() tok_words = {} words = [] for word,idx in alphabet.iteritems(): tok_word = tknr.tokenize(word.decode('utf-8')) tok_words[idx] = tok_word words.extend(tok_word) print len(tok_words) print len(words) print "Vocab size", len(alphabet) fname,delimiter,ndim = ('embeddings/updated_embeddings_custom_200M'.format(type,str(ndim)),' ',ndim) word2vec = load_glove_vec(fname,words,delimiter,ndim) print 'len',len(word2vec) ndim = len(word2vec[word2vec.keys()[0]]) print 'ndim', ndim random_words_count = 0 vocab_emb = np.zeros((len(alphabet) + 1, ndim),dtype='float32') for idx,tok_word in tok_words.iteritems(): isrand = 1 word_vec = np.zeros(ndim) for tok in tok_word: if tok in word2vec.keys(): word_vec += word2vec[tok] isrand = 0 if isrand: word_vec = np.random.uniform(-0.25, 0.25, ndim) random_words_count += 1 vocab_emb[idx] = word_vec.astype(np.float32)/len(tok_word) print "Using zero vector as random" print 'random_words_count', random_words_count svd = TruncatedSVD(n_components=5) vocab_emb = svd.fit_transform(vocab_emb).astype(np.float32) print vocab_emb.shape fname = 'embeddings/smiley_tweets_embedding_{}'.format('topic') outfile = os.path.join(data_dir, 'emb_{}.npy'.format(os.path.basename(fname))) print outfile np.save(outfile, vocab_emb)
def main(): input_fname = 'small' if len(sys.argv) > 1: input_fname = sys.argv[1] tknzr = TweetTokenizer() tagger = PerceptronTagger() fout = ('embeddings/smiley_tweets_embedding_expanded_{}'.format(input_fname)) fname,delimiter,ndim = ('embeddings/smiley_tweets_embedding_{}'.format(input_fname),' ',52) word2vec = load_glove_vec(fname,{},delimiter,ndim) tagdict = tagger.tagdict tagidx = {} nRows = len(word2vec) nCols = len(tagdict) print nRows,':',nCols counter = 0 for tag in tagdict.keys(): tagidx[tag] = counter counter += 1 exp_wemb = {} for word in word2vec.keys(): exp_wemb[word] = np.zeros(nCols) print tagidx train = "semeval/task-B-train-plus-dev.tsv.gz" test = "semeval/task-B-test2014-twitter.tsv.gz" dev = "semeval/twitter-test-gold-B.downloaded.tsv.gz" test15 = "semeval/task-B-test2015-twitter.tsv.gz" smiley_pos = 'semeval/smiley_tweets_{}.gz'.format(input_fname) it = 0 files = [train,test,dev,test15,smiley_pos] for filen in files: for tweet in gzip.open(filen,'rb'): tweet = tknzr.tokenize(tweet.decode('utf-8')) tags = _pos_tag(tweet, None, tagger) for (word,tag) in tags: if word in exp_wemb.keys() and tag in tagidx.keys(): idx = tagidx[tag] exp_wemb[word][idx] = 1 if (it%10) == 0: print 'Progress:',it it += 1 f = open(fout,'wb') for word in exp_wemb: f.write(word) tags = exp_wemb[word] for i in np.nditer(tags): f.write(' {}'.format(i)) fname.write("\n")
class MySentences(object): def __init__(self, files): self.files = files self.tknzr = TweetTokenizer() def __iter__(self): for fname in self.files: for line in gzip.open(fname,'rb'): tweet = preprocess_tweet(line) tweet = self.tknzr.tokenize(tweet.decode('utf-8')) yield filter(lambda word: ' ' not in word, tweet)
def load_data(fname): tid,topics,tweets,sentiments = [],[],[],[] tknzr = TweetTokenizer(reduce_len=True) n_not_available = 0 with open(fname) as f: for line in f: splits = line.split('\t') tweet = splits[3] sentiment = convertSentiment(splits[2]) if tweet != "Not Available\n": tid.append(splits[0]) topic = pts.preprocess_tweet(splits[1]) topic_tok = tknzr.tokenize(topic.decode('utf-8')) topics.append(splits[1]) tweet = pts.preprocess_tweet(tweet) tweet_tok = tknzr.tokenize(tweet.decode('utf-8')) tweets.append(tweet_tok) sentiments.append(int(sentiment)) else: n_not_available += 1 print "Number of not availalbe tweets:", n_not_available return tid,topics,tweets,sentiments
def load_data(fname,pos): tid,tweets,sentiments = [],[],[] tknzr = TweetTokenizer(reduce_len=True) n_not_available = 0 with open(fname) as f: for line in f: splits = line.split('\t') tweet = splits[pos + 1] sentiment = convertSentiment(splits[pos]) tid.append(splits[0]) tweet = pts.preprocess_tweet(tweet) tweet_tok = tknzr.tokenize(tweet.decode('utf-8')) tweets.append(tweet_tok) sentiments.append(int(sentiment)) return tid,tweets,sentiments
def tokenize_tweets(filename, dest_folder): basename = os.path.basename(filename) dest = os.path.join(dest_folder, basename + '.tok') print("processing %s" % basename) tknzr = TweetTokenizer() with codecs.open(dest, 'w', "utf-8") as out_fs: with open(filename, 'r', encoding="utf-8") as in_fs: for line in in_fs: try: language, id, timestamp, username, tweet = line.strip().split('\t') except: print("could not parse line.") continue if language != 'en': continue tweet = tknzr.tokenize(tweet) if not 6 < len(tweet) < 110: continue tweet = preprocess_tweet(' '.join(tweet)) filter(lambda word: ' ' not in word, tweet) out_fs.write(id+'\t'+timestamp+'\t'+username+'\t'+tweet+'\n')
from collections import defaultdict tokenizer = TweetTokenizer() csvfile = open('trainingandtestdata/testdata.manual.2009.06.14.csv', 'rb') reader = csv.reader(csvfile, delimiter=',') rownum = 0 sentiments = [] tokens = [[]] for row in reader: colnum = 0 for col in row: if colnum == 0: sentiments.insert(rownum,int(col)) if colnum == 5: raw = col #.read().decode('utf8') tokens.insert(rownum,tokenizer.tokenize(raw)) ## print("tokens contents:", end='') ## for word in tokens[rownum]: ## print(word, end = " ") ## print() colnum += 1 rownum += 1 csvfile.close() #Divide into training and test data - randomly allocate 4/5 to training and 1/5 to test position = [] posPosts = [] negPosts = [] neuPosts = [] for posts in range(0,len(sentiments)): position.insert(posts,random.randint(0,5))