def process_semeval(data_folder, outfname, idfname): """ Process dataset of SemEval 2013 task 2 subtask B Only keep postive and negative tweets Format: tweet|label,user_id,split\n """ logger.info("start processing tweets for SemEval 2013 task 2 subtask B") tid_uid_map = {} with open(idfname, "rb") as f: for line in f: parts = line.strip().split() tid_uid_map[parts[0]] = parts[1] train_file, val_file, test_file = data_folder[0], data_folder[1], data_folder[2] fout = open(outfname, "w") with open(train_file, "rb") as f: for line in f: parts = line.strip().split("\t") label = parts[2] tweet = parts[3] userid = "unknown" if parts[0] in tid_uid_map: userid = tid_uid_map[parts[0]] if tweet == "Not Available": continue tweet = clean_tweet(tweet) tweet = clean_tweet_toks(tokenize(tweet)) if label == "positive": fout.write(tweet + "|1," + userid + ",1\n") elif label == "negative": fout.write(tweet + "|0," + userid + ",1\n") with open(val_file, "rb") as f: for line in f: parts = line.strip().split("\t") label = parts[2] tweet = parts[3] userid = "unknown" if parts[0] in tid_uid_map: userid = tid_uid_map[parts[0]] if tweet == "Not Available": continue tweet = clean_tweet(tweet) tweet = clean_tweet_toks(tokenize(tweet)) if label == "positive": fout.write(tweet + "|1," + userid + ",2\n") elif label == "negative": fout.write(tweet + "|0," + userid + ",2\n") with open(test_file, "rb") as f: for line in f: parts = line.strip().split("\t") label = parts[2] tweet = parts[3] userid = "unknown" if parts[0] in tid_uid_map: userid = tid_uid_map[parts[0]] if tweet == "Not Available": continue tweet = clean_tweet(tweet) tweet = clean_tweet_toks(tokenize(tweet)) if label == "positive": fout.write(tweet + "|1," + userid + ",3\n") elif label == "negative": fout.write(tweet + "|0," + userid + ",3\n") fout.close() logger.info("finish processing data")
def clean_str(string): """ Tokenization/string cleaning for all datasets except for SST. Every dataset is lower cased except for TREC """ # string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = twokenize.tokenize(string.lower()) for i in range(len(string)): if string[i][0] == '@': string[i] = "" elif string[i][0:4] =="http": string[i]="" string = " ".join(string) """ reducing repeated char """ a=string[0] b=string[1] newStr=a+b for i in range(len(string)-2): c=string[i+2] if(a==b and b==c): pass else: newStr=newStr+c a=b b=c string = newStr string = string.replace("`","\'") string = string.replace("\u002c",",") string = string.replace("\u2019","\'") string = string.replace("\\\"\"","\"") ''' string = re.sub(r"[^A-Za-z0-9(),!?]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!+", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) ''' string = twokenize.tokenize(string) return " ".join(string)
def process_omd(fname, outfname, idfname): """ Process OMD dataset: each tweet is associated with three votings which 1:neg 2:pos Only keep postive and negative tweets according two 2/3 agreement Format: tweet|label\n """ logger.info("start processing tweets for OMD") tid_uid_map = {} with open(idfname, "rb") as f: for line in f: parts = line.strip().split() tid_uid_map[parts[0]] = parts[1] fout = open(outfname, "w") with open(fname, "rb") as f: for line in f: parts = line.strip().split("\t") votes = [int(parts[-1]), int(parts[-2]), int(parts[-3])] votes.sort() if votes[0] == 1 and votes[1] == 1: label = "0" elif votes[1] == 2 and (votes[0] == 2 or votes[2] == 2): label = "1" else: continue tweet = parts[2] userid = "unknown" if parts[0] in tid_uid_map: userid = tid_uid_map[parts[0]] if tweet[0] == "\"": tweet = tweet[1:].strip() if tweet[-1] == "\"": tweet = tweet[:-1].strip() tweet = clean_tweet(tweet) tweet = clean_tweet_toks(tokenize(tweet)) fout.write(tweet + "|" + label + "," + userid + "\n") fout.close() logger.info("finish processing data")
def perprocessing(tdic): new_dic = {} POS_feature = [] for line in tdic: id = line gt = tdic[line][0] raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1])) text = twokenize.normalizeTextForTagger(raw) text_tk = twokenize.tokenize(text) # print(text_tk) print(text_tk) telist = [] for word in text_tk: word = word.lower() # ps = nltk.stem.PorterStemmer() # word = ps.stem(word) telist.append(word) # print(telist) afterlemma = lemma(telist) telist = afterlemma[0] POS_feature.append(afterlemma[1]) # print(telist) newtext = ' '.join(telist) # print(newtext) newtext = textPreprocessor01.replaceall(newtext) #now preprocess . change to URLINK SADFACE print(newtext) new_dic[id] = gt, newtext return new_dic, np.array(POS_feature)
def kshinglize(s, k=KSHINGLES, stopwords=STOPWORDS): """ Tokenizes string s, removes stopwords, and returns a set of k-shingles """ s = s.strip().lower() tokens_raw = twokenize.tokenize(s) tokens = filterstopwords(tokens_raw, stopwords) return tokens_to_kshingles(tokens, k)
def __init__(self, testData): self.labeledTweets = [] for line in open(testData): line = line.rstrip('\n') fields = line.split('\t') fields[6] = ' '.join(twokenize.tokenize(fields[6])) self.labeledTweets.append(fields)
def prepare_and_tokenize(text, url_scheme='st', strip_word_padding=False, alphanumeric_only=False): """ Prepares the raw tweet text for tokenisation, then tokenizes it using twokenize (https://github.com/ianozsvald/ark-tweet-nlp-python) """ #handle URLS # possible schemes 'st': single token, 'leave' if url_scheme != 'leave': text = re.sub(url_re, "hyperlinktoken", text) text = re.sub(mention_re, "mentiontoken", text) # reduce extended words to a shorter token. e.g. "reeeeeeeeeee"->"ree", "hahahaha" -> "haha" if strip_word_padding: #TODO: Make this actually work text = re.sub(word_pad_re, "", text) # strip out non AN characters (except # and @) if alphanumeric_only: text = re.sub(alphanumeric_only_re, "", text) # add spaces between emoji for match in list(set(emoji_re.findall(text))): text = text.replace(match, " " + match + " ") return tokenize(text)
def preprocess(m, sep_emoji=False): m = m.lower() m = max_reps(m) #replace user mentions with token '@user' user_regex = r".?@.+?( |$)|<@mention>" m = re.sub(user_regex," @user ", m, flags=re.I) #replace urls with token 'url' m = re.sub(twokenize.url," url ", m, flags=re.I) tokenized_msg = ' '.join(twokenize.tokenize(m)).strip() if sep_emoji: #tokenize emoji, this tokenzier however has a problem where repeated punctuation gets separated e.g. "blah blah!!!"" -> ['blah','blah','!!!'], instead of ['blah','blah','!','!','!'] m_toks = tokenized_msg.split() n_toks = twk.tokenize(tokenized_msg) if len(n_toks)!=len(m_toks): #check if there is any punctuation in this string has_punct = map(lambda x:x in twk.punctuation, n_toks) if any(has_punct): new_m = n_toks[0] for i in xrange(1,len(n_toks)): #while the same punctuation token shows up, concatenate if has_punct[i] and has_punct[i-1] and (n_toks[i] == n_toks[i-1]): new_m += n_toks[i] else: #otherwise add space new_m += " "+n_toks[i] tokenized_msg = new_m return tokenized_msg.lstrip()
def main(argv): if len(sys.argv) != 3: print("Usage:> python getTaggedFile.py infile.txt outfile.txt") exit() infile_name = str(sys.argv[1]) outfile_name = str(sys.argv[2]) infile = open(infile_name, 'r') outfile = open(outfile_name, 'w') tagger = PerceptronTagger() print("Reading file...") line = infile.readline() while line != '': # Use Twokenizer for twitter parser tagset = None tokens = tokenize(line) tags = nltk.tag._pos_tag(tokens, tagset, tagger) outfile.write(format_tagged(tags)) line = infile.readline() # close file and connection infile.close() outfile.close() print("Finished tagging... Closing files.")
def Extract(self, text): features = [] words = twokenize.tokenize(text) #hand-crafted features iCapitalized = True nCapitalized = 0.1 nAllCaps = 0.1 nCapLowerViolated = 0.1 nCapUpperViolated = 0.1 nWords = 0.1 for i in range(len(words)): capitalized = re.search(r'^([A-Z]|[a-z][A-Z])', words[i]) if capitalized and not (i == 0 or re.match(r"\.|\?|!|@.+|http:.+|:|\"", words[i-1])): nCapitalized += 1.0 if not (i == 0 or re.match(r"\.|\?|!|@.+|http:.+|:|\"", words[i-1])): if capitalized and self.capDict.get(words[i].lower(), '1') != '1': nCapUpperViolated += 1.0 features.append(self.fVocab.GetID('upperViolated=%s' % words[i].lower())) elif not capitalized and re.match(r'[a-z]+', words[i]) and self.capDict.get(words[i].lower(), '1') != '0': nCapLowerViolated += 1.0 #features.append(self.fVocab.GetID('lowerViolated=%s' % words[i].lower())) if re.match(r'\w+', words[i][0:1]): nWords += 1 if re.match(r"i|i'm|im|u", words[i]): iCapitalized = False if re.match(r"[A-Z]{2,}", words[i]): nAllCaps += 1 features.append(self.fVocab.GetID('iCapitalized=%s' % iCapitalized)) return ' '.join(["%s:1" % x for x in features]) + " %s:%s" % (self.fVocab.GetID('nAllCaps'), nAllCaps/nWords) + " %s:%s" % (self.fVocab.GetID('nCapitalized'), nCapitalized/nWords) + " %s:%s" % (self.fVocab.GetID('nCapLowerViolated'), nCapLowerViolated/nWords) + " %s:%s" % (self.fVocab.GetID('nCapUpperViolated'), nCapUpperViolated/nWords)
def take_into_account_negation(self, tweet): neg_pattern = re.compile( 'never|nothing|nowhere|noone|none|not|havent|hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint|no|' + 'n\'t|haven\'t|haven\'t|hasn\'t|hadn\'t|can\'t|couldn\'t|shouldn\'t|won\'t|wouldn\'t|don\'t|doesn\'t|didn\'t|isn\'t|aren\'t', re.IGNORECASE) clause_pattern = re.compile(r'^[.:;!?]$') neg = re.search(neg_pattern, tweet) if neg != None: #print 'Negation in tweet: ' + tweet pattern = tweet[neg.start():] end = re.search(clause_pattern, pattern) if end == None: end_str = len(tweet) else: end_str = end.start() end_str = int(end_str) - 1 negated = '' tokens = twokenize.tokenize(pattern[:end_str]) for w in tokens: negated += w + '_neg ' negated = tweet[:neg.start()] + negated #print 'Negation in tweet: ' + negated else: negated = tweet return negated
def preprocess(m, sep_emoji=False): m = m.lower() m = max_reps(m) #replace user mentions with token '@user' user_regex = r".?@.+?( |$)|<@mention>" m = re.sub(user_regex, " @user ", m, flags=re.I) #replace urls with token 'url' m = re.sub(twokenize.url, " url ", m, flags=re.I) tokenized_msg = ' '.join(twokenize.tokenize(m)).strip() if sep_emoji: #tokenize emoji, this tokenzier however has a problem where repeated punctuation gets separated e.g. "blah blah!!!"" -> ['blah','blah','!!!'], instead of ['blah','blah','!','!','!'] m_toks = tokenized_msg.split() n_toks = twk.tokenize(tokenized_msg) if len(n_toks) != len(m_toks): #check if there is any punctuation in this string has_punct = map(lambda x: x in twk.punctuation, n_toks) if any(has_punct): new_m = n_toks[0] for i in xrange(1, len(n_toks)): #while the same punctuation token shows up, concatenate if has_punct[i] and has_punct[i - 1] and ( n_toks[i] == n_toks[i - 1]): new_m += n_toks[i] else: #otherwise add space new_m += " " + n_toks[i] tokenized_msg = new_m return tokenized_msg.lstrip()
def build_dict(self, corpus, word=True, which_grams=None): """ Builds the necessary ngrams out of the corpus Word is set to True by default which builds word ngrams If set to False, will build character ngrams Which_grams is the n values of the ngrams. By default will create unigrams, bigrams, and trigrams for words and bigrams, trigrams, four-grams and five-grams for characters """ dct = UnigramDictionary() if word: which_grams = [1,2,3] else: which_grams = [2,3,4,5] for text in corpus: if word: tokens = twokenize.tokenize(text) else: tokens = list(text) #list of tokens, each index is the zipped object of # tokens for the given n all_tokens = [ find_ngrams(tokens, n) for n in which_grams ] for j in all_tokens: for token in j: dct.add(token) if word: self.word_ngram = dct else: self.char_ngram = dct return dct
def perprocessing(tdic): new_dic = {} for line in tdic: id = line gt = tdic[line][0] raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1])) text = twokenize.normalizeTextForTagger(raw) text_tk = twokenize.tokenize(text) telist = [] for word in text_tk: word = word.lower() ps = nltk.stem.PorterStemmer() word = ps.stem(word) # word = nltk.stem.SnowballStemmer(word) telist.append(word) # return ''.join(ans) # newtext = ?telist # newtext = ' '.join(text_tk) newtext = ' '.join(telist) # print(newtext) newtext = textPreprocessor01.replaceall(newtext) new_dic[id] = gt, newtext # print(type(tdic[line][1])) # print(line) # print(type(line)) # print(type(newtext)) # print(newtext) return new_dic
def main(argv): tagger = PerceptronTagger() tagset = None tokens = tokenize(line) tags = nltk.tag._pos_tag(tokens, tagset, tagger) format_tagged(tags)
def learn_terms(self, tweets_file_object, learn_lemmas=True, cache_size=1000000): reader = csv.reader(tweets_file_object, delimiter=",", quotechar="\"") term_freq = Counter() term_id_map = dict() tweet_vectors = [] for row in reader: tweet_id = int(row[0]) tweet_text = row[-1] terms = [t.lower().encode("utf-8") for t in twokenize.tokenize(tweet_text)] if learn_lemmas: terms = [self.lmtz.lemmatize(term) for term in terms] tweet_sp_vector = [] counted_ids = [] for term in terms: if term not in term_id_map: term_id = len(term_id_map) term_id_map[term] = term_id else: term_id = term_id_map[term] if term_id not in counted_ids: term_freq[term_id] += 1 counted_ids.append(term_id) tweet_sp_vector.append(term_id) tweet_vectors.append((tweet_id, tweet_sp_vector)) if len(tweet_vectors) >= cache_size: self.write_tweet_vectors(tweet_vectors) tweet_vectors = [] self.write_tweet_vectors(tweet_vectors) self.write_terms(term_id_map, term_freq)
def get_sentences(line, content_type): global spacy_en if content_type == 'SocialMediaPosting': sentences = [] start_offset = 0 sent = [] for token in twokenize.tokenize(line[:-1]): idx = line.index(token, start_offset) sent.append(Token(token, idx)) start_offset = idx + len(token) sentences.append(sent) return sentences elif content_type == 'Blog' or content_type == 'NewsArticle' or content_type == 'Post': try: spacy_doc = spacy_en(line) except: spacy_en = spacy.load('en') print('**** Loaded spacy en') spacy_doc = spacy_en(line) return spacy_doc.sents
def extract_more_decep_tech_features(self, tweets, vocab_file): #print 'Extracting decep_tech/decep_type features with training vocab' train_vocab = {} k = 0 for line in open(vocab_file): train_vocab[line.strip()] = k k += 1 #print 'Train vocab size=>' + str(len(train_vocab)) cv = CountVectorizer(ngram_range=(1, 1), binary=True, vocSuraiyalary=train_vocab) train_features_bow = cv.fit_transform(tweets) add_decep_tech_matrix = [] hash_pattern = re.compile('\#+[\w_]+[\w\'_\-]*[\w_]+') elong_pattern = re.compile("([a-zA-Z])\\1{2,}") caps_pattern = re.compile(('[A-Z][A-Z\d]+')) punc_pattern = re.compile('([.,!?]+)') for tweet in tweets: tweet_vector = [] tokens = twokenize.tokenize(tweet) #count the number of elongated tokens n_elong = len(re.findall(elong_pattern, tweet)) #count the number of all_caps tokens n_caps = len(re.findall(caps_pattern, tweet)) #count the number of repeated punctuation n_rep_punct = len(re.findall(punc_pattern, tweet)) #count the number of hasgtags n_hahtag = len(re.findall(hash_pattern, tweet)) #check if the tweets has SAD, HAPPY, BOTH_SH or NA emoticon emoticon_mood = emoticons.analyze_tweet(tweet.strip()) if emoticon_mood == 'NA': emoticon_mood = 0 elif emoticon_mood == 'HAPPY': emoticon_mood = 2 elif emoticon_mood == 'SAD': emoticon_mood = 1 elif emoticon_mood == 'BOTH_HS': emoticon_mood = 4 tweet_vector = [ n_elong, n_caps, n_rep_punct, n_hahtag, emoticon_mood ] add_decep_tech_matrix.append(tweet_vector) #print np.asarray(add_decep_tech_matrix).shape a = np.asarray(add_decep_tech_matrix) #print 'additional 5 features: ' + str(a) sa = sparse.csr_matrix(add_decep_tech_matrix) features = hstack([sa, train_features_bow]) #print 'final feature matrix size: ' + str(features.shape) return features
def process(self,text): tTweet = "" for word in text.split(): if "#" in word: word = word.replace("#"," ") f=0 for tt in self.remove: if tt in word: f=1 if f==1: continue tTweet = " ".join([tTweet,word]) tTweet = tTweet.strip() tempTweet = "" for word in twokenize.tokenize(tTweet): if word != " " and word not in self.stop and not word.isdigit(): word = word.strip().lower() if len(word) > 26: word=word[:27] #### Normalize Emoticons try: word = self.emoticons[word] except: #Normalize Acronyms try: try: if self.wordDict[word] ==1: word = word except: word = self.acronyms[word] except: #Normalize Contractions try: word = self.contractions[word] except: #Normalize words (Spell) try: if self.wordDict[word] == 1: word = word except: CW = self.correct(word) if "@" in word or "#" in word: word = word else: if CW != "a": word = CW if "@" in word: word="@user" tempTweet = " ".join([tempTweet,word.strip()]) tempTweet = tempTweet.lower().strip() tempTweet = " ".join(stemmer.stem(w) for w in tempTweet.split(" ") if w not in self.stop) #print(tempTweet.encode("utf-8")) return(tempTweet) ##Usage # pre = Preprocess() # pre.process("lol god pls help with my hw :) :(:D")
def process_line(s, clean_string=True): if clean_string: s = clean_str(s) tokens = tokenize(s) #return [process_token(None,token).lower() for token in tokens] sent = nltk.pos_tag(tokens) chunks = nltk.ne_chunk(sent, binary=False) return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
def splitTweet2Sents(tweet): tweet = re.sub(u'\u201c', '\"', tweet) tweet = re.sub(u'\u201d', '\"', tweet) tweet = tweet.encode('ascii', 'ignore') tokenizedtweet = u" ".join(tokenize(tweet)) cleantweet = filterTweetText(tokenizedtweet) sents = sentSplitter(cleantweet) return sents
def tokenize(text): stemmer = PorterStemmer() # lmtzr = WordNetLemmatizer() tokens = twokenize.tokenize(text) tokens_clean = [s for s in tokens if s not in set(string.punctuation)] # tokens = nltk.word_tokenize(text) stems = stem_tokens(tokens_clean, stemmer) return stems
def preprocess(m): m = m.lower() m = max_reps(m) # replace user mentions with token '@user' user_regex = r".?@.+?( |$)|<@mention>" m = re.sub(user_regex, " @user ", m, flags=re.I) # replace urls with token 'url' m = re.sub(twokenize.url, " url ", m, flags=re.I) return twokenize.tokenize(m)
def all_tokens(tweetreader): i = 0 for r in tweetreader: i += 1 tokens = tokenize(r[-1]) for t in tokens: yield t if i >= 50000: return
def process_line(s, clean_string=True): """ Processes a line by iteratively calling process_token. """ if clean_string: s = clean_str(s) tokens = tokenize(s) sent = nltk.pos_tag(tokens) chunks = nltk.ne_chunk(sent, binary=False) return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
def run_all(): csv.field_size_limit(sys.maxsize) psql_conn = psycopg2.connect("dbname='tweet'") psycopg2.extras.register_hstore(psql_conn) pg_cur = psql_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) # Build up the bins-to-nghds mapping so we can easily translate. bins_to_nghds = {} for line in DictReader(open('point_map.csv')): bins_to_nghds[(float(line['lat']), float(line['lon']))] = line['nghd'] #get nghd to zone mapping nghds_to_zones = {} for line in DictReader(open('zone_map.csv')): nghds_to_zones[line['nghd']] = line['zone'] words_per_zone = json.load(open('outputs/zone_words.json')) top10words = {} tweets_per_word = defaultdict(lambda: defaultdict(list)) for zone in words_per_zone: top10words[zone] = words_per_zone[zone]["top words"] pg_cur.execute("SELECT text, ST_ASGEOJSON(coordinates), user_screen_name " + "FROM tweet_pgh;") counter = 0 for row in pg_cur: counter += 1 if (counter % 10000) == 0: print str(counter) + ' tweets processed' coords = json.loads(row[1])['coordinates'] bin = util.util.round_latlon(coords[1], coords[0]) if bin in bins_to_nghds: tweet_nghd = bins_to_nghds[bin] else: tweet_nghd = 'Outside Pittsburgh' if tweet_nghd in nghds_to_zones: zone = "Zone " + nghds_to_zones[tweet_nghd] else: zone = tweet_nghd tweet = row[0] tweet = tweet.replace('“','"').replace('”','"') tweet = unicode(tweet, errors='ignore') username = row[2] wordList = twokenize.tokenize(tweet) wordList = map(lambda x:x.lower(),wordList) for word in top10words[zone]: if word in wordList: tweets_per_word[zone][word].append(username + ": " + tweet) print "writing to JSON file" with open('outputs/tweets_per_zoneword.json','w') as outfile: json.dump(tweets_per_word,outfile, indent=2)
def tokenize_str(istring): ostring = [] for line in istring.split('\n'): try: ostring.append(u" ".join(twokenize.tokenize( line[:])).encode('utf-8')) except Exception as e: print e print line return '\n'.join(ostring)
def process_statuses(self, statuses): statuses = [twokenize.tokenize(s.text.lower()) for s in statuses] for s in xrange(len(statuses)): w = 1 while True: if w >= len(statuses[s]): break if statuses[s][w][0] == "'": statuses[s] = statuses[s][:w-1] + [statuses[s][w-1] + statuses[s][w]] + statuses[s][w+1:] w = 0 w += 1 return statuses
def get_idx_from_sent(sent, word_idx_map, k): """ Transforms sentence into a list of indices. Pad with zeroes. """ x = [] words = tokenize(sent) for word in words: if word in word_idx_map: x.append(word_idx_map[word]) else: x.append(word_idx_map[UNK_TOKEN]) return x
def test(text): newtext = text.lower() newtext = replaceURLs(newtext) newtext = replaceUserMentions(newtext) newtext = replacenonalpha(newtext) newtext = replacenumbers(newtext) newtext = replaceshort(newtext) newtext = replacespace(newtext) newtext = newtext.strip() newtext = twokenize.tokenize(newtext) return newtext
def vectorize(self, text, embeddings=True, ngrams=True): """ Returns the feature vector for a given text """ word_tokens = twokenize.tokenize(text) char_tokens = list(text) #Don't do anything if the necessary data isnt there #if self.char_ngram is None: # print("Missing character n-grams") # return #if self.word_ngram is None: # print("Missing word n-grams") # return if ngrams: word_features = find_ngram_ft_vec(word_tokens, self.word_ngram) char_features = find_ngram_ft_vec(char_tokens, self.char_ngram, which_grams = [2,3,4,5]) if embeddings: local_w_vects = [ self.word_vectors[w] for w in word_tokens if w in self.word_vectors] if local_w_vects == []: word_embding = self.avg_embd else: word_embding = csr_matrix(np.mean(local_w_vects, axis=0)) # total_vector = None # count = 0 # for w in word_tokens: # if w in self.word_vectors: # count += 1 # if total_vector is None: # total_vector = self.word_vectors[w] # else: # total_vector += self.word_vectors[w] # word_embding = coo_matrix(np.divide(total_vector, count)) if embeddings and ngrams: feature_vect = hstack((word_features, char_features)) feature_vect = hstack((feature_vect, word_embding)) elif embeddings: feature_vect = word_embding elif ngrams: feature_vect = hstack((word_features, char_features)) else: print("Do you not want anything?") #feature_vect = hstack((feature_vect, word_embding)) return feature_vect
def tokenize(istring, ostring): # print 'this is mytokenizer.py ifile = open(istring, 'r') ofile = open(ostring, 'w') for line in ifile: try: ofile.write( u" ".join(twokenize.tokenize(line[:])).encode('utf-8') + '\n') except: print line ofile.close() ifile.close()
def preprocess(tweet): abbv_dict = json.load(open("../other/abbreviations.json")) emo_lexica_dict = json.load(open("../other/emoticons.json")) for emoticon in emo_lexica_dict[u'emoticons']: abbv_dict[emoticon] = ' ' for word in emo_lexica_dict[u'words']: abbv_dict[word] = ' ' hash_transformer = Transformer.HashtagTransformer() sub_transformer = Transformer.SubstitutionTransformer(abbv_dict) preprocessor = Preprocessor([hash_transformer, sub_transformer]) tweet = ' '.join(tokenize(tweet)) tweet = preprocessor.transform(tweet) return tweet
def tokenize_and_clean(msg, alignments): if alignments: toks = twokenize.tokenize(msg) else: toks = twokenize.simple_tokenize(msg) for i in range(len(toks)): toks[i] = toks[i].lower() inds = range(len(toks)) #if len(inds) < len(toks): print "dropping junk", sorted(list(toks[i] for i in (set(range(len(toks)))-set(inds)))) if alignments: return toks.subset(inds) else: return [toks[i] for i in inds]
def normalize_tweet(text, lowercase=False, rm_digits=False, return_tokens=False): if lowercase: text = text.lower() text = re.sub(URL_PATTERN, 'URL', text) tokens = twokenize.tokenize(text) if return_tokens: if rm_digits: tokens = map(lambda tk: re.sub(NUM_PATTERN, 'NUM', tokens)) return tokens clean = ' '.join(tokens) if rm_digits: re.sub(NUM_PATTERN, 'NUM', clean) return clean
def preprocess(tweet): abbv_dict = json.load(open("../other/abbreviations.json")) emo_lexica_dict = json.load(open("../other/emotions.json")) for emoticon in emo_lexica_dict[u'emoticons']: abbv_dict[emoticon] = ' ' for word in emo_lexica_dict[u'words']: abbv_dict[word] = ' ' hash_transformer = Transformer.HashtagTransformer() sub_transformer = Transformer.SubstitutionTransformer(abbv_dict) preprocessor = Preprocessor([hash_transformer, sub_transformer]) tweet = ' '.join(tokenize(tweet)) tweet = preprocessor.transform(tweet) return tweet
def preproc(text): newtext = text.lower() newtext = replaceURLs(newtext) newtext = replaceUserMentions(newtext) newtext = replacenonalpha(newtext) newtext = replacenumbers(newtext) newtext = replaceshort(newtext) newtext = replacespace(newtext) newtext = newtext.strip() newtext = twokenize.tokenize(newtext) newtext = removestop(newtext, clean_stop_words) newtext = ' '.join(newtext) return newtext
def __iter__(self): for fname in self.files: with gzip.open(fname) as f: for line in f: tweet = json.loads(line.strip()) text = tweet['text'] if self.exclude_rts: if retweet_or_share(text) or tweet['is_rt']: continue text = prepare_text(text) if self.downcase: text = text.lower() yield tokenize(text)
def twitterTokenizeText(fn, output_fn): with open(fn, 'r') as input_file: tok_lines = [] for line in input_file: line = line.strip().lower().decode('utf8') line = line.replace('`', ' ') tok_seq = tokenize(line) tok_line = ' '.join(tok_seq) tok_lines.append(tok_line) with open(output_fn, 'w') as output_file: tok_text = '\n'.join(tok_lines) print(tok_text.encode('utf8'), file=output_file) print('done twitter tokenizing text....')
def tokenizeTweets(tweets): total_tweets = [] filter_prefix_set = ('@', 'http', 'www') # filter for english for status in tweets: tokenized = tokenize(status) # remove http tags and hashtags words = [ re.sub(r'[^\w\s]', '', word).lower() for word in tokenized if not word.startswith(filter_prefix_set) ] if words: total_tweets.append(words) return total_tweets
def process_line(s, clean_string=True, enable_tags = False): """ Processes a line by iteratively calling process_token. """ if clean_string: s = clean_str(s) tokens = tokenize(s) if enable_tags: sent = nltk.pos_tag(tokens) chunks = nltk.ne_chunk(sent, binary=False) words = [] for chunk in chunks: words += process_chunk(chunk) return [w.lower().encode('UTF-8') for w in words] else: return [process_token(token).lower().encode('UTF-8') for token in tokens]
def parse_tweets(tweets): parsed_tweets =[] for tweet_json in tweets: try: #tweet_json = json.loads(tweet_str); tweet_text = tweet_json['text']; if u'RT' in tweet_text: tweet_text = tweet_text[0:tweet_text.index(u'RT') -1] tweet_token = tk.tokenize(tweet_text) tweet_token =[char_reduction(tok) for tok in tweet_token] tweet_token = [t for tok in tweet_token for t in es.expand(tok) if (not (('@' in t) or (tk.Url_RE.search(t)) or (not emo.Emoticon_RE.search(t) and tk.Punct_re.search(t))))] if tweet_token != []: tweet_obj = {"token":tweet_token,"location" : tweet_json['place']['country'] if tweet_json['place'] != None else None,"json":tweet_json,"type" :""} parsed_tweets.append(tweet_obj) except Exception as e: print e return parsed_tweets
def read_tweets(self, filename, emo): """Read tweets in raw format, returning a list of all tweets in the file""" emo_tweets = [] non_emo_tweets = [] with codecs.open(filename, encoding='utf8') as tweet_file: # tweet = [] for line in tweet_file: data = json.loads(line) id = data['tweetid'].strip() text = data['text'].strip() emotions = data['emotions'] tokens = tokenize(text) incount = 0 for e in emotions: if e == emo: incount = 1 if incount == 1: emo_tweets.append(SPACE.join(tokens)) elif incount == 0: non_emo_tweets.append(SPACE.join(tokens)) return emo_tweets, non_emo_tweets
def __init__(self, line): fields = line.split('","') if fields[0] == '"0': self.senti = -1 elif fields[0] == '"2': self.senti = 0 elif fields[0] == '"4': self.senti = 1 self.id = fields[1] self.date = fields[2] # self.text = fields[5][1:-1] self.text = normalization(fields[5][:-1]) tokens = tokenize(self.text) self.tokens = tokens tokens_postag = nltk.pos_tag(tokens) wordnet_tag = [] for each_pair in tokens_postag: if 'NN' in each_pair[1]: wordnet_tag.append((each_pair[0], 'n')) if 'JJ' in each_pair[1]: wordnet_tag.append((each_pair[0], 'a')) elif 'RB' in each_pair[1]: wordnet_tag.append((each_pair[0], 'r')) elif 'VB' in each_pair[1]: wordnet_tag.append((each_pair[0], 'v')) # lemmatized tokens are lemmatized and lowered self.ltoken_tag = [] for each_pair in wordnet_tag: lword = lemmatizer.lemmatize(each_pair[0], each_pair[1]) self.ltoken_tag.append((lword.lower(), each_pair[1])) self.tweet_senti_score = [] for each_pair in self.ltoken_tag: each_score = sentiextractor.get_score(each_pair) if abs(each_score) > 0.02: self.tweet_senti_score.append(each_score) else: self.tweet_senti_score.append(0)
def analyze_tweets(tweets): 'keep what is necessary' first_candidates = [] sys.path.append("../libs/tweetmotif/") import twokenize for tweet in tweets: text = tweet['text'] word_list = twokenize.tokenize(text) if '#socrates' in word_list: first_candidates.append(tweet) # filter out tweet not published today. import datetime second_candidates = [] for first_cand in first_candidates: created_at = first_cand['created_at'] orthodoxized_time = orthodoxize_time(created_at, "%Y%m%d") today = datetime.date.today().strftime("%Y%m%d") if orthodoxized_time == today: second_candidates.append(first_cand) return second_candidates
def run_zmp(clf, port, preprocess=False, verbose=False): '''Classify data coming from a ZMQ socket, reply to each request with the result. ''' context = zmq.Context() socket = context.socket(zmq.REP) address = 'tcp://*:' + str(port) socket.bind(address) if verbose: print('ZMQ Service Running: on %s' % (address,)) while True: # Wait for next request from client message = socket.recv() # preprocess if preprocess: message = twokenize.tokenize(message) message = twokenize.preprocess(message) # check for empty message if not message: socket.send(str(default_class)) # classify and reply else: socket.send(str(clf.predict([message])[0]))
def __init__(self, text): self.text = normalization(text) tokens = tokenize(self.text) self.tokens = tokens tokens_postag = nltk.pos_tag(tokens) wordnet_tag = [] for each_pair in tokens_postag: if 'NN' in each_pair[1]: wordnet_tag.append((each_pair[0], 'n')) if 'JJ' in each_pair[1]: wordnet_tag.append((each_pair[0], 'a')) elif 'RB' in each_pair[1]: wordnet_tag.append((each_pair[0], 'r')) elif 'VB' in each_pair[1]: wordnet_tag.append((each_pair[0], 'v')) self.not_words = set(['not', 'don\'t', 'dont', 'didn\'t', 'didnt', 'doesn\'t', 'doesnt', 'no', 'never', 'isn\'t', 'isnt', 'cant', 'can\'t', 'cannot', 'wasnt', 'wasn\'t', 'weren\'t', 'werent', 'couldn\'t']) # lemmatized tokens are lemmatized and lowered self.ltoken_tag = [] for each_pair in wordnet_tag: lword = lemmatizer.lemmatize(each_pair[0], each_pair[1]) self.ltoken_tag.append((lword.lower(), each_pair[1])) self.tweet_senti_score = [] for i in xrange(len(self.ltoken_tag)): each_score = sentiextractor.get_score(self.ltoken_tag[i]) # print each_score if abs(each_score) > 0.02: if i>0 and self.ltoken_tag[i-1][0] in self.not_words or i>1 and self.ltoken_tag[i-2][0] in self.not_words: self.tweet_senti_score.append(-each_score) else: self.tweet_senti_score.append(each_score) else: self.tweet_senti_score.append(0)
entityMap[entity] = i i += 1 dict2label = {} for line in open('%s/hbc/data/dict-label3' % (BASE_DIR)): (dictionary, label) = line.rstrip('\n').split(' ') dict2label[dictionary] = label nLines = 1 #tweet ='Accident - A4 Great West Rd about 100 M from Syon Lane near Gillette Cnr - Road has been opened with lane restricted.'+'\n'+'Correction Accident - A4 Great West Rd at Syon Lane (Gillette Cnr) Road was closed w/b, which has been opened with a lane one restriction.' #tweet ='Shepherds Bush Green remains closed for gasworks from Holland Park RBT, diversions in place - expect peak period delays (espec. Holland Rd)\nShepherds Bush Green is closed towards King Street College due to roadworks between Holland Rd Roundabout and West 12 Shopping Centre\nR.I.P to the possum on Holland Rd that car crushed you.Kroger making a comeback the one on holland rd gon be nice as hell\n@TeaQ09 @Moe_Diesel_Baby Whats going on in wagener on new holland rd????' #tweet='The A4 Ellesmere Rd has reopened at Sutton Court Rd following the earlier collision. Residual Qs remain back to junction 2 on the M4' tweet =sys.stdin.readline().strip() line = tweet.encode('utf-8') while line: words = twokenize.tokenize(line) seq_features = [] tags = [] goodCap = capClassifier.Classify(words) > 0.9 if posTagger: pos = posTagger.TagSentence(words) pos = [p.split(':')[0] for p in pos] # remove weights else: pos = None # Chunking the tweet if posTagger and chunkTagger: word_pos = zip(words, [p.split(':')[0] for p in pos]) chunk = chunkTagger.TagSentence(word_pos)
BASE_DIR = 'twitter_nlp.jar' sys.path.append('%s/hbc/python' % (BASE_DIR)) sys.path.append('%s/python' % (BASE_DIR)) from LdaFeatures import LdaFeatures from twokenize import tokenize prevText = None for line in sys.stdin: line = line.rstrip('\n') fields = line.split('\t') sid = fields[0] text = fields[6] words = tokenize(text) confidence = 1.0 / float(fields[-1]) eType = fields[-2] entity = fields[-3] neTags = fields[-4].split(' ') pos = fields[-5].split(' ') words = fields[-6].split(' ') #Just skip duplicate texts (will come from tweets with more than one entiity) if prevText and prevText == text: continue prevText = text features = LdaFeatures(words, neTags, windowSize=int(options.windowSize)) for i in range(len(features.entities)): entity = ' '.join(features.words[features.entities[i][0]:features.entities[i][1]])
def parseOneTweet(line): words = twokenize.tokenize(line) seq_features = [] tags = [] goodCap = capClassifier.Classify(words) > 0.9 if posTagger: pos = posTagger.TagSentence(words) #pos = [p.split(':')[0] for p in pos] # remove weights pos = [re.sub(r':[^:]*$', '', p) for p in pos] # remove weights else: pos = None # Chunking the tweet if posTagger and chunkTagger: word_pos = zip(words, [p.split(':')[0] for p in pos]) chunk = chunkTagger.TagSentence(word_pos) chunk = [c.split(':')[0] for c in chunk] # remove weights else: chunk = None #Event tags if posTagger and eventTagger: events = eventTagger.TagSentence(words, [p.split(':')[0] for p in pos]) events = [e.split(':')[0] for e in events] else: events = None quotes = Features.GetQuotes(words) for i in range(len(words)): features = fe.Extract(words, pos, chunk, i, goodCap) + ['DOMAIN=Twitter'] if quotes[i]: features.append("QUOTED") seq_features.append(" ".join(features)) ner.stdin.write(("\t".join(seq_features) + "\n").encode('utf8')) for i in range(len(words)): tags.append(ner.stdout.readline().rstrip('\n').strip(' ')) features = LdaFeatures(words, tags) #Extract and classify entities for i in range(len(features.entities)): type = None wids = [str(vocab.GetID(x.lower())) for x in features.features[i] if vocab.HasWord(x.lower())] if llda and len(wids) > 0: entityid = "-1" if entityMap.has_key(features.entityStrings[i].lower()): entityid = str(entityMap[features.entityStrings[i].lower()]) labels = dictionaries.GetDictVector(features.entityStrings[i]) if sum(labels) == 0: labels = [1 for x in labels] llda.stdin.write("\t".join([entityid, " ".join(wids), " ".join([str(x) for x in labels])]) + "\n") sample = llda.stdout.readline().rstrip('\n') labels = [dict2label[dictMap[int(x)]] for x in sample[4:len(sample)-8].split(' ')] count = {} for label in labels: count[label] = count.get(label, 0.0) + 1.0 maxL = None maxP = 0.0 for label in count.keys(): p = count[label] / float(len(count)) if p > maxP or maxL == None: maxL = label maxP = p if maxL != 'None': tags[features.entities[i][0]] = "B-%s" % (maxL) for j in range(features.entities[i][0]+1,features.entities[i][1]): tags[j] = "I-%s" % (maxL) else: tags[features.entities[i][0]] = "O" for j in range(features.entities[i][0]+1,features.entities[i][1]): tags[j] = "O" else: tags[features.entities[i][0]] = "B-ENTITY" for j in range(features.entities[i][0]+1,features.entities[i][1]): tags[j] = "I-ENTITY" output = ["%s/%s" % (words[x], tags[x]) for x in range(len(words))] if pos: output = ["%s/%s" % (output[x], pos[x]) for x in range(len(output))] if chunk: output = ["%s/%s" % (output[x], chunk[x]) for x in range(len(output))] if events: output = ["%s/%s" % (output[x], events[x]) for x in range(len(output))] return " ".join(output)
def tokenize(tweet): tweet = tweet.replace("\\n", " \\n ") tweet = tweet.replace("@", " @") tokens = twokenize.tokenize(tweet) return split_contractions(tokens)
#!/usr/bin # Tokenize text using twokenize # Francisco Guzman import sys sys.path.append('third-party') import twokenize as tok for line in sys.stdin: tokenized = tok.tokenize(line.decode("utf-8",'ignore')) print u" ".join( tokenized).encode("utf-8",'ignore')
def Geo_C(intput): OneCoord = r'([-+]?\d{1,3}\.\d{3,})' Separator = r', ?' LatLong = re.compile(OneCoord + Separator + OneCoord, re.U) for raw, tweet in iterate(raw=True, inputList=intput): source = lookup(tweet, 'source') if "Buoy" in source: # print "REJECT BUOY\t" + json.dumps(tweet) continue n_fol = lookup(tweet, 'user.followers_count') or 0 n_fri = lookup(tweet, 'user.friends_count') or 0 if not (n_fol < 1000 and n_fri < 1000): # print "REJECT FOLLOWERS\t" + json.dumps(lookup(tweet,'user')) continue text = lookup(tweet, 'text') if not text.strip(): # print "REJECT NO TEXT\t" + json.dumps(record) continue lat = None lon = None orig_str = "" loc_type = None geo = lookup(tweet, 'geo') if geo and geo['type'] == 'Point': lat, lon = geo['coordinates'] loc_type = 'OFFICIAL' else: loc = lookup(tweet, 'user.location').strip() if not loc: # print "REJECT NO USERLOC\t" + json.dumps(record) continue m = LatLong.search(loc.encode('utf8')) if not m: # print "REJECT NO GEO REGEX\t" + json.dumps(record) continue lat, lon = m.groups() loc_type = 'REGEX' lat = float(lat); lon = float(lon) if (lat, lon) == (0, 0) or lat < -90 or lat > 90 or lon < -180 or lon > 180: # print "REJECT JUNK GEO\t" + json.dumps([lat,lon]) + "\t" + json.dumps(record) continue # # For our applications we usually want to kill retweets if lookup(tweet, 'retweeted_status'): # print "REJECT OFFICIAL RT\t" + json.dumps(text) continue toks = twokenize.tokenize(text) if any(tok == 'RT' for tok in toks): # print "REJECT TEXT RT\t" + json.dumps(text) continue # Build a "SmallTweet" format record record = { 'id': lookup(tweet, 'id'), 'user': lookup(tweet, 'user.screen_name'), 'date': tweet['created_at_datetime'].strftime("%Y-%m-%dT%H:%M:%S"), 'text': lookup(tweet, 'text') } record['lonlat'] = [lon, lat] if '\t' in record['user']: print >> sys.stderr, "WTF\t" + json.dumps(record) continue out = [ # 'GEO ' + loc_type, # str(record['id']), # record['user'].encode('utf-8'), # record['date'].encode('utf-8'), str(record['lonlat'][0]) + '+' + str(record['lonlat'][1]) # record['text'].encode('utf-8') # json.dumps(lookup(tweet, 'user.location')), # json.dumps(lookup(tweet, 'source')), # json.dumps(record), ] # TempVar = record # print '\t'.join(out) return '\t'.join(out)
def _tokenize(self, tweet): t = twokenize.tokenize(tweet) return t