def perprocessing(tdic): new_dic = {} POS_feature = [] for line in tdic: id = line gt = tdic[line][0] raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1])) text = twokenize.normalizeTextForTagger(raw) text_tk = twokenize.tokenize(text) # print(text_tk) print(text_tk) telist = [] for word in text_tk: word = word.lower() # ps = nltk.stem.PorterStemmer() # word = ps.stem(word) telist.append(word) # print(telist) afterlemma = lemma(telist) telist = afterlemma[0] POS_feature.append(afterlemma[1]) # print(telist) newtext = ' '.join(telist) # print(newtext) newtext = textPreprocessor01.replaceall(newtext) #now preprocess . change to URLINK SADFACE print(newtext) new_dic[id] = gt, newtext return new_dic, np.array(POS_feature)
def parse_text(tw_obj): # remove use mentions, urls from the text # use extended tweet if presents if 'extended_tweet' in tw_obj: text = tw_obj['extended_tweet']['full_text'] # or use normal text else: text = tw_obj['text'] # process quoted tweet and append to text if tw_obj['is_quote_status'] and 'quoted_status' in tw_obj: # process quoted tweet qt_obj = tw_obj['quoted_status'] if 'extended_tweet' in qt_obj: qt_text = qt_obj['extended_tweet']['full_text'] # or use normal text else: qt_text = qt_obj['text'] text = ''.join([text, ' %QUOTES% ', qt_text]) text_norm = normalizeTextForTagger(replace_sp_tokens(text)) # process text into list of keywords text_tokens = get_tokens(text) text_tokens = [t for t in text_tokens if t not in stopwords] token_counts = dict(Counter(itertools.chain(*[text_tokens]))) # text_tokens = [lemma(t) for t in text_tokens] return text, text_norm, text_tokens, token_counts
def perprocessing(tdic): new_dic = {} for line in tdic: id = line gt = tdic[line][0] raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1])) text = twokenize.normalizeTextForTagger(raw) text_tk = twokenize.tokenize(text) telist = [] for word in text_tk: word = word.lower() ps = nltk.stem.PorterStemmer() word = ps.stem(word) # word = nltk.stem.SnowballStemmer(word) telist.append(word) # return ''.join(ans) # newtext = ?telist # newtext = ' '.join(text_tk) newtext = ' '.join(telist) # print(newtext) newtext = textPreprocessor01.replaceall(newtext) new_dic[id] = gt, newtext # print(type(tdic[line][1])) # print(line) # print(type(line)) # print(type(newtext)) # print(newtext) return new_dic
def file_list(filepath,include_columns): ifile = open(filepath, "rb") reader = csv.reader(ifile) rownum = 0 print("Reading Tweet") hashwords = re.compile("#\S*",re.I) linkwords = re.compile("http\S*",re.I) reference = re.compile("@\S*",re.I) listo = [] for row in reader: if rownum == 0: header = row else: listi = [] for col in include_columns: if col == 1: time = datetime.datetime.strptime(row[col], "%a %b %d %H:%M:%S +0000 %Y") listi.append(time) elif col == 6: query = str(row[col]) for res in re.finditer(linkwords,query): query = query.replace(res.group(),"") for res in re.finditer(hashwords,query): query = query.replace(res.group(),"") for res in re.finditer(reference,query): query = query.replace(res.group(),"") query = tk.squeezeWhitespace(query) query = tk.normalizeTextForTagger(query.decode('latin-1').encode("utf-8").decode('utf8')) listi.append(query) else: listi.append(row[col]) listo.append(listi) rownum += 1 return listo
def perprocessing(tdic): new_dic = {} for line in tdic: id = line gt = tdic[line][0] raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1])) text = twokenize.normalizeTextForTagger(raw) text_tk = twokenize.tokenize(text) # print(text_tk) newtext = ' '.join(text_tk) newtext = textPreprocessor01.replaceall(newtext) new_dic[id] = gt, newtext # print(type(tdic[line][1])) # print(line) # print(type(line)) # print(type(newtext)) # print(newtext) return new_dic
def normalize(text): # for easier comparison text = text.lower() # to avoid IndexError: string index out of range (usually too much indent) text = squeezeWhitespace(text) # replace common abbreviation or unreadable characters text = text.replace(">", ">") text = text.replace("&", "&") text = text.replace("w/", "with") text = text.replace('\u2019', "'") text = text.replace('\u2026', "...") # remove urls text = re.sub(r'http\S+', '', text) #replace URL links with the term 'URL' # remove keyword 'springbreak' text = text.replace('springbreak', '') text = text.replace('spring', '') text = text.replace('break', '') return normalizeTextForTagger(text)
def perprocessing(tdic): new_dic = {} for line in tdic: id = line gt = tdic[line][0] raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1])) text = twokenize.normalizeTextForTagger(raw) text_tk = twokenize.tokenize(text) telist = [] for word in text_tk: word = word.lower() ps = nltk.stem.PorterStemmer() word = ps.stem(word) telist.append(word) newtext = ' '.join(telist) newtext = textPreprocessor02.replaceall(newtext) new_dic[id] = gt, newtext return new_dic
def file_list(filepath, include_columns): ifile = open(filepath, "rb") reader = csv.reader(ifile) rownum = 0 hashwords = re.compile("#\S*", re.I) linkwords = re.compile("http\S*", re.I) reference = re.compile("@\S*", re.I) listo = [] for row in reader: if rownum == 0: header = row else: listi = [] for col in include_columns: if col == 1: time = datetime.datetime.strptime( row[col], "%a %b %d %H:%M:%S +0000 %Y") listi.append(time) elif col == 6: query = str(row[col]) for res in re.finditer(linkwords, query): query = query.replace(res.group(), "") for res in re.finditer(hashwords, query): query = query.replace(res.group(), "") for res in re.finditer(reference, query): query = query.replace(res.group(), "") query = tk.squeezeWhitespace(query) query = tk.normalizeTextForTagger( query.decode('latin-1').encode("utf-8").decode('utf8')) listi.append(query) else: listi.append(row[col]) listo.append(listi) rownum += 1 return listo
def tokenize(tweet): return twokenize.tokenize(twokenize.normalizeTextForTagger(tweet))
def get_tokens(text): return tokenize(clean(replace_sp_tokens(normalizeTextForTagger(text))))
def train_classifier(classpath): ##ifile = open('F:/Srinjay/Tweet Feed/Tweet Feed/classsified.csv', "rb") ifile = open(classpath, "rb") reader = csv.reader(ifile) rownum = 0 tweetTime = [] tweetDesc = [] tweetR = [] tweetToken = [] tweet = "" ps = PorterStemmer() for row in reader: if rownum == 0: header = row else: colnum = 0 if (row[0].find("+0000")!=-1): tweet = row[0].lower() tweetR.append(row[1]) brk = tweet.index('+0000')+10 tweetTime.append(datetime.datetime.strptime(tweet[:brk], "%a %b %d %H:%M:%S +0000 %Y")) x = tweet[brk:] squeeze = tk.squeezeWhitespace(x) normal = tk.normalizeTextForTagger(squeeze.decode('utf8')) tweetDesc.append(normal) punct_num = re.compile(r'[-.?!,":;()|0-9]') time_pat = re.compile("(\d{1,2}(.\d{1,2})|\d{1,2})(am|pm|AM|Am|PM|Pm)") date_pat = re.compile("\d{1,2}\/\d{1,2}") week_pat = re.compile("Sun|Mon|Tue|Wed|Thurs|Fri|Sat|sunday|monday|tuesday|wednesday|thursday|friday|saturday/",re.I) ## print(rownum,normal) if(time_pat.search(normal)): normal = normal + " timepresent" if(date_pat.search(normal)): normal = normal + " datepresent" if(week_pat.search(normal)): normal = normal + " weekpresent" normal = re.sub(time_pat, '', normal) normal = re.sub(date_pat, '', normal) normal = re.sub(week_pat, '', normal) normal = punct_num.sub("", normal) tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') b = tokenizer.tokenize(normal) b = [i for i in b if (i not in stop)] token = [ps.stem(i) for i in b] ## print(rownum) tweetToken.append(token) rownum += 1 ifile.close() ## feature engineering documents = [] all_words = [] tweet_non = [] tweet_rel = [] for i in range(0,len(tweetR)): documents.append((tweetToken[i],tweetR[i])) all_words.extend(tweetToken[i]) if (tweetR[i] == 'Non-Relevant'): tweet_non.extend(tweetToken[i]) else: tweet_rel.extend(tweetToken[i]) all_words_freq = nltk.FreqDist(all_words) rel_words_freq = nltk.FreqDist(tweet_rel) non_words_freq = nltk.FreqDist(tweet_non) ##ranked words according to c/n ratio with add 1 smoothing init_features = list(all_words_freq.keys()) score_words = [] for i in init_features: score_words.append([float(rel_words_freq[i]+1)/float(non_words_freq[i]+1),i]) score_words = sorted(score_words, reverse=True) scores = [] scores = [i[0] for i in score_words] scores_mean = numpy.average(scores) features_1 = []; #random sample gneration 2000 - train, rest - test a = (r.uniform(0,len(tweetToken),2000)) b = [int(i) for i in a] accu = [] threshold = range(5,20,2) threshold = [float(i)/10 for i in threshold] threshold = 0.7 features_1 = []; for i in range(0,len(score_words)): if score_words[i][0]>threshold: features_1.append(score_words[i][1]) feature_score1=[] for i in range(0,len(tweetR)): feature_score1.append([find_features(tweetToken[i],features_1),tweetR[i]]) trainingset = [] for i in b: trainingset.append(feature_score1[i]) testset = [x for x in feature_score1 if x not in trainingset] ##naive base naive = nltk.NaiveBayesClassifier.train(trainingset) accuracy = nltk.classify.accuracy(naive,testset) classifier = [naive,features_1,accuracy] return classifier
tweetR.append(row[1]) # Add the value 10 to +0000 because the tweet text starts after 10 indexes brk = tweet.index('+0000')+10 tweetTime.append(datetime.datetime.strptime(tweet[:brk], "%a %b %d %H:%M:%S +0000 %Y")) TRACE(2,tweetTime) x = tweet[brk:] squeeze = tk.squeezeWhitespace(x) TRACE(3,squeeze) normal = tk.normalizeTextForTagger(squeeze.decode('utf8')) TRACE(4,normal) tweetDesc.append(normal) TRACE(5,tweetDesc) punct_num = re.compile(r'[-.?!,":;()|0-9]') time_pat = re.compile("(\d{1,2}(.\d{1,2})|\d{1,2})(am|pm|AM|Am|PM|Pm)") date_pat = re.compile("\d{1,2}\/\d{1,2}") week_pat = re.compile("Sun|Mon|Tue|Wed|Thurs|Fri|Sat|sunday|monday|tuesday|wednesday|thursday|friday|saturday/",re.I) #TRACE(6,find_events(normal)) if(time_pat.search(normal)):