def add_to_faq(question, answer): databaseOperation.insert_answer(question, answer) noOfRows = databaseOperation.no_of_rows subj, root, obj = topic.topic(question) databaseOperation.insert_subject(subj, noOfRows + 1) databaseOperation.insert_subject(root, noOfRows + 1) databaseOperation.insert_subject(obj, noOfRows + 1)
def __init__(self): self.tweetlist=[] self.searchIndex={} self.stopwords = {} for word in nltk.corpus.stopwords.words('english'): self.stopwords[word]=1 self.stopwords['@'] = 1 self.stopwords['rt'] = 1 self.stopwords['#'] = 1 self.stopwords['http'] = 1 self.stopwords['tco'] = 1 self.stopwords['na'] = 1 self.stopwords['want'] = 1 self.stopwords['ta'] = 1 self.topicobj = topic()
def cal_tf_toDB(self): self.read_user_dic("../user_dic/user_dic_09_wids_2.json") count = 0 start = time.clock() for user in self.user_dic: tweets = [] it = self.db[self.COLLECTION_UV].find({"screen_name" : user}) for i in it: for tweet in i["tweets"]: tweets.append(tweet["text"]) tweet_content = " ".join(tweets) mytopic = topic.topic() word_bag = mytopic.process_sentence(tweet_content) count_total = len(word_bag) word_map = {} # to store TF score word_map_2 = {} for word in word_bag: if not word in word_map: word_map.update({word : 1}) else: word_map[word] += 1 for word in word_map: word_map[word] = word_map[word]/float(count_total) """ in order to avoid special characters as key, append 'term-' to every term """ for word in word_map: word_map_2.update({"term-"+word: word_map[word]}) #print "term-kind"[5:] #print word_map["term-kind"[5:]] #break it = self.db[self.COLLECTION_UV].find({'screen_name': user}) for i in it: i.update({"tf_score" : word_map_2}) self.db[self.COLLECTION_UV].update({"screen_name": user}, i) # add tf_score count += 1 if count%500 == 0: end = time.clock() print "%d users done... %.2f s.." %(count, (end - start)) start = time.clock()
from sklearn.metrics import classification_report from sklearn.feature_extraction import DictVectorizer import pickle import feature_extract import topic import heapq print 'Pickling out' pos_data=np.load('posproc.npy') neg_data=np.load('negproc.npy') print 'Number of sarcastic tweets :', len(pos_data) print 'Number of non-sarcastic tweets :', len(neg_data) print 'Training topics' topic_mod = topic.topic(nbtopic=200,alpha='symmetric') topic_mod.fit(np.concatenate((pos_data,neg_data))) print 'Feature eng' # label set cls_set = ['Non-Sarcastic','Sarcastic'] featuresets = [] index=0 for tweet in pos_data: if (np.mod(index,10000)==0): print "Positive tweet processed: ",index featuresets.append((feature_extract.dialogue_act_features(tweet,topic_mod),cls_set[1])) index+=1 index=0
def topic(self): if not self._loadID: return Exception('section-not-loaded') return topic(self._sqldb, self._loadID)
from sklearn.metrics import classification_report, confusion_matrix from sklearn.svm import SVC from gensim.models import ldamodel import features import pickle comments = [] sarc_set = set() features_list_train = [] features_list_test = [] data = pd.read_csv('./train-balanced-sarcasm.csv') data.dropna(subset=['comment'], inplace=True) #topic model #topic_model = topic(numTopics = 200, alpha = 'symmetric', model = "topics.model", dictionary = "dictionary.model") topic_model = topic(numTopics=200, alpha='symmetric') topic_model.generate(data['comment']) ''' infile = open("feature_vec_train.p",'rb') feature_vec_train = pickle.load(infile) infile.close() infile = open("feature_vec_test.p",'rb') feature_vec_test = pickle.load(infile) infile.close() infile = open("y_train.p",'rb') feature_vec_test = pickle.load(infile) infile.close() infile = open("y_test.p",'rb')
import pickle import os import feature_extract import topic fileObject1 = open( os.path.join(os.path.dirname(os.path.realpath(__file__)), 'vecdict.p'), 'r') fileObject2 = open( os.path.join(os.path.dirname(os.path.realpath(__file__)), 'classif.p'), 'r') vec = pickle.load(fileObject1) classifier = pickle.load(fileObject2) fileObject1.close() fileObject2.close() topic_mod = topic.topic(model=os.path.join(os.path.dirname(os.path.realpath(__file__)), 'topics.tp'),\ dicttp=os.path.join(os.path.dirname(os.path.realpath(__file__)), 'topics_dict.tp')) def tweetscore(sentence): features = feature_extract.dialogue_act_features(sentence, topic_mod) features_vec = vec.transform(features) score = classifier.decision_function(features_vec)[0] percentage = int(round(2.0 * (1.0 / (1.0 + np.exp(-score)) - 0.5) * 100.0)) return percentage
from sklearn.metrics import classification_report from sklearn.feature_extraction import DictVectorizer import pickle import feature_extract import topic import heapq print('Pickling out') pos_data = np.load('posproc.npy') neg_data = np.load('negproc.npy') print('Number of sarcastic tweets :', len(pos_data)) print('Number of non-sarcastic tweets :', len(neg_data)) print('Training topics') topic_mod = topic.topic(nbtopic=200, alpha='symmetric') topic_mod.fit(np.concatenate((pos_data, neg_data))) print('Feature eng') # label set cls_set = ['Non-Sarcastic', 'Sarcastic'] featuresets = [] index = 0 for tweet in pos_data: if (np.mod(index, 10000) == 0): print("Positive tweet processed: ", index) featuresets.append( (feature_extract.dialogue_act_features(tweet, topic_mod), cls_set[1])) index += 1
if __name__ == '__main__': obj = util() if obj.isLogin(): print("您已经登陆") else: account = raw_input('请输入你的用户名\n> ') secret = raw_input("请输入你的密码\n> ") obj.login(secret, account) begin = int(time.time()) print u"正在抓取个人您知乎数据...." #获取个人主页token token = obj.getToken() print u"获取个人主页token:", token obj_topic = topic.topic(obj.session) obj_question = question.question() obj_answer = answer.answer() # topic_id = '19551432' # questions=obj_question.getQuestionsByXHR(topic_id,'hot',0,3200.29677322) # for question in questions: # print question[1] #questions = obj_question.getQuestionsByTopicId('19551432', 'hot') # topic_id='19551432' # # questions = obj_question.getQuestionsByTopicId(topic_id) # # # 创建话题文件夹(需要查表) # obj.cursor.execute("SELECT NAME FROM TOPIC WHERE LINK_ID = %s", int(topic_id)) # result = obj.cursor.fetchone()
from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report import feature_extract import topic folder = os.path.join(os.path.dirname(__file__), "../..") include_reddit = int(sys.argv[1]) classif_type = int(sys.argv[2]) print("Preparing...") sarc_data = np.load("sarc-processed.npy") non_data = np.load("nonsarc-processed.npy") print("Training topics...\n") topic_mod = topic.topic(nbtopic=200, alpha="symmetric") topic_mod.fit(np.concatenate((sarc_data, non_data))) print("Extracting features...") sarcs = json.loads( open(os.path.join(folder, "sarc-comments.json"), "r").read()) nonsarcs = json.loads( open(os.path.join(folder, "nonsarc-comments.json"), "r").read()) labels = ["Sarcastic", "Non-Sarcastic"] featuresets = [] i = 1 for k, v in sarcs.items(): print(i, "sarcastic comments processed of", len(sarcs), end="\r") featuresets.append(
words = re.sub(r'http:[\\/.a-z0-9]+\s?', '', text) print words words = re.sub(r'(@\w+\s?)|(@\s+)', '', words) print words words = re.sub(r'[\#\-\+\*\`\.\;\:\"\?\<\>\[\]\{\}\|\~\_\=]', '', words) print words words = re.sub(r'rt\s?', '', words) print words words = words.strip() token = nltk.word_tokenize(words) print token words = words.split() print words mytopic = topic.topic() print mytopic.process_sentence(text) # Tree manipulation # Extract phrases from a parsed (chunked) tree # Phrase = tag for the string phrase (sub-tree) to extract # Returns: List of deep copies; Recursive def ExtractPhrases( myTree, phrase): myPhrases = [] if (myTree.node == phrase): myPhrases.append( myTree.copy(True) ) for child in myTree: if (type(child) is Tree): list_of_phrases = ExtractPhrases(child, phrase)
it takes a sentence and returns a percentage which describes how sarcastic the input sentence is. """ import numpy as np import pickle import os import feature_extract import topic fileObject1 = open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'vecdict.p'), 'r') fileObject2= open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'classif.p'), 'r') vec = pickle.load(fileObject1) classifier = pickle.load(fileObject2) fileObject1.close() fileObject2.close() topic_mod = topic.topic(model=os.path.join(os.path.dirname(os.path.realpath(__file__)), 'topics.tp'),\ dicttp=os.path.join(os.path.dirname(os.path.realpath(__file__)), 'topics_dict.tp')) def tweetscore(sentence): features = feature_extract.dialogue_act_features(sentence,topic_mod) features_vec = vec.transform(features) score = classifier.decision_function(features_vec)[0] percentage = int(round(2.0*(1.0/(1.0+np.exp(-score))-0.5)*100.0)) return percentage