Пример #1
0
def add_to_faq(question, answer):

    databaseOperation.insert_answer(question, answer)

    noOfRows = databaseOperation.no_of_rows

    subj, root, obj = topic.topic(question)

    databaseOperation.insert_subject(subj, noOfRows + 1)
    databaseOperation.insert_subject(root, noOfRows + 1)
    databaseOperation.insert_subject(obj, noOfRows + 1)
Пример #2
0
	def __init__(self):
		self.tweetlist=[]
		self.searchIndex={}
		self.stopwords = {}
		for word in nltk.corpus.stopwords.words('english'):
			self.stopwords[word]=1
		self.stopwords['@'] = 1
		self.stopwords['rt'] = 1
		self.stopwords['#'] = 1
		self.stopwords['http'] = 1
		self.stopwords['tco'] = 1
		self.stopwords['na'] = 1
		self.stopwords['want'] = 1
		self.stopwords['ta'] = 1
		self.topicobj = topic()
Пример #3
0
 def cal_tf_toDB(self):
     self.read_user_dic("../user_dic/user_dic_09_wids_2.json")
     count = 0
     start = time.clock()
     for user in self.user_dic:
         tweets = []
         it = self.db[self.COLLECTION_UV].find({"screen_name" : user})
         for i in it:
             for tweet in i["tweets"]:
                 tweets.append(tweet["text"])
         tweet_content = " ".join(tweets)
         mytopic = topic.topic()
         word_bag = mytopic.process_sentence(tweet_content)
         count_total = len(word_bag)
         word_map = {} # to store TF score
         word_map_2 = {}
         for word in word_bag:
             if not word in word_map:
                 word_map.update({word : 1})
             else:
                 word_map[word] += 1
         for word in word_map:
             word_map[word] = word_map[word]/float(count_total)
         
         """ in order to avoid special characters as key, append 'term-' to every term """
         for word in word_map:
             word_map_2.update({"term-"+word: word_map[word]})
         
         #print "term-kind"[5:]
         #print word_map["term-kind"[5:]]
         #break
         it = self.db[self.COLLECTION_UV].find({'screen_name': user})
         for i in it:
             i.update({"tf_score" : word_map_2})
             self.db[self.COLLECTION_UV].update({"screen_name": user}, i) # add tf_score
         count += 1
         if count%500 == 0:
             end = time.clock()
             print "%d users done... %.2f s.." %(count, (end - start))
             start = time.clock()
Пример #4
0
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
import pickle
import feature_extract
import topic
import heapq

print 'Pickling out'
pos_data=np.load('posproc.npy')
neg_data=np.load('negproc.npy')
print 'Number of  sarcastic tweets :', len(pos_data)
print 'Number of  non-sarcastic tweets :', len(neg_data)

print 'Training topics'

topic_mod = topic.topic(nbtopic=200,alpha='symmetric')
topic_mod.fit(np.concatenate((pos_data,neg_data)))

print 'Feature eng'
# label set
cls_set = ['Non-Sarcastic','Sarcastic']
featuresets = [] 

index=0
for tweet in pos_data:
    if (np.mod(index,10000)==0):
        print "Positive tweet processed: ",index
    featuresets.append((feature_extract.dialogue_act_features(tweet,topic_mod),cls_set[1]))
    index+=1
 
index=0
Пример #5
0
    def topic(self):
        if not self._loadID:
            return Exception('section-not-loaded')

        return topic(self._sqldb, self._loadID)
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from gensim.models import ldamodel
import features
import pickle

comments = []
sarc_set = set()
features_list_train = []
features_list_test = []
data = pd.read_csv('./train-balanced-sarcasm.csv')
data.dropna(subset=['comment'], inplace=True)

#topic model
#topic_model = topic(numTopics = 200, alpha = 'symmetric', model = "topics.model", dictionary = "dictionary.model")
topic_model = topic(numTopics=200, alpha='symmetric')
topic_model.generate(data['comment'])
'''
infile = open("feature_vec_train.p",'rb')
feature_vec_train = pickle.load(infile)
infile.close()

infile = open("feature_vec_test.p",'rb')
feature_vec_test = pickle.load(infile)
infile.close()

infile = open("y_train.p",'rb')
feature_vec_test = pickle.load(infile)
infile.close()

infile = open("y_test.p",'rb')
Пример #7
0
import pickle
import os
import feature_extract
import topic

fileObject1 = open(
    os.path.join(os.path.dirname(os.path.realpath(__file__)), 'vecdict.p'),
    'r')
fileObject2 = open(
    os.path.join(os.path.dirname(os.path.realpath(__file__)), 'classif.p'),
    'r')

vec = pickle.load(fileObject1)
classifier = pickle.load(fileObject2)

fileObject1.close()
fileObject2.close()

topic_mod = topic.topic(model=os.path.join(os.path.dirname(os.path.realpath(__file__)), 'topics.tp'),\
                        dicttp=os.path.join(os.path.dirname(os.path.realpath(__file__)), 'topics_dict.tp'))


def tweetscore(sentence):

    features = feature_extract.dialogue_act_features(sentence, topic_mod)
    features_vec = vec.transform(features)
    score = classifier.decision_function(features_vec)[0]
    percentage = int(round(2.0 * (1.0 / (1.0 + np.exp(-score)) - 0.5) * 100.0))

    return percentage
Пример #8
0
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
import pickle
import feature_extract
import topic
import heapq

print('Pickling out')
pos_data = np.load('posproc.npy')
neg_data = np.load('negproc.npy')
print('Number of  sarcastic tweets :', len(pos_data))
print('Number of  non-sarcastic tweets :', len(neg_data))

print('Training topics')

topic_mod = topic.topic(nbtopic=200, alpha='symmetric')
topic_mod.fit(np.concatenate((pos_data, neg_data)))

print('Feature eng')
# label set
cls_set = ['Non-Sarcastic', 'Sarcastic']
featuresets = []

index = 0
for tweet in pos_data:
    if (np.mod(index, 10000) == 0):
        print("Positive tweet processed: ", index)
    featuresets.append(
        (feature_extract.dialogue_act_features(tweet, topic_mod), cls_set[1]))
    index += 1
Пример #9
0
if __name__ == '__main__':
    obj = util()
    if obj.isLogin():
        print("您已经登陆")
    else:
        account = raw_input('请输入你的用户名\n>  ')
        secret = raw_input("请输入你的密码\n>  ")
        obj.login(secret, account)

    begin = int(time.time())
    print u"正在抓取个人您知乎数据...."
    #获取个人主页token
    token = obj.getToken()
    print u"获取个人主页token:", token

    obj_topic = topic.topic(obj.session)
    obj_question = question.question()
    obj_answer = answer.answer()

    # topic_id = '19551432'
    # questions=obj_question.getQuestionsByXHR(topic_id,'hot',0,3200.29677322)
    # for question in questions:
    #     print question[1]
    #questions = obj_question.getQuestionsByTopicId('19551432', 'hot')
    # topic_id='19551432'
    #
    # questions = obj_question.getQuestionsByTopicId(topic_id)
    #
    # # 创建话题文件夹(需要查表)
    # obj.cursor.execute("SELECT NAME FROM TOPIC WHERE LINK_ID = %s", int(topic_id))
    # result = obj.cursor.fetchone()
Пример #10
0
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import feature_extract
import topic

folder = os.path.join(os.path.dirname(__file__), "../..")

include_reddit = int(sys.argv[1])
classif_type = int(sys.argv[2])

print("Preparing...")
sarc_data = np.load("sarc-processed.npy")
non_data = np.load("nonsarc-processed.npy")

print("Training topics...\n")
topic_mod = topic.topic(nbtopic=200, alpha="symmetric")
topic_mod.fit(np.concatenate((sarc_data, non_data)))

print("Extracting features...")
sarcs = json.loads(
    open(os.path.join(folder, "sarc-comments.json"), "r").read())
nonsarcs = json.loads(
    open(os.path.join(folder, "nonsarc-comments.json"), "r").read())

labels = ["Sarcastic", "Non-Sarcastic"]
featuresets = []

i = 1
for k, v in sarcs.items():
    print(i, "sarcastic comments processed of", len(sarcs), end="\r")
    featuresets.append(
Пример #11
0
    def topic(self):
        if not self._loadID:
            return Exception('section-not-loaded')

        return topic(self._sqldb, self._loadID)
Пример #12
0
words = re.sub(r'http:[\\/.a-z0-9]+\s?', '', text)
print words
words = re.sub(r'(@\w+\s?)|(@\s+)', '', words)
print words
words = re.sub(r'[\#\-\+\*\`\.\;\:\"\?\<\>\[\]\{\}\|\~\_\=]', '', words)
print words
words = re.sub(r'rt\s?', '', words)
print words
words = words.strip()

token = nltk.word_tokenize(words)
print token
words = words.split()
print words

mytopic = topic.topic()
print mytopic.process_sentence(text)


# Tree manipulation

# Extract phrases from a parsed (chunked) tree
# Phrase = tag for the string phrase (sub-tree) to extract
# Returns: List of deep copies;  Recursive
def ExtractPhrases( myTree, phrase):
    myPhrases = []
    if (myTree.node == phrase):
        myPhrases.append( myTree.copy(True) )
    for child in myTree:
        if (type(child) is Tree):
            list_of_phrases = ExtractPhrases(child, phrase)
Пример #13
0
it takes a sentence and returns a percentage which describes how sarcastic
the input sentence is. """

import numpy as np
import pickle
import os
import feature_extract
import topic

fileObject1 = open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'vecdict.p'), 'r')
fileObject2= open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'classif.p'), 'r')

vec = pickle.load(fileObject1)
classifier = pickle.load(fileObject2)

fileObject1.close()
fileObject2.close()

topic_mod = topic.topic(model=os.path.join(os.path.dirname(os.path.realpath(__file__)), 'topics.tp'),\
                        dicttp=os.path.join(os.path.dirname(os.path.realpath(__file__)), 'topics_dict.tp'))

def tweetscore(sentence):
    
    features = feature_extract.dialogue_act_features(sentence,topic_mod)
    features_vec = vec.transform(features)
    score = classifier.decision_function(features_vec)[0]
    percentage = int(round(2.0*(1.0/(1.0+np.exp(-score))-0.5)*100.0))
    
    return percentage