class RecBySNS(object): def __init__(self): self.db = Database() self.nlpir = PyNLPIR(self) self.renren = Renren(self) self.url = URL(self) self.UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv: 17.0) Gecko/17.0 Firefox/17.0" self.pos_blacklist_regexs = [ "^emoticon$", "^title$", "^ude.", "^w.*", "^vshi", "^vyou", "^p.*", "^ule", "^m.*", "^cc", "^session$", ] def assign_recbysns_entity_sentiment(self): for status in self.db.select_table( "weibo_status", "text like '%%《%%》%%' or \ text like '%%http://%%' or \ text like '%%https://%%'", 12696, 5, ): sessions = self.nlpir.segment_weibo_status(status["text"]) i = 0 while i < len(sessions): session = sessions[i] entities = [] session_text = "" for segment in session: session_text += segment.rsplit("/", 1)[0] if self.nlpir.get_POS(segment) == "title": title = re.match(u"《(.*?)》/title", segment).group(1) if self.db.select_douban_movie_by_title(title) or self.db.select_douban_book_by_title(title): entities.append(segment) elif self.nlpir.get_POS(segment) == "url": match = re.search(u"(http.*)/url", segment) if match is None: print "###########%s###########" % segment continue url = match.group(1) url = self.db.select_recbysns_url_by_short_url(url) if url is None: print "***********%s***********" % segment continue if self.url.is_video_url(url["origin_url"]): entities.append(segment) positions = {} for entity in entities: if entity in positions: position = positions[entity] + 1 positions[entity] = position else: position = 0 positions[entity] = position print status["text"] print session_text print entity print "Type:" type = int(sys.stdin.readline()) print "Sentiment:" sentiment = int(sys.stdin.readline()) self.db.query( "INSERT INTO recbysns_entity( \ entity, status_id, session, position, \ type, score) \ VALUES(%s, %s, %s, %s, %s, %s)", (entity, status["id"], i, position, type, sentiment), ) self.db.commit() i = i + 1 def is_blacklist_word(self, word): for pos_blacklist_regex in self.pos_blacklist_regexs: if re.search(pos_blacklist_regex, self.nlpir.get_POS(word)): return True return False
class Douban(object): def __init__(self): self.client = \ DoubanClient('028bc5c2b034fb1c07a35148109ef154', '2f42bec4d6a403b4', 'http://rec.jjyao.me', 'douban_basic_common,shuo_basic_r,shuo_basic_w') #self.client.auth_with_code('39076cd663a27f06') from recbysns import recbysns self.recbysns = recbysns self.db = Database() self.cookie = 'bid="feXKjUDU9TI"; ue="*****@*****.**"; ll="118159"; ct=y; viewed="10001392_4025068_3098478_3302642_1434275_21760836_10537640_10522595_6799191"; dbcl2="51087586:lfMA83G6vyc"; ck="hW6b"; __utma=30149280.1374624205.1362216082.1368805078.1368838128.94; __utmb=30149280.31.10.1368838128; __utmc=30149280; __utmz=30149280.1368798357.92.42.utmcsr=movie.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/subject/11529526/; __utmv=30149280.5108; __utma=81379588.1127088638.1342424134.1368807092.1368838128.118; __utmb=81379588.52.8.1368843871481; __utmc=81379588; __utmz=81379588.1368798357.115.34.utmcsr=movie.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/subject/11529526/' def __getattr__(self, attr): return _Callable(self, attr, getattr(self.client, attr)) def book_search(self, **kw): """ Fallback for book.search """ keyword = kw['q'] books = [] for i in [0, 15]: url = 'http://book.douban.com/subject_search?start=%d&search_text=%s&cat=1001' % \ (i, keyword.encode('utf-8')) html = pq(url=url, parser='html', opener=lambda url, **kw: urllib2.urlopen(urllib2.Request(url, headers={ 'User-Agent': self.recbysns.UA, 'Cookie': self.cookie}), timeout=10).read(). decode('utf8', 'ignore')) for book in html('#content .subject-list .subject-item'): book = pq(book) collect_info = book('.info .collect-info span') # I have not read the book if len(collect_info) > 1: state = pq(collect_info[1]).text() if state == u'经常读': continue book_id = int(re.match('http://book.douban.com/subject/(\d+)/', book('.pic .nbg').attr('href')).group(1)) title = book('.info h2 a').attr('title') image = book('.pic .nbg img').attr('src') pub = book('.info .pub').text() rating = pq(book('.info .star')) if rating('.rating_nums'): numRaters = int(re.match(u'\((\d+)', rating('.pl').text()).group(1)) average = rating('.rating_nums').text() rating = {"numRaters": numRaters, "average": average} else: rating = {"numRaters": 0, "average": 0} books.append({'id': book_id, 'title': title, 'image': image, 'pub': pub, 'rating': rating}) return {'books': books, 'total': len(books)} def movie_search(self, **kw): """ Fallback for movie.search """ keyword = kw['q'] url = 'http://movie.douban.com/subject_search?search_text=%s&cat=1002' % \ keyword.encode('utf-8') html = pq(url=url, parser='html', opener=lambda url, **kw: urllib2.urlopen(urllib2.Request(url, headers={ 'User-Agent': self.recbysns.UA, 'Cookie': self.cookie}), timeout=10).read()) movies = [] for movie in html('#content table .item'): movie = pq(movie) id = int(re.match('http://movie.douban.com/subject/(\d+)/', movie('.nbg').attr('href')).group(1)) image = movie('.nbg img').attr('src') pub = movie('.pl2>.pl').text() rating = pq(movie('.pl2 .star')) if rating and rating('.rating_nums').text(): numRaters = int(re.match(u'\((\d+)', rating('.pl').text()).group(1)) average = rating('.rating_nums').text() rating = {"numRaters": numRaters, "average": average} else: rating = {"numRaters": 0, "average": 0} titles = [title.strip() for title in movie('.pl2>a').text().split('/')] movies.append({'id': id, 'titles': titles, 'image': image, 'pub': pub, 'rating': rating}) return {'movies': movies, 'total': len(movies)} def crawl_book(self): #start = 63831 %《%》% for book in self.db.select_table('douban_book', 'img_url is null'): try: result = douban.book.search(q=book['title']) if int(result['total']) == 0: print book['title'] continue for result in result['books']: if result['title'] == book['title']: self.db.query("UPDATE douban_book SET id = %s, \ img_url = %s, pub = %s, \ raters_num = %s, score = %s \ WHERE id = %s", (result['id'], result['image'], result['pub'], result['rating']['numRaters'], result['rating']['average'], book['id'])) self.db.commit() break except Exception as e: print book['title'] print e def crawl_movie(self): for movie in self.db.select_table('douban_movie', 'img_url is null'): try: result = douban.movie.search(q=movie['title']) if int(result['total']) == 0: print movie['title'] continue for result in result['movies']: if movie['title'] in result['titles']: self.db.query("UPDATE douban_movie SET id = %s, \ img_url = %s, pub = %s, \ raters_num = %s, score = %s \ WHERE id = %s", (result['id'], result['image'], result['pub'], result['rating']['numRaters'], result['rating']['average'], movie['id'])) self.db.commit() break except Exception as e: print movie['title'] print e
# coding=utf8 import codecs from db import Database NER_BOOK = 0 NER_MOVIE = 2 NER_VIDEO = 3 NER_OTHERS = -1 NER_MAGAZINE = 1 NER_MUSIC = 4 NER_TV_PROGRAM = 5 SA_POSITIVE = 1 SA_NETURAL = 0 SA_NEGATIVE = -1 db = Database() SA_POSITIVE_EMOTICONS = set([emoticon['emoticon'] for emoticon in db.select_table('weibo_emoticon', 'score = 1')]) SA_NEGATIVE_EMOTICONS = set([emoticon['emoticon'] for emoticon in db.select_table('weibo_emoticon', 'score = -1')]) f = codecs.open('data/recbysns_positive_words.txt', 'r', 'utf8') SA_POSITIVE_WORDS = set([word.strip() for word in f.readlines()]) f.close() f = codecs.open('data/recbysns_negative_words.txt', 'r', 'utf8') SA_NEGATIVE_WORDS = set([word.strip() for word in f.readlines()]) f.close() f = codecs.open('data/recbysns_negatives.txt', 'r', 'utf8') SA_NEGATIVES = set([word.strip() for word in f.readlines()]) f.close()