def __init__(self): self.db = Database() self.nlpir = PyNLPIR(self) self.renren = Renren(self) self.url = URL(self) self.UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv: 17.0) Gecko/17.0 Firefox/17.0" self.pos_blacklist_regexs = [ "^emoticon$", "^title$", "^ude.", "^w.*", "^vshi", "^vyou", "^p.*", "^ule", "^m.*", "^cc", "^session$", ]
class RecBySNS(object): def __init__(self): self.db = Database() self.nlpir = PyNLPIR(self) self.renren = Renren(self) self.url = URL(self) self.UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv: 17.0) Gecko/17.0 Firefox/17.0" self.pos_blacklist_regexs = [ "^emoticon$", "^title$", "^ude.", "^w.*", "^vshi", "^vyou", "^p.*", "^ule", "^m.*", "^cc", "^session$", ] def assign_recbysns_entity_sentiment(self): for status in self.db.select_table( "weibo_status", "text like '%%《%%》%%' or \ text like '%%http://%%' or \ text like '%%https://%%'", 12696, 5, ): sessions = self.nlpir.segment_weibo_status(status["text"]) i = 0 while i < len(sessions): session = sessions[i] entities = [] session_text = "" for segment in session: session_text += segment.rsplit("/", 1)[0] if self.nlpir.get_POS(segment) == "title": title = re.match(u"《(.*?)》/title", segment).group(1) if self.db.select_douban_movie_by_title(title) or self.db.select_douban_book_by_title(title): entities.append(segment) elif self.nlpir.get_POS(segment) == "url": match = re.search(u"(http.*)/url", segment) if match is None: print "###########%s###########" % segment continue url = match.group(1) url = self.db.select_recbysns_url_by_short_url(url) if url is None: print "***********%s***********" % segment continue if self.url.is_video_url(url["origin_url"]): entities.append(segment) positions = {} for entity in entities: if entity in positions: position = positions[entity] + 1 positions[entity] = position else: position = 0 positions[entity] = position print status["text"] print session_text print entity print "Type:" type = int(sys.stdin.readline()) print "Sentiment:" sentiment = int(sys.stdin.readline()) self.db.query( "INSERT INTO recbysns_entity( \ entity, status_id, session, position, \ type, score) \ VALUES(%s, %s, %s, %s, %s, %s)", (entity, status["id"], i, position, type, sentiment), ) self.db.commit() i = i + 1 def is_blacklist_word(self, word): for pos_blacklist_regex in self.pos_blacklist_regexs: if re.search(pos_blacklist_regex, self.nlpir.get_POS(word)): return True return False