示例#1
0
 def __init__(self):
     self.db = Database()
     self.nlpir = PyNLPIR(self)
     self.renren = Renren(self)
     self.url = URL(self)
     self.UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv: 17.0) Gecko/17.0 Firefox/17.0"
     self.pos_blacklist_regexs = [
         "^emoticon$",
         "^title$",
         "^ude.",
         "^w.*",
         "^vshi",
         "^vyou",
         "^p.*",
         "^ule",
         "^m.*",
         "^cc",
         "^session$",
     ]
示例#2
0
class RecBySNS(object):
    def __init__(self):
        self.db = Database()
        self.nlpir = PyNLPIR(self)
        self.renren = Renren(self)
        self.url = URL(self)
        self.UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv: 17.0) Gecko/17.0 Firefox/17.0"
        self.pos_blacklist_regexs = [
            "^emoticon$",
            "^title$",
            "^ude.",
            "^w.*",
            "^vshi",
            "^vyou",
            "^p.*",
            "^ule",
            "^m.*",
            "^cc",
            "^session$",
        ]

    def assign_recbysns_entity_sentiment(self):
        for status in self.db.select_table(
            "weibo_status",
            "text like '%%《%%》%%' or \
                                            text like '%%http://%%' or \
                                            text like '%%https://%%'",
            12696,
            5,
        ):
            sessions = self.nlpir.segment_weibo_status(status["text"])
            i = 0
            while i < len(sessions):
                session = sessions[i]
                entities = []
                session_text = ""
                for segment in session:
                    session_text += segment.rsplit("/", 1)[0]
                    if self.nlpir.get_POS(segment) == "title":
                        title = re.match(u"《(.*?)》/title", segment).group(1)
                        if self.db.select_douban_movie_by_title(title) or self.db.select_douban_book_by_title(title):
                            entities.append(segment)
                    elif self.nlpir.get_POS(segment) == "url":
                        match = re.search(u"(http.*)/url", segment)
                        if match is None:
                            print "###########%s###########" % segment
                            continue
                        url = match.group(1)
                        url = self.db.select_recbysns_url_by_short_url(url)
                        if url is None:
                            print "***********%s***********" % segment
                            continue
                        if self.url.is_video_url(url["origin_url"]):
                            entities.append(segment)
                positions = {}
                for entity in entities:
                    if entity in positions:
                        position = positions[entity] + 1
                        positions[entity] = position
                    else:
                        position = 0
                        positions[entity] = position
                    print status["text"]
                    print session_text
                    print entity
                    print "Type:"
                    type = int(sys.stdin.readline())
                    print "Sentiment:"
                    sentiment = int(sys.stdin.readline())
                    self.db.query(
                        "INSERT INTO recbysns_entity( \
                                   entity, status_id, session, position, \
                                   type, score) \
                                   VALUES(%s, %s, %s, %s, %s, %s)",
                        (entity, status["id"], i, position, type, sentiment),
                    )
                    self.db.commit()
                i = i + 1

    def is_blacklist_word(self, word):
        for pos_blacklist_regex in self.pos_blacklist_regexs:
            if re.search(pos_blacklist_regex, self.nlpir.get_POS(word)):
                return True
        return False