Пример #1
0
    def __init__(self):
        jieba.load_userdict(os.path.join(config.DICT_PATH, "user_defined_dict.txt"))
        jieba.load_userdict(os.path.join(config.DICT_PATH, "sogoupinyin_dict.txt"))

        signedCommentsDbHandler = SignedCommentsDbHandler()
        appleAppDbHandler = AppleAppDbHandler()

        self.__appId_rating_mean_dict = {}
        appList = appleAppDbHandler.queryAll()
        for app in appList:
            # print(app)
            self.__appId_rating_mean_dict[app[0]] = app[2]

        signedComments = signedCommentsDbHandler.queryAll()
        self.__commentId_comment_dict = {}    # {comment_id:signedComment}
        self.__userName_commentIds_dict = {}  # {user_name:[comment_id,comment_id,...]}
        self.__content_count_dict = {}        # {content:count}
        for signedComment in signedComments:
            self.__commentId_comment_dict[signedComment[1]] = signedComment
            if signedComment[8] in self.__userName_commentIds_dict.keys():
                self.__userName_commentIds_dict[signedComment[8]].append(signedComment[1])
            else:
                self.__userName_commentIds_dict[signedComment[8]] = [signedComment[1]]
            if signedComment[3] in self.__content_count_dict.keys():
                self.__content_count_dict[signedComment[3]] += 1
            else:
                self.__content_count_dict[signedComment[3]] = 1
Пример #2
0
 def __init__(self):
     self.__headers = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/57.0'
     }
     self.__pattern = re.compile(
         '<entry>.*?<updated>(.*?)</updated>.*?<id>(.*?)</id>.*?'
         '<title>(.*?)</title>.*?<content.*?>(.*?)</content>.*?<im:voteSum>(.*?)</im:voteSum>.*?'
         '<im:voteCount>(.*?)</im:voteCount>.*?<im:rating>(.*?)</im:rating>.*?'
         '<im:version>(.*?)</im:version>.*?<name>(.*?)</name>', re.M | re.S)
     self.__appCommentsHandler = AppCommentsDbHandler()
     self.__appleAppHandler = AppleAppDbHandler()
Пример #3
0
class WebToDb(object):
    def __init__(self):
        self.__headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/57.0'
        }
        self.__pattern = re.compile(
            '<entry>.*?<updated>(.*?)</updated>.*?<id>(.*?)</id>.*?'
            '<title>(.*?)</title>.*?<content.*?>(.*?)</content>.*?<im:voteSum>(.*?)</im:voteSum>.*?'
            '<im:voteCount>(.*?)</im:voteCount>.*?<im:rating>(.*?)</im:rating>.*?'
            '<im:version>(.*?)</im:version>.*?<name>(.*?)</name>', re.M | re.S)
        self.__appCommentsHandler = AppCommentsDbHandler()
        self.__appleAppHandler = AppleAppDbHandler()

    def executeAll(self):
        counter = 0
        appleApps = self.__appleAppHandler.queryAll()
        for appleApp in appleApps:
            print('\n正在获取苹果应用: %s-%s 的最新评论······' % (appleApp[0], appleApp[1]))
            added = self.executeByAppId(appleApp[0])
            print('新增%d条评论' % added)
            counter += added
        return counter

    def executeByAppId(self, appId):
        count_before = self.__appCommentsHandler.count()
        # get comment entries from page 1 to 10
        for currPage in range(1, 11):
            url = "https://itunes.apple.com/rss/customerreviews/page=" + str(currPage) + \
                  "/id=" + str(appId) + "/sortby=mostrecent/xml?l=en&&cc=cn"
            spider = MySpider(url, self.__headers, self.__pattern)
            comments = spider.getMsgs()
            try:
                # insert comment entries from current website page one by one
                for comment in comments:
                    commentItemList = list(comment)
                    commentItemList.append(appId)  # app_id
                    commentItemList.append(str(''))  # isSpam
                    try:
                        self.__appCommentsHandler.insertAppComment(
                            commentItemList)
                    except UserWarning:
                        raise UserWarning('Outdated comments!')
                    except Exception as errStr:
                        print(errStr)
            except (Exception, UserWarning) as errStr:
                print(errStr, 'Update next app\'s comment!')
                break
        count_after = self.__appCommentsHandler.count()
        return count_after - count_before
Пример #4
0
class XlsxToDb(object):
    def __init__(self):
        self.__signedCommentsDbHandler = SignedCommentsDbHandler()
        self.__appleAppHandler = AppleAppDbHandler()

    # 将input目录下某个xlsx文件的评论导入到数据库
    def executeOneApp(self, fileName):
        print(fileName)
        wb = None
        try:
            filePath = os.path.join(config.RESOURCES_PATH, 'signedComments',
                                    fileName)
            wb = load_workbook(filename=filePath)
            ws = wb.get_sheet_by_name(wb.get_sheet_names()[0])
        except Exception:
            print("未找到" + fileName + "文件")
            return
        appId = ws.cell(row=2, column=1).value
        # # 显示有多少张表
        # print( "Worksheet range(s):", wb.get_named_ranges() )
        # print( "Worksheet name(s):", wb.get_sheet_names() )
        # # 显示表名,表行数,表列数
        # print( "Work Sheet Titile:", ws.title )
        # print( "Work Sheet Rows:", ws.max_row)
        # print( "Work Sheet Cols:", ws.max_column )

        # 建立存储数据的列表
        comments_list = []

        for row in range(2, ws.max_row + 1):
            temp_list = []
            for col in range(2, 12):
                temp_list.append(ws.cell(row=row, column=col).value)
            temp_list.insert(9, appId)
            self.__signedCommentsDbHandler.insertSignedComment(temp_list)
            comments_list.append(temp_list)

        # 打印字典数据个数
        print('Total:%d' % len(comments_list))

    # 将input目录下所有已标记xlsx文件的评论导入到数据库
    def executeAllApp(self):
        appList = list(self.__appleAppHandler.queryAll())
        print(len(appList))
        for app in appList:
            self.executeOneApp(app[0] + "_" + app[1] + ".xlsx")
Пример #5
0
def doUpdate():

    sHandler = SignedCommentsDbHandler()
    aHandler = AppleAppDbHandler()

    appList = aHandler.queryAll()
    for appId, appName in appList:
        print(appId, appName)
        commentsList = sHandler.queryCommentsByAppId(appId)
        commentsCount = len(commentsList)
        if commentsCount == 0:
            aHandler.updateRatingMean(appId, 0.)
        else:
            totalRating = 0.
            for comment in commentsList:
                totalRating += comment[7]
            ratingMean = float(totalRating/commentsCount)
            aHandler.updateRatingMean(appId,ratingMean)
Пример #6
0
class WebToDb_2(object):

    def __init__(self):
        self.__appleAppHandler = AppleAppDbHandler()
        self.__appCommentsHandler = AppCommentsDbHandler()

    def executeAll(self):
        appleApps = self.__appleAppHandler.queryAll()
        sizeOfAppleApps = len(appleApps)
        print("sizeOfAppleApps",sizeOfAppleApps)
        sizeOfAppids = int(sizeOfAppleApps / 10 + 1)
        listOfAppids = []       # [[appid1,appid2,...], ...]
        countTemp = 0
        appidsTemp = []
        for index in range(0,sizeOfAppleApps):
            countTemp += 1
            appidsTemp.append(appleApps[index][0])
            if countTemp >= sizeOfAppids or index >= sizeOfAppleApps-1:
                listOfAppids.append(appidsTemp)
                appidsTemp = []
                countTemp = 0
        # for li in listOfAppids:
        #     for appid in li:
        #         print(appid,',')
        #     print('\n')
        listOfThreads = []
        for appids in listOfAppids:
            myThread = ThreadOfWebToDb(appids)
            listOfThreads.append(myThread)
        print("number of threads:{}".format(len(listOfThreads)))
        for thread in listOfThreads:
            thread.start()

        countOfNewComments = 0
        for thread in listOfThreads:
            thread.join()
            countOfNewComments += thread.countOfNewComments()
        return countOfNewComments
Пример #7
0
 def __init__(self):
     self.__signedCommentsDbHandler = SignedCommentsDbHandler()
     self.__appleAppHandler = AppleAppDbHandler()
Пример #8
0
 def __init__(self):
     self.__appleAppHandler = AppleAppDbHandler()
     self.__appCommentsHandler = AppCommentsDbHandler()
Пример #9
0
 def __init__(self):
     # connet to database
     self.__appCommentsHandler = AppCommentsDbHandler()
     self.__appleAppHandler = AppleAppDbHandler()
     self.__signedCommentsDbHandler = SignedCommentsDbHandler()
Пример #10
0
class DbToXlsx(object):

    def __init__(self):
        # connet to database
        self.__appCommentsHandler = AppCommentsDbHandler()
        self.__appleAppHandler = AppleAppDbHandler()
        self.__signedCommentsDbHandler = SignedCommentsDbHandler()

    def exportAllComments(self):
        currTime = time.localtime(time.time())
        xlsxFileName = str(currTime.tm_year) + "_" + str(currTime.tm_mon) + \
                       "_" + str(currTime.tm_mday) + "_" + str(currTime.tm_hour) +\
                       "_" + str(currTime.tm_min) + "_" + str(currTime.tm_sec) + '.xlsx'
        filePath = os.path.join(config.RESOURCES_PATH,'output',xlsxFileName)
        comments = self.__appCommentsHandler.queryAll()

        wb = Workbook()
        ws = wb.active
        # app_id,time,comment_id,title,content,voteSum,voteCount,rating,version,user_name,isSpam,app_name
        head = ('app_id', 'time', 'comment_id', 'title', 'content', 'voteSum', 'voteCount',
                'rating', 'version', 'user_name', 'isSpam', 'app_name')
        for j in range(len(head)):
            ws.cell(row=1, column=j + 1).value = head[j]
        for comment in comments:
            ws.append(comment)
        wb.save(filename = filePath)

    # export comments to several xlsx files named by app_id-app_name
    # multi thread
    def exportCommentsEachApp(self):
        # create output dir
        currTime = time.localtime(time.time())  # get current time
        dirName = str(currTime.tm_year) + "_" + str(currTime.tm_mon) + "_" +\
                  str(currTime.tm_mday) + "_" + str(currTime.tm_hour) + "_" + str(currTime.tm_min) +\
                  "_" + str(currTime.tm_sec) + 'xlsx'
        dirPath = os.path.join(config.RESOURCES_PATH, 'output', dirName)
        os.mkdir(dirPath)
        appTuple = self.__appleAppHandler.queryAll()  # ((app_id,app_name),(app_id,app_name),...)

        threads = []
        for i in range(len(appTuple)):
            print(appTuple[i])
            myThread = ThreadExportCommentOfOneApp(i,appTuple[i],dirPath,self.__appCommentsHandler)
            threads.append(myThread)
        for thread in threads:
            thread.start()
        for thread in threads:
            thread.join()

    def exportSignedCommentsEachApp(self):
        # create output dir
        currTime = time.localtime(time.time())  # get current time
        dirName = str(currTime.tm_year) + "_" + str(currTime.tm_mon) + "_" +\
                  str(currTime.tm_mday) + "_" + str(currTime.tm_hour) + "_" + str(currTime.tm_min) +\
                  "_" + str(currTime.tm_sec) + 'Signedxlsx'
        dirPath = os.path.join(config.RESOURCES_PATH, 'output', dirName)
        os.mkdir(dirPath)
        appTuple = self.__appleAppHandler.queryAll()  # ((app_id,app_name),(app_id,app_name),...)

        threads = []
        for i in range(len(appTuple)):
            print(appTuple[i])
            myThread = ThreadExportCommentOfOneApp(i,appTuple[i],dirPath,self.__signedCommentsDbHandler)
            threads.append(myThread)
        for thread in threads:
            thread.start()
        for thread in threads:
            thread.join()