예제 #1
0
def youkuRelatedVideos():
    resArr = []
    limit = 6

    url = 'http://www.soku.com/search_video/q_' + searchStrAdd
    soup = ScrabHelper.getSoupFromURL(url, {}, 'utf8')

    # 暂时取8个
    links = soup.find_all('div', 'sk-vlist')[0].find_all('div',
                                                         'v-link',
                                                         limit=limit)

    for i in range(limit):
        vid = links[i].find('a').attrs['_log_vid']
        title = links[i].find('a').attrs['title']

        resArr.append({'vid': vid, 'title': title})

    resJson = {'data': resArr}

    sql = "update jimi_radar_video set related_video_youku = '%s' where id=%d" % (
        json.dumps(resJson, ensure_ascii=False, encoding='UTF-8'), id)
    SqlHelper.ExecuteNonQuery(sql)


# youkuRelatedVideos()
예제 #2
0
def analysisAndSaveOnce(resultsAll, dict, fieldName):
    # 结果字典
    resultDictionary = {}
    for row in resultsAll:
        for word in dict:
            wordEncode = word.encode('utf8')
            resultHTML = row[0].encode('utf8')  # 不知道为什么
            pattern1 = re.compile(wordEncode)
            res = re.findall(pattern1, resultHTML)
            matchLength = res.__len__()

            if matchLength != 0:
                # print matchLength
                # print wordEncode
                if resultDictionary.get(wordEncode) == None:
                    resultDictionary[wordEncode] = matchLength
                else:
                    resultDictionary[wordEncode] += matchLength

    jsonStr = json.dumps(resultDictionary,
                         ensure_ascii=False,
                         encoding='UTF-8')

    sql = "update jimi_radar_dimensionmode set %s ='%s' where id=%d" % (
        fieldName, jsonStr, dmId)
    insertNum = SqlHelper.ExecuteNonQuery(sql)
    print jsonStr
    print sql
    print insertNum
예제 #3
0
def doSave(json):
    # print json
    scrabId = json.get('scrabId')  # int
    processed_clue = json.get('processed_clue')  # str
    scrab_result = json.get('scrab_result')
    data_time = json.get('data_time')

    rowCount = SqlHelper.ExecuteNonQuery(
        "insert into jimi_radar_result (scrab_id,processed_clue,scrab_result,data_time,insert_time) values('%d','%s','%s','%s','%s')"
        % (scrabId, processed_clue, scrab_result, data_time,
           DateHelper.getDateNowStr()))
예제 #4
0
def jd():
    url = 'https://sclub.jd.com/comment/productPageComments.action?productId=256035&score=0&sortType=3&page=0&pageSize=10&isShadowSku=0&callback=fetchJSON_comment98vv3934'
    jsonp = ScrabHelper.getHTMLFromURL(url, {}, 'gbk')
    loadJson = ScrabHelper.loads_jsonp(jsonp)
    hotCommentTagStatistics = loadJson['hotCommentTagStatistics']
    resArr = {}
    for stat in hotCommentTagStatistics:
        resArr[stat['name']] = stat['count']

    resJson = {'data': resArr}

    sql = "update jimi_radar_evaluate set jdyinxiang = '%s' where id=%d" % (
        json.dumps(resJson, ensure_ascii=False, encoding='UTF-8'), id)
    print sql
    SqlHelper.ExecuteNonQuery(sql)
예제 #5
0
def weiboYinXiang():
    resArr = {}
    url = 'http://s.weibo.com/impress?key=' + searchStr + '&cate=whole&isswitch=1&refer=tag&cuid=3235723984'
    soup = ScrabHelper.getSoupFromURL(url, {}, 'utf8')
    secs = soup.find_all('div', 'impress_label')[0].find_all('section')
    for sec in secs:
        aas = sec.find_all('a')
        length = len(aas)

        for a in aas:
            text = a.get_text()
            resArr[text] = (5 - length)

    resJson = {'data': resArr}

    sql = "update jimi_radar_evaluate set weiboyinxiang = '%s' where id=%d" % (
        json.dumps(resJson, ensure_ascii=False, encoding='UTF-8'), id)
    print sql
    SqlHelper.ExecuteNonQuery(sql)
예제 #6
0
def weixinYear1():
    dateNow = datetime.datetime.now()
    dateYearAgo = dateNow + datetime.timedelta(days=-365)
    dateNow = str(dateNow).split(' ')[0]
    dateYearAgo = str(dateYearAgo).split(' ')[0]

    url = 'http://weixin.sogou.com/weixin?type=2&ie=utf8&query=' + searchStr + '&tsn=5&ft=' + dateYearAgo + '&et=' + dateNow + '&interation=null&wxid=&usip=null&from=tool'
    soup = ScrabHelper.getSoupFromURL(url, {}, 'utf8')

    text = soup.find_all('div', 'mun')[0].get_text()  # 搜索工具百度为您找到相关结果约403,000个

    pat = re.compile(r'[\d,]+')
    num = pat.findall(text)[0]
    num = numMinusComma(num)

    sql = "update jimi_radar_index set %s = '%s' where id=%d" % (
        'weixin_year1', num, id)
    SqlHelper.ExecuteNonQuery(sql)


# weixinYear1()
예제 #7
0
def baiduYear1():
    dateNow = datetime.datetime.now()
    dateYearAgo = dateNow + datetime.timedelta(days=-365)
    intNow = str(DateHelper.getDateInt(dateNow))
    yearAgoNow = str(DateHelper.getDateInt(dateYearAgo))

    print intNow
    print yearAgoNow

    stf = 'stf=' + yearAgoNow + ',' + intNow + '|stftype=1'
    stf = urllib.quote(stf)
    url = "http://www.baidu.com/s?wd=" + searchStr + "&gpc=" + stf
    data = requests.get(url).text
    soup = ScrabHelper.getSoupFromHtml(data)
    text = soup.find_all('div',
                         'nums')[0].get_text()  # 搜索工具百度为您找到相关结果约403,000个

    pat = re.compile(r'[\d,]+')
    num = pat.findall(text)[0]
    num = numMinusComma(num)

    sql = "update jimi_radar_index set %s = '%s' where id=%d" % ('baidu_year1',
                                                                 num, id)
    SqlHelper.ExecuteNonQuery(sql)
예제 #8
0
import urllib
import datetime
import re
import requests
from sqlHelper import SqlHelper
from scrabHelper import ScrabHelper
from dateHelper import DateHelper
from selenium import webdriver
import time

dmId = 1
searchArr = ['小黑瓶', '兰蔻小黑瓶', '兰蔻 小黑瓶']
searchStr = searchArr[0]

count = SqlHelper.ExecuteScalar(
    "select count(*) from jimi_radar_index where dm_id=%d and keyword='%s'" %
    (dmId, searchStr))

if count == 0:
    SqlHelper.ExecuteScalar(
        "insert into jimi_radar_index (dm_id,keyword,ctime) values (%d,'%s','%s')"
        % (dmId, searchStr, DateHelper.getDateNowStr()))

id = SqlHelper.ExecuteScalar(
    "select id from jimi_radar_index where dm_id=%d and keyword='%s'" %
    (dmId, searchStr))


def numMinusComma(num):
    return ''.join(num.split(','))
예제 #9
0
def tiaomao():
    url = 'https://rate.tmall.com/listTagClouds.htm?itemId=43165859354&isAll=true&isInner=true&t=1482481000827&callback=jsonp1575'
    jsonp = ScrabHelper.getHTMLFromURL(url, {}, 'gbk')

    # json = {
    # "tags": {
    # "dimenSum": 9,
    # "innerTagCloudList": "",
    # "rateSum": 177,
    # "structuredRateStatisticList": [],
    #     "tagClouds": [{"count": 39, "id": "10120", "posi": true, "tag": "服务好", "weight": 0}, {
    #         "count": 35,
    #         "id": "620",
    #         "posi": true,
    #         "tag": "质量好",
    #         "weight": 0
    #     }, {"count": 33, "id": "420", "posi": true, "tag": "物流快", "weight": 0}, {
    #         "count": 33,
    #         "id": "1020",
    #         "posi": true,
    #         "tag": "正品",
    #         "weight": 0
    #     }, {"count": 13, "id": "824", "posi": true, "tag": "保湿滋润", "weight": 0}, {
    #         "count": 9,
    #         "id": "4624",
    #         "posi": true,
    #         "tag": "吸收效果不错",
    #         "weight": 0
    #     }, {"count": 8, "id": "2524", "posi": true, "tag": "清洁度强", "weight": 0}, {
    #         "count": 4,
    #         "id": "124",
    #         "posi": true,
    #         "tag": "味道好闻",
    #         "weight": 0
    #     }, {"count": 3, "id": "1224", "posi": true, "tag": "控油", "weight": 0}],
    #     "userTagCloudList": [{
    #         "dimenName": "年龄",
    #         "id": 26,
    #         "tagScaleList": [{"count": 2, "index": 0, "proportion": 0.0, "scale": "18岁以下"}, {
    #             "count": 491,
    #             "index": 1,
    #             "proportion": 16.0,
    #             "scale": "18-24"
    #         }, {"count": 1011, "index": 2, "proportion": 33.0, "scale": "25-29"}, {
    #             "count": 1080,
    #             "index": 3,
    #             "proportion": 35.0,
    #             "scale": "30-40"
    #         }, {"count": 484, "index": 4, "proportion": 16.0, "scale": "40岁以上"}],
    #         "total": 3068
    #     }]
    # }
    # }
    loadjson = ScrabHelper.loads_jsonp(jsonp)

    # ******************************************************
    tagClouds = loadjson['tags']['tagClouds']
    resArr = {}
    for tagCloud in tagClouds:
        resArr[tagCloud['tag']] = tagCloud['count']

    resJson = {'data': resArr}

    sql = "update jimi_radar_evaluate set tianmaoyinxiang = '%s' where id=%d" % (
        json.dumps(resJson, ensure_ascii=False, encoding='UTF-8'), id)
    print sql
    SqlHelper.ExecuteNonQuery(sql)

    # ******************************************************
    tagScaleList = loadjson['tags']['userTagCloudList'][0]['tagScaleList']
    resArr = {}
    for tagScale in tagScaleList:
        resArr[tagScale['scale']] = tagScale['count']

    resJson = {'data': resArr}

    sql = "update jimi_radar_evaluate set tianmaoAge = '%s' where id=%d" % (
        json.dumps(resJson, ensure_ascii=False, encoding='UTF-8'), id)
    print sql
    SqlHelper.ExecuteNonQuery(sql)
예제 #10
0
def doSave(numName, num, playName, play):
    sql = "update jimi_radar_video set %s = '%s' , %s = '%s' where id=%d" % (
        numName, num, playName, play, id)
    print sql
    SqlHelper.ExecuteNonQuery(sql)
예제 #11
0
                    resultDictionary[wordEncode] += matchLength

    jsonStr = json.dumps(resultDictionary,
                         ensure_ascii=False,
                         encoding='UTF-8')

    sql = "update jimi_radar_dimensionmode set %s ='%s' where id=%d" % (
        fieldName, jsonStr, dmId)
    insertNum = SqlHelper.ExecuteNonQuery(sql)
    print jsonStr
    print sql
    print insertNum


# 得到当前dm爬取了哪些网站
scrab_json = SqlHelper.ExecuteScalar(
    "select scrab_json from jimi_radar_dimensionmode where id=" + str(dmId))
# {"data": [{"scrabId": 1,
# "clue": 'http://cosme.pclady.com.cn/product/29669.html'
# }, {
# "scrabId": 2,
# "clue": 'http://product.kimiss.com/product/80696/'
# }]}
scrab_json = json.loads(scrab_json)
scrabArray = scrab_json['data']

# 得到字典对象数组
dtDict = SqlHelper.ExecuteDataTable(
    "select cate_json from jimi_radar_dict_cate where id<10 order by id")

# 字典关键字数组 {"data":["浓稠", "粘稠", "有点稠", "稠稠", "粘粘", "黏腻", "黏黏", "厚实", "厚重", "比较厚", "丰盈"]}
dictKeyWordArray = []