Exemplo n.º 1
0
def fetch_keywords():
    """
    获取站点级别的关键字并插入数据库中.
    """
    ss.headers['Referer'] = ym
    r = ss.get(ym + '/girls/all/')
    r.encoding = 'gbk'
    tree = html.fromstring(r.text)
    links = tree.xpath('//div[@class="listap"]/a')
    print 'Found %s keywords' % len(links)
    ret = []
    for link in links:
        msg = 'Processing keyword %s' % link.get('title')
        text = link.get('title').strip()
        keyword = Keyword.find_one({'name': text})
        name = unicode(text)
        if keyword:
            msg += ', skipped as existing'
        else:
            keyword = Keyword()
            keyword.name = name
            keyword.level = KeywordLevel.SITE
            keyword.refer = u'%s%s' % (ym, link.get('href'))
            keyword.save()
        print msg
        ret.append(name)

    return ret
Exemplo n.º 2
0
def analyze_keyword(k):
    """
    分析站点级别的关键字, 获取其百度指数以及其相关的长尾关键字.
    """
    keyword = Keyword.find_one({'name': k})
    if not keyword:
        print 'Keyword %s does not exist' % k
        return
    if keyword.baiduIndex > 0 or keyword.baiduResult > 0:
        print 'Keyword %s is imported before' % k
        return
    print 'Try to analyze keyword %s/%s' % (keyword._id, k)

    ss.headers['Referer'] = 'http://www.5118.com/'
    t = ss.get('http://www.5118.com/seo/words/%s' % url_quote(k)).text
    tree = html.fromstring(t)
    dls = tree.xpath('//div[@class="Fn-ui-list dig-list"]/dl')
    total = len(dls)
    for dl in dls:
        if dl.get('class', '') == 'dl-word':
            continue
        name = unicode(dl.xpath('./dd[1]//a[1]/@title')[0].strip())
        baidu_index = dl.xpath('./dd[2]/text()')[0].strip()
        baidu_result = dl.xpath('./dd[3]/text()')[0].strip()
        if not baidu_index.isdigit():
            baidu_index = 0
        if not baidu_result.isdigit():
            baidu_result = 0
        print 'Found keyword: %s/%s/%s' % (name, baidu_index, baidu_result)

        if name == k:
            keyword.baiduIndex = int(baidu_index)
            keyword.baiduResult = int(baidu_result)
            if total > 2:
                keyword.total = total - 2
            keyword.save()
        else:
            if Keyword.count({'name': name}) > 0:
                print 'This keyword already exists'
                continue
            long_tail = Keyword()
            long_tail.name = name
            long_tail.level = KeywordLevel.LONG_TAIL
            long_tail.parentId = keyword._id
            long_tail.baiduIndex = int(baidu_index)
            long_tail.baiduResult = int(baidu_result)
            long_tail.save()

    time.sleep(random.randint(5, 15))
Exemplo n.º 3
0
def analyze_keyword(app, keyword):
    """
    分析站点级别的关键字, 获取其百度指数以及其相关的长尾关键字.
    目前是从5118抓取.
    """
    app.logger.info('Try to analyze keyword %s/%s' % (keyword._id, keyword.name))

    ss.headers['Referer'] = 'http://www.5118.com/'
    t = ss.get('http://www.5118.com/seo/words/%s' % url_quote(keyword.name)).text
    tree = html.fromstring(t)
    dls = tree.xpath('//div[@class="Fn-ui-list dig-list"]/dl')
    total = len(dls)
    for dl in dls:
        if dl.get('class', '') == 'dl-word':
            continue
        name = unicode(dl.xpath('./dd[1]//a[1]/@title')[0].strip())
        baidu_index = dl.xpath('./dd[2]/text()')[0].strip()
        baidu_result = dl.xpath('./dd[3]/text()')[0].strip()
        if not baidu_index.isdigit():
            baidu_index = 0
        if not baidu_result.isdigit():
            baidu_result = 0
        app.logger.info('Found keyword: %s/%s/%s' % (name, baidu_index, baidu_result))

        if name == keyword.name:
            keyword.baiduIndex = int(baidu_index)
            keyword.baiduResult = int(baidu_result)
            if total > 2:
                keyword.total = total - 2
            keyword.save()
        else:
            long_tail = Keyword.find_one({'name': name})
            if not long_tail:
                long_tail = Keyword()
                long_tail.name = name
                long_tail.level = KeywordLevel.LONG_TAIL
                long_tail.parentId = keyword._id

            long_tail.baiduIndex = int(baidu_index)
            long_tail.baiduResult = int(baidu_result)
            long_tail.save()