def fetch_keywords(): """ 获取站点级别的关键字并插入数据库中. """ ss.headers['Referer'] = ym r = ss.get(ym + '/girls/all/') r.encoding = 'gbk' tree = html.fromstring(r.text) links = tree.xpath('//div[@class="listap"]/a') print 'Found %s keywords' % len(links) ret = [] for link in links: msg = 'Processing keyword %s' % link.get('title') text = link.get('title').strip() keyword = Keyword.find_one({'name': text}) name = unicode(text) if keyword: msg += ', skipped as existing' else: keyword = Keyword() keyword.name = name keyword.level = KeywordLevel.SITE keyword.refer = u'%s%s' % (ym, link.get('href')) keyword.save() print msg ret.append(name) return ret
def analyze_keyword(k): """ 分析站点级别的关键字, 获取其百度指数以及其相关的长尾关键字. """ keyword = Keyword.find_one({'name': k}) if not keyword: print 'Keyword %s does not exist' % k return if keyword.baiduIndex > 0 or keyword.baiduResult > 0: print 'Keyword %s is imported before' % k return print 'Try to analyze keyword %s/%s' % (keyword._id, k) ss.headers['Referer'] = 'http://www.5118.com/' t = ss.get('http://www.5118.com/seo/words/%s' % url_quote(k)).text tree = html.fromstring(t) dls = tree.xpath('//div[@class="Fn-ui-list dig-list"]/dl') total = len(dls) for dl in dls: if dl.get('class', '') == 'dl-word': continue name = unicode(dl.xpath('./dd[1]//a[1]/@title')[0].strip()) baidu_index = dl.xpath('./dd[2]/text()')[0].strip() baidu_result = dl.xpath('./dd[3]/text()')[0].strip() if not baidu_index.isdigit(): baidu_index = 0 if not baidu_result.isdigit(): baidu_result = 0 print 'Found keyword: %s/%s/%s' % (name, baidu_index, baidu_result) if name == k: keyword.baiduIndex = int(baidu_index) keyword.baiduResult = int(baidu_result) if total > 2: keyword.total = total - 2 keyword.save() else: if Keyword.count({'name': name}) > 0: print 'This keyword already exists' continue long_tail = Keyword() long_tail.name = name long_tail.level = KeywordLevel.LONG_TAIL long_tail.parentId = keyword._id long_tail.baiduIndex = int(baidu_index) long_tail.baiduResult = int(baidu_result) long_tail.save() time.sleep(random.randint(5, 15))
def analyze_keyword(app, keyword): """ 分析站点级别的关键字, 获取其百度指数以及其相关的长尾关键字. 目前是从5118抓取. """ app.logger.info('Try to analyze keyword %s/%s' % (keyword._id, keyword.name)) ss.headers['Referer'] = 'http://www.5118.com/' t = ss.get('http://www.5118.com/seo/words/%s' % url_quote(keyword.name)).text tree = html.fromstring(t) dls = tree.xpath('//div[@class="Fn-ui-list dig-list"]/dl') total = len(dls) for dl in dls: if dl.get('class', '') == 'dl-word': continue name = unicode(dl.xpath('./dd[1]//a[1]/@title')[0].strip()) baidu_index = dl.xpath('./dd[2]/text()')[0].strip() baidu_result = dl.xpath('./dd[3]/text()')[0].strip() if not baidu_index.isdigit(): baidu_index = 0 if not baidu_result.isdigit(): baidu_result = 0 app.logger.info('Found keyword: %s/%s/%s' % (name, baidu_index, baidu_result)) if name == keyword.name: keyword.baiduIndex = int(baidu_index) keyword.baiduResult = int(baidu_result) if total > 2: keyword.total = total - 2 keyword.save() else: long_tail = Keyword.find_one({'name': name}) if not long_tail: long_tail = Keyword() long_tail.name = name long_tail.level = KeywordLevel.LONG_TAIL long_tail.parentId = keyword._id long_tail.baiduIndex = int(baidu_index) long_tail.baiduResult = int(baidu_result) long_tail.save()