示例#1
0
def get_specific_element(element, text):
    """
    get specific info from the text
    :param element: the info to be crawled
    :param text: the html source
    :return: the value of the info
    """
    conf = ConfigureParser('./__configure__/configure.xml')
    [choice, pattern] = conf.get_configure_by_tag_name(element)
    return get_res(choice, pattern, text)
示例#2
0
def jd_search(keys):
    conf = ConfigureParser('./__configure__/configure.xml')
    output_file = conf.get_configure_by_tag_name('output_file')
    xpath = conf.get_configure_by_tag_name('xpath')
    for key in keys:
        source = get_html(handle_type(key))
        try:
            sel = Selector(text=source).xpath(xpath)
            num = str(num_trans(sel.extract()[0]))
        except:
            num = '0'
        print handle_type(key), num
        with open(output_file, 'ab') as f:
            f.write(key + '\t' + num + '\n')
示例#3
0
def get_keys():
    conf = ConfigureParser('./__configure__/configure.xml')
    input_file = conf.get_configure_by_tag_name('input_file')
    output_file = conf.get_configure_by_tag_name('output_file')
    if not os.path.exists(output_file):
        with open(input_file, 'rb') as f:
            keys = [each.strip() for each in f.readlines()]
        return keys
    else:
        with open(input_file, 'rb') as f1:
            with open(output_file, 'rb') as f2:
                keys1 = [each.strip() for each in f1.readlines()]
                keys2 = [
                    each.strip().rsplit('\t', 1)[0] for each in f2.readlines()
                ]
                keys = [each for each in keys1 if each not in keys2]
        return keys
示例#4
0
文件: main.py 项目: Lion-Yang/spider
import time
import random
from __spider__.doubanJudge import DoubanJudge
from __configure__.ConfigureParser import ConfigureParser

if __name__ == '__main__':
    conf = ConfigureParser('./__configure__/configure.xml')
    input_file = conf.get_configure_by_tag_name_simple('input_file')
    output_file = conf.get_configure_by_tag_name_simple('output_file')
    with open(input_file, 'r') as f:
        for each in f:
            key = each.rsplit('\t', 1)[0].strip()
            crawler = DoubanJudge(unicode(key), output_file)
            crawler.handle()
            time.sleep(random.randint(3, 5))
示例#5
0
#!/usr/bin/python
# -*-coding:utf-8-*-
import urllib2
import json
import requests
import datetime
import re
import sys
from __util__.library import handle_data, mail_notification, TransException, my_retry
from __configure__.ConfigureParser import ConfigureParser
reload(sys)
sys.setdefaultencoding('utf8')

Configure = ConfigureParser('./__configure__/configure.xml')
admin = Configure.get_configure_by_tag_name('admin')


def get_now_time():
    """
    to get the last date when there is a music rank list on QQ music
    the rank list may update late, so give a timedelta 4 days.
    :return: a string of date such as: 2016-8-20
    """
    now = datetime.datetime.now() - datetime.timedelta(3)
    return now.strftime('%Y-%m-%d')


def get_current_nums():
    """
    get the current number of the rank list
    :return: a string of year and the number such as: 2016_37, presenting the 37th rank list of 2016
示例#6
0
# -*-coding:utf-8-*-
import datetime
import time
from __util__.library import create_post_urls, mail_notification
from __configure__.ConfigureParser import ConfigureParser
from __spider__.spidermovie import handle_movie
from __spider__.spidermusic import handle_music
from __spider__.spiderseries import handle_series
from __spider__.netEaseMusic import handle_netease_music

Configure = ConfigureParser('./__configure__/configure.xml')
# get interval time from the configure file
interval = int(eval(Configure.get_configure_by_tag_name('update_duration')))


def manage(func):
    """
    a decorator to print executing info and set sleep time
    :param func: the function being executed
    :return: an auxiliary function
    """
    def wraps(*arg, **args):
        while 1:
            print "Start executing at {}".format(datetime.datetime.now())
            func(*arg, **args)
            print "End executing at {}".format(datetime.datetime.now())
            time.sleep(interval)
    return wraps


@manage