示例#1
0
    def run_custom(self, input_path_list, output_path):

        # aggregate
        print('-' * 20)
        outfiles = [output_path + 'aggregated.xml']
        from aggregate import Aggregator
        options = input_path_list + ['-o', outfiles[-1]]
        Aggregator.run(options)

        # convert shorthands
        print('-' * 20)
        outfiles += [output_path + 'converted.xml']
        from convert import Converter
        options = [outfiles[-2]] + ['-o', outfiles[-1]]
        Converter.run(options)

        # validate conversion
        print('-' * 20)
        from validate import Validator
        options = [outfiles[-1]] + ['-o', output_path + 'validation.log']
        Validator.run(options)

        if self.convert_only:
            return

        # tokenise
        print('-' * 20)
        outfiles += [output_path + 'tokenised.xml']
        from tokenise import TEITokeniser
        options = [outfiles[-2]] + ['-o', outfiles[-1]]
        TEITokeniser.run(options)

        # kwic.xml
        print('-' * 20)
        outfiles += [output_path + 'kwic.xml']
        from kwic import KWICList
        options = [outfiles[-2]] + ['-o', outfiles[-1]]
        if self.para_string:
            options += ['-r', self.para_string]
        KWICList.run(options)

        # kwic.html
        print('-' * 20)
        outfiles += [output_path + 'kwic.html']
        from kwic_html import KwicHtml
        options = [outfiles[-2]] + ['-o', outfiles[-1]]
        KwicHtml.run(options)
示例#2
0
文件: check.py 项目: hedou/Spider-1
def main():
    arguments = docopt(__doc__, version='1.0.0')
    target = arguments['<target>']
    timeout = int(arguments['--timeout'])
    thread_num = int(arguments['<thread_num>'])
    process_num = int(arguments['<process_num>'])
    print('{} {} {} {}'.format(target, timeout, thread_num, process_num))
    validator = Validator(target, timeout, process_num, thread_num)
    ip_all = []
    logging.info("Load proxy ip, total: %s", len(ip_all))
    result_tmp = validator.run(ip_all)
    result = []
    for one in result_tmp:
        if one["speed"] > 8:
            pass
        else:
            result.append(one)
    logging.info("validator run finished")
    logging.info(len(result))
    result = sorted(result, key=lambda x: x["speed"])
    return result
示例#3
0
class Proxy(object):

    def __init__(self,target,run_times,threadhold,cycle_time,timeout,thread_num,process_num):
        self.client = pymongo.MongoClient('mongodb://*****:*****@115.28.36.253:27017/proxy')
        self.db = self.client.proxy
        self.collection = self.db.proxy_list
        self.validate = Validator(target,timeout,process_num,thread_num)
        self.run_count = run_times
        self.threadhold = threadhold
        self.cycle_time = cycle_time #seconds

    def get_proxy(self,num):
            tid = '555947027942665'
            url = 'http://tvp.daxiangdaili.com/ip/?tid={}&num={}&category=2&delay=5&protocol=https&ports=80,8080,3128'.format(tid, num)
            ip_all = []
            result = requests.get(url)
            result = result.text.split('\n')
            for one in result:
                if one != '':
                    ip = {
                        'ip': one.split(':')[0].strip(),
                        'port': one.split(':')[1].strip()
                    }
                    ip_all.append(ip)
            if len(ip_all) == 0:
                print('NO PROXY AVAILABLE')
            else:
                print('update proxy')
            return ip_all

    def insert(self,result):
        for i in result:
            self.collection.insert(i)
            print(i) 
    def get_length(self):
        count = 0
        for one in self.collection.find():
            count += 1
        return count
        
    def check_db(self):
        ip_all=[]
        for one in self.collection.find():
            ip_all.append(one)
            self.collection.remove(one)
        tmp = self.validate.run(ip_all)
        ip_all = []
        for one in tmp:
            if one['speed'] < 10:
                ip_all.append(one)
        if len(ip_all):
            print('check db:')
            self.insert(ip_all)
        
        
    def run(self,num):
        while self.run_count:
            if self.run_count%2==0:
                self.check_db()
            print('Run times count {}'.format(self.run_count))
            self.run_count -= 1
            start = time.time()
            while self.get_length() < self.threadhold:
                ip_all = self.get_proxy(num)
                tmp=self.validate.run(ip_all)
                ip_all = []
                for one in tmp:
                    if one['speed'] <10:
                        ip_all.append(one)
                if len(ip_all):
                    self.insert(ip_all)
                    break
                else:
                    print('no valid')
            print('proxies num in db {}'.format(self.get_length()))
            if self.run_count:
                print('Already satisfied the setting,sleep....')
                time.sleep(self.cycle_time-(time.time()-start))
            else:
                print('Run time finished, quit.')