def run_custom(self, input_path_list, output_path): # aggregate print('-' * 20) outfiles = [output_path + 'aggregated.xml'] from aggregate import Aggregator options = input_path_list + ['-o', outfiles[-1]] Aggregator.run(options) # convert shorthands print('-' * 20) outfiles += [output_path + 'converted.xml'] from convert import Converter options = [outfiles[-2]] + ['-o', outfiles[-1]] Converter.run(options) # validate conversion print('-' * 20) from validate import Validator options = [outfiles[-1]] + ['-o', output_path + 'validation.log'] Validator.run(options) if self.convert_only: return # tokenise print('-' * 20) outfiles += [output_path + 'tokenised.xml'] from tokenise import TEITokeniser options = [outfiles[-2]] + ['-o', outfiles[-1]] TEITokeniser.run(options) # kwic.xml print('-' * 20) outfiles += [output_path + 'kwic.xml'] from kwic import KWICList options = [outfiles[-2]] + ['-o', outfiles[-1]] if self.para_string: options += ['-r', self.para_string] KWICList.run(options) # kwic.html print('-' * 20) outfiles += [output_path + 'kwic.html'] from kwic_html import KwicHtml options = [outfiles[-2]] + ['-o', outfiles[-1]] KwicHtml.run(options)
def main(): arguments = docopt(__doc__, version='1.0.0') target = arguments['<target>'] timeout = int(arguments['--timeout']) thread_num = int(arguments['<thread_num>']) process_num = int(arguments['<process_num>']) print('{} {} {} {}'.format(target, timeout, thread_num, process_num)) validator = Validator(target, timeout, process_num, thread_num) ip_all = [] logging.info("Load proxy ip, total: %s", len(ip_all)) result_tmp = validator.run(ip_all) result = [] for one in result_tmp: if one["speed"] > 8: pass else: result.append(one) logging.info("validator run finished") logging.info(len(result)) result = sorted(result, key=lambda x: x["speed"]) return result
class Proxy(object): def __init__(self,target,run_times,threadhold,cycle_time,timeout,thread_num,process_num): self.client = pymongo.MongoClient('mongodb://*****:*****@115.28.36.253:27017/proxy') self.db = self.client.proxy self.collection = self.db.proxy_list self.validate = Validator(target,timeout,process_num,thread_num) self.run_count = run_times self.threadhold = threadhold self.cycle_time = cycle_time #seconds def get_proxy(self,num): tid = '555947027942665' url = 'http://tvp.daxiangdaili.com/ip/?tid={}&num={}&category=2&delay=5&protocol=https&ports=80,8080,3128'.format(tid, num) ip_all = [] result = requests.get(url) result = result.text.split('\n') for one in result: if one != '': ip = { 'ip': one.split(':')[0].strip(), 'port': one.split(':')[1].strip() } ip_all.append(ip) if len(ip_all) == 0: print('NO PROXY AVAILABLE') else: print('update proxy') return ip_all def insert(self,result): for i in result: self.collection.insert(i) print(i) def get_length(self): count = 0 for one in self.collection.find(): count += 1 return count def check_db(self): ip_all=[] for one in self.collection.find(): ip_all.append(one) self.collection.remove(one) tmp = self.validate.run(ip_all) ip_all = [] for one in tmp: if one['speed'] < 10: ip_all.append(one) if len(ip_all): print('check db:') self.insert(ip_all) def run(self,num): while self.run_count: if self.run_count%2==0: self.check_db() print('Run times count {}'.format(self.run_count)) self.run_count -= 1 start = time.time() while self.get_length() < self.threadhold: ip_all = self.get_proxy(num) tmp=self.validate.run(ip_all) ip_all = [] for one in tmp: if one['speed'] <10: ip_all.append(one) if len(ip_all): self.insert(ip_all) break else: print('no valid') print('proxies num in db {}'.format(self.get_length())) if self.run_count: print('Already satisfied the setting,sleep....') time.sleep(self.cycle_time-(time.time()-start)) else: print('Run time finished, quit.')