示例#1
0
def crawling():
    PROJECT_NAME = 'intranet'
    HOMEPAGE = 'http://intranet.iitg.ernet.in/'
    DOMAIN_NAME = getDomainName(HOMEPAGE)
    QUEUE_FILE = PROJECT_NAME + '/queue.txt'
    CRAWLED_FILE = PROJECT_NAME + '/crawled.txt'
    NOTLINKS_FILE = PROJECT_NAME + '/notLinks.txt'
    CANNOTOPEN_FILE = PROJECT_NAME + '/cannotOpen.txt'

    NUMBER_OF_THREADS = 8

    queue = Queue()
    spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME)

    # Create worker threads (die when main exits)
    def createWorkers():
        for _ in range(NUMBER_OF_THREADS):
            t = threading.Thread(target=work)
            t.daemon = True
            t.start()

# Do the next job in queue

    def work():
        while True:
            url = queue.get()
            spider.crawlPage(threading.current_thread().name, url)
            queue.task_done()

# Each queued link is a new job

    def createJobs():
        for link in fileToSet(QUEUE_FILE):
            queue.put(link)
        queue.join()
        crawl()


#check if there are links in queue, if so then crawl

    def crawl():
        queuedLinks = fileToSet(QUEUE_FILE)
        if len(queuedLinks) > 0:
            print(str(len(queuedLinks)) + ' links in queue')
            createJobs()

    createWorkers()
    crawl()
示例#2
0
 def test_marks_episode_as_do_not_download(self):
     mockDAL = Mock(spec=DAL.DAL)
     mockDAL.get_eps_for_show = MagicMock(return_value = {1: [1,2,3,4,5,6,7,8,9,10,11,12]})
     spiderbro = spider.spider(mockDAL)
     spiderbro.config.force_learn = True
     spiderbro.find_torrents_for_show("Constantine")
     mockDAL.mark_episode_do_not_download.assert_called_once_with("Constantine", 1, 13, unittest.mock.ANY, unittest.mock.ANY)
示例#3
0
    def __init__(self, proxies={'http': 'http://127.0.0.1:8080',
        'https': 'http://127.0.0.1:8080'}):
        """
        Creates an instance of the ZAP api client.

        :Parameters:
           - `proxies`: dictionary of ZAP proxies to use.
           
        Note that all of the other classes in this directory are generated
        new ones will need to be manually added to this file
        """
        self.__proxies = proxies
        
        self.acsrf = acsrf(self)
        self.ajaxSpider = ajaxSpider(self)
        self.ascan = ascan(self)
        self.authentication = authentication(self)
        self.autoupdate = autoupdate(self)
        self.brk = brk(self)
        self.context = context(self)
        self.core = core(self)
        self.forcedUser = forcedUser(self)
        self.httpsessions = httpSessions(self)
        self.importLogFiles = importLogFiles(self)
        self.params = params(self)
        self.pnh = pnh(self)
        self.pscan = pscan(self)
        self.script = script(self)
        self.search = search(self)
        self.selenium = selenium(self)
        self.sessionManagement = sessionManagement(self)
        self.spider = spider(self)
        self.users = users(self)
示例#4
0
    def __init__(self, proxies={'http': 'http://127.0.0.1:8080',
        'https': 'http://127.0.0.1:8080'}):
        """
        Creates an instance of the ZAP api client.

        :Parameters:
           - `proxies`: dictionary of ZAP proxies to use.
           
        Note that all of the other classes in this directory are generated
        new ones will need to be manually added to this file
        """
        self.__proxies = proxies
        
        self.acsrf = acsrf(self)
        self.ajaxSpider = ajaxSpider(self)
        self.ascan = ascan(self)
        self.authentication = authentication(self)
        self.autoupdate = autoupdate(self)
        self.brk = brk(self)
        self.context = context(self)
        self.core = core(self)
        self.forcedUser = forcedUser(self)
        self.httpsessions = httpSessions(self)
        self.importLogFiles = importLogFiles(self)
        self.params = params(self)
        self.pnh = pnh(self)
        self.pscan = pscan(self)
        self.script = script(self)
        self.search = search(self)
        self.selenium = selenium(self)
        self.sessionManagement = sessionManagement(self)
        self.spider = spider(self)
        self.users = users(self)
示例#5
0
 def __init__(self, num):
     self.num = num
     self.sp = spider.spider()
     if num == 4 or num == 5:
         self.sp.code = 'gb2312'
     self.html = ""
     self.engineName = r'.*?java.*?工程师.*?'
示例#6
0
def gather(q,urls):
	"""
	Parses each url in list urls and puts the item in queue q.
	"""
	for url in urls:
		miner = spider.spider()
		q.put(miner.parse(url),True)
示例#7
0
def get_match_ids():
    Requst = spider()
    res_match_ids = []
    next_id = '5584623680'
    last_id = next_id
    while True:
        url, params = gen_match_id_by_opendota(next_id)
        try:
            JsonMatches = Requst.GET(url, params)
            #print(JsonMatches)
            ids = get_all_match_ids(JsonMatches)
            add = 0
            for i in ids:
                if i not in res_match_ids:
                    res_match_ids.append(i)
                    add += 1
            print('get ids size = {}, add {} to res match ids, now size={}, now id ={}'.format(len(ids), add, len(res_match_ids), next_id))
            next_id = get_next_id(ids)
            if next_id is None:
                next_id = int(last_id) - 100
            last_id = next_id 
            
            if len(res_match_ids) > 50000:
                saveData('data/match_ids.data', res_match_ids)
        except:
            print('error')
示例#8
0
文件: main.py 项目: THMAIL/kindle
def main():
    print('现在是:' + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    #爬取
    print '开始爬取'
    web = spider()
    web.start()
    print '爬取完成'
    #转pdf
    print '开始转换'
    for title in web.titleSet:
        dirtopdf(title)
        dir_merge_pdf(title)
    print '转换完成'
    #发送到Kindle
    print '开始发送'
    files = os.listdir("pdf")
    for file in files:
        sendEMAIL(file)
    print '发送完成'

    print '开始清理'
    for title in web.titleSet:
        shutil.rmtree(title)
    for file in os.listdir("pdf"):
        os.remove("pdf/" + file)
    print '清理完成'

    print 'ALL DONE!'
示例#9
0
def test():

    # url="http://www.mm131.com/chemo/"
    # sp=spider()
    # sp.getWebList(url)

    sp = spider()
    sp.getcode('http://www.mm131.com/xiaohua/12.html')
    sp.getimgdict("http://www.mm131.com/qingchun/1.html")
示例#10
0
 def __init__(self, num):
     threading.Thread.__init__(self)
     self.setDaemon(True)
     self.sp = spider()
     self.nulcount = 1
     #self.headurl=source.urllist[num][1]
     self.headurl = ""
     self.stop = False
     self.num = num
示例#11
0
def main():
    sched = BlockingScheduler()
    sched.add_job(spider.spider(), 'interval', seconds=21600)
    print 'Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C')

    try:
        sched.start()
    except (KeyboardInterrupt, SystemExit):
        pass
示例#12
0
 def run(self):
     global timer_interval
     htmlMaker = HtmlMaker(self.REDIS_IP, self.REDIS_PORT, self.OUTPUT_PATH)
     sp = spider(self.LOG_ADDRESS,self.LOG_FORMAT,self.LOG_LEVEL,self.REDIS_IP,self.REDIS_PORT,
                 self.REDIS_FREQUENCE,self.SPIDER_KEYS)
     while self.run_status:
         sp.capture()
         htmlMaker.makeHtml()
         self.sleep(timer_interval)
示例#13
0
文件: app.py 项目: JulyMood/Robot
def spider_task():
    _REDIS.setnx(_PAGE_INDEX, 1)
    page_index = _REDIS.get(_PAGE_INDEX)
    task = spider.spider()
    while task.start(page_index):
        page_index = _REDIS.incr(_PAGE_INDEX)
        logging.info('Crawl the index page %s to finish' % page_index)
    while task.start():
        logging.info('Crawl the first page to finish, sleep one day')
        time.sleep(60 * 60 * 24)
示例#14
0
def rumMain(now):
    ''' 主程序'''
    t_bg = time.perf_counter()
    logger.info('[WeiBo System] start to run...')
    message_file_path = os.path.join(path_message, f'messeage_{now}.txt')

    spider(urls=[
        f'https://weibo.com/{ii}?is_all=1&stat_date={months}#feedtop'
        for ii in targets
    ],
           message_file_path=message_file_path)

    try:
        send_email(f'Weibo{now}', message_file_path)
        logger.info('[Email] send e-mail successfully...')
    except:
        logger.warning(f'[Email] failed to send {message_file_path}!!!')
    t_ed = time.perf_counter()
    logger.info(f'[WeiBo System] end to run, escape {t_ed - t_bg:.2f} secs...')
示例#15
0
 def test_login(self, stock):
     code = stock[0]
     market = stock[1]
     global path
     path = './data/stock/%s/xueqiu/raw/'%(code)
     url = 'http://xueqiu.com/stock/industry/stockList.json?type=1&code=%s%s&size=8&_=1433829008414'%(market,code)
     sp = spider() 
     print url
     stock_html = sp.getsource_xueqiu(url, 'utf-8')
     print stock_html
示例#16
0
	def preprocess(self, preproc_params=None):
		''' Pre-train the data with the provided preproc_params '''
		if preproc_params == None:
			preproc_params = self.preproc_params
		elif self.preproc_params == None:
			self.preproc_params = preproc_params
			skip_flag = False
		else:
			skip_flag = True
		
		p = preproc_params  # alias
		# discrete is no-op only for in-core; otherwise, skip_flag if same params
		discrete = p['discrete']
		skip_flag &= i_reduced == self.preproc_params['i_reduced']
		skip_flag &= discrete == self.preproc_params['discrete']
		if ~self.outofcore:
			self.Xd = self.data[0]
			self.Xc = self.data[1]
			self.y = self.data[2]
			self.w = self.data[3]
		elif ~skip_flag:
			i_reduced = self.i_reduced
			with open("higgs/working/reduced_train.dat", 'rb') as f:
				for i in range(4*i_reduced):
					temp = pickle.load(f)
				self.Xd = pickle.load(f)
				self.Xc = pickle.load(f)
				self.y = pickle.load(f)
				self.w = pickle.load(f)
				temp = None  # prevent pickling memory leak
		if discrete:
			self.X = self.Xd
		else:
			self.X = self.Xc
		if ~skip_flag:
			self.reload_cv()
		
		# skip_flag spider if same params
		skip_flag &= p["spider"] == self.preproc_params["spider"]
		if ~skip_flag:
			spider_params = p["spider"]
			spider_params['metric'] = 'wminkowski'
			spider_params['w'] = np.max(self.X, axis=0)
			self.X, self.y, self.w = spider(self.X, self.y, self.w, **spider_params)
		
		# skip_flag feature_selection if same params
		skip_flag &= p["feature_selection"] == self.preproc_params["feature_selection"]
		if ~skip_flag:
			fs_alg = p["feature_selection"]["algorithm"]
			fs_params = p["feature_selection"]["params"]
			fs = FeatureSelector(algorithm=fs_alg)
			fs.fit(self.Xd, self.y, **fs_params)
			self.X = fs.transform(self.X)
		
		self.preproc_params = p
示例#17
0
def test_topit(page_num=20, thread_num=10, limit=None,
               img_store_path="./pics/"):

    global sqlite
    global url_pool
    global g_workers
    global img_download_counter

    img_download_counter = 0

    URL_PREFIX = "http://www.topit.me/"

    def construct_root_url(num=2):
        url_prefix = URL_PREFIX
        url = lambda n: url_prefix + "?p=" + str(n)
        start_pages = map(url, range(num))
        start_pages = set(start_pages)
        return start_pages 

    DB_PATH = "./topit.db"
    sqlite = sqlite3.connect(DB_PATH)
    cur = sqlite.cursor()
    cur.execute("SELECT url FROM urls")
    urls = cur.fetchall()
    map(url_pool.add, urls)
    del urls

    urls = construct_root_url(num=page_num)
    urls = list(urls)

    tasks = generate_init_tasks(urls=urls, img_store_path=img_store_path, )

    g_workers = []

    store_queue = queue.Queue()
    img_download_queue = queue.Queue()
    parse_queue = queue.Queue()
    download_queue = queue.Queue()

    spider_instance = spider(urls=urls,
                             store_queue=store_queue,
                             img_download_queue=img_download_queue,
                             download_queue=download_queue,
                             parse_queue=parse_queue,
                             limits=limit
                             )

    spider_instance._setup(tasks, download_queue)
    for i in range(thread_num):
        w = gevent.spawn(spider_instance.run,)
        w.working = None
        w.page = None
        g_workers.append(w)
    gevent.joinall(g_workers)
示例#18
0
 def application():
     from skywalking.trace.context import get_context
     get_context().put_correlation("correlation", "correlation")
     # @runnable(op="/test")
     # def post():
     #     requests.post("http://127.0.0.1:9092/users")
     #
     # from threading import Thread
     # t = Thread(target=post)
     # t.start()
     #
     # res = requests.post("http://127.0.0.1:9092/users")
     #
     # t.join()
     mysqldb = MysqlTaskConfig().get_instance()
     spider(69, 70, mysqldb, "xiaohongshu")
     from kafka import KafkaProducer
     producer = KafkaProducer(bootstrap_servers=['127.0.0.1:9092'],
                              api_version=(1, 0, 1))
     producer.send('skywalking', b'xiaohongshu')
     return jsonify({"status": "okokokok"})
示例#19
0
def run(email, password):
    spider.set_repo('repo_file')
    bot = spider.spider('my_friends', email, password)
    bot.log.setLevel(20)
    rid, login_info = bot.login()
    if rid is None:
        print('spider login error. detail:{}'.format(login_info))
        return
    else:
        print('spider login success. rid={}'.format(rid))
    spider.spider.getNet2(bot, rid)
    return rid
示例#20
0
def run(email, password):
	spider.set_repo('repo_file')
	bot = spider.spider('my_friends', email, password)
	bot.log.setLevel(20)
	rid, login_info = bot.login()
	if rid is None:
		print('spider login error. detail:{}'.format(login_info))
		return
	else:
		print('spider login success. rid={}'.format(rid))
	spider.spider.getNet2(bot, rid)
	return rid
示例#21
0
def MuluPaQu():
    print "==================================="
    print MuluPaQuInf.url
    try:
        print MuluPaQuInf.url
        global count
        count = 0
        spider.spider_url.append(MuluPaQuInf.url)
        spider.dir_url.append(MuluPaQuInf.url.split('?')[0])
        path = os.path.dirname(os.path.realpath(__file__)) + "\\MLPQ_scan.txt"
        if os.path.exists(path):
            os.remove(path)
        print spider.spider_url
        flag = 0
        while len(spider.spider_url) > 0:
            try:
                spider(spider.spider_url.pop())
                flag = 1
            except:
                pass
        if flag == 1:
            result4 = list()
            data4 = file('./file/result/MLPQ_scan.txt')
            for line in data4:
                result4.append({"id": line[5:-1]})

            final = list()
            final.append({"content": result4, "ways": "MuluPaQu"})

            str1 = 'file\history\op' + str(Optimes) + '.txt'
            with open(str1, 'a+') as f:
                f.write(str(final).replace("\'", "\""))

            global Optimes
            Optimes = Optimes + 1
            f = open('./file/optimes.txt', 'wb')
            f.write(str(Optimes))
            return jsonify(result4)
    except:
        print "--------------------Error----------------------"
示例#22
0
def get_data(locals):
    location = {
        "杭州": '080200',
        "上海": '020000',
        "北京": '010000',
        "广州": '030200',
        "深圳": '040000',
        '武汉': '180200',
        '宁波': '080300',
        "苏州": '070300',
        '南京': '070200',
        '长沙': '190200',
        '成都': '090200',
        '重庆': '060000',
        '昆明': '250200',
        '西安': '200200',
        '哈尔滨': '220200',
        '大连': '230300',
        '长春': '240200'
    }
    for local in locals:
        local_code = location[local]
        if not os.path.exists('data'):
            os.mkdir('data')
        file = 'data\\{}.csv'.format(local)
        with open(file, 'w') as f:
            f.close()

        for page in range(1, 2001):
            url = 'https://search.51job.com/list/{}' \
                  ',000000,0000,00,9,99,+,2,{}.html' \
                  '?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&' \
                  'jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='.format(local_code,page)
            with open('data\\t.text', 'w') as f:
                f.writelines(url)
                f.close()

            spider(url, file)
            print("保存成功!", end=" ")
            print("location:{},page={}".format(local, page))
示例#23
0
def main():

    if len(sys.argv) < 1:
	print "Specify URL"
	sys.exit(1)

    try:

	the_spider = spider(0, 10, True)
	the_spider.work(sys.argv[1])
    
    except:
	raise
示例#24
0
def orderSort(currentLink, workName, tag4link, class4link, tag4nextPage,
              class4nextPage, volume):
    link = currentLink
    while True:
        crawler = spider(link, workName)
        furtherLink = crawler.getLinks(tag4link, class4link)
        if furtherLink != None:
            for i in furtherLink:
                fileGenerator(workName, i, volume)
        link = crawler.nextPage(tag4nextPage, class4nextPage)
        if link == None:
            break
        time.sleep(1)
示例#25
0
 def __init__(self):
     Frame.__init__(self)
     self.spider = spider.spider()
     self.text = Text()
     self.alertButton1 = Button(self,
                                text='单进程爬取网页',
                                command=lambda: thread_it(self.pachong))
     self.alertButton2 = Button(self,
                                text='多进程爬取网站',
                                command=lambda: thread_it(self.morepachong))
     self.nameInput = Entry(self, width=50)
     self.listbox = Listbox(self)
     self.quitButton = Button(self, text='Quit', command=self.quit)
示例#26
0
def main():
    init()

    spiders = []
    threadid = 2
    exitFlag = 0

    proxiesPool.update()  # 先获取代理ip

    proxyThread = spider.spider(  # 开启代理ip线程
        updateProxiesPool, 1, exitFlag=exitFlag, pool=proxiesPool)
    proxyThread.start()
    spiders.append(proxyThread)

    for i in range(0, 30):  # 开启获取用户线程
        sp = spider.spider(crawlUppers,
                           threadid,
                           carry=upperInputTask,
                           output=upperOutputTask,
                           cfg=cfg,
                           pool=proxiesPool)
        sp.start()
        spiders.append(sp)
        threadid = threadid + 1

    # 主线程, 插入数据库
    while not upperInputTask.empty() or not upperOutputTask.empty():
        if not upperOutputTask.empty():
            upper = upperOutputTask.get()
            db.execute(
                sqlInsertUpperInfo,
                (upper.mid, upper.follower, upper.uname, upper.face,
                 time.strftime('%Y-%m-%d %H:%M:%S',
                               time.localtime(upper.regtime)), upper.sex,
                 upper.level, upper.vipType, upper.vipStatus))
    exitFlag = 1

    for i in spiders:
        i.join()
示例#27
0
def contenter(BigworkName, tag4pat, class4pat, tag4doc, class4doc, volume):
    folder = BigworkName + '_linker'
    workName = BigworkName + '_content'
    position = [0,0]
    logs = None

    try:
        file = open(workName+'.log', 'r')
        logs = file.readlines()
        file.close()
        logs = logs[-1]
        if '-------Programme Finished!-------' in logs:
            logs = 'end'
        else:
            logs = logs.split(' ')       # current finished position: file # line #
            logs = [int(logs[4]) - 1,int(logs[6]) - 1]
    except:
        pass

    if logs == None:
        logger(workName, "-------Start Programme!-------")
    elif logs == 'end':
        print("The program is finished!")
        return None
    else:
        logger(workName, "-------Restart Programme!-------")
        position = logs

    while True:
        link = filePicker(BigworkName, folder, position)
        if link == None:
            break
        else:
            link = link.replace('\n','')
        crawler = spider(link,workName)
        logger(workName, 'Current finished position: file ' + str(position[0] + 1) + ' line ' + str(position[1] + 1))
        text_patiant = crawler.getText(tag4pat, class4pat)[0]
        if text_patiant != None:
            text_doctor = crawler.getText(tag4doc, class4doc)[0]
            if text_doctor != None:
                textGenerator(workName, text_patiant, 'pat', position, volume, 1)
                textGenerator(workName, text_doctor, 'doc', position, volume)
        if position[1] == volume - 1:
            position[0] += 1
            position[1] = 0
        else:
            position[1] += 1
        time.sleep(1)
示例#28
0
def write_to_csv(city, area, random_delay):
    '''

    :param city: 城市名
    :param area: 区域名
    :return: 将爬取的数据写入文件:ershoufang-city-area.csv
    '''
    city_ch = cities[city]
    area_ch = get_city_area(city)[area]
    print('Now writing {0}|{1}'.format(city_ch, area_ch), 'to csv')
    with open('ershoufang-{0}-{1}.csv'.format(city_ch, area_ch),
              'w') as csvfile:
        for info in spider(city, area, random_delay):
            print("Now wrting:", '|'.join(info[0:5]))
            csvfile.write('|'.join(info))
            csvfile.write("\n")
示例#29
0
文件: get_info.py 项目: zxgdll/renren
def run(meth,orig_id=None):
    repo_mode, repo_name, user, passwd = init_config()
    spider.set_repo(repo_mode)
    tt = spider.spider(repo_name,user,passwd)
    tt.log.setLevel(20)
    my_rid, login_info = tt.login()
    if my_rid is None:
        print('spider login error. detail:{}'.format(login_info))
        if not input('continue for test?(1/0)'):
            return None
        else:
            my_rid='11111111'
    else:
        print('spider login success. rid={}'.format(my_rid))
    if orig_id is None:
        orig_id = my_rid
    meth(tt,orig_id)
示例#30
0
def main():
    global myRequests ,headers
    # ssl._create_default_https_context = ssl._create_unverified_context
    headers = {'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.125 Safari/537.36'}
    myRequests = requests.Session()
    myRequests.headers.update(headers)

    link = weixin()
    link.start()
    spi = spider()
    while 1:
        price = spi.start()
        print(price)
        # for price in prices:
        #     print (price[1])
        link.sendMsg('mountain blue','Daen' ,str(price))
        time.sleep(10)
示例#31
0
文件: web.py 项目: chris-magic/spider
def get_info():
    url = request.form.get('url')

    if re.search(u"http", url) is None:
        url = "http://" + url

    error = None
    #spider(url)
    iteminfo = spider(url)

    if iteminfo == {}:
        error = "invalid item"
        return render_template("show_info.html", error=error)
    if iteminfo != {}:
        title = iteminfo['t'].decode('utf-8')
        price = iteminfo['p'].decode('utf-8')
        links = iteminfo['l']
    #return iteminfo[1]
        return render_template("show_info.html", title=title, price=price, links=links )
示例#32
0
    def __init__(self, proxies=None, apikey=None):
        """
        Creates an instance of the ZAP api client.

        :Parameters:
           - `proxies`: dictionary of ZAP proxies to use.

        Note that all of the other classes in this directory are generated
        new ones will need to be manually added to this file
        """
        self.__proxies = proxies or {
            'http': 'http://127.0.0.1:8080',
            'https': 'http://127.0.0.1:8080'
        }
        self.__apikey = apikey

        self.acsrf = acsrf(self)
        self.ajaxSpider = ajaxSpider(self)
        self.ascan = ascan(self)
        self.authentication = authentication(self)
        self.authorization = authorization(self)
        self.autoupdate = autoupdate(self)
        self.brk = brk(self)
        self.context = context(self)
        self.core = core(self)
        self.forcedUser = forcedUser(self)
        self.httpsessions = httpSessions(self)
        self.importLogFiles = importLogFiles(self)
        self.params = params(self)
        self.pnh = pnh(self)
        self.pscan = pscan(self)
        self.reveal = reveal(self)
        self.script = script(self)
        self.search = search(self)
        self.selenium = selenium(self)
        self.sessionManagement = sessionManagement(self)
        self.spider = spider(self)
        self.stats = stats(self)
        self.users = users(self)

        # not very nice, but prevents warnings when accessing the ZAP API via https
        requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
示例#33
0
    def __init__(self, proxies=None, apikey=None):
        """
        Creates an instance of the ZAP api client.

        :Parameters:
           - `proxies`: dictionary of ZAP proxies to use.

        Note that all of the other classes in this directory are generated
        new ones will need to be manually added to this file
        """
        self.__proxies = proxies or {
            'http': 'http://127.0.0.1:8080',
            'https': 'http://127.0.0.1:8080'
        }
        self.__apikey = apikey

        self.acsrf = acsrf(self)
        self.ajaxSpider = ajaxSpider(self)
        self.ascan = ascan(self)
        self.authentication = authentication(self)
        self.authorization = authorization(self)
        self.autoupdate = autoupdate(self)
        self.brk = brk(self)
        self.context = context(self)
        self.core = core(self)
        self.forcedUser = forcedUser(self)
        self.httpsessions = httpSessions(self)
        self.importLogFiles = importLogFiles(self)
        self.params = params(self)
        self.pnh = pnh(self)
        self.pscan = pscan(self)
        self.reveal = reveal(self)
        self.script = script(self)
        self.search = search(self)
        self.selenium = selenium(self)
        self.sessionManagement = sessionManagement(self)
        self.spider = spider(self)
        self.stats = stats(self)
        self.users = users(self)

        # not very nice, but prevents warnings when accessing the ZAP API via https
        requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
示例#34
0
 def spider_btn(self):
     '''
     爬虫开启按钮
     '''
     try:
         #清空日志及表格
         self.textBrowser_spider.clear()
         self.tableWidget.setRowCount(0)
         #得到开始结束id 保存路径
         start_id = self.lineEdit_start_id.text()
         end_id = self.lineEdit_end_id.text()
         filename = self.lineEdit_save.text()
         #实例爬虫对象 开启爬虫
         self.spiderObj = spider(start_id, end_id, filename)
         self.spiderObj.sig_one_end.connect(self.spider_end)
         self.spiderObj.sig_item_end.connect(self.spider_item)
         self.spiderObj.sig_end.connect(self.spider_end)
         self.spiderObj.start()
     except Exception as e:
         print(e)
示例#35
0
def main():
    Config = config('conf/config.ini')
    MatchesDetailBaseUrl = Config.getMatchesUrl()
    
    Requst = spider()
    #get parsed matches id
    MatchesIds = readMatchIdFromFile('data/match_ids.data')
    AlreadyDoneIdFile = 'tmp/AlreadyDoneId.data'
    AlreadyDoneId = []
    try:
        AlreadyDoneId = readAlreadyDoneIds(AlreadyDoneIdFile)
    except:
        traceback.print_exc()
    
    ids = list(set(MatchesIds) - set(AlreadyDoneId))
    #get matches detail by matches id
    count = 0
    for item in ids:
        retry = 0
        while retry < 3:
            try:
                MatchID = item
                StoreFileName = 'data/matchesdata/{}.data'.format(MatchID)
                if os.path.exists(StoreFileName):
                    print('file {} exists.'.format(StoreFileName))
                    AlreadyDoneId.append(MatchID)
                    break
                print('start match_id={}'.format(MatchID))
                MatchesDetailUrl = changeMatchId(MatchesDetailBaseUrl, str(MatchID))
                print(MatchesDetailUrl)
                JsonMatchesDetail = getMatchesDetail(Requst, MatchesDetailUrl)            
                saveData(StoreFileName, JsonMatchesDetail)
                AlreadyDoneId.append(MatchID)
                if count %100 == 0:
                    saveData(AlreadyDoneIdFile, AlreadyDoneId)
                break
            except:
                traceback.print_exc()
                print('exception in {}, retry {} times'.format(MatchID,retry))
                retry+=1
        count += 1
示例#36
0
文件: web.py 项目: xiangyuelin/spider
def get_info():
    url = request.form.get('url')

    if re.search(u"http", url) is None:
        url = "http://" + url

    error = None
    #spider(url)
    iteminfo = spider(url)

    if iteminfo == {}:
        error = "invalid item"
        return render_template("show_info.html", error=error)
    if iteminfo != {}:
        title = iteminfo['t'].decode('utf-8')
        price = iteminfo['p'].decode('utf-8')
        links = iteminfo['l']
        #return iteminfo[1]
        return render_template("show_info.html",
                               title=title,
                               price=price,
                               links=links)
示例#37
0
    def __init__(self, proxies={'http': 'http://127.0.0.1:8080',
        'https': 'http://127.0.0.1:8080'}):
        """
        Creates an instance of the ZAP api client.

        :Parameters:
           - `proxies`: dictionary of ZAP proxies to use.
           
        Note that all of the other classes in this directory are generated
        new ones will need to be manually added to this file
        """
        self.__proxies = proxies
        
        self.acsrf = acsrf(self)
        self.ascan = ascan(self)
        self.auth = auth(self)
        self.autoupdate = autoupdate(self)
        self.context = context(self)
        self.core = core(self)
        self.params = params(self)
        self.pscan = pscan(self)
        self.search = search(self)
        self.spider = spider(self)
示例#38
0
 def init(self):	    
     #  This should all be done by the manifest parser
     globals.marek = marek.marek(120, 100, 2)    # initialize the marek object        
     increment = 580/int(globals.spider_number)
     for i in range(20, 600, increment):            
         sprite = spider.spider(i, 420)          
         images = [engine.load_image('spider.png')]
         ani = em.Animation(images)            
         event = em.SpriteEvent("onIdle", None, ani, 0, 0)
         sprite.em.add(event)
         event = em.SpriteEvent("onWalkLeft", None, ani, 0, 0)
         sprite.em.add(event)
         event = em.SpriteEvent("onWalkRight", None, ani, 0, 0)
         sprite.em.add(event)
         event = em.SpriteEvent("onJump", None, ani, 0, 0)
         sprite.em.add(event)
         event = em.SpriteEvent("onFall", None, ani, 0, 0)
         sprite.em.add(event)            
         event = em.SpriteEvent("onShoot", None, ani, 0, 0)
         sprite.em.add(event)            
         event = em.Event("onCollide", None)
         sprite.em.add(event)            
         globals.spiders.append(sprite)  
     globals.camera = engine.Camera(globals.map, globals.marek, globals.window.width, globals.window.height)
示例#39
0
文件: zhihu.py 项目: yjzx121/Spider
def weiboIdToUrl(url):
    html = spider(url)
    return re.search(r'私信</a>&nbsp;\<a href="/(.*?)/info"\>资料</a>', html,
                     re.S).group(1)
示例#40
0
文件: zhihu.py 项目: yjzx121/Spider
    # COOKIES = weiboCookies
    # url = weiboURL
    # weiboIdSpider(spider(url + 'follow'))

    zhihuCookies = {
        "Cookie":
        "_ga=GA1.2.200227533.1448713358; " +
        "q_c1=3c8a6952ff6b451186e548a78e07e5f3|1448717055000|1448717055000; " +
        "_za=c4856f7e-2b10-4c6f-ac1c-7120243828b1; _xsrf=53de1f0fc43b2118b48cfe714e889872; __utmt=1; "
        +
        'cap_id="NmJkYTc0OWUzZTQ4NGQyY2E3MjQ2ZmI0NWU0Mzk1MzM=|1448868103|a22b3ff3843b0e08bf078ff3cabd69c590ffe399"; '
        + "__utma=51854390.200227533.1448713358.1448868015.1448868015.1; " +
        "__utmb=51854390.16.9.1448868181265; __utmc=51854390;" +
        "__utmz=51854390.1448868015.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; "
        +
        "__utmv=51854390.000--|2=registration_date=20130912=1^3=entry_date=20151128=1; "
        'z_c0="QUFBQXZRc2VBQUFYQUFBQVlRSlZUWGVHZzFhbTcyREhCeDYwdWRFRUx2RlZvemFOWm14ck9BPT0=|1448868215|cce7afe611ff3806d63d8d8b943b36af4c40302d"; '
        +
        'unlock_ticket="QUFBQXZRc2VBQUFYQUFBQVlRSlZUWDhBWEZiRXBVVTZxdFA0bTluOEVHMWFIY3pNU2dvWWVBPT0=|1448868215|f0ede700d045c55277c0f34f20a00f332f6be485"'
    }
    zhihuURL = 'http://www.zhihu.com/people/xie-ke-41/followees'

    # url = zhihuURL
    # COOKIES = zhihuCookies
    # zhihuIdSpider(spider(url))

    COOKIES = weiboCookies
    url = 'http://weibo.cn/2807748433/info'
    weiboInfoSpider(spider(url))
示例#41
0
 def test_gets_only_aired_eps(self):
     spiderbro = spider.spider()
     mockdate = date(2015, 1, 20) # only season ep 9 has aired at this point
     tvdb_episodes, status = spiderbro.get_tvdb_episodes('Constantine', mockdate)
     self.assertDictEqual(tvdb_episodes, {1:[1,2,3,4,5,6,7,8,9]})
示例#42
0
 def test_ignores_specials(self):
     spiderbro = spider.spider()
     tvdb_episodes = spiderbro.get_tvdb_episodes('Doctor Who (2005)')
     self.assertNotIn(0, tvdb_episodes)
示例#43
0
文件: a.py 项目: zxyuling/z7z8
#coding = utf-8
import re
import sys
import spider
name=1
def p(dom,url):
	global name
	if re.findall('400-188-6666',str(dom)):
		f = open('F:\\webdeveloper\\z7z8\\88\\'+str(name)+'.html','wb')
		f1 = open('F:\\webdeveloper\\z7z8\\88\\list.txt','a')
		f1.write(url)
		f.write(dom)
		name=name+1
print(sys.argv)
sp = spider.spider(sys.argv[1],int(sys.argv[2]),int(sys.argv[3]),p)
sp.start()
print(sp.pagelist())
示例#44
0
 def test_dont_mark_download_whole_season_if_season_not_finished_airing(self):
     mockDAL = Mock(spec=DAL.DAL)
     mockDAL.get_eps_for_show = MagicMock(return_value = {})
     spiderbro = spider.spider(mockDAL)
     tvdb_episodes = spiderbro.get_missing_episodes('Orphan Black')
     self.assertNotEqual(tvdb_episodes[3], [-1])
示例#45
0
 def test_mark_show_as_ended(self):
     mockDAL = Mock(spec=DAL.DAL)
     spiderbro = spider.spider(mockDAL)
     tvdb_episodes, status = spiderbro.get_tvdb_episodes('Life On Mars')
     mockDAL.mark_show_ended.assert_called_with('Life On Mars')
示例#46
0
 def test_updates_show_table(self):
     mockDAL = Mock(spec=DAL.DAL)
     mockDAL.get_eps_for_show = MagicMock(return_value = {1: [1, 2]})
     spiderbro = spider.spider(mockDAL)
     missing_episodes = spiderbro.get_missing_episodes("Life On Mars")
     mockDAL.update_show_table.assert_called_with('Life On Mars')
示例#47
0
def scrap(start_day=None, end_day=None, start=1, end=20):
	logger.clear()
	logger.begin(start_day, end_day, start)
	logger.log("Try to get expressions...", flush=True)
	if end != None:
		expressions = expression.objects.filter(id__range=(start, end)).order_by('id')
	else:
		expressions = expression.objects.filter(id__range=(start, 3000)).order_by('id')

	s = spider()
	logger.log("Try to login...", flush=True)
	browser = s.login()
	cnt = 0
	file_path = ''
	for item in expressions:
		cnt += 1
		logger.log(u"第" + str(item.id) + u"个表达式:" + item.name, count=item.id, flush=True)
		#验证是否登录
		check_login = s.check_login(browser)
		if not json.loads(check_login)['success']:
			logger.log('check is not login , sleep 100s ,then try login again')
			time.sleep(100)
			browser = s.login()

		file_path = s.get_xls_by_expression(item.content, browser, start_day, end_day)
		if file_path != None:
			file_path = os.path.normpath(file_path)
			#logger.log(file_path)
			rows = excel_table_byindex(file_path, include_name=False)
			#删除文件
			os.remove(file_path)
			for row in rows:
				# 忽略第一行
				if row == rows[0]:
					continue
				apply_num = row[0]
				# 查重
				p = patent.objects.filter(apply_number=apply_num)
				if len(p) > 0:
					logger.log("{0} update!".format(apply_num))
					p = p[0]
					records = excute_record.objects.filter(expression=item, time_stamp=row[6])
					if len(records) > 0:
						record = records[0]
					else:
						record = excute_record(expression=item, time_stamp=row[6])
						record.save()
					p.record = record
					p.apply_number = row[0]
					p.name = row[1]
					p.main_classify_code = row[2]
					p.classify_code = row[3]
					p.apply_man = row[4]
					p.invente_man = row[5]
					p.publicity_date = row[6]
					p.publicity_code = row[7]
					p.patent_agent = row[8]
					p.agent = row[9]
					p.aplly_date = row[10]
					p.address = row[11]
					p.priority = row[12]
					p.province_code = row[13]
					p.abstract = row[14]
					p.main_right = row[15]
					p.international_apply = row[16]
					p.international_publicity = row[17]
					p.enter_country_date = row[18]
					p.right_demand = row[20]
					p.valid_state = row[21]
					p.state_code = row[22]
					p.type = row[23]
					p.save()
					continue
				logger.log(apply_num)
				#插入纪录
				records = excute_record.objects.filter(expression=item, time_stamp=row[6])  # row[6]==public data #
				if len(records) > 0:
					# logger.log("record already exist !")
					record = records[0]
				else:
					record = excute_record(expression=item, time_stamp=row[6])  # row[6]==public data #
					record.save()
				p = patent(  # 对应的执行记录
				             record=record,

				             # 申请号
				             apply_number=(row[0]),

				             # 名称
				             name=(row[1]),

				             # 主分类号
				             main_classify_code=row[2],

				             #分类号
				             classify_code=row[3],

				             #申请(专利权)人
				             apply_man=row[4],

				             #发明(设计)人
				             invente_man=row[5],

				             #公开(公告)日
				             publicity_date=(row[6]),

				             #公开(公告)号
				             publicity_code=row[7],

				             # 专利代理机构
				             patent_agent=row[8],

				             # 代理人
				             agent=row[9],
				             # 申请日
				             aplly_date=row[10],

				             # 地址
				             address=row[11],

				             # 优先权
				             priority=row[12],

				             # 国省代码
				             province_code=row[13],

				             # 摘要
				             abstract=row[14],

				             # 主权项
				             main_right=row[15],

				             # 国际申请
				             international_apply=row[16],

				             # 国际公布
				             international_publicity=row[17],

				             # 进入国家日期
				             enter_country_date=row[18],
				             # 权利要求书
				             right_demand=row[20],
				             # 法律状态
				             valid_state=row[21],
				             # 专利状态代码
				             state_code=row[22],
				             # 专利类型
				             type=row[23]
				             )
				try:
					p.save()
				except Exception, e:
					logger.log(str(e), flush=True)
					logger.log('failed to save patent!',flush=True)
示例#48
0
hos_level = {
	33:'三级甲等',
	30:'三级医院',
	20:'二级医院',
	10:'一级医院',
}
num = 0
for x in hos_class:	#遍历医院类型
	for y in hos_level:	#遍历医院等级
		for z in hos_area:		#遍历医院所属地区
			#开始抓取页面并解析
			hos_class_key = x
			hos_level_key = y
			hos_area_key = z
			url = "http://www.zj12580.cn/hos/all?page=1&pageSize=30&levelId="+`hos_level_key`+"&typeId="+`hos_class_key`+"&areaId="+`hos_area_key`
			html = spider(url)
			print hos_level[hos_level_key],hos_class[hos_class_key],hos_area[hos_area_key]
			# 进行页面解析
			soup = BeautifulSoup(html)
			content = soup.find('div',{"class":"left_hos_bottom"})
			hos_list = content.findAll("tr")
			# 当页面没有有效内容时,结束此次循环
			if len(hos_list) == 0:		
				continue
			for n in range(len(hos_list)):
				hos_info = hos_list[n].find('p',{'class':'title'}).find('a')
				hos_name = hos_info.text.encode('utf8')		#医院名字
				hos_url = hos_info['href']			#医院url
				match_obj = re.match(r'.*hos/info/(\d{1,4})\?deptCode.*',hos_url)
				hos_offi_id = match_obj.group(1)		#官网医院id
	#			sql = "insert into hospital(hos_name,class,level,region,hos_url)\
示例#49
0
from BeautifulSoup import BeautifulSoup

db = MySQLdb.connect("localhost","root","123456","guahao" )
cursor = db.cursor()
sql_charset = 'set names utf8'
cursor.execute(sql_charset)
sql = 'select hos_id,hos_url,hos_name from hospital '
cursor.execute(sql)
results = cursor.fetchall()
for row in results:
	hos_id = int(row[0])
	hos_url = row[1]
	hos_name = row[2]
	print hos_name
	if hos_url == 'http://www.bjguahao.gov.cn/comm/yyks-91.html':
		print '朝阳医院出现问题'
	else:
		html = spider.spider(hos_url)
		soup = BeautifulSoup(html)
		link_a = soup.find('a',{'rel':'#gm-ditu'})
		img_src = link_a.find('img')['src'].encode('utf8')
		match_obj = re.match(r'.*center=(\d{1,4}\.\d{4,7}),(\d{1,4}\.\d{5,7}).*',img_src)
		longitude = match_obj.group(1)
		latitude = match_obj.group(2)
		sql_2 = 'update hospital set longitude =%s,latitude=%s where hos_id = %d' % (longitude,latitude,hos_id)
		cursor.execute(sql_2)
		db.commit()
		print longitude,latitude
db.close()

示例#50
0
 def test_saves_successful_search_in_database(self):
     mockDAL = Mock(spec=DAL.DAL)
     mockDAL.get_eps_for_show = MagicMock(return_value = {1: [1,2,3,4,5,6,7,8,9,10,11,12]})
     spiderbro = spider.spider(mockDAL)
     spiderbro.find_torrents_for_show("Constantine")
     mockDAL.mark_episode_for_download.assert_called_once_with("Constantine", 1, 13, unittest.mock.ANY, unittest.mock.ANY)
示例#51
0
 def spider(self):
     resultado = "Inicio"
     urlIni = self.par1.get()
     deep = self.par2.get()
     resultado = spider.spider(urlIni, deep)
     self.x.set(resultado)   
示例#52
0
 def test_can_get_missing_episodes(self):
     mockDAL = Mock(spec=DAL.DAL)
     mockDAL.get_eps_for_show = MagicMock(return_value = {1: [1, 2]})
     spiderbro = spider.spider(mockDAL)
     missing_episodes = spiderbro.get_missing_episodes("Life On Mars")
     self.assertDictEqual(missing_episodes, {1:[3,4,5,6,7,8], 2:[-1]})
示例#53
0
# -*- coding:utf-8 -*-

import spider

if __name__ == '__main__':
    word = input("Input key word: ")
    url = 'https://www.baidu.com/sf/vsearch?wd=' + word + '&pd=video'
    spider = spider.spider()
    urls = spider.parseHtml(spider.getHtml(url))
    spider.dowmloadVideos(urls)
示例#54
0
import threading
from queue import Queue
from spider import spider
from domain import *
from general import *

PROJECT_NAME = 'wikipedia'
HOMEPAGE = 'https://www.wikipedia.org'
DOMAIN_NAME = get_domain_name(HOMEPAGE)
QUEUE_FILE = PROJECT_NAME + '/queue.txt'
CRAWLED_FILE = PROJECT_NAME + '/crawled.txt'
NUMBER_OF_THREADS = 2
queue = Queue()
spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME)


# Create worker threads (will die when main exits)
def create_workers():
    for _ in range(NUMBER_OF_THREADS):
        t = threading.Thread(target=work)
        t.daemon = True
        t.start()


# Do the next job in the queue
def work():
    while True:
        url = queue.get()
        spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()
示例#55
0
 def test_can_get_tvdbshow_episode_list(self):
     spiderbro = spider.spider()
     tvdb_episodes, status = spiderbro.get_tvdb_episodes('Life On Mars')
     self.assertDictEqual(tvdb_episodes, {1:[1,2,3,4,5,6,7,8], 2:[1,2,3,4,5,6,7,8]})
示例#56
0
# python 3 compatibility
import configuration
import DAL
import logging as log
import spider

configuration.setup_logging()
log.info("SpiderBro, two point oh!")

dal = DAL.DAL()
sbro = spider.spider(dal)

# TODO: implement cmdline args for 1 show, all shows, all shows airing this week

for show in dal.get_full_show_list():
    ga_list = sbro.find_torrents_for_show(show)