Пример #1
0
    def test_attributes(self):
        """Test the setting of attributes.

        1. Test default attributes.
        2. Test user-defined attributes. All arguments are different from the default settings.
        """

        # Test default attributes
        patterns = {
            'scheme_pattern': r'http|https',
            'domain_pattern': r'.*',
            'path_pattern': r'.*',
        }
        c = Crawler()
        ## Test the WorkingListManager
        self.assertTrue(isinstance(c.WLM, WorkingListManager))
        self.assertFalse(c.WLM.workExists())

        ## Test the UrlMatcher
        self.assertTrue(isinstance(c.UM, UrlMatcher))
        for k, p in patterns.items():
            self.assertEqual(getattr(c.UM, k), p)

        ## Test other attributes
        self.assertFalse(c.curUrl)
        self.assertFalse(c.extractors)
        self.assertTrue(c.autoAddInternalLinks)

        #----------------------------#

        # Test user-defined attributes
        extractors = [
            self.Extractor_1('title'),
            self.Extractor_2('para'),
            self.Extractor_3('gift1'),
            self.Extractor_4('giftTitles')
        ]
        patterns = {
            'scheme_pattern': 'http',
            'domain_pattern': 'www.yahoo.com',
            'path_pattern': r'/path/page.*$',
        }
        c = Crawler(workingList=self.testUrls,
                    **patterns,
                    extractors=extractors,
                    autoAddInternalLinks=False)

        ## Test the WorkingListManager
        self.assertTrue(c.WLM.workExists())
        self.assertEqual(c.WLM.records, deque(self.testUrls))

        ## Test the UrlMatcher
        self.assertTrue(isinstance(c.UM, UrlMatcher))
        for k, p in patterns.items():
            self.assertEqual(getattr(c.UM, k), p)

        ## Test other attributes
        self.assertFalse(c.curUrl)
        self.assertEqual(c.extractors, extractors)
        self.assertFalse(c.autoAddInternalLinks)
Пример #2
0
 def set_crawlers(self):
     old_page = Page.get_or_create(self.db,
                                   self.project_config.project_name,
                                   self.versions.old,
                                   Url.clean_url(Constants.DOCKER_URL))
     new_page = Page.get_or_create(self.db,
                                   self.project_config.project_name,
                                   self.versions.new,
                                   Url.clean_url(Constants.DOCKER_URL))
     self.old_crawler = Crawler(old_page, self.old_project.port)
     self.new_crawler = Crawler(new_page, self.new_project.port)
Пример #3
0
 def test_scarpe_keyword(self):
     lista = [
         "https://www.cucineluberoma.it",
         "https://lubecreomilano.it/",
     ]
     crw = Crawler()
     crw.scrape_keyword(lista)
Пример #4
0
    def __init__(self):

        # Checks to see if a json file with the data for the week exists
        # TODO: Check if the file is up to date
        if os.path.isfile('../CafeAPI/data.json'):
            # Reads the data from the file into a variable
            with open('../CafeAPI/data.json', 'r') as f:
                self.base = json.load(f)
            print("Database: Retrieved data from file")
        else:
            # Run the Crawler a max of 5 times for more stability in case of unstable internet
            for i in range(4):
                try:
                    # Release Crawler
                    with Crawler() as c:
                        # Navigate and collect data
                        c.nav()

                        # Set data to variable
                        self.base = c.get_info()

                        # Write the data to a file for future reference
                        with open('../Throwaway/CafeAPI/data.json', 'w') as f:
                            json.dump(self.base, f)
                    # Break if all of the above works successfully
                    print(f"Database: Retrieved data from Crawler on try #{i}")
                    break
                except:
                    # This means that something failed and the program has to retry
                    print(
                        f"Database: Something went wrong, loading data retry #{i}"
                    )
                    pass

        print("Database: Initiated Data Collection")
def step2_download_zipfiles():
	desktop_path = Helper.get_desktop_dir()
	directory = os.path.join(desktop_path, RAW_DATA_PATH)
	if not os.path.exists(directory):
	    os.makedirs(directory)

	db = Database()
	currency_list = db.get_currency_list()
	crawler_list = [Crawler(db) for x in range(THREAD_NUMBER)]

	lock = threading.RLock()

	def down_data(crawler):

	    while len(currency_list) > 0:

	        with lock:
	            currency = currency_list[0]
	            currency_list.remove(currency)

	        crawler.download_historical_data(currency["symbol"], currency["time"], directory)
	    crawler.quit()

	for crawler in crawler_list:
	    t = threading.Thread(target=down_data, args=(crawler, ))
	    t.start()
Пример #6
0
def start():
	'''抓取进程开始,每次取出一个节点抓取 '''
	# 初始化
	mongo_peoples , redis_client = Init()

	# 待抓取节点集合是否为空
	while redis_client.scard(waiting_set) == 0: # 为空
		# 等待 waiting_size 秒
		time.sleep(wait_time)

	# 从待抓取节点集合随机(右端)取出一个节点
	node = redis_client.spop(waiting_set)
	urlToken = node

	# 抓取节点代表用户的个人信息
	# printx('准备代理……')
	printx('正在抓取用户 %s 的个人信息……'%urlToken)
	try_cnt = try_limit
	while try_cnt > 0:
		try:
			c = Crawler(isCookie=False,timeout=socket_timeout)
			# 手动设置代理IP
			ip = proxyip.get()
			c.set_proxyip(ip)

			people = get_Info(c,urlToken)
			if people==None:
				raise Exception,'抓取的用户信息为空'
		except Exception,e:
			try_cnt -= 1
			print e
			printx('用户 %s 个人信息抓取出错,还可以尝试抓取 %d 次'%(urlToken,try_cnt))
		else:
			break
Пример #7
0
def load_more(url):
	crawler = Crawler()
	crawler.get(url)
	assert "Influencer Love | Fashion ID" in crawler.title, "TITLE INCORRECT"
	try:
		times_clicked = 0
		start = int(time())
		while True:
			# delete tab if we accidentally trip a twitter tab open
			if "Twitter" in crawler.getTitle():
				crawler.driver.find_element_by_tag_name('body').send_keys(Keys.COMMAND + 'w')
				any_button = crawler.findElementsByXPath("//a")[0]
				any_button.send_keys(Keys.COMMAND + 'w')
				crawler.closeExtraTabs()
			
			# find load more button
			load_more_button = crawler.findElementByXPath("//a[@id='ctf-more']")
			crawler.highlight("//a[@id='ctf-more']")
			crawler.click(load_more_button)
			times_clicked += 1
			print('%s CLICKS' % times_clicked)
			crawler.closeExtraTabs()

	except Exception as e:
		print('EXCEPTION', e)
	crawler.close()
	end = int(time())
	print(start)
	print(end)
	print('TOTAL TIME ELAPSED: %s' % (end - start))
Пример #8
0
def main():
    if len(sys.argv) > 1 and len(sys.argv) < 5:
        context = sys.argv[1]
        place = "./"
        clean = False

        if len(sys.argv) == 3:
            if sys.argv[2] == "--clean":
                clean = True
            else:
                place = sys.argv[2]
        elif len(sys.argv) == 4:
            place = sys.argv[2]
            if sys.argv[3] == "--clean":
                clean = True
            else:
                print("Unrecognized option '" + sys.argv[3] + "'")

        with Crawler(place, context) as spider:
            if not clean:
                if (os.path.exists(spider.listFileName)
                        and os.path.exists(spider.visitedFileName)):
                    spider.loadState(spider.listFileName,
                                     spider.visitedFileName)
                else:
                    print("Could not load one or more of the URL lists.")

            spider.recursivePull()
    else:
        printHelp()
Пример #9
0
def random_page(url):
    crawler = Crawler()
    crawler.get(url)
    assert "Urban Dictionary" in crawler.title, "TITLE INCORRECT"
    try:
        # find random page
        random_button = crawler.findElementByXPath(
            "//a[@class='circle-button' and @href='/random.php']")
        crawler.highlight(
            "//a[@class='circle-button' and @href='/random.php']")
        crawler.click(random_button)

        # extract content
        content = {}
        content["word"] = crawler.findElementByXPath(
            "(//a[@class='word'])[1]").text
        crawler.highlight("(//a[@class='word'])[1]")
        content["meaning"] = crawler.findElementByXPath(
            "(//div[@class='meaning'])[1]").text
        crawler.highlight("(//div[@class='meaning'])[1]")
        content_dict = dumps(content)
        return content_dict
    except:
        print('MISSING', e)
    crawler.close()
Пример #10
0
    def __sync_thread(self):
        while True:
            try:
                resources_tags = self.auto_tagger.process(
                    Crawler(
                        self.get('black-list'),
                        self.get('white-list'),
                        self.get('crawled-resources'),
                    ).crawl())

                SyncAgent(
                    self.get('settings')['server'],
                    self.get('settings')['user-token'],
                    self.get('settings')['device-token'],
                ).sync(resources_tags)

            except Exception as new_exception:
                print('[ERROR]: When trying to sync: {0}'.format(
                    new_exception.message))

            else:
                self.get('crawled-resources').update(
                    set(resource for resource, _ in resources_tags))

            time.sleep(self.get('settings')['sync']['interval'])
Пример #11
0
def test_blocks():
    """
    Check transactions in each of a random sample of blocks.

    Send a request to https://etherchain.org/api/block/:block/tx to get a list
    of all transactions that occurred in that block. Cross-reference with the
    transactions in the local block (in mongo).
    """
    c = Crawler.Crawler(start=False)
    client = c.mongo_client

    sample = random.sample(range(1, 1700000), 100)
    N = len(sample)

    # Track the number of times the number of transactions is different.
    wrong_blocks = list()
    num_error = "Incorrect number of transactions in {}% of {} blocks."

    blocks = client.find({"number": {"$in": sample}})
    for block in blocks:
        n = block["number"]
        uri = "https://etherchain.org/api/block/{}/tx".format(n)
        ethchain = json.loads(requests.get(uri).text)

        # Check the number of transactions in the block
        if len(ethchain["data"]) != len(block["transactions"]):
            wrong_blocks.append(n)

    wrong_nums = len(wrong_blocks)
    pprint.pprint(wrong_blocks)
    assert wrong_nums == 0, num_error.format(100.0 * wrong_nums / N, N)
Пример #12
0
def check_with_cosine():
    url_file = open("categories/index/url.txt", encoding="utf-8")
    urls = url_file.read().split()
    login_vector_file = open("categories/login/vector.txt", encoding="utf-8")
    index_vector_file = open("categories/index/vector.txt", encoding="utf-8")
    register_vector_file = open("categories/register/vector.txt",
                                encoding="utf-8")
    login_vector = login_vector_file.read().split()
    for i in range(0, len(login_vector)):
        login_vector[i] = int(login_vector[i])
    indexVector = index_vector_file.read().split()
    registerVector = register_vector_file.read().split()
    crawler = Crawler()
    for url in urls:
        # url = "https://mail.sjtu.edu.cn/"
        try:
            words = open("categories/words.txt",
                         encoding='utf-8').read().split("\n")
            vector = crawler.word_frequency_statistics_by_url(url, words)
            print(login_vector)
            print(vector)
            print("Possibility of " + url + " being a login page is " +
                  str(cos(login_vector, vector)))
        except Exception:
            continue
Пример #13
0
def runScan(target):

    crawler = Crawler()
    findings = {}

    print("Scanning: ", target)

    findings.clear()
    findings = {"target":target,"sqlinjection":[], "WeakPassword":[]}


    if not crawler.init(target):
        return

    crawler.crawl()
    crawler.findLoginPanel()

    AuthBypass.check_authbypass(crawler.loginFormEndpoints, findings)
    WeakPasswords.check_weak_passwords(crawler.loginFormEndpoints, findings)


    if len(crawler.loginFormEndpoints) > 0:
        findings["loginForm"]="yes"
    else:
        findings["loginForm"] = "no"

    sqli_scan_urls(crawler.uEndPoints, findings)
    sqli_scan_forms(crawler.fEndpoints, findings)
    CommonFunctions.save_findings(findings)
def get_per_followerList(urlToken, page, sum_page):
    ''' 抓取 follower 列表的每一页'''
    printx('正在抓取第 %d/%d 页……' % (page, sum_page))
    try_cnt = try_limit

    follower_list = []

    while try_cnt > 0:
        try:
            # 设置抓取器
            c = Crawler(isCookie=False, timeout=socket_timeout)
            # 手动设置代理IP
            ip = proxyip.get()
            c.set_proxyip(ip)

            # 解析当前页的 html
            url = '%s/people/%s/followers?page=%d' % (host, urlToken, page)
            html = c.get_html(url)
            s = BS(html, 'html.parser')

            # 获得当前页的所有关注用户
            data = s.find('div', attrs={'id': 'data'})['data-state']
            data = json.loads(data)
            items = data['people']['followersByUser'][urlToken]['ids']
            for item in items:
                if item != None and item != False and item != True and item != '知乎用户'.decode(
                        'utf8'):
                    node = item.encode('utf8')
                    follower_list.append(node)
        except Exception, e:
            try_cnt -= 1
            #printx(e)
            printx('用户 %s 第 %d 页抓取出错,还可以尝试抓取 %d 次' % (urlToken, page, try_cnt))
        else:
            break
Пример #15
0
def main(token, args):
    config = args.config
    storageConfig = config['storage']
    rootLogger.setLevel(args.log or config['general']['log_level'].upper())

    crawler = Crawler(config)
    RawEvent.BASE_TZ = config['crawler']['defaults']['timezone']

    repo, icsSha, jsonSha, jsonContent = loadFromGit(token, storageConfig)
    if not repo:
        return 1

    importEvents = json.loads(jsonContent or '[]')
    if importEvents and not args.ignore_previous_crawls:
        crawler.importJSON(importEvents)

    crawler.discover()
    crawler.resolve()

    exportedJSON = crawler.exportJSON(force=args.force_write)
    exportedICS = crawler.exportICS(force=args.force_write)

    if exportedJSON is False or exportedICS is False:
        logger.info('No new events')
        return 0

    res = storeToGit(repo, storageConfig, icsSha, exportedICS, jsonSha,
                     exportedJSON)
    return 0 if res else 1
Пример #16
0
    def get_article(self, url):
        crawler = Crawler()
        # get html data from url
        web_data = crawler.get_page(url)
        soup = BeautifulSoup(web_data, 'html.parser')

        # remove link news 
        [e.extract() for e in soup('div', {'class':'link_news'})]

        # article title
        self.title = soup('h3', {'id':'articleTitle'})[0].text

        # create date and time of article
        date_time = soup('span', {'class':'t11'})[0].text.split()
        self.date = date_time[0]
        self.time = date_time[1]

        # press name
        press_logo = soup('div', {'class':'press_logo'})[0]
        self.press = press_logo.find('img')['alt']
        del press_logo

        # article contents
        self.contents = soup('div', {'id':'articleBodyContents'})[0].text
        self.contents = re.sub('[\n\r]', '', self.contents)
Пример #17
0
class Main(Process):
    urlToCrawl = ''
    crawling = False
    crawler = Crawler()
    db = DbHandler()

    def __init__(self, url=None):
        global urlToCrawl
        if url is not None:
            urlToCrawl = url

    def start(self, url):
        global urlToCrawl, crawler
        urlToCrawl = url
        self.idle()

    def printen(self, url):
        print url

    def idle(self):
        while self.db.getCrawlstate('crawler')[0]:
            while not Main.crawling:
                status = self.db.getCrawlstate(urlToCrawl)
                if status[0]:
                    Main.crawling = True
                else:
                    time.sleep(1800)
            Main.crawling = False
            Main.crawler.startCrawler(urlToCrawl, status[1])
Пример #18
0
    def test_getPageBs(self):
        """Test the method of getting the :obj:`BeautifulSoup` object from a web page.
        
        1. Test getting page bs object from an available page
        2. Test getting page bs object from an unavailable page
        """
        c = Crawler()

        # Test getting page bs object from an available page
        bs = c.getPageBs(self.testUrls[0])
        self.assertTrue(isinstance(
            bs, BeautifulSoup))  # Returned data type correctness
        self.assertEqual(
            self.strIO.getvalue(),
            f'Getting: {self.testUrls[0]}\n')  # Standard output correctness
        self.assertTrue(c.curUrl)  # Current url existence
        self.assertEqual(c.curUrl,
                         self.testUrls[0])  # Current url attribute correctness

        #----------------------------------------------------#

        # Test getting page bs object from an unavailable page
        sys.stdout = self.strIO = io.StringIO()
        bs = c.getPageBs('')
        self.assertFalse(c.curUrl)
        self.assertFalse(bs or isinstance(bs, BeautifulSoup))
        self.assertTrue(
            re.match(f'Getting: \nFailed to get : ', self.strIO.getvalue()))
Пример #19
0
 def test_printInfo(self):
     """Test the :obj:``printInfo`` method.
     """
     c = Crawler(workingList=self.testUrls)
     c.printInfo('this is a message')
     expectedStr = f'Result:\nthis is a message\nRemained Work Amount: 3\n{"-"*20}\n'
     self.assertEqual(self.strIO.getvalue(),
                      expectedStr)  # Standard output string correctness
Пример #20
0
def step1_get_currency_list():
	db = Database()
	crawler = Crawler(db) 
	currency_list, time_list = crawler.get_currency_list_with_url(DEFAULT_SITE_URL + CURRENCYLIST_URL)
	currency_list_dict = [{"symbol": currency_list[index], "time": int(time_list[index])} for index in range(len(currency_list))]
	db.currency_list.insert_many(currency_list_dict)
	crawler.quit()
	db.close()
Пример #21
0
def main():
    argparser = argparse.ArgumentParser(description="Scrapes a Web site and writes the generated HTML to disk for caching")
    argparser.add_argument('root', help='The starting point URL for the crawl (beginning with http:// or https://)')
    args = argparser.parse_args()

    assert args.root.startswith(('https://', 'http://'))
    policy = ScrapingPolicy(args.root)
    Crawler(policy).crawl()
Пример #22
0
async def main():
    webpage_store = WebpageStore()
    webpage_processor = WebpageProcessor(webpage_store)
    crawler = Crawler(webpage_processor, max_depth=3, verbose=True)
    # initial_urls = ["https://en.wikipedia.org/wiki/Web_scraping"]
    # initial_urls = [f"https://swapi.co/api/people/{i}" for i in range(1, 3)]
    initial_urls = get_initial_urls()
    await crawler.run(initial_urls)
Пример #23
0
 def run_crawler(self):
     while True:
         try:
             crawler = Crawler(
                 host=self.__host, port=self.__port, key=self.__key)
             crawler.run()
         except Exception:
             pass
         sleep(self.__cr_time)
def download_articles_from(titles_list):
    crawler = Crawler()
    print("Starting download")
    pool = ThreadPoolExecutor(max_workers=5)
    for title in titles_list:
        pool.submit(crawler.search, title)

    pool.shutdown(wait=True)
    crawler.write_fails()
Пример #25
0
 def test_download(self):
     crawler = Crawler()
     with open(r'C:\Users\matti\OneDrive\Desktop\lista.txt') as f:
         content = f.readlines()
     # you may also want to remove whitespace characters like `\n` at the end of each line
     content = [x.strip() for x in content]
     for x in content:
         print(x)
     crawler.scrape_photos("https://www.cucineluberoma.it/", content)
Пример #26
0
    def test_extendWorkingList(self):
        """Test the method of extending the working list.

        1. Test extending an empty working list.
        2. Test extending a none empty working list.
        """

        # Test extending an empty working list
        c = Crawler()
        self.assertFalse(c.WLM.records)
        c.extendWorkingList(self.testUrls[:2])
        self.assertTrue(c.WLM.records)
        self.assertEqual(c.WLM.records, deque(self.testUrls[:2]))

        # Test extending a none empty working list
        c = Crawler(workingList=self.testUrls[:1])
        self.assertTrue(c.WLM.records)
        c.extendWorkingList(self.testUrls[1:2])
        self.assertEqual(c.WLM.records, deque(self.testUrls[:2]))
Пример #27
0
 def setup_project(self, version: str, db: Session) -> bool:
     project = self.deploy_version(version)
     if project is not None:
         self.projects[version] = project
         page = Page.get_or_create(db, self.project_config.project_name,
                                   version,
                                   Url.clean_url(Constants.DOCKER_URL))
         self.crawlers[version] = Crawler(page, self.projects[version].port)
         return True
     return False
Пример #28
0
def main() -> None:
    urls = None
    with open('./urls.json') as f:
        urls = json.loads(f.read())
    for i in urls:
        crawler = Crawler(i, urls[i])
        crawler.crawl()
        add_data(i, crawler.sorted_time_table)
    save()
    return
Пример #29
0
	def build(self, keyWord, num, bfs=False):
		if bfs is False:
			queue = WebQueue()
		else:
			queue = BfsQueue()
		top10List = self.__getTop10(keyWord)
		for url in top10List:
			queue.offer(url, 0)
		return Crawler(num, queue)
		
Пример #30
0
 def __init__(self, threads, tor=False):
     self.counter = 0
     self.threads = threads
     self.tor = tor
     self.q = (Queue.Queue(), Queue.Queue())
     self.qq = []
     for i in xrange(self.threads):
         self.qq.append((Queue.Queue(), Queue.Queue()))
         c = Crawler(self.qq[-1], self.tor)
         daemon_thread(c.run)