Python Crawler 예제들, src.crawler.Crawler Python 예제들

예제 #1

0

파일 보기

파일: main.py 프로젝트: Am1n3e/INF8007-web-scraper

def _crawl(crawler: Crawler, args: argparse.Namespace) -> int:
    """Crawl using the provided crawler.

    Args:
        crawler: The crawler object.
        args: The command line arguments

    Returns:
        0 on success, else 1
    """
    failure_occured = False
    try:
        crawler.crawl()
        _print_dead_links(crawler.dead_links)
    except CrawlerException as exception:
        logger.error(str(exception))
        failure_occured = True
    except Exception as exception:
        failure_occured = True
        # Using Broad exception to catch all errors to give a proper error message
        logger.error("Error occured while crawling")
        if args.show_exception_tb:  # To keep the output clean
            logger.exception(exception)

    return 1 if failure_occured else 0

예제 #2

0

파일 보기

def start():
    crawler = Crawler()
    crawler_results = crawler.crawl()
    for crawler_result in crawler_results:
        attribute_string = crawler_result.get('attribute_string')
        attribute_usd_price = crawler_result.get('attribute_usd_price')
        attribute = crawler_result.get('attribute')
        converter = Converter()
        print attribute,converter.convert(attribute_usd_price,attribute_string)

예제 #3

0

파일 보기

파일: miner.py 프로젝트: INNOVINATI/linkminer

class LinkMiner:
    data = None

    def __init__(self, sources: list, targets: list):
        self.crawler = Crawler(sources=sources, targets=targets)
        self.graph = Digraph(strict=True, engine='circo')
        self.graph.graph_attr['overlap'] = 'false'

    def extract(self):
        self.data = self.crawler.run()
        nodes = Counter(self.data['nodes'])
        top = max(nodes.values())
        for node in nodes.keys():
            self.graph.node(
                node, node, **{
                    'size': str(max([nodes[node], int(top / 4)])),
                    'fontsize': str(max([nodes[node],
                                         int(top / 4)]))
                })
        for edge in self.data['edges']:
            self.graph.edge(edge['source'], edge['target'])

    def render(self, filename='untitled'):
        self.graph.render(f'{filename}.gv', view=True)

    def export_json(self, filename):
        string = json.dumps(self.data['edges'])
        with open(f'{filename}.json', 'w') as file:
            file.write(string)

예제 #4

0

파일 보기

class ConverterTest(unittest.TestCase):
    def test_crawl(self):
        self.crawler = Crawler()
        self.assertIsInstance(self.crawler.crawl(), list)

    if __name__ == '__main__':
        unittest.main()

예제 #5

0

파일 보기

def create_app_test():

    file_hndlrs = configure_logger()

    mediator_q = Queue()

    clients = [
        Parser(Queue(), mediator_q, config),
        BotProtocol(Queue(), mediator_q, config),
        Crawler(Queue(), mediator_q, config, 10),
        CommandMessageHandler(Queue(), mediator_q, config),
    ]

    mediator = AppMediator(mediator_q, clients)
    mediator.start()

    for client in clients:
        client.start()
    reglament_thread = Thread(target=reglament_work, args=[mediator])
    reglament_thread.start()
    try:
        while True:
            time.sleep(30)
            if not mediator.is_alive():
                logger.error('Медиатор умер, пеерзапускаю...')
                mediator = AppMediator(mediator_q, mediator.clients)
                reglament_thread = Thread(target=reglament_work,
                                          args=[mediator])
                reglament_thread.start()
                mediator.start()
            mediator.check_clients()
    finally:
        for file_hndl in file_hndlrs:
            file_hndl.close()

예제 #6

0

파일 보기

파일: run.py 프로젝트: takaiyuk/hatena-blog

def run_crawler():
    from src.const import NAME_URL_DICT
    from src.crawler import Crawler

    is_headless = args.headless
    for name in NAME_URL_DICT.keys():
        print(f"==={name}===")
        Crawler(name, is_headless).run()

예제 #7

0

파일 보기

def main(argv):
    domain = ''
    try:
        opts, args = getopt.getopt(argv, "hd:", ["domain="])
    except getopt.GetoptError:
        print('app.py -d <domain>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('app.py -d <domain>')
            sys.exit()
        elif opt in ("-d", "--domain"):
            domain = arg
    max_depth = 3
    site_map_set = Crawler(domain, max_depth).crawl(0)
    logger.info('site map:')
    for endpoint in site_map_set:
        logger.info(endpoint)

예제 #8

0

파일 보기

    def __init__(self):
        # Parse arguemnts
        if len(sys.argv) > 1:
            arg = sys.argv[1]
        else:
            arg = ask('Enter domain:')
            if not arg:
                exit()

        # Verify domain integrity
        if '://' in arg:
            parsed = urlsplit(arg)
        else:
            parsed = urlsplit('http://' + arg)
        if '.' not in parsed.netloc:
            pr('Invalid domain!', '!')
            exit()

        # Verify subdomain
        self.subdomain = self.base_domain = None
        pts = parsed.netloc.split('.')
        if len(pts) > 2:
            pr('Is this the subdomain you wish to use:? ' + pts[0])
            if pause('agree', cancel=True):  # subdomain
                self.subdomain = pts[0]
                self.base_domain = '.'.join(pts[1:])
        if not self.subdomain:
            self.subdomain = 'www'
        if not self.base_domain:
            self.base_domain = parsed.netloc

        self.domain = parsed.netloc
        self.scheme = parsed.scheme if parsed.scheme else 'http'
        print()
        pr('Using domain: ' + fc + self.domain + fx)

        self.crawler = Crawler(self, parsed.path)

예제 #9

0

파일 보기

파일: crawler_test.py 프로젝트: phanindra-anchuri/crawlsearch

 def extract_test(self):
     crawler = Crawler(self.test_url, 1)
     extracted_urls = crawler.extract_links(self.test_url)
     self.assertTrue(len(extracted_urls) > 0)

예제 #10

0

파일 보기

파일: crawl.py 프로젝트: kkurian/craigsgigs

#!/usr/bin/env python

import logging

from src.crawler import Crawler

if '__main__' == __name__:
    logging.basicConfig(level=logging.DEBUG)
    Crawler().crawl()

예제 #11

0

파일 보기

파일: app.py 프로젝트: sthh/bildungsserver-content-crawler

import argparse

from src import settings
from src.api import BildungsserverFeed, LocalXmlFeed, LocalRssFeed
from src.crawler import Crawler, SiemensCrawler, BildungsserverCrawler
from src.exceptions import ConfigurationError

if __name__ == '__main__':
    if settings.CRAWLER.lower() == 'bildungsserver':
        Crawler = BildungsserverCrawler
    elif settings.CRAWLER.lower() == 'siemens-stiftung':
        Crawler = SiemensCrawler
    else:
        raise ConfigurationError("settings.CRAWLER must be set.")
    dry_run = settings.DRY_RUN

    crawler = Crawler(dry_run=dry_run)
    crawler.crawl()

예제 #12

0

파일 보기

import sys
import os
from termcolor import colored
from src.crawler import Crawler, CrawlerRequest

if len(sys.argv) < 2:
    print(colored('Please provide the input file as argument.', 'yellow'),
          colored('For eg:', 'yellow'))
    print(colored('$ python AMPCrawler.py /path/to/your/file.txt', 'green'))
    sys.exit(1)
elif (not os.path.exists(sys.argv[1]) and
      not os.path.isfile(sys.argv[1])):
    print(colored('Make sure given file is on correct path', 'red'),
          colored('and file type is file', 'red'))
    sys.exit(1)

try:
    file = sys.argv[1]
    ampCrawl = Crawler(file)
    ampCrawl.run_crawler()
    ampCrawl.show_result()

    sys.exit(ampCrawl.exit_code())
except Exception as e:
    print("Crawler script failing with error:\n%s" % (e))
    sys.exit(1)

예제 #13

0

파일 보기

from src.crawler import Crawler
from src.feeder import Feeder

from warnings import filterwarnings
import pymysql as pymysql
filterwarnings('ignore', category = pymysql.Warning)

parser = argparse.ArgumentParser(description="Download all Pokemon Showdown's stats files, and fill a database with its stats.")
parser.add_argument("dbms", help="Database Management System", choices=["mysql"])
parser.add_argument("host", help="Database address")
parser.add_argument("user", help="Database user")
parser.add_argument("password", help="User password")
parser.add_argument("dbname", help="Database name")
group = parser.add_mutually_exclusive_group()
group.add_argument("-p", "--only-parse", "--skip-download", help="do not download any file from the internet and only use available local files to build the database", action="store_true")
# group.add_argument("-d", "--only-download", "--skip-parse", help="do not parse and do not store any file in a database, and only download files from the internet", action="store_true")
parser.add_argument("-F", "--folder", help="folder to use to download files into, and to parse from")
parser.add_argument("-f", "--file", help="only process a single specific file")
parser.add_argument("-v", "--verbose", help="be verbose", action="store_true")
args = parser.parse_args()

# Phase 1 : Download
print args
if not args.only_parse:
    crawler = Crawler('')
    crawler.run()

# Phase 2 : Parse
feeder = Feeder('stats')
feeder.feedAll(args.dbms, args.host, args.user, args.password, args.dbname)

예제 #14

0

파일 보기

def main(browser, url, config):

    crawler = Crawler(browser, FreidaConfig(browser))
    crawler.scrape(url, 'freida_results_' + str(time.time())[:10] + '.csv')

예제 #15

0

파일 보기

 def test_crawl(self):
     self.crawler = Crawler()
     self.assertIsInstance(self.crawler.crawl(), list)

예제 #16

0

파일 보기

파일: crawler_test.py 프로젝트: phanindra-anchuri/crawlsearch

 def robots_test(self):
     crawler = Crawler(self.test_url, 1)
     self.assertTrue(self.robot_path and self.robot_url
                     not in crawler.extract_links(self.test_url))

예제 #17

0

파일 보기

from src.crawler import Crawler
import time

TIME_LIMIT = 900  # time in seconds, after which the crawler is forcibly stopped.

if __name__ == '__main__':
    print('Running crawler...')

    crawler = Crawler('airbnb')
    crawler.start()

    print('Thread started, Ctrl-c to stop early.')

    try:
        time.sleep(TIME_LIMIT)
    except KeyboardInterrupt:
        print("** Killing crawler")
    else:
        print("** Times up, ending crawl.")
    finally:
        crawler.kill()

    crawler.wait_for_child()

    print('** Crawler finished')

예제 #18

0

파일 보기

파일: miner.py 프로젝트: INNOVINATI/linkminer

 def __init__(self, sources: list, targets: list):
     self.crawler = Crawler(sources=sources, targets=targets)
     self.graph = Digraph(strict=True, engine='circo')
     self.graph.graph_attr['overlap'] = 'false'

예제 #19

0

파일 보기

파일: sqliv.py 프로젝트: Dev-MHM/SQLvi

from urlparse import urlparse

from src import std
from src import scanner
from src import reverseip
from src import serverinfo
from src.web import search
from src.crawler import Crawler

# search engine instance
bing = search.Bing()
google = search.Google()
yahoo = search.Yahoo()

# crawler instance
crawler = Crawler()


def singlescan(url):
    """instance to scan single targeted domain"""

    if urlparse(url).query != '':
        result = scanner.scan([url])
        if result != []:
            # scanner.scan print if vulnerable
            # therefore exit
            return result

        else:
            print ""  # move carriage return to newline
            std.stdout("no SQL injection vulnerability found")

예제 #20

0

파일 보기

파일: scrapeyard.py 프로젝트: ibemitchy/ScrapeYard

def scrape_yard():
    crawler = AsynchronousCrawler() if settings.isASynchronous else Crawler()
    crawler.start()

    scraper = MultiprocessScraper() if settings.isMultiprocess else Scraper()
    scraper.start()