Python Scraper.begin 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: scraper

클래스/타입: Scraper

메소드/함수: begin

hotexamples.com에서의 예제들: 3

Python Scraper.begin - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 scraper.Scraper.begin에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Scraper(30)

matchTag(7)

connect(6)

__init__(5)

_time_now(5)

close(5)

submit(3)

find_docs(3)

get_children(3)

create_destination(2)

extractTag(2)

get_papers(2)

begin(2)

get_all_page_uris(1)

get_all_skills(1)

get_css(1)

get_and_write_records(1)

getZipLinks(1)

get_manga(1)

get_paths(1)

get_post_data_per_page(1)

get_all_manga(1)

getGameList(1)

getSlist(1)

getQlist(1)

getInformation(1)

getIndexhtm(1)

get_prices(1)

getEvents(1)

getDepts(1)

getAppList(1)

gather_reddit_data(1)

fetch_most_recent_transactions(1)

fetch_booster_usage(1)

extractText(1)

create_organization_sets(1)

create_http_link(1)

get_price(1)

DownloadImage(1)

get_script(1)

scrape_ingredients(1)

update_submission_content(1)

store_parse(1)

stopped(1)

sort(1)

seturldata(1)

set_started_callback(1)

set_output_file(1)

set_finished_callback(1)

set_broadcast_document_callback(1)

예제 #1

파일 보기

def main():
    parser = argparse.ArgumentParser(
        description="Help with the process of putting a song in Apple Music.")
    parser.add_argument(
        "youtube",
        default=None,
        help="The youtube link to the songs video, must surround in quotes")
    parser.add_argument("--title",
                        nargs="+",
                        default=None,
                        help="The title of the music")
    parser.add_argument("--artist",
                        nargs="+",
                        default=None,
                        help="The artist of the music")
    parser.add_argument("--thumbnail",
                        action="store_true",
                        help="If the music should include art")
    parser.add_argument("--geckodriver",
                        default=None,
                        help="Path to geckodriver, must surround in quotes")

    arguments = parser.parse_args()

    arguments.title = " ".join(arguments.title)
    arguments.artist = " ".join(arguments.artist)

    name = f"{arguments.artist} - {arguments.title}"

    options = {
        "format":
        "bestaudio/best",
        "postprocessors": [{
            "key": "FFmpegExtractAudio",
            "preferredcodec": "mp3",
            "preferredquality": "192",
        }],
        "outtmpl":
        f"{name}.%(ext)s"
    }

    with youtube_dl.YoutubeDL(options) as youtube:
        youtube.download([arguments.youtube])

    music = metadata.load(f"{name}.mp3")

    if arguments.title:
        music.tag.title = arguments.title

    if arguments.artist:
        music.tag.artist = arguments.artist

    if arguments.thumbnail and arguments.geckodriver:
        from scraper import Scraper

        scraper = Scraper(arguments.geckodriver, music)
        scraper.begin(name=name)

    music.tag.save()

예제 #2

파일 보기

graph = Graph()

while running:
    print "Commands:"
    print "0: quit"
    print "1:(Actor/Movie to parse)"
    print "2:(load graph path)"
    print "3:(execute command)"
    input = raw_input("Command")
    print input
    if input[0] == '0':
        break
    if input[0] == '1':
        test = Scraper('https://en.wikipedia.org/wiki/' + input[2:], 50)
        test.set_speed(1)
        graph = test.begin()
    if input[0] == '2':
        graph.open_json(input[2:])
    if input[0] == '3':
        cmd = "print "
        cmd += input[2:]
        exec(cmd)
'''
test_two = Scraper('https://en.wikipedia.org/wiki/Ryan_Reynolds', 30)
test_two.set_speed(1)

print str(Scraper.get_oldest_actors(graph, 5))

print str(Scraper.get_movies(graph,2009))
print str(Scraper.get_actors(graph,2009))

예제 #3

파일 보기

파일: launcher.py 프로젝트: citruspi/BarkingOwl

class ScraperWrapper(threading.Thread):

    def __init__(self,address='localhost',exchange='barkingowl',DEBUG=False):
        threading.Thread.__init__(self)

        self.uid = str(uuid.uuid4())
        self.address = address
        self.exchange = exchange
        self.DEBUG=DEBUG
        self.interval = 1

        # create scraper instance
        self.scraper = Scraper(uid=self.uid,DEBUG=DEBUG)
        self.scraping = False

        #setup message bus
        self.respcon = pika.BlockingConnection(pika.ConnectionParameters(
                                                           host=self.address))
        self.respchan = self.respcon.channel()
        self.respchan.exchange_declare(exchange=self.exchange,type='fanout')

        self.reqcon = pika.BlockingConnection(pika.ConnectionParameters(host=address))
        self.reqchan = self.reqcon.channel()
        self.reqchan.exchange_declare(exchange=exchange,type='fanout')
        result = self.reqchan.queue_declare(exclusive=True)
        queue_name = result.method.queue
        self.reqchan.queue_bind(exchange=exchange,queue=queue_name)
        self.reqchan.basic_consume(self.reqcallback,queue=queue_name,no_ack=True)

        if self.DEBUG:
            print "Scraper Wrapper INIT complete."

    def run(self):
        # setup call backs
        self.scraper.setFinishedCallback(self.scraperFinishedCallback)
        self.scraper.setStartedCallback(self.scraperStartedCallback)
        self.scraper.setBroadcastDocCallback(self.scraperBroadcastDocCallback)

        # broadcast availability
        self.broadcastavailable()
        self.reqchan.start_consuming()

    def stop(self):
        self.scraper.stop()
        self.reqchan.stop_consuming()

    def broadcastavailable(self):
        if self.scraper.status['busy'] == True:
            # we are currently scraping, so we are not available - don't broadcast
            return

        isodatetime = strftime("%Y-%m-%d %H:%M:%S")
        packet = {
            'availabledatetime': str(isodatetime)
        }
        payload = {
            'command': 'scraper_available',
            'sourceid': self.uid,
            'destinationid': 'broadcast',
            'message': packet
        }
        jbody = simplejson.dumps(payload)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)

        #
        # TODO: move this over to it's own timer, no need to do it here.
        #
        if self.scraper.stopped():
            raise Exception("Scraper Wrapper Exiting")
        else:
            threading.Timer(self.interval, self.broadcastavailable).start()
        
    def broadcaststatus(self):
        isodatetime = strftime("%Y-%m-%d %H:%M:%S")
        packet = {
            'status': self.scraper.status,
            'urldata': self.status['urldata'],
            'statusdatetime': str(isodatetime)
        }
        payload = {
            'command': 'scraper_status',
            'sourceid': self.uid,
            'destinationid': 'broadcast',
            'message': packet
        }
        jbody = simplejson.dumps(payload)
        #time.sleep(.5)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)

    def broadcastsimplestatus(self):
        isodatetime = strftime("%Y-%m-%d %H:%M:%S")

        if self.scraper.status['urldata'] == {}:
            targeturl = 'null'
        else:
            targeturl = self.scraper.status['urldata']['targeturl']

        packet = {
            'busy': self.scraper.status['busy'],
            'linkcount': self.scraper.status['linkcount'],
            'processedlinkcount': len(self.scraper.status['processed']),
            'badlinkcount': len(self.scraper.status['badlinks']),
            'targeturl': targeturl,
            'statusdatetime': str(isodatetime)
        }
        payload = {
            'command': 'scraper_status_simple',
            'sourceid': self.uid,
            'destinationid': 'broadcast',
            'message': packet
        }
        jbody = simplejson.dumps(payload)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)

    def scraperFinishedCallback(self,payload):
        jbody = simplejson.dumps(payload)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
        return

    def scraperStartedCallback(self,payload):
        jbody = simplejson.dumps(payload)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
        return

    def scraperBroadcastDocCallback(self,payload):
        jbody = simplejson.dumps(payload)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
        return

    # message handler
    def reqcallback(self,ch,method,properties,body):
        try:
            response = simplejson.loads(body)
            if self.DEBUG:
                print "Processing Message:\n\t{0}".format(response['command'])
            if response['command'] == 'url_dispatch':
                if response['destinationid'] == self.uid:
                    #print "URL Dispatch Command Seen."
                    #print response
                    if self.scraping == False:
                        #print "[Wrapper] Launching Scraper on URL: '{0}'".format(response['message']['targeturl'])
                        self.scraper.seturldata(response['message'])
                        if self.scraper.started == False:
                            self.scraper.start()
                        self.scraper.begin()
                        self.scraping = True

            elif response['command'] == 'scraper_finished':
                if response['sourceid'] == self.scraper.uid:
                    self.scraping = False

            elif response['command'] == 'get_status':
                self.broadcaststatus()

            elif response['command'] == 'get_status_simple':
                self.broadcastsimplestatus()

            elif response['command'] == 'shutdown':
                if response['destinationid'] == self.uid:
                    print "[{0}] Shutting Down Recieved".format(self.uid)
                    self.stop()

            elif response['command'] == 'global_shutdown':
                print "Global Shutdown Recieved"
                self.stop()

        except:
            if self.DEBUG:
                print "Message Error"