Python scrape примеры, scrape.scrape Python примеры использования

Пример #1

0

Показать файл

def scrape(url: str,
           staleOnly: bool = False,
           fallback: bool = False) -> sc.ScrapeMeta:
    if sc._staleOnly:
        util.logMessage(f'skitter.scrape: HERMES_STALE only {url}')
        return sc.scrape(url)

    if staleOnly:
        util.logMessage(f'skitter.scrape: staleOnly {url}')
        for c in reversed(priv.skitterClients):
            ce = c.cache(url)
            if ce is not None:
                return ce
        raise Exception(f'skitter.scrape: unable to staleOnly scrape: {url}')

    for c in priv.skitterClients:
        try:
            #util.logMessage(f'skitter.scrape: calling {c.ident}.scrape({url})')
            r = c.scrape(url)
            return r
        except Exception as e:
            util.logMessage(f'skitter.scrape: {c.ident}.scrape failed: {e}')
            pass

    if fallback:
        return sc.scrape(url)
    raise Exception(f'skitter.scrape: unable to scrape: {url}')

Пример #2

0

Показать файл

Файл: fabfile.py Проект: tikyau/azurecompare

def scrape_ec2():
    ec2_file = 'AWSinstances.json'
    try:
        scrape(ec2_file)
    except Exception as e:
        print "ERROR: Unable to scrape data: %s" % e
        print traceback.print_exc()

Пример #3

0

Показать файл

def main():
    """Code for scraper.

    Actual scraping is turned off because server doesn't like being pinged.

    """
    # url = "https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major"
    # # Use to make URL attribute of scholarship object usable
    # appendable_url = "https://www.scholarships.com"

    # # Setup output file
    # scan_time = date.today()
    # filename = 'scan_' + str(scan_time) + '.csv'
    # with open(filename, 'w', encoding='utf-8-sig') as f:
    #     w = csv.DictWriter(f, ['name', 'url', 'amount', 'deadline', 'description'])
    #     w.writeheader()

    # # get response
    # response = get_response(url)

    # soup = BeautifulSoup(response.content, 'html5lib')
    # url_table = soup.find(id="ullist")
    # url_list = url_table.find_all('a')
    # for link in url_list:
    #     get_scholarshipscom_details(link.get('href'), appendable_url, filename)

    #     # Wait 1 second between requests
    #     sleep(1)


    print("Pushing file into the database.", flush=True)
    scrape(environ['MYSQL_USER'], environ['MYSQL_PASSWORD'], "db", environ['MYSQL_DB_NAME'])

    print("done")

Пример #4

0

Показать файл

Файл: tester.py Проект: rish9101/Python_Assignment

 def test_ScrapedUser(self):
     scrape.scrape(self.username)
     output_text = io.StringIO()
     sys.stdout = output_text
     scrape.scrape(self.username)
     sys.stdout = sys.__stdout__
     self.assertEqual(output_text.getvalue(), 'My name is Kanish and my current city is Roorkee\n')

Пример #5

0

Показать файл

Файл: mytk.py Проект: samirmayfield/PythonProjects

 def export(self):
     urlName = self.varURLName.get()
     className = self.varClassName.get()
     csvName = self.varCSVName.get()
     tagType = self.varTAGType.get()
     scrape.WriteCSV(csvName)
     scrape.scrape(className, urlName, csvName, tagType)

Пример #6

0

Показать файл

    def main(self):
        print(red + '''
  _____                     _  __    _             
 |  __ \\                   | |/ /   | |            
 | |__) | __ _____  ___   _| ' / ___| | _____ _ __ 
 |  ___/ '__/ _ \\ \\/ / | | |  < / _ \\ |/ / _ \\ '__|
 | |   | | | (_) >  <| |_| | . \\  __/   <  __/ |   
 |_|   |_|  \\___/_/\\_\\\\__, |_|\\_\\___|_|\\_\\___|_|   
                       __/ |                       
                      |___/                        \n''')
        print(blue + 'by Nexolyte\n')
        m = get('Main Menu\n' +\
                red + '[' + blue + '1' + red + '] - ' + white + 'Scrape\n' +\
                red + '[' + blue + '2' + red + '] - ' + white + 'Check\n' +\
                red + '[' + blue + 'e' + red + '] - ' + white + 'Exit\n')

        if m == '1':
            os.system('cls')
            scrape.scrape()
        elif m == '2':
            os.system('cls')
            check.check()
        elif m == 'e':
            os.system('cls')
            sys.exit(1)
        else:
            os.system('cls')
            error('Input not recognised. Please retype and try again.')
            self.main()

Пример #7

0

Показать файл

    def main(self):
        print(red + """
    dBBBBBb dBBBBBb    dBBBBP`Bb  .BP dBP dBP dBBBBBb    dBBBP dBBBBBb   dBBBBBb  dBBBP dBBBBBb
       dB'     dBP   dBP.BP     .BP     dBP      dBP               BB       dB'            dBP
  dBBBP'  dBBBBK   dBP.BP    dBBK     dBP   dBBBBK'  dBBP     dBP BB   dBBBP' dBBP    dBBBBK
 dBP     dBP  BB  dBP.BP    dB'      dBP   dBP  BB  dBP      dBP  BB  dBP    dBP     dBP  BB
dBP     dBP  dB' dBBBBP    dB' dBP  dBP   dBP  dB' dBBBBP   dBBBBBBB dBP    dBBBBP  dBP  dB' v1.1"""
              )
        print(blue + 'by Xenex\n')
        m = get('Main Menu\n' +\
                red + '[' + blue + '1' + red + '] - ' + white + 'Scrape\n' +\
                red + '[' + blue + '2' + red + '] - ' + white + 'Check\n' +\
                red + '[' + blue + '3' + red + '] - ' + white + 'Exit\n')

        if m == '1':
            os.system('cls')
            scrape.scrape()
        elif m == '2':
            os.system('cls')
            check.check()
        elif m == 'e':
            os.system('cls')
            sys.exit(1)
        else:
            os.system('cls')
            error('Input not recognised. Please retype and try again.')
            self.main()

Пример #8

0

Показать файл

def main():
    """Shows basic usage of the Sheets API.
    Prints values from a sample spreadsheet.
    
    """

    # Authorization of google account
    creds = None

    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)

    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)

        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    # Scraping method starts here
    scrape.scrape(creds)

Пример #9

0

Показать файл

def get_table():
    username = input("Username: "******"twu_website.html")
    make_table(selected_term)

Пример #10

0

Показать файл

Файл: fabfile.py Проект: pcorliss/ec2instances.info

def build():
    """Scrape AWS sources for data and build the site"""
    data_file = 'www/instances.json'
    try:
        scrape(data_file)
    except Exception, e:
        print "ERROR: Unable to scrape site data: %s" % e

Пример #11

0

Показать файл

def build():
    """Scrape AWS sources for data and build the site"""
    data_file = 'www/instances.json'
    try:
        scrape(data_file)
    except Exception, e:
        print "ERROR: Unable to scrape site data: %s" % e

Пример #12

0

Показать файл

def scrape_ec2(c):
    """Scrape EC2 data from AWS and save to local file"""
    ec2_file = "www/instances.json"
    try:
        scrape(ec2_file)
    except Exception as e:
        print("ERROR: Unable to scrape EC2 data")
        print(traceback.print_exc())

Пример #13

0

Показать файл

Файл: fabfile.py Проект: amiryal/ec2instances.info

def scrape_ec2():
    """Scrape EC2 data from AWS and save to local file"""
    ec2_file = 'www/instances.json'
    try:
        scrape(ec2_file)
    except Exception as e:
        print "ERROR: Unable to scrape data: %s" % e
        print traceback.print_exc()

Пример #14

0

Показать файл

Файл: fabfile.py Проект: startupcradle/ec2instances.info

def scrape_ec2():
    """Scrape EC2 data from AWS and save to local file"""
    ec2_file = 'www/instances.json'
    try:
        scrape(ec2_file)
    except Exception as e:
        print "ERROR: Unable to scrape data: %s" % e
        print traceback.print_exc()

Пример #15

0

Показать файл

Файл: tasks.py Проект: rvictordelta/littlefield_tech

def scrape_go():
    print(
        f"---beginning scrape at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
    )
    for group in groups:
        scrape.scrape(group)
    print(
        f"---finished  scrape at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
    )

Пример #16

0

Показать файл

Файл: fabfile.py Проект: fidian/ec2instances.info

def build():
    """Scrape AWS sources for data and build the site"""
    data_file = 'www/instances.json'
    try:
        scrape(data_file)
    except Exception as e:
        print "ERROR: Unable to scrape site data: %s" % e
        print traceback.print_exc()
    render_html()

Пример #17

0

Показать файл

def build():
    """Scrape AWS sources for data and build the site"""
    data_file = 'www/instances.json'
    try:
        scrape(data_file)
    except Exception as e:
        print "ERROR: Unable to scrape site data: %s" % e
        print traceback.print_exc()
    render_html()

Пример #18

0

Показать файл

Файл: main.py Проект: CAN-Group/can

def run():
    recreate_schema()
    scrape.scrape()
    scheduler.start()
    try:
        asyncio.get_event_loop().run_forever()
    except (KeyboardInterrupt, SystemExit):
        print("Shutting down. Please wait...")
        scheduler.shutdown(wait=True)
        exit(0)

Пример #19

0

Показать файл

def main():
    from scrape import scrape
    from do_etl import do_etl

    try:
        scrape()
    except ValueError as e:
        print(e)
    finally:
        do_etl(inital_load=False)

Пример #20

0

Показать файл

Файл: flaskServer.py Проект: mohamed-elfiky/GoodBook

def upload_file():
    file = request.files['image']
    f = os.path.join(app.config['UPLOAD_FOLDER'], 'img.jpg')#file.filename)
    
    # add your custom code to check that the uploaded file is a valid image and not a malicious file (out-of-scope for this post)
    file.save(f)
    print('file uploaded successfully')
    text = recognize_text()
    print(text)
    print(scrape(text))
    return scrape(recognize_text())[0]

Пример #21

0

Показать файл

 def fetch(self):
     out = scrape.scrape({
         "url": self.url,
         "css": 'div[data-tts="answers"]',
         "text": True
     })
     return out.strip()

Пример #22

0

Показать файл

 def fetch(self):
     out = scrape.scrape({
         "url": self.url,
         "css": 'div.startupLogos a',
         "print_url": True,
     })
     return out

Пример #23

0

Показать файл

def scraper():
    mars_info = scrape.scrape()

    #listings = mongo.db.listings
    #listings.update({}, listings_result, upsert = True)

    return redirect("/", code=302)

Пример #24

0

Показать файл

Файл: run.py Проект: akshaygujjari/NFL-Player-Style-Analysis

def run():
    data = scrape()
    grouped_data = group(data)
    analysis(grouped_data)
    statistics(grouped_data)
    # define more tasks here
    print("Run Completed Successfully")

Пример #25

0

Показать файл

Файл: app.py Проект: skavya90/Web_scraping

def web_scrape():
    db.collection.remove({})
    mars_data = scrape.scrape()
    #print('----after getting data---')
    #print(mars_data)
    db.collection.insert_one(mars_data)
    return redirect("http://localhost:5000/", code=302)

Пример #26

0

Показать файл

	def getCurrentInfo(self, fic: Fic) -> Fic:
		fic.url = self.baseUrl + str(fic.localId)
		url = fic.url.split('?')[0] + '?view_adult=true'
		# scrape fresh info
		data = scrape.scrape(url)

		return self.parseInfoInto(fic, data['raw'])

Пример #27

0

Показать файл

Файл: flask_sample.py Проект: csravan83/xkcd-scraper

def hello_world():
    # localhost:8000/
    file_url = scrape()
    file = requests.get(file_url).content
    with open(basename('img.png'), "wb") as f:
        f.write(file)
    return send_file(f, mimetype='image/png')

Пример #28

0

Показать файл

def new(username, include):
    try:
        if not tweets.find_one({"username": username}):
            tweets.insert_one({
                "username": username,
                "tweets": scrape(username)
            })
        obj = tweets.find_one({"username": username})

        model, rmodel = train([e['full_text'] for e in obj["tweets"]])
        if include:
            text = generate_with(model, rmodel, include)
        else:
            text = generate(model)
        return json.dumps({
            'success':
            True,
            'message':
            ' '.join(text),
            'name':
            obj['tweets'][0]['user']['name'],
            'avatar':
            obj['tweets'][0]['user']['profile_image_url_https']
        })
    except:
        return json.dumps({
            'success': False,
            'message': 'Oops! An error occurred.',
        })

Пример #29

0

Показать файл

Файл: index.py Проект: Krazete/antscoper

 def post(self):
     year = int(self.request.get('year', 1990))
     term = int(self.request.get('term', 92))
     template = open('scrape_yearterm.html').read()
     try:
         scrape([year], [term])
         self.response.write('YearTerm {:04d}-{:02d} has successfully been added to the database.<br><br>'.format(year, term))
         year_value = str(year + 1 if term == 92 else year) # iterate year
         input_index = str([0, 92, 03, 14, 25, 39, 76].index(term)) # iterate term
     except Exception as e:
         traceback.print_exc() # only visible in terminal
         self.response.write('ERROR: {}<br><br>'.format(e))
         year_value = str(year) # preserve year
         input_index = str([0, 03, 14, 25, 39, 76, 92].index(term)) # preserve term
     content = template.replace('{YEAR_VALUE}', year_value).replace('{INPUT_INDEX}', input_index)
     self.response.write(content)

Пример #30

0

Показать файл

def scraper():
    # Run the scrape function we made to pull all the data from the sources
    mars_data = scrape.scrape()
    # Update the collection with new data
    db.db.collection.update({}, mars_data, upsert=True)

    return redirect("/")

Пример #31

0

Показать файл

Файл: sugarQuillAdapter.py Проект: FanFicDev/hermes

	def getCurrentInfo(self, fic: Fic) -> Fic:
		url = self.constructUrl(fic.localId)
		# scrape fresh info
		data = scrape.scrape(url)

		edumpContent('<!-- {} -->\n{}'.format(url, data['raw']), 'sugarquill_ec')
		return self.parseInfoInto(fic, data['raw'])

Пример #32

0

Показать файл

Файл: Mission.To.Mars.app.py Проект: cghayes99/wustl-bootcamp-homework

def get_scrape():
    mars_data = knife.scrape()

    mars_db = mongo.db.mars
    mars_db.update({}, mars_data, upsert=True)

    return redirect("http://localhost:5000/", code=302)

Пример #33

0

Показать файл

def prediction(stock):
    if request.method == 'POST':
        # form = request.form
        stock = request.form['ticker']
        req = request
        print(req.form)
        ticker = request.form['ticker']
        ma1 = int(request.form['ma1'])
        ma2 = int(request.form['ma2'])
        from_date = request.form['from_date']
        to_date = request.form['to_date']
        crossover = ''

        # Parameters can now be passed through for calculations
        results = forecast(ma1,ma2,ticker,from_date,to_date)

        data = scrape(ticker)
        print(data.keys())
        cap = data['cap']
        price = data['price']
        day = data['day']
        week = data['week']
        month = data['month']
        quarter = data['quarter']
        headlines = data['headlines']
        trend = results['trend']
        value=Markup(results['html'])

        # img = f'predict.png'


        return render_template("dynamicForecast.html",from_date=from_date,to_date=to_date,ma1=ma1,ma2=ma2,ticker=ticker,crossover=crossover,trend=trend,cap=cap,price=price,day=day,week=week,month=month,quarter=quarter,value=value,headlines=headlines)   
        # return render_template('dynamicForecast.html',stock=stock)
    else:
        return render_template('dynamicForecast.html')

Пример #34

0

Показать файл

Файл: hackedNews.py Проект: nlintz/HackedNews

 def get(self):
     titleLinkAssoc = scrape.scrape("http://www.metafilter.com/", "div.posttitle > a")
     formattedLinks = [
         "http://www.metafilter.com" + v for k, v in titleLinkAssoc.items()
     ]  # metafilter hosts their own content so you need to add http://www.metafilter to each link
     titles = [k for k, v in titleLinkAssoc.items()]
     formattedLinkAssoc = dict(zip(titles, formattedLinks))
     self.render("scraped.html", titleLinks=formattedLinkAssoc, site="MetaFilter")

Пример #35

0

Показать файл

Файл: web_main.py Проект: sealneaward/cloudgenius

def analyze():
    url=request.form['url']
    text = scrape.scrape(url)
    # get end of url for naming pics
    i = url.rfind('/') + 1
    url = url[i:]
    polarity_url = analysis.get_sentiment_analysis(text, url)
    wordmap_url = analysis.get_wordmap(text, url)

    return render_template("analysis.html", polarity=polarity_url, wordmap=wordmap_url)

Пример #36

0

Показать файл

Файл: crawler.py Проект: ericrose/pitchfork-indexer

def crawl(review_store, page=0):
    page = urllib2.urlopen("http://pitchfork.com/reviews/albums/" + ("" if page==0 else (str(page) + "/")))
    soup = BeautifulSoup(page)
    main_grid = soup.find("ul", {"class" : "object-grid"})
    
    for a_child in main_grid.findAll("a"): 
        shelve_key = a_child['href'].encode('ASCII', 'ignore').split('/')[-2]
        print(shelve_key)
        if not review_store.has_key(shelve_key):
            print("key not in store")
            review_store[shelve_key] = scrape("http://pitchfork.com"+a_child['href'])

Пример #37

0

Показать файл

Файл: MyScrapper.py Проект: Smartsn123/MyDistributedCrawler

def  scrape_controller(domain):
   
    base_url = "http://www.hm.com/us/products/search?"
    #'''
    #gets from bucket the input file and the last position of scrapping
    inp_file,pos = get_inp_line()
    mark =0
   
    #move reader to the given position
    while mark <pos:
         mark+=1
         
    
    #fetch next line from reader
    inp_line = get_next_line(mark)
    #'''
    #inp_line = "categories=men&term=gingham shirts|type=Gingham,source=H&M"
    
    while inp_line !="EOF" :
    
            search_q , db_entry = inp_line.split('|')
            
            #to sort by new arrivals
            if not check_for_domain("Orders=newfrom_desc",search_q):
                 search_q = search_q+"&"+"Orders=newfrom_desc"
            
            #in case none of the domain
            if  not check_for_domain("categories=men",search_q)  and  not check_for_domain("categories=ladies",search_q)  and   not check_for_domain("categories=kids",search_q)  and  not heck_for_domain("categories=sale",search_q)  :
                  search_q =  domain+"&"+search_q
             #incase given domain
            elif check_for_domain(domain,search_q) : 
                     print" --"
            #incase of no doamin
            else :
                  continue
                  
                  
            
            q_url = base_url + search_q
            print q_url
            try: 
                my_items= scrape(q_url)
                #save_to_db(my_items)
                #for item in my_items:
                   #print item 
            except :
                print ("unable to scrape from "+q_url)
                
             #get the next imput line from the input file   
            inp_line=get_next_line(mark+1)
            #update the position of reader in SQS
            update_reader_pos(mark+1)
            mark=mark+1

Пример #38

0

Показать файл

Файл: cin.py Проект: timss/fastlege-varsel

def get_communities(url, counties):
    """Returns a dict {id: name}"""
    communities = {}

    # could also be done with incomprehensible dict comprehension
    for county in counties:
        soup = scrape(url, county)
        pat = "{}\d+".format(county)
        options = filter_tags(soup, 'option', pat)
        communities.update(options)

    return communities

Пример #39

0

Показать файл

Файл: app.py Проект: shaansheikh/SBUCourseMonitor

def seatcheck(medium,username):
	seats = scrape(db.getTemp(username))
	if seats > 0:
		messenger.message(medium,username,"Good news! Your class has " + str(seats) + " open seats, so you can go sign up now! If you have the ID of another course that's closed that you'd like to track, let me know!")
		db.changeState(username,1)

	elif seats > -1000:
		messenger.message(medium,username,"You're all set. I'll monitor your course and message you here if a seat in your class opens up.")
		messenger.message(medium,username,"Anything else I can help you with? You can say 'commands' for a list of commands I understand.")
		temp = db.getTemp(username)
		db.addJob(username,temp)
		db.changeState(username,0)
	else:
		messenger.message(medium,username,"Couldn't figure out how many seats open. Is classfind down?")
		db.changeState(username,2)

Пример #40

0

Показать файл

Файл: disc.py Проект: tliu/mnectar_discovery_backend

def add_package(package):
    cur = get_db().cursor()
    cur.execute("insert into game (package) values (?)", (package,))
    get_db().commit()
    id = cur.lastrowid
    res = scrape(id, package)
    name = res[0]
    desc = res[1]
    rating = float(res[2])
    activity = res[4]
    category = res[5]
    cur.execute("update game set name='%s',description='%s',rating=%f,activity='%s',category='%s' where id=%d" % (name, desc, rating, activity, category, int(id)))
    get_db().commit()
    cur.connection.close()
    return json.dumps(id)

Пример #41

0

Показать файл

Файл: gui.py Проект: ds604/reddit_scraper

 def _scrapes(self, include_sub, include_dir, expose=True, alert_when_done=True):
     try:
         count = 0
         for x in scrape.scrape(self.settings, include_sub=include_sub, include_dir=include_dir):
             if isinstance(x, int):
                 count += x
                 continue
             if expose:
                 reveal(x)
     except requests.ConnectionError:
         tkMessageBox.askokcancel("Connection Error",
                                  "Could not connect to Reddit. Check your internet settings, "
                                  "and make sure Reddit isn't down.")
     else:
         tkMessageBox.askokcancel("", "Scrape Complete! %d files downloaded." % count)

Пример #42

0

Показать файл

Файл: server.py Проект: agness/recipe_nltk

 def post(self):
     blob = self.get_argument("blob", None)
     url = self.get_argument("url", None)
     self.write("<style>i{color:#aaa;}</style>")
     if blob:
         for line in blob.split("\n"):
             line = line.strip()
             if len(line) > 5:
                 self.write(line+"<br>")
                 self.write("<i>%s, %s</i><br><br>" % classifier.classify(line))
     elif url:
         page_data = scrape(url)
         self.write("<p><b>source:</b> "+page_data["source"]+"</p>")
         self.write("<p><b>title:</b> "+page_data["title"]+"</p>")
         self.write("<p><b>url:</b> "+page_data["url"]+"</p>")
         self.write("<p><b>description:</b> "+page_data["description"]+"</p>")
         self.write("<hr>")
         for line in page_data["body"]:
             if len(line) > 5: # TODO drop any line < 2 words; NER for time
                 score = classifier.classify(line)
                 if score[1] > 0.5:
                     self.write(line+"<br>")
                     self.write("<i>%s, %s</i><br><br>" % score)

Пример #43

0

Показать файл

Файл: recipe_api.py Проект: paralysedforce/Recipe-Project

def autograder(url):
    '''Accepts the URL for a recipe, and returns a dictionary of the
    parsed results in the correct format. See project sheet for
    details on correct format.'''
    ingredient_strings, step_strings = scrape.scrape(url)
# The ingredient template is 
#   name, quantity, measurement, descriptor, preparation, pre-preparation
    fin_ingredients = []
    for ingredient in ingredient_strings:
        name = unicode(parser.recognize_ingredient(ingredient))
        number = parser.recognize_number(ingredient)
        unit = parser.recognize_unit(ingredient)
        descriptors = [unicode(i) for i in
                parser.recognize_descriptors(ingredient)]
        fin_ingredients.append({"name": name, "quantity": number, "measurement":
            [unicode(unit)], "descriptor": descriptors})

    primary_method = None
    methods = set()
    for method in COOKING_METHODS.keys()[::-1]:
        for variation in COOKING_METHODS[method]:
            for step in step_strings:
                if variation in step:
                    methods.add(unicode(method))
                    primary_method = unicode(method)

    cookware_set = set()
    for cookware in COOKWARE:
        for variation in COOKWARE[cookware]:
            for step in step_strings:
                if variation in step:
                    cookware_set.add(unicode(cookware))

    return {"ingredients": list(fin_ingredients), "cooking methods": list(methods),
            "primary cooking method": primary_method, "cooking tools":
            list(cookware_set)}

Пример #44

0

Показать файл

Файл: main.py Проект: RylanSchaeffer/ECS289K-Practical-Data-Science

__author__ = 'rylan'

from scrape import scrape
from compare import compare
from printToHTML import printToHTML

# sourceOne = 'https://news.google.com/'
# sourceTwo = 'https://news.yahoo.com/'
sourceOne = raw_input('Please enter first newsource: ')
sourceTwo = raw_input('Please enter second newsource: ')

articleTitles = scrape(sourceOne, sourceTwo)
pairings = compare(articleTitles[0], articleTitles[1])
printToHTML(pairings)

Пример #45

0

Показать файл

Файл: cin.py Проект: timss/fastlege-varsel

def get_counties(url):
    """Returns a dict {id: name}"""
    soup = scrape(url)
    pat = '^\d{2}$'
    return filter_tags(soup, 'option', pat)

Пример #46

0

Показать файл

Файл: notifyer.py Проект: shaansheikh/SBUCourseMonitor

from scrape import getinfo,scrape,statusUpdate
from dbaccess import AuthDatabase
from interface import Interface
import datetime
import json

config = json.loads(open("/root/SBUCourseMonitor/config.json").read())
messenger = Interface(config)

db = AuthDatabase(config["database_addr"])

for job in db.getJobs():
	seats = scrape(job[2])
	if seats > 0:
		user = db.getUserByID(job[1])[0]
		info = getinfo(str(job[2]))
		messenger.message(user[1],user[2],"Knock, knock! You course " + info + " now has " + str(seats) + " open seats. Go sign up!")
		print datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') + "\t" + user[2] + "\t" + info
		db.deleteJob(job[0])

Пример #47

0

Показать файл

Файл: hackedNews.py Проект: nlintz/HackedNews

 def get(self):
     titleLinkAssoc = scrape.scrape("http://www.digg.com/", "h2.story-title > a")
     self.render("scraped.html", titleLinks=titleLinkAssoc, site="Digg")

Пример #48

0

Показать файл

Файл: hackedNews.py Проект: nlintz/HackedNews

 def get(self):
     titleLinkAssoc = scrape.scrape("http://www.slashdot.org/", "h2 > span > a")
     self.render("scraped.html", titleLinks=titleLinkAssoc, site="Slash Dot")

Пример #49

0

Показать файл

Файл: job.py Проект: sagarmanchanda/krawler

import proxy
from scrape import scrape
from bs4 import BeautifulSoup
import urllib.request as req
url = "http://intranet.iitg.ernet.in/"
scrape(url)
print("done")

Пример #50

0

Показать файл

Файл: hackedNews.py Проект: nlintz/HackedNews

 def get(self):
     titleLinkAssoc = scrape.scrape("http://www.techcrunch.com/", "h2.headline > a")
     self.render("scraped.html", titleLinks=titleLinkAssoc, site="Tech Crunch")

Пример #51

0

Показать файл

Файл: app.py Проект: bengalurufc/i-league-table-scraper

def do_scrape():
    return Response(i_league_scraper.scrape(), mimetype="text/plain")

Пример #52

0

Показать файл

Файл: fabfile.py Проект: imran2140/ec2instances.info

def build():
    """Scrape AWS sources for data and build the site"""
    data_file = "www/instances.json"
    scrape(data_file)
    render(data_file, "in/index.html.mako", "www/index.html")

Пример #53

0

Показать файл

Файл: page.py Проект: johnb30/atlas

def parse_results(message, db_collection):
    """
    Function to parse the links drawn from an RSS feed.

    Parameters
    ----------

    message: pattern.web.Results.
                Object containing data on the parsed RSS feed. Each item
                represents a unique entry in the RSS feed and contains
                relevant information such as the URL and title of the
                story.

    db_collection: pymongo Collection.
                        Collection within MongoDB that in which results are
                        stored.
    """
    global proxies, proxy_user, proxy_pass

    if proxies:
        proxy_choice = {'http': random.choice(proxies)}
        proxy_login = requests.auth.HTTPProxyAuth(proxy_user,
                                                  proxy_pass)
    else:
        proxy_choice = ''
        proxy_login = {}

    lang = message.get('lang')
    story_url = message.get('url')
    website = message.get('website')
    title = message.get('title')
    date = message.get('date')
    if lang == 'english':
        goose_extractor = Goose({'use_meta_language': False,
                                 'target_language': 'en',
                                 'enable_image_fetching': False})
    elif lang == 'arabic':
        from goose.text import StopWordsArabic
        goose_extractor = Goose({'stopwords_class': StopWordsArabic,
                                 'enable_image_fetching': False})
    else:
        print(lang)

    if 'bnn_' in website:
        # story_url gets clobbered here because it's being replaced by
        # the URL extracted from the bnn content.
        #TODO: Deprecate this for now since using GhostJS is weird.
        logging.info('A BNN story.')
#        text, meta, story_url = scrape.bnn_scrape(story_url, goose_extractor)
        text = ''
        pass
    else:
        text, meta = scrape.scrape(story_url, goose_extractor, proxy_choice,
                                   proxy_login)
    text = text.encode('utf-8')

    if text:
        cleaned_text = _clean_text(text, website)

        # TODO: Figure out where the title, URL, and date should come from
        # TODO: Might want to pull title straight from the story since the RSS
        # feed is borked sometimes.
        entry_id = connectors.add_entry(db_collection, cleaned_text, title,
                                        story_url, date, website, lang)
        if entry_id:
            try:
                logging.info('Added entry from {} with id {}. {}.'.format(story_url,
                                                                          entry_id,
                                                                          datetime.datetime.now()))
            except UnicodeDecodeError:
                logging.info('Added entry from {}. Unicode error for id'.format(story_url))
    else:
        logging.warning('No text from {}'.format(story_url))

Пример #54

0

Показать файл

Файл: printDict.py Проект: pranavrc/wiki-stats

import pickle
import scrape

scrape.scrape()
dictPickle = open('barf','rb')
outputDict = pickle.load(dictPickle)
dictPickle.close()

##dictFile = open('barf.txt','w')
##dictFile.write(outputDict)
##dictFile.close()

Пример #55

0

Показать файл

Файл: fabfile.py Проект: ahmed26/ec2instances.info

def build():
    """Scrape AWS sources for data and build the site"""
    data_file = 'www/instances.json'
    scrape(data_file)
    render(data_file, 'in/index.html.mako', 'www/index.html')

Пример #56

0

Показать файл

Файл: parse.py Проект: ccr122/ccr

ATTR_DICT = {   'title' : '../csvs/exid_title.csv', 
                'date'  : '../csvs/exid_date.csv' ,
                'url'   : '../csvs/exid_url.csv'   }

def create_attr_csvs(index):
    '''
    creates csvs for title, date, url 
    input:
        index: dictionary of exhibit information
    output:
        writes csv files according to ATTR_DICT
    '''
    for attr in ATTR_DICT:
        with open(ATTR_DICT[attr],'w') as f:
            line = 'ex_id|' + attr + '\n'
            f.write(line)
            for museum_id in index:
                for ex_id in index[museum_id]:
                    line = '{}|{}\n'.format(str(ex_id), \
                        index[museum_id][ex_id][attr].encode('utf-8'))
                    
                    f.write(line)   
    
if __name__ == "__main__":
    index = scrape.scrape()
    if os.path.isfile('../pickled_search_object'):
        os.remove('../pickled_search_object')
    wd = build_word_dict(index)
    create_wordct_csv(wd)
    create_attr_csvs(index)

Пример #57

0

Показать файл

Файл: parser.py Проект: paralysedforce/Recipe-Project

def main(original_recipe):
    # urls = ['http://allrecipes.com/recipe/easy-meatloaf/',
    #         'http://allrecipes.com/Recipe/Easy-Garlic-Broiled-Chicken/',
    #         'http://allrecipes.com/Recipe/Baked-Lemon-Chicken-with-Mushroom-Sauce/',
    #         'http://allrecipes.com/Recipe/Meatball-Nirvana/']
    if original_recipe.url:
        scraped_ing, scraped_steps = scrape.scrape(original_recipe.url)

        # parse ingredient info, create objects
        ingredients = []
        for ingredient in scraped_ing:
            new_ing = parse_ingredient(ingredient)
            cursor = db.ingredients.find({"name":new_ing.name})
            i = 0
            for document in cursor:
                i += 1
            if i == 0:
                # add to DB
                db.ingredients.insert({"name":new_ing.name, "category":"????", "flag":"none"})
            ingredients.append(new_ing)

        steps = []
        for step in scraped_steps:
            #SPLIT STEP CONTENTS BEFORE PARSING
            if not step:
                continue # HANDLE EMPTY
        # for new_parser
            # parsed_steps = parse_step(step)
            # for p in parsed_steps:
            #     steps.append(p)
        #for new_parser
            step_sent = nltk.sent_tokenize(step)
            for sent in step_sent:
                if contains_procedure(sent) == 1:
                    new_proc = parse_step(sent)
                    steps.append(new_proc)
                elif contains_procedure(sent) > 1:
                    actions = double_action(sent)
                    if actions:
                        for a in actions:
                            new_proc = parse_step(a)
                            steps.append(new_proc)
                        if contains_procedure(sent) == 2:
                            break
                    clause = sent.split(';')
                    for c in clause:
                        if contains_procedure(c) == 1:
                            new_proc = parse_step(c)
                            steps.append(new_proc)
                        elif contains_procedure(c) > 1:
                            more_clause = c.split(',')
                            for more_c in more_clause:
                                if contains_procedure(more_c) == 1:
                                    new_proc = parse_step(more_c)
                                    steps.append(new_proc)
                                elif contains_procedure(more_c) > 1:
                                    actions = double_action(more_c)
                                    if actions:
                                        for a in actions:
                                            new_proc = parse_step(a)
                                            steps.append(new_proc)
                                        if contains_procedure(more_c) == 2:
                                            break
                                    else:
                                        new_proc = parse_step(more_c)
                                        steps.append(new_proc)

        original_recipe.in_list = ingredients
        original_recipe.pr_list = steps

    #call transform etc
    reconstruction.reconstruct(original_recipe)
    r = original_recipe
    try:
        transformed_recipe = transform.transform(r)
    except RuntimeError:
        return [original_recipe, Recipe()]

    #if transformed_recipe == original_recipe:
    #    print "There are no changes to be made"
    #else:
    reconstruction.reconstruct(transformed_recipe)
    return [original_recipe, transformed_recipe]

Пример #58

0

Показать файл

Файл: messageparser.py Проект: AraHaan/BloodyAra

def parse(message):
    if(message.content.startswith('!blood')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(message.author.name)
        print(stamp)
        print(type(message.author.name))
        print(type(stamp))
        '''print('sending hello to ' + message.author.name + ' ' + stamp)'''
        return ('Is that blood I smell? ' + stamp)
    elif(message.content.startswith('!commands')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('sending command list ' + stamp)
        return (commands)
    elif(message.content.startswith('!changelog')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('sending changelog ' + stamp)
        return (changelog)
    elif(message.content.startswith('!source')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('sending source ' + stamp)
        return (source)
    if(message.content.startswith('!beg')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('Frozen Pizza ' + message.author.name + ' ' + stamp)
        return ('Can I have that Frozen Pizza? ' + stamp)
    if(message.content.startswith('!goodboy')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('Treat ' + message.author.name + ' ' + stamp)
        return ('Can I have my Treat now? ' + stamp)
    elif(message.content.startswith('!elwiki')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        searchterm = message.content[7:].strip()
        if (len(searchterm) == 0):
            print('no argument specified')
            return ('Tell me what to look for, and I shall deliver.')
        if (searchterm.lower().find('seris') != -1):
            print('not looking for seris')
            return ('Some old mistakes should not be touched upon. Mistakes are often a scab to an old, deep wound.')
        if (badword.has_profanity(searchterm)):
            return ('You should reconsider your words if you value your life, ' + message.author.mention())
        print('looking up ' + searchterm)
        r = requests.get('http://elwiki.net/wiki/index.php?search=' + searchterm, allow_redirects=False)
        print(r.status_code)
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        if (r.status_code == 302):
            answer = r.headers['Location']
            print(answer + ' sent on ' + stamp)
            return ('Page for ' + searchterm + ' : ' + answer)
        if (r.status_code == 200):
            print('scraping')
            answer = scrape.scrape(r.text)
            if(answer is None):
                return 'I could not find a match for that.'
            else:
                return ('First match for ' + searchterm + ' : ' + answer)
    elif(message.content.startswith('!babel')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up babel on ' + stamp)
        babelfeed = feedparser.parse('http://elwiki.net/babel/?feed=rss2')
        answer = babelfeed.entries[0]['title'] + ' ' + babelfeed.entries[0]['link']
        print(answer)
        return ('Last post on Babel - ' + answer)
    elif(message.content.startswith('!na')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up na on ' + stamp)
        nafeed = feedparser.parse('http://en.elswordonline.com/feed/')
        answer = nafeed.entries[0]['title'] + ' ' + nafeed.entries[0]['link']
        print(answer)
        return ('Last NA update - ' + answer)
    elif(message.content.startswith('!uk')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up uk on ' + stamp)
        ukfeed = feedparser.parse('http://board.en.elsword.gameforge.com/index.php?page=ThreadsFeed&format=rss2&boardID=8')
        answer = ukfeed.entries[0]['title'] + ' ' + ukfeed.entries[0]['link']
        print(answer)
        return ('Last UK update - ' + answer)
    elif(message.content.startswith('!void')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up void on ' + stamp)
        return scrape.scrape_void()
    elif(message.content.startswith('!events')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up void events and contests on ' + stamp)
        return scrape.vevent()
    elif(message.content.startswith('!promo')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up void promotions on ' + stamp)
        return scrape.vpromotions()
    elif(message.content.startswith('!general')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up void void general topics on ' + stamp)
        return scrape.vgeneral()
    elif(message.content.startswith('!suggest')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up void void suggestions on ' + stamp)
        return scrape.vsuggestions()
    elif(message.content.startswith('!intro')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up void Intro/Farewells on ' + stamp)
        return scrape.vintro()
    elif(message.content.startswith('!guild')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up void guild topics on ' + stamp)
        return scrape.vguilds()
    elif(message.content.startswith('!shots')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('sending shots fired ' + stamp)
        return ('Hmm. It appears as if shots have been fired.')
    elif(message.content.startswith('!ibset')):
        searchterm = message.content[6:].strip()
        return elsgear.lookup(searchterm)
    elif(message.content.startswith('!google')):
        searchterm = message.content[7:].strip()
        if(len(searchterm) == 0):
            return ('Tell me what to look for, and I shall deliver.')
        if (badword.has_profanity(searchterm)):
            return ('You should reconsider your words if you value your life, ' + message.author.mention())
        return ('https://www.google.com/search?q=' + urllib.parse.quote_plus(searchterm))
    elif(message.content.startswith('!gimg')):
        searchterm = message.content[5:].strip()
        if(len(searchterm) == 0):
            return ('Tell me what to look for, and I shall deliver.')
        if (badword.has_profanity(searchterm)):
            return ('You should reconsider your words if you value your life, ' + message.author.mention())
        return ('https://www.google.com/search?q=' + urllib.parse.quote_plus(searchterm) + '&tbm=isch')
    elif(message.content.startswith('!youtube')):
        searchterm = message.content[8:].strip()
        if(len(searchterm) == 0):
            return ('Tell me what to look for, and I shall deliver.')
        if (badword.has_profanity(searchterm)):
            return ('You should reconsider your words if you value your life, ' + message.author.mention())
        return ('https://www.youtube.com/results?search_query=' + urllib.parse.quote_plus(searchterm))
#    elif(message.content.startswith('!hall')):
#        print('delivering event message (10-22-15)')
#        return(halloween)
    elif(message.content.startswith('!lenify')):
        msg = message.content[7:].strip()
        return(msg)
    elif(message.content.startswith('!roast')):
        print('delivering roast')
        response = 'http://i.imgur.com/rSMtLIM.gif'
        for mention in message.mentions:
            print('mentioning ' + mention.name)
            response += (' ' + mention.mention())
        return response
    elif(message.content.startswith('!salt')):
        print('delivering salt')
        response = ''
        for mention in message.mentions:
            print('mentioning ' + mention.name)
            response += (' ' + mention.mention())
        return response + '\n\n' + salt
    elif(message.content.startswith('!lyyin')):
        response = ''
        for mention in message.mentions:
            print('mentioning ' + mention.name)
            response += (mention.mention() + ' ')
        response += lyying
        return response
    else:
        return None

Python scrape примеры использования