Пример #1
0
def scrape(url: str,
           staleOnly: bool = False,
           fallback: bool = False) -> sc.ScrapeMeta:
    if sc._staleOnly:
        util.logMessage(f'skitter.scrape: HERMES_STALE only {url}')
        return sc.scrape(url)

    if staleOnly:
        util.logMessage(f'skitter.scrape: staleOnly {url}')
        for c in reversed(priv.skitterClients):
            ce = c.cache(url)
            if ce is not None:
                return ce
        raise Exception(f'skitter.scrape: unable to staleOnly scrape: {url}')

    for c in priv.skitterClients:
        try:
            #util.logMessage(f'skitter.scrape: calling {c.ident}.scrape({url})')
            r = c.scrape(url)
            return r
        except Exception as e:
            util.logMessage(f'skitter.scrape: {c.ident}.scrape failed: {e}')
            pass

    if fallback:
        return sc.scrape(url)
    raise Exception(f'skitter.scrape: unable to scrape: {url}')
Пример #2
0
def scrape_ec2():
    ec2_file = 'AWSinstances.json'
    try:
        scrape(ec2_file)
    except Exception as e:
        print "ERROR: Unable to scrape data: %s" % e
        print traceback.print_exc()
Пример #3
0
def main():
    """Code for scraper.

    Actual scraping is turned off because server doesn't like being pinged.

    """
    # url = "https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major"
    # # Use to make URL attribute of scholarship object usable
    # appendable_url = "https://www.scholarships.com"

    # # Setup output file
    # scan_time = date.today()
    # filename = 'scan_' + str(scan_time) + '.csv'
    # with open(filename, 'w', encoding='utf-8-sig') as f:
    #     w = csv.DictWriter(f, ['name', 'url', 'amount', 'deadline', 'description'])
    #     w.writeheader()

    # # get response
    # response = get_response(url)

    # soup = BeautifulSoup(response.content, 'html5lib')
    # url_table = soup.find(id="ullist")
    # url_list = url_table.find_all('a')
    # for link in url_list:
    #     get_scholarshipscom_details(link.get('href'), appendable_url, filename)

    #     # Wait 1 second between requests
    #     sleep(1)


    print("Pushing file into the database.", flush=True)
    scrape(environ['MYSQL_USER'], environ['MYSQL_PASSWORD'], "db", environ['MYSQL_DB_NAME'])

    print("done")
Пример #4
0
 def test_ScrapedUser(self):
     scrape.scrape(self.username)
     output_text = io.StringIO()
     sys.stdout = output_text
     scrape.scrape(self.username)
     sys.stdout = sys.__stdout__
     self.assertEqual(output_text.getvalue(), 'My name is Kanish and my current city is Roorkee\n')
Пример #5
0
 def export(self):
     urlName = self.varURLName.get()
     className = self.varClassName.get()
     csvName = self.varCSVName.get()
     tagType = self.varTAGType.get()
     scrape.WriteCSV(csvName)
     scrape.scrape(className, urlName, csvName, tagType)
Пример #6
0
    def main(self):
        print(red + '''
  _____                     _  __    _             
 |  __ \\                   | |/ /   | |            
 | |__) | __ _____  ___   _| ' / ___| | _____ _ __ 
 |  ___/ '__/ _ \\ \\/ / | | |  < / _ \\ |/ / _ \\ '__|
 | |   | | | (_) >  <| |_| | . \\  __/   <  __/ |   
 |_|   |_|  \\___/_/\\_\\\\__, |_|\\_\\___|_|\\_\\___|_|   
                       __/ |                       
                      |___/                        \n''')
        print(blue + 'by Nexolyte\n')
        m = get('Main Menu\n' +\
                red + '[' + blue + '1' + red + '] - ' + white + 'Scrape\n' +\
                red + '[' + blue + '2' + red + '] - ' + white + 'Check\n' +\
                red + '[' + blue + 'e' + red + '] - ' + white + 'Exit\n')

        if m == '1':
            os.system('cls')
            scrape.scrape()
        elif m == '2':
            os.system('cls')
            check.check()
        elif m == 'e':
            os.system('cls')
            sys.exit(1)
        else:
            os.system('cls')
            error('Input not recognised. Please retype and try again.')
            self.main()
Пример #7
0
    def main(self):
        print(red + """
    dBBBBBb dBBBBBb    dBBBBP`Bb  .BP dBP dBP dBBBBBb    dBBBP dBBBBBb   dBBBBBb  dBBBP dBBBBBb
       dB'     dBP   dBP.BP     .BP     dBP      dBP               BB       dB'            dBP
  dBBBP'  dBBBBK   dBP.BP    dBBK     dBP   dBBBBK'  dBBP     dBP BB   dBBBP' dBBP    dBBBBK
 dBP     dBP  BB  dBP.BP    dB'      dBP   dBP  BB  dBP      dBP  BB  dBP    dBP     dBP  BB
dBP     dBP  dB' dBBBBP    dB' dBP  dBP   dBP  dB' dBBBBP   dBBBBBBB dBP    dBBBBP  dBP  dB' v1.1"""
              )
        print(blue + 'by Xenex\n')
        m = get('Main Menu\n' +\
                red + '[' + blue + '1' + red + '] - ' + white + 'Scrape\n' +\
                red + '[' + blue + '2' + red + '] - ' + white + 'Check\n' +\
                red + '[' + blue + '3' + red + '] - ' + white + 'Exit\n')

        if m == '1':
            os.system('cls')
            scrape.scrape()
        elif m == '2':
            os.system('cls')
            check.check()
        elif m == 'e':
            os.system('cls')
            sys.exit(1)
        else:
            os.system('cls')
            error('Input not recognised. Please retype and try again.')
            self.main()
Пример #8
0
def main():
    """Shows basic usage of the Sheets API.
    Prints values from a sample spreadsheet.
    
    """

    # Authorization of google account
    creds = None

    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)

    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)

        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    # Scraping method starts here
    scrape.scrape(creds)
Пример #9
0
def get_table():
    username = input("Username: "******"twu_website.html")
    make_table(selected_term)
Пример #10
0
def build():
    """Scrape AWS sources for data and build the site"""
    data_file = 'www/instances.json'
    try:
        scrape(data_file)
    except Exception, e:
        print "ERROR: Unable to scrape site data: %s" % e
Пример #11
0
def build():
    """Scrape AWS sources for data and build the site"""
    data_file = 'www/instances.json'
    try:
        scrape(data_file)
    except Exception, e:
        print "ERROR: Unable to scrape site data: %s" % e
Пример #12
0
def scrape_ec2(c):
    """Scrape EC2 data from AWS and save to local file"""
    ec2_file = "www/instances.json"
    try:
        scrape(ec2_file)
    except Exception as e:
        print("ERROR: Unable to scrape EC2 data")
        print(traceback.print_exc())
Пример #13
0
def scrape_ec2():
    """Scrape EC2 data from AWS and save to local file"""
    ec2_file = 'www/instances.json'
    try:
        scrape(ec2_file)
    except Exception as e:
        print "ERROR: Unable to scrape data: %s" % e
        print traceback.print_exc()
Пример #14
0
def scrape_ec2():
    """Scrape EC2 data from AWS and save to local file"""
    ec2_file = 'www/instances.json'
    try:
        scrape(ec2_file)
    except Exception as e:
        print "ERROR: Unable to scrape data: %s" % e
        print traceback.print_exc()
Пример #15
0
def scrape_go():
    print(
        f"---beginning scrape at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
    )
    for group in groups:
        scrape.scrape(group)
    print(
        f"---finished  scrape at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
    )
Пример #16
0
def build():
    """Scrape AWS sources for data and build the site"""
    data_file = 'www/instances.json'
    try:
        scrape(data_file)
    except Exception as e:
        print "ERROR: Unable to scrape site data: %s" % e
        print traceback.print_exc()
    render_html()
Пример #17
0
def build():
    """Scrape AWS sources for data and build the site"""
    data_file = 'www/instances.json'
    try:
        scrape(data_file)
    except Exception as e:
        print "ERROR: Unable to scrape site data: %s" % e
        print traceback.print_exc()
    render_html()
Пример #18
0
def run():
    recreate_schema()
    scrape.scrape()
    scheduler.start()
    try:
        asyncio.get_event_loop().run_forever()
    except (KeyboardInterrupt, SystemExit):
        print("Shutting down. Please wait...")
        scheduler.shutdown(wait=True)
        exit(0)
Пример #19
0
def main():
    from scrape import scrape
    from do_etl import do_etl

    try:
        scrape()
    except ValueError as e:
        print(e)
    finally:
        do_etl(inital_load=False)
Пример #20
0
def upload_file():
    file = request.files['image']
    f = os.path.join(app.config['UPLOAD_FOLDER'], 'img.jpg')#file.filename)
    
    # add your custom code to check that the uploaded file is a valid image and not a malicious file (out-of-scope for this post)
    file.save(f)
    print('file uploaded successfully')
    text = recognize_text()
    print(text)
    print(scrape(text))
    return scrape(recognize_text())[0]
Пример #21
0
 def fetch(self):
     out = scrape.scrape({
         "url": self.url,
         "css": 'div[data-tts="answers"]',
         "text": True
     })
     return out.strip()
Пример #22
0
 def fetch(self):
     out = scrape.scrape({
         "url": self.url,
         "css": 'div.startupLogos a',
         "print_url": True,
     })
     return out
Пример #23
0
def scraper():
    mars_info = scrape.scrape()

    #listings = mongo.db.listings
    #listings.update({}, listings_result, upsert = True)

    return redirect("/", code=302)
Пример #24
0
def run():
    data = scrape()
    grouped_data = group(data)
    analysis(grouped_data)
    statistics(grouped_data)
    # define more tasks here
    print("Run Completed Successfully")
Пример #25
0
def web_scrape():
    db.collection.remove({})
    mars_data = scrape.scrape()
    #print('----after getting data---')
    #print(mars_data)
    db.collection.insert_one(mars_data)
    return redirect("http://localhost:5000/", code=302)
Пример #26
0
	def getCurrentInfo(self, fic: Fic) -> Fic:
		fic.url = self.baseUrl + str(fic.localId)
		url = fic.url.split('?')[0] + '?view_adult=true'
		# scrape fresh info
		data = scrape.scrape(url)

		return self.parseInfoInto(fic, data['raw'])
Пример #27
0
def hello_world():
    # localhost:8000/
    file_url = scrape()
    file = requests.get(file_url).content
    with open(basename('img.png'), "wb") as f:
        f.write(file)
    return send_file(f, mimetype='image/png')
Пример #28
0
def new(username, include):
    try:
        if not tweets.find_one({"username": username}):
            tweets.insert_one({
                "username": username,
                "tweets": scrape(username)
            })
        obj = tweets.find_one({"username": username})

        model, rmodel = train([e['full_text'] for e in obj["tweets"]])
        if include:
            text = generate_with(model, rmodel, include)
        else:
            text = generate(model)
        return json.dumps({
            'success':
            True,
            'message':
            ' '.join(text),
            'name':
            obj['tweets'][0]['user']['name'],
            'avatar':
            obj['tweets'][0]['user']['profile_image_url_https']
        })
    except:
        return json.dumps({
            'success': False,
            'message': 'Oops! An error occurred.',
        })
Пример #29
0
 def post(self):
     year = int(self.request.get('year', 1990))
     term = int(self.request.get('term', 92))
     template = open('scrape_yearterm.html').read()
     try:
         scrape([year], [term])
         self.response.write('YearTerm {:04d}-{:02d} has successfully been added to the database.<br><br>'.format(year, term))
         year_value = str(year + 1 if term == 92 else year) # iterate year
         input_index = str([0, 92, 03, 14, 25, 39, 76].index(term)) # iterate term
     except Exception as e:
         traceback.print_exc() # only visible in terminal
         self.response.write('ERROR: {}<br><br>'.format(e))
         year_value = str(year) # preserve year
         input_index = str([0, 03, 14, 25, 39, 76, 92].index(term)) # preserve term
     content = template.replace('{YEAR_VALUE}', year_value).replace('{INPUT_INDEX}', input_index)
     self.response.write(content)
Пример #30
0
def scraper():
    # Run the scrape function we made to pull all the data from the sources
    mars_data = scrape.scrape()
    # Update the collection with new data
    db.db.collection.update({}, mars_data, upsert=True)

    return redirect("/")
Пример #31
0
	def getCurrentInfo(self, fic: Fic) -> Fic:
		url = self.constructUrl(fic.localId)
		# scrape fresh info
		data = scrape.scrape(url)

		edumpContent('<!-- {} -->\n{}'.format(url, data['raw']), 'sugarquill_ec')
		return self.parseInfoInto(fic, data['raw'])
def get_scrape():
    mars_data = knife.scrape()

    mars_db = mongo.db.mars
    mars_db.update({}, mars_data, upsert=True)

    return redirect("http://localhost:5000/", code=302)
Пример #33
0
def prediction(stock):
    if request.method == 'POST':
        # form = request.form
        stock = request.form['ticker']
        req = request
        print(req.form)
        ticker = request.form['ticker']
        ma1 = int(request.form['ma1'])
        ma2 = int(request.form['ma2'])
        from_date = request.form['from_date']
        to_date = request.form['to_date']
        crossover = ''

        # Parameters can now be passed through for calculations
        results = forecast(ma1,ma2,ticker,from_date,to_date)

        data = scrape(ticker)
        print(data.keys())
        cap = data['cap']
        price = data['price']
        day = data['day']
        week = data['week']
        month = data['month']
        quarter = data['quarter']
        headlines = data['headlines']
        trend = results['trend']
        value=Markup(results['html'])

        # img = f'predict.png'


        return render_template("dynamicForecast.html",from_date=from_date,to_date=to_date,ma1=ma1,ma2=ma2,ticker=ticker,crossover=crossover,trend=trend,cap=cap,price=price,day=day,week=week,month=month,quarter=quarter,value=value,headlines=headlines)   
        # return render_template('dynamicForecast.html',stock=stock)
    else:
        return render_template('dynamicForecast.html')
Пример #34
0
 def get(self):
     titleLinkAssoc = scrape.scrape("http://www.metafilter.com/", "div.posttitle > a")
     formattedLinks = [
         "http://www.metafilter.com" + v for k, v in titleLinkAssoc.items()
     ]  # metafilter hosts their own content so you need to add http://www.metafilter to each link
     titles = [k for k, v in titleLinkAssoc.items()]
     formattedLinkAssoc = dict(zip(titles, formattedLinks))
     self.render("scraped.html", titleLinks=formattedLinkAssoc, site="MetaFilter")
Пример #35
0
def analyze():
    url=request.form['url']
    text = scrape.scrape(url)
    # get end of url for naming pics
    i = url.rfind('/') + 1
    url = url[i:]
    polarity_url = analysis.get_sentiment_analysis(text, url)
    wordmap_url = analysis.get_wordmap(text, url)

    return render_template("analysis.html", polarity=polarity_url, wordmap=wordmap_url)
Пример #36
0
def crawl(review_store, page=0):
    page = urllib2.urlopen("http://pitchfork.com/reviews/albums/" + ("" if page==0 else (str(page) + "/")))
    soup = BeautifulSoup(page)
    main_grid = soup.find("ul", {"class" : "object-grid"})
    
    for a_child in main_grid.findAll("a"): 
        shelve_key = a_child['href'].encode('ASCII', 'ignore').split('/')[-2]
        print(shelve_key)
        if not review_store.has_key(shelve_key):
            print("key not in store")
            review_store[shelve_key] = scrape("http://pitchfork.com"+a_child['href'])
Пример #37
0
def  scrape_controller(domain):
   
    base_url = "http://www.hm.com/us/products/search?"
    #'''
    #gets from bucket the input file and the last position of scrapping
    inp_file,pos = get_inp_line()
    mark =0
   
    #move reader to the given position
    while mark <pos:
         mark+=1
         
    
    #fetch next line from reader
    inp_line = get_next_line(mark)
    #'''
    #inp_line = "categories=men&term=gingham shirts|type=Gingham,source=H&M"
    
    while inp_line !="EOF" :
    
            search_q , db_entry = inp_line.split('|')
            
            #to sort by new arrivals
            if not check_for_domain("Orders=newfrom_desc",search_q):
                 search_q = search_q+"&"+"Orders=newfrom_desc"
            
            #in case none of the domain
            if  not check_for_domain("categories=men",search_q)  and  not check_for_domain("categories=ladies",search_q)  and   not check_for_domain("categories=kids",search_q)  and  not heck_for_domain("categories=sale",search_q)  :
                  search_q =  domain+"&"+search_q
             #incase given domain
            elif check_for_domain(domain,search_q) : 
                     print" --"
            #incase of no doamin
            else :
                  continue
                  
                  
            
            q_url = base_url + search_q
            print q_url
            try: 
                my_items= scrape(q_url)
                #save_to_db(my_items)
                #for item in my_items:
                   #print item 
            except :
                print ("unable to scrape from "+q_url)
                
             #get the next imput line from the input file   
            inp_line=get_next_line(mark+1)
            #update the position of reader in SQS
            update_reader_pos(mark+1)
            mark=mark+1
Пример #38
0
def get_communities(url, counties):
    """Returns a dict {id: name}"""
    communities = {}

    # could also be done with incomprehensible dict comprehension
    for county in counties:
        soup = scrape(url, county)
        pat = "{}\d+".format(county)
        options = filter_tags(soup, 'option', pat)
        communities.update(options)

    return communities
Пример #39
0
def seatcheck(medium,username):
	seats = scrape(db.getTemp(username))
	if seats > 0:
		messenger.message(medium,username,"Good news! Your class has " + str(seats) + " open seats, so you can go sign up now! If you have the ID of another course that's closed that you'd like to track, let me know!")
		db.changeState(username,1)

	elif seats > -1000:
		messenger.message(medium,username,"You're all set. I'll monitor your course and message you here if a seat in your class opens up.")
		messenger.message(medium,username,"Anything else I can help you with? You can say 'commands' for a list of commands I understand.")
		temp = db.getTemp(username)
		db.addJob(username,temp)
		db.changeState(username,0)
	else:
		messenger.message(medium,username,"Couldn't figure out how many seats open. Is classfind down?")
		db.changeState(username,2)
Пример #40
0
def add_package(package):
    cur = get_db().cursor()
    cur.execute("insert into game (package) values (?)", (package,))
    get_db().commit()
    id = cur.lastrowid
    res = scrape(id, package)
    name = res[0]
    desc = res[1]
    rating = float(res[2])
    activity = res[4]
    category = res[5]
    cur.execute("update game set name='%s',description='%s',rating=%f,activity='%s',category='%s' where id=%d" % (name, desc, rating, activity, category, int(id)))
    get_db().commit()
    cur.connection.close()
    return json.dumps(id)
Пример #41
0
 def _scrapes(self, include_sub, include_dir, expose=True, alert_when_done=True):
     try:
         count = 0
         for x in scrape.scrape(self.settings, include_sub=include_sub, include_dir=include_dir):
             if isinstance(x, int):
                 count += x
                 continue
             if expose:
                 reveal(x)
     except requests.ConnectionError:
         tkMessageBox.askokcancel("Connection Error",
                                  "Could not connect to Reddit. Check your internet settings, "
                                  "and make sure Reddit isn't down.")
     else:
         tkMessageBox.askokcancel("", "Scrape Complete! %d files downloaded." % count)
Пример #42
0
 def post(self):
     blob = self.get_argument("blob", None)
     url = self.get_argument("url", None)
     self.write("<style>i{color:#aaa;}</style>")
     if blob:
         for line in blob.split("\n"):
             line = line.strip()
             if len(line) > 5:
                 self.write(line+"<br>")
                 self.write("<i>%s, %s</i><br><br>" % classifier.classify(line))
     elif url:
         page_data = scrape(url)
         self.write("<p><b>source:</b> "+page_data["source"]+"</p>")
         self.write("<p><b>title:</b> "+page_data["title"]+"</p>")
         self.write("<p><b>url:</b> "+page_data["url"]+"</p>")
         self.write("<p><b>description:</b> "+page_data["description"]+"</p>")
         self.write("<hr>")
         for line in page_data["body"]:
             if len(line) > 5: # TODO drop any line < 2 words; NER for time
                 score = classifier.classify(line)
                 if score[1] > 0.5:
                     self.write(line+"<br>")
                     self.write("<i>%s, %s</i><br><br>" % score)
Пример #43
0
def autograder(url):
    '''Accepts the URL for a recipe, and returns a dictionary of the
    parsed results in the correct format. See project sheet for
    details on correct format.'''
    ingredient_strings, step_strings = scrape.scrape(url)
# The ingredient template is 
#   name, quantity, measurement, descriptor, preparation, pre-preparation
    fin_ingredients = []
    for ingredient in ingredient_strings:
        name = unicode(parser.recognize_ingredient(ingredient))
        number = parser.recognize_number(ingredient)
        unit = parser.recognize_unit(ingredient)
        descriptors = [unicode(i) for i in
                parser.recognize_descriptors(ingredient)]
        fin_ingredients.append({"name": name, "quantity": number, "measurement":
            [unicode(unit)], "descriptor": descriptors})

    primary_method = None
    methods = set()
    for method in COOKING_METHODS.keys()[::-1]:
        for variation in COOKING_METHODS[method]:
            for step in step_strings:
                if variation in step:
                    methods.add(unicode(method))
                    primary_method = unicode(method)

    cookware_set = set()
    for cookware in COOKWARE:
        for variation in COOKWARE[cookware]:
            for step in step_strings:
                if variation in step:
                    cookware_set.add(unicode(cookware))

    return {"ingredients": list(fin_ingredients), "cooking methods": list(methods),
            "primary cooking method": primary_method, "cooking tools":
            list(cookware_set)}
__author__ = 'rylan'

from scrape import scrape
from compare import compare
from printToHTML import printToHTML

# sourceOne = 'https://news.google.com/'
# sourceTwo = 'https://news.yahoo.com/'
sourceOne = raw_input('Please enter first newsource: ')
sourceTwo = raw_input('Please enter second newsource: ')

articleTitles = scrape(sourceOne, sourceTwo)
pairings = compare(articleTitles[0], articleTitles[1])
printToHTML(pairings)
Пример #45
0
def get_counties(url):
    """Returns a dict {id: name}"""
    soup = scrape(url)
    pat = '^\d{2}$'
    return filter_tags(soup, 'option', pat)
Пример #46
0
from scrape import getinfo,scrape,statusUpdate
from dbaccess import AuthDatabase
from interface import Interface
import datetime
import json

config = json.loads(open("/root/SBUCourseMonitor/config.json").read())
messenger = Interface(config)

db = AuthDatabase(config["database_addr"])

for job in db.getJobs():
	seats = scrape(job[2])
	if seats > 0:
		user = db.getUserByID(job[1])[0]
		info = getinfo(str(job[2]))
		messenger.message(user[1],user[2],"Knock, knock! You course " + info + " now has " + str(seats) + " open seats. Go sign up!")
		print datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') + "\t" + user[2] + "\t" + info
		db.deleteJob(job[0])
Пример #47
0
 def get(self):
     titleLinkAssoc = scrape.scrape("http://www.digg.com/", "h2.story-title > a")
     self.render("scraped.html", titleLinks=titleLinkAssoc, site="Digg")
Пример #48
0
 def get(self):
     titleLinkAssoc = scrape.scrape("http://www.slashdot.org/", "h2 > span > a")
     self.render("scraped.html", titleLinks=titleLinkAssoc, site="Slash Dot")
Пример #49
0
import proxy
from scrape import scrape
from bs4 import BeautifulSoup
import urllib.request as req
url = "http://intranet.iitg.ernet.in/"
scrape(url)
print("done")
Пример #50
0
 def get(self):
     titleLinkAssoc = scrape.scrape("http://www.techcrunch.com/", "h2.headline > a")
     self.render("scraped.html", titleLinks=titleLinkAssoc, site="Tech Crunch")
Пример #51
0
def do_scrape():
    return Response(i_league_scraper.scrape(), mimetype="text/plain")
Пример #52
0
def build():
    """Scrape AWS sources for data and build the site"""
    data_file = "www/instances.json"
    scrape(data_file)
    render(data_file, "in/index.html.mako", "www/index.html")
Пример #53
0
def parse_results(message, db_collection):
    """
    Function to parse the links drawn from an RSS feed.

    Parameters
    ----------

    message: pattern.web.Results.
                Object containing data on the parsed RSS feed. Each item
                represents a unique entry in the RSS feed and contains
                relevant information such as the URL and title of the
                story.

    db_collection: pymongo Collection.
                        Collection within MongoDB that in which results are
                        stored.
    """
    global proxies, proxy_user, proxy_pass

    if proxies:
        proxy_choice = {'http': random.choice(proxies)}
        proxy_login = requests.auth.HTTPProxyAuth(proxy_user,
                                                  proxy_pass)
    else:
        proxy_choice = ''
        proxy_login = {}

    lang = message.get('lang')
    story_url = message.get('url')
    website = message.get('website')
    title = message.get('title')
    date = message.get('date')
    if lang == 'english':
        goose_extractor = Goose({'use_meta_language': False,
                                 'target_language': 'en',
                                 'enable_image_fetching': False})
    elif lang == 'arabic':
        from goose.text import StopWordsArabic
        goose_extractor = Goose({'stopwords_class': StopWordsArabic,
                                 'enable_image_fetching': False})
    else:
        print(lang)

    if 'bnn_' in website:
        # story_url gets clobbered here because it's being replaced by
        # the URL extracted from the bnn content.
        #TODO: Deprecate this for now since using GhostJS is weird.
        logging.info('A BNN story.')
#        text, meta, story_url = scrape.bnn_scrape(story_url, goose_extractor)
        text = ''
        pass
    else:
        text, meta = scrape.scrape(story_url, goose_extractor, proxy_choice,
                                   proxy_login)
    text = text.encode('utf-8')

    if text:
        cleaned_text = _clean_text(text, website)

        # TODO: Figure out where the title, URL, and date should come from
        # TODO: Might want to pull title straight from the story since the RSS
        # feed is borked sometimes.
        entry_id = connectors.add_entry(db_collection, cleaned_text, title,
                                        story_url, date, website, lang)
        if entry_id:
            try:
                logging.info('Added entry from {} with id {}. {}.'.format(story_url,
                                                                          entry_id,
                                                                          datetime.datetime.now()))
            except UnicodeDecodeError:
                logging.info('Added entry from {}. Unicode error for id'.format(story_url))
    else:
        logging.warning('No text from {}'.format(story_url))
Пример #54
0
import pickle
import scrape

scrape.scrape()
dictPickle = open('barf','rb')
outputDict = pickle.load(dictPickle)
dictPickle.close()

##dictFile = open('barf.txt','w')
##dictFile.write(outputDict)
##dictFile.close()


Пример #55
0
def build():
    """Scrape AWS sources for data and build the site"""
    data_file = 'www/instances.json'
    scrape(data_file)
    render(data_file, 'in/index.html.mako', 'www/index.html')
Пример #56
0
ATTR_DICT = {   'title' : '../csvs/exid_title.csv', 
                'date'  : '../csvs/exid_date.csv' ,
                'url'   : '../csvs/exid_url.csv'   }

def create_attr_csvs(index):
    '''
    creates csvs for title, date, url 
    input:
        index: dictionary of exhibit information
    output:
        writes csv files according to ATTR_DICT
    '''
    for attr in ATTR_DICT:
        with open(ATTR_DICT[attr],'w') as f:
            line = 'ex_id|' + attr + '\n'
            f.write(line)
            for museum_id in index:
                for ex_id in index[museum_id]:
                    line = '{}|{}\n'.format(str(ex_id), \
                        index[museum_id][ex_id][attr].encode('utf-8'))
                    
                    f.write(line)   
    
if __name__ == "__main__":
    index = scrape.scrape()
    if os.path.isfile('../pickled_search_object'):
        os.remove('../pickled_search_object')
    wd = build_word_dict(index)
    create_wordct_csv(wd)
    create_attr_csvs(index)
Пример #57
0
def main(original_recipe):
    # urls = ['http://allrecipes.com/recipe/easy-meatloaf/',
    #         'http://allrecipes.com/Recipe/Easy-Garlic-Broiled-Chicken/',
    #         'http://allrecipes.com/Recipe/Baked-Lemon-Chicken-with-Mushroom-Sauce/',
    #         'http://allrecipes.com/Recipe/Meatball-Nirvana/']
    if original_recipe.url:
        scraped_ing, scraped_steps = scrape.scrape(original_recipe.url)

        # parse ingredient info, create objects
        ingredients = []
        for ingredient in scraped_ing:
            new_ing = parse_ingredient(ingredient)
            cursor = db.ingredients.find({"name":new_ing.name})
            i = 0
            for document in cursor:
                i += 1
            if i == 0:
                # add to DB
                db.ingredients.insert({"name":new_ing.name, "category":"????", "flag":"none"})
            ingredients.append(new_ing)

        steps = []
        for step in scraped_steps:
            #SPLIT STEP CONTENTS BEFORE PARSING
            if not step:
                continue # HANDLE EMPTY
        # for new_parser
            # parsed_steps = parse_step(step)
            # for p in parsed_steps:
            #     steps.append(p)
        #for new_parser
            step_sent = nltk.sent_tokenize(step)
            for sent in step_sent:
                if contains_procedure(sent) == 1:
                    new_proc = parse_step(sent)
                    steps.append(new_proc)
                elif contains_procedure(sent) > 1:
                    actions = double_action(sent)
                    if actions:
                        for a in actions:
                            new_proc = parse_step(a)
                            steps.append(new_proc)
                        if contains_procedure(sent) == 2:
                            break
                    clause = sent.split(';')
                    for c in clause:
                        if contains_procedure(c) == 1:
                            new_proc = parse_step(c)
                            steps.append(new_proc)
                        elif contains_procedure(c) > 1:
                            more_clause = c.split(',')
                            for more_c in more_clause:
                                if contains_procedure(more_c) == 1:
                                    new_proc = parse_step(more_c)
                                    steps.append(new_proc)
                                elif contains_procedure(more_c) > 1:
                                    actions = double_action(more_c)
                                    if actions:
                                        for a in actions:
                                            new_proc = parse_step(a)
                                            steps.append(new_proc)
                                        if contains_procedure(more_c) == 2:
                                            break
                                    else:
                                        new_proc = parse_step(more_c)
                                        steps.append(new_proc)

        original_recipe.in_list = ingredients
        original_recipe.pr_list = steps

    #call transform etc
    reconstruction.reconstruct(original_recipe)
    r = original_recipe
    try:
        transformed_recipe = transform.transform(r)
    except RuntimeError:
        return [original_recipe, Recipe()]

    #if transformed_recipe == original_recipe:
    #    print "There are no changes to be made"
    #else:
    reconstruction.reconstruct(transformed_recipe)
    return [original_recipe, transformed_recipe]
Пример #58
0
def parse(message):
    if(message.content.startswith('!blood')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(message.author.name)
        print(stamp)
        print(type(message.author.name))
        print(type(stamp))
        '''print('sending hello to ' + message.author.name + ' ' + stamp)'''
        return ('Is that blood I smell? ' + stamp)
    elif(message.content.startswith('!commands')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('sending command list ' + stamp)
        return (commands)
    elif(message.content.startswith('!changelog')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('sending changelog ' + stamp)
        return (changelog)
    elif(message.content.startswith('!source')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('sending source ' + stamp)
        return (source)
    if(message.content.startswith('!beg')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('Frozen Pizza ' + message.author.name + ' ' + stamp)
        return ('Can I have that Frozen Pizza? ' + stamp)
    if(message.content.startswith('!goodboy')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('Treat ' + message.author.name + ' ' + stamp)
        return ('Can I have my Treat now? ' + stamp)
    elif(message.content.startswith('!elwiki')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        searchterm = message.content[7:].strip()
        if (len(searchterm) == 0):
            print('no argument specified')
            return ('Tell me what to look for, and I shall deliver.')
        if (searchterm.lower().find('seris') != -1):
            print('not looking for seris')
            return ('Some old mistakes should not be touched upon. Mistakes are often a scab to an old, deep wound.')
        if (badword.has_profanity(searchterm)):
            return ('You should reconsider your words if you value your life, ' + message.author.mention())
        print('looking up ' + searchterm)
        r = requests.get('http://elwiki.net/wiki/index.php?search=' + searchterm, allow_redirects=False)
        print(r.status_code)
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        if (r.status_code == 302):
            answer = r.headers['Location']
            print(answer + ' sent on ' + stamp)
            return ('Page for ' + searchterm + ' : ' + answer)
        if (r.status_code == 200):
            print('scraping')
            answer = scrape.scrape(r.text)
            if(answer is None):
                return 'I could not find a match for that.'
            else:
                return ('First match for ' + searchterm + ' : ' + answer)
    elif(message.content.startswith('!babel')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up babel on ' + stamp)
        babelfeed = feedparser.parse('http://elwiki.net/babel/?feed=rss2')
        answer = babelfeed.entries[0]['title'] + ' ' + babelfeed.entries[0]['link']
        print(answer)
        return ('Last post on Babel - ' + answer)
    elif(message.content.startswith('!na')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up na on ' + stamp)
        nafeed = feedparser.parse('http://en.elswordonline.com/feed/')
        answer = nafeed.entries[0]['title'] + ' ' + nafeed.entries[0]['link']
        print(answer)
        return ('Last NA update - ' + answer)
    elif(message.content.startswith('!uk')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up uk on ' + stamp)
        ukfeed = feedparser.parse('http://board.en.elsword.gameforge.com/index.php?page=ThreadsFeed&format=rss2&boardID=8')
        answer = ukfeed.entries[0]['title'] + ' ' + ukfeed.entries[0]['link']
        print(answer)
        return ('Last UK update - ' + answer)
    elif(message.content.startswith('!void')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up void on ' + stamp)
        return scrape.scrape_void()
    elif(message.content.startswith('!events')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up void events and contests on ' + stamp)
        return scrape.vevent()
    elif(message.content.startswith('!promo')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up void promotions on ' + stamp)
        return scrape.vpromotions()
    elif(message.content.startswith('!general')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up void void general topics on ' + stamp)
        return scrape.vgeneral()
    elif(message.content.startswith('!suggest')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up void void suggestions on ' + stamp)
        return scrape.vsuggestions()
    elif(message.content.startswith('!intro')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up void Intro/Farewells on ' + stamp)
        return scrape.vintro()
    elif(message.content.startswith('!guild')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('looking up void guild topics on ' + stamp)
        return scrape.vguilds()
    elif(message.content.startswith('!shots')):
        stamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('sending shots fired ' + stamp)
        return ('Hmm. It appears as if shots have been fired.')
    elif(message.content.startswith('!ibset')):
        searchterm = message.content[6:].strip()
        return elsgear.lookup(searchterm)
    elif(message.content.startswith('!google')):
        searchterm = message.content[7:].strip()
        if(len(searchterm) == 0):
            return ('Tell me what to look for, and I shall deliver.')
        if (badword.has_profanity(searchterm)):
            return ('You should reconsider your words if you value your life, ' + message.author.mention())
        return ('https://www.google.com/search?q=' + urllib.parse.quote_plus(searchterm))
    elif(message.content.startswith('!gimg')):
        searchterm = message.content[5:].strip()
        if(len(searchterm) == 0):
            return ('Tell me what to look for, and I shall deliver.')
        if (badword.has_profanity(searchterm)):
            return ('You should reconsider your words if you value your life, ' + message.author.mention())
        return ('https://www.google.com/search?q=' + urllib.parse.quote_plus(searchterm) + '&tbm=isch')
    elif(message.content.startswith('!youtube')):
        searchterm = message.content[8:].strip()
        if(len(searchterm) == 0):
            return ('Tell me what to look for, and I shall deliver.')
        if (badword.has_profanity(searchterm)):
            return ('You should reconsider your words if you value your life, ' + message.author.mention())
        return ('https://www.youtube.com/results?search_query=' + urllib.parse.quote_plus(searchterm))
#    elif(message.content.startswith('!hall')):
#        print('delivering event message (10-22-15)')
#        return(halloween)
    elif(message.content.startswith('!lenify')):
        msg = message.content[7:].strip()
        return(msg)
    elif(message.content.startswith('!roast')):
        print('delivering roast')
        response = 'http://i.imgur.com/rSMtLIM.gif'
        for mention in message.mentions:
            print('mentioning ' + mention.name)
            response += (' ' + mention.mention())
        return response
    elif(message.content.startswith('!salt')):
        print('delivering salt')
        response = ''
        for mention in message.mentions:
            print('mentioning ' + mention.name)
            response += (' ' + mention.mention())
        return response + '\n\n' + salt
    elif(message.content.startswith('!lyyin')):
        response = ''
        for mention in message.mentions:
            print('mentioning ' + mention.name)
            response += (mention.mention() + ' ')
        response += lyying
        return response
    else:
        return None