def scrape_register(cookies, module, year_id): page, _ = scraper.fetch( "%s?%s" % (SEARCH_URL, urllib.urlencode({ 'form_id': 3, 'exclude': '', 'year_id': year_id, 'mnem': module, })), headers={'Cookie': cookies,} ) doc = scraper.parse(page) ## print page title = doc.find(scraper.path("h4", "a")).text for table in doc.findall(scraper.path("table")): if 'bordercolor' in table.keys(): headings = [ t.text for t in table.findall(scraper.path("th", "font", "b")) ] if headings != [ 'Name', 'Category', 'Course', 'Misc' ]: BARF ## for row in table.findall(scraper.path("tr"))[1:]: ## for c in row.findall(scraper.path("td", "font")): ## print c, c.text, c.tail students = [ dict(zip(headings[:-1], (c.text.strip() for c in row.findall(scraper.path("td", "font")) if c.text))) for row in table.findall(scraper.path("tr"))[1:] ] return { 'title': title, 'students': students, }
def main(): parser = argparse.ArgumentParser( description="Use frequencies to assign sentiment score.") parser.add_argument("stock_sym", help="stock symbol of company") parser.add_argument( "filename", help="name of JSON file with links to financial articles") args = parser.parse_args() return parse(args.filename)
def get(self): from google.appengine.api import urlfetch import scraper url = "http://www.marctracker.com/PublicView/status.jsp" result = urlfetch.fetch(url) if result.status_code == 200: response = scraper.parse(result.content) self.response.write(response)
def test_parser(self): result = scraper.parse(self.content) self.assertIsInstance(result, list, "Parse result not list." "Must be a list of tuples." "Check it !") self.assertIsInstance(result[0], tuple, "Parse result list is not a tuple." "Must be a tuple 9 elements." "Check it !") self.assertEqual(len(result[0]), 9, "Tuple size not 9." "Must be a 9 elements." "Check it !")
def scrape(): errors = [] results = "" if request.method == "POST": try: target_url = request.form['urlname'] results = parse(target_url) except: errors.append("unable to get url...") return render_template('/anotherpage.html', errors=errors, results=results)
def main(): song = input("What's the song you want to visualize? ") artist = input("And who's it by? ") lyrics = scraper.parse(song, artist) mc = markov.build_mkch(lyrics) json_generator.build_json(mc) # the list generated by the following print function is a text-based visualization of # the markov chain (adjacency list) # each node (unique word) is listed with all its edges (words it links to), along with `the 'probability' # that a word in the edge-list follows the node markov.print_mkch(mc)
def process(request, keyword, city): scraped_data = scraper.parse(keyword, city) # location = scraper.getLocation() #print location if scraped_data is None: #error = "Your search for %s, in %s does not match any jobs"%(keyword,city) salary_dict = [] rating_dict = [] return salary_dict, rating_dict salary_dict = [] rating_dict = [] #for data in scraped_data: if scraped_data: for d in scraped_data: if len(d['Rating'])>0: rating_dict.append({'Company': d['Company'], 'Rating': d['Rating'],'Url': d['Url'], 'Location': d['Location'], 'Salary': d['Salary'], 'Name': d['Name']}) if len(d['Salary'])>0: salary_dict.append({'Company': d['Company'], 'Rating': d['Rating'], 'Int_Salary': int(d['Salary'].split('-')[1][1:-1]),'Salary': d['Salary'],'Url': d['Url'], 'Location': d['Location'], 'Name': d['Name']}) rating_dict = sorted(rating_dict, key=itemgetter('Rating'), reverse=True) salary_dict = sorted(salary_dict, key=itemgetter('Int_Salary'), reverse=True) return salary_dict, rating_dict else: print "Your search for %s, in %s does not match any jobs"%(keyword,city)
if not (dump_ascii or dump_json): dump_ascii = True if "".join(map(lambda s:s.lower(), args)) in Courses: courses = "%0D%0A".join(map(urllib.quote_plus, Courses[args[0]])) elif specify_courses: courses = "%0D%0A".join(map(urllib.quote_plus, args)) if courses: url = "%s;%s" % (TT_URL, COURSES_URL % { "courses": courses, }) else: modules = "%0D%0A".join(args) url = "%s;%s" % (TT_URL, MODULES_URL % { "modules": modules, }) if not (courses or modules): die_with_usage("", 1) modules = scrape_timetable(scraper.parse(scraper.fetch(url)[0])) if module_detail: for m in modules: data = { 'year_id': '000110', 'mnem': m['code'], } page, hdrs = scraper.fetch(MODULE_DETAIL_URL, data) m['detail'] = scrape_module_details(scraper.parse(page)) ## dump scraped data if dump_json: print json.dumps(modules) elif dump_ascii: for module in modules: print "\x1b[0;1m%s\x1b[0m" % module['code'], "--", module['title'] for (act, data) in sorted(module['acts'].items()): print "\t%-13s" % (act,), \