def get_text_links(): for entry in links: arr = links[entry] if "//" not in arr[0]: i = 0 while i + 2 < len(arr): try: title = scraper.scrape(arr[i + 2], arr[1])[0].strip().encode( 'ascii', 'ignore') + " " + entry except: print arr print i + 2 data.loc[title] = ["", 0] data.loc[title]["Link"] = arr[i + 2] txts = scraper.scrape(arr[i + 2], arr[0]) for txt in txts: words = txt.split() for w in words: w = ''.join(c for c in w if c not in punctuation) if w.lower() in keywords: data.loc[title]["Score"] += 1 i += 1 else: print entry
def get_text_files(): paths = glob.glob('files/*/*') for f in paths: source = f.split('/')[1] title = scraper.scrape_file(f, links[source][1])[0].strip().encode( 'ascii', 'ignore') + ' ' + source data.loc[title] = ["", 0] for s in ["lexology", "natlawreview"]: if source == s: for link in links[s]: if "//" in link: t = scraper.scrape(link, links[s][1])[0].strip().encode( 'ascii', 'ignore') if t[:10].lower() in title.lower(): data.loc[title]['Link'] = link break if source == "law360": for link in links["law360"]: if "//" in link: t = scraper.scrape(link, "h1")[0].strip().encode( 'ascii', 'ignore') if t[:10].lower() in title.lower(): data.loc[title]['Link'] = link break txts = scraper.scrape_file(f, links[source][0]) for txt in txts: words = txt.split() for w in words: w = ''.join(c for c in w if c not in punctuation) if w.lower() in keywords: data.loc[title]["Score"] += 1
def scrape_nnm(): settings = player.load_settings() data_path = settings.torrents_path() hashes = [] for torr in filesystem.listdir(filesystem.join(data_path, 'nnmclub')): if torr.endswith('.torrent'): try: from base import TorrentPlayer tp = TorrentPlayer() tp.AddTorrent(filesystem.join(data_path, 'nnmclub', torr)) data = tp.GetLastTorrentData() if data: hashes.append((data['announce'], data['info_hash'], torr.replace('.torrent', '.stat'))) except BaseException as e: log.print_tb(e) for chunk in chunks(hashes, 32): import scraper try: seeds_peers = scraper.scrape(chunk[0][0], [i[1] for i in chunk]) except RuntimeError as RunE: if '414 status code returned' in RunE.message: for c in chunks(chunk, 16): try: seeds_peers = scraper.scrape(c[0][0], [i[1] for i in c]) process_chunk(c, data_path, seeds_peers) except BaseException as e: log.print_tb(e) continue except BaseException as e: log.print_tb(e) continue process_chunk(chunk, data_path, seeds_peers)
def main(): parser = argparse.ArgumentParser(description='Bulk Saving Of Job Posts') parser.add_argument('-q', help='Search Term Query') parser.add_argument('-l', help='Location') parser.add_argument('-o', help='Output location') parser.add_argument('--json', help='Export Json File') parser.add_argument('--xlsx', help='Export Excel File') args = parser.parse_args() # print(args.accumulate(args.integers)) if args.l == None: print "Must include location with: -l 'location'" exit if args.q == None: print "Must include search term with: -q 'job title'" exit if (args.q != None and args.l != None): print 'make this stuff happen' output = '' if (args.o == None): fileName = args.l.replace(' ', '-').replace( ',', '') + '-' + args.q.replace(' ', '-') output = os.getcwd() + '/' + fileName + '.xlsx' else: output = args.o scraper.scrape({ 'location': args.l, 'search': args.q, 'output': output })
def concertScrape(limitOfNew): key = 'Dv4TzTBMtO5GJ57Dcrf0Jbxst8fEQHLx' secret = 'lgAN5MVcjmUM30rB' url = 'https://app.ticketmaster.com/discovery/v2/events.json?size=' + str(limitOfNew) + '&classificationName=concert&apikey=Dv4TzTBMtO5GJ57Dcrf0Jbxst8fEQHLx' contents = urllib2.urlopen(url).read() data = json.loads(contents) for i in range(len(data["_embedded"]["events"])): try: event = data["_embedded"]["events"][i] # Get all necessary info from json creator = event["promoter"]["name"] date = event["dates"]["start"]["localDate"] name = event["name"] url = event["images"][0]["url"] desc = event["url"] # Change from unicode str creator = scraper.uni_to_str(creator) date = scraper.uni_to_str(date) name = scraper.uni_to_str(name) url= scraper.uni_to_str(url) desc = scraper.uni_to_str(desc) print(creator, date, name, url, desc) scraper.scrape(creator, name, desc, date, "concert", url) except: continue
def main(): creds = config.get_creds() sftp.download(creds.get("sftp_url"), creds.get("sftp_username"), creds.get("sftp_password"), creds.get("localpath")) cleaner.clean(creds.get("localpath")) merge.merge(creds.get("localpath")) scraper.scrape(creds)
def main(): if len(sys.argv) <= 1: printUsageAndExit() animes = sys.argv[1:] anime_urls = [utils.getPageUrl(anime) for anime in animes] s.scrape(anime_urls)
def predict(pages, team1, team2, m): #make sure we have data on these teams if not(isTop20(team1)): print("Team 1 is not a top 20 team") return if not(isTop20(team2)): print("Team 2 is not a top 20 team") return if not(isMap(m)): print("The map is not in our pool") return if pages>0: print("Scraping process will take some time, please be patient") print("**********Scraping Map Results now**********") scrape(pages) print("**********Scraping Map Stats now**********") scrape_map_stats() print("**********Filtering Data now**********") filterCSV() print("**********Generating Data now**********") #Make sure we have a "filtered_top20.csv" file to examine (in case the user doesn't scrape) if os.path.isfile('filtered_top20.csv'): data = getDataReady() tree = build_tree_id3(data) #pp = pprint.PrettyPrinter(indent=4) #pp.pprint(tree) boolean = {True : team1, False : team2} print("{} would win.".format(boolean[classify(tree,userInputStats(team1, team2, m))])) else: print('\"filtered_top20.csv\" was not found. Please scrape for data before attempting to predict')
def test_scrape_raises_http_error(): with mock.patch('scraper.urlopen') as urlopen_mock: urlopen_mock.side_effect = HTTPError('http://example.org', 404, 'Not found', {}, mock.Mock()) with pytest.raises(HTTPError) as exc: scraper.scrape('http://example.org') assert exc.value.code == 404 assert exc.value.msg == 'Not found'
def lambda_handler(event, context): # Run scraping function scrape() # Query Spotify API, write into JSON file and upload it to S3 query_spotify_api() print("The function ran successfully.")
def home(): try: scraper.scrape() return 'Ran Successfully' except Exception as e: if (app.config['DEBUG'] == True): return str(e) else: return 'Error Encountered'
def print_headlines(): # clear the list so we only get the latest headlines headlines.clear() # run this function over each paper for url in urls.values(): scrape(url) # the function returns the final list return response
def fetchProfiles(initURL, maxcount): """Given the URL from where to initiate the crawling, it first fetches the webpage, sends it to the crawler for scraping data from the webpage. Not only that, it also reads all the public profile urls present in the current page and adds them to the list. In subsequent iterations, it will fetch the LinkedIn profiles of people associated with these urls. The iteration continues for the number of times specified by maxcount""" count = 0 links = set([initURL]) waitinglist = list() start = datetime.now() while count < maxcount: count += 1 while len(links) > 0: newreq = links.pop() if newreq not in waitinglist: # If the url hasn't be used already, add it to the waiting list waitinglist.append(newreq) break try: page = urllib2.urlopen(waitinglist[-1]).read( ) # Fetch the web page from the url just appended scraper.scrape( page, waitinglist[-1]) # Send the page and the url for scraping if len(links) < 3: links.update(profileURL.findall( page)) # Get all the urls present in this web page except: pass links = set([link.strip('"') for link in links]) # String processing to remove quotes percentage = int(count * 100.0 / maxcount) # Progress bar sys.stdout.write('\r' + '=' * percentage + '>' + ' ' * (101 - percentage) + str(percentage) + '%') sys.stdout.flush() print 'Fetched', count, 'profiles in', \ (datetime.now() - start).total_seconds(), 'seconds' start = datetime.now() classifier.classify( ) # Classify all profiles in the database [TODO: classify only updated portion of db] print 'Classified all profiles in database in', \ (datetime.now() - start).total_seconds(), 'seconds' indexer.computeIndexes( ) # Compute indexes for every profile in the database [TODO: same as above] print 'Calculated indexes for all profiles in database in', \ (datetime.now() - start).total_seconds(), 'seconds'
def main(): log.info("Starting Bidwire run") start = time.time() scraper.scrape() log.info("Scraping complete. Sending notifications.") new_bids = notifier.send_new_bids_notifications(EMAIL_RECIPIENTS) elapsed_secs = time.time() - start log.info("Notification sending complete. Sending debug email.") DebugEmail().send(new_bids, EMAIL_RECIPIENTS, elapsed_secs)
def main(): log.info("Starting Bidwire run") start = time.time() scraper.scrape(SITE_CONFIG) log.info("Scraping complete. Sending notifications.") new_bids = notifier.send_new_notifications(SITE_CONFIG) elapsed_secs = time.time() - start log.info("Notification sending complete. Sending debug email.") DebugEmail().send(new_bids, SITE_CONFIG, elapsed_secs)
def scrape_companies_data( company_names: List[str], use_cache: bool = False, n: int = 2147483647, skip_companies: Set[str] = set() ) -> Tuple[List[Company], List[FailedCompanyError]]: errors = [] output_data = [] for i, company_name in enumerate(company_names): if i >= n: break if company_name in skip_companies: print(f'[INFO] Skip scraping {company_name}') continue try: company_id = company_name.replace(' ', '_').lower() company = Company(id=company_id) overview_url, reviews_url = scraper.get_glassdoor_urls( company_name) print('[INFO]', company_name, overview_url, reviews_url) if overview_url is None or reviews_url is None: raise Exception( f'Cannot find both URLs for "{company_name}": {overview_url} {reviews_url}' ) reviews_data = scraper.scrape(reviews_url, f'{company_name}_reviews.html', scraper.get_reviews_data) overview_data = scraper.scrape(overview_url, f'{company_name}_overview.html', scraper.get_overview_data) data = { 'name': company_name, 'overview_url': overview_url, 'reviews_url': reviews_url, 'linkedin_url': scraper.get_linkedin_url(company_name), } data.update(reviews_data) data.update(overview_data) company.update_data(data) output_data.append(company) except Exception as e: print(f'[FAIL] caught exception when parsing "{company_name}"') errors.append( FailedCompanyError( company_name=company_name, exception=e, )) return output_data, errors
def get(self, college, year, branch, low, high, semc): scraper.scrape(college, year, branch, low, high, semc) filename = 'ExcelFiles/' + '1' + college + year + branch + low + '-' + high extension = '.xls' zipf = zipfile.ZipFile('Results-Excel.zip', 'w', zipfile.ZIP_DEFLATED) files = [ filename + extension, filename + 'GPA' + extension, filename + 'RANK' + extension ] for file in files: zipf.write(file) zipf.close() return send_from_directory('', 'Results-Excel.zip')
def start(parameters): try: db_connection = connector.connect(user='******', password='******', host='localhost', database='EAGLEEYE') scraper.scrape(parameters, db_connection) except: time.sleep(5) print("Database down, trying to connect...") start(parameters)
def test_scrape(self, source): test_limit = 3 web_df = scrape( source=source, limit=test_limit, test=True, since=str(datetime.datetime.now().date() - datetime.timedelta(7)), ) self.assertEqual(len(web_df), test_limit) web_df = scrape(source=source, limit=test_limit, test=True, since="2019-09-17") self.assertEqual(len(web_df), test_limit)
def update_model(): print("Scraping posts...") scrape(500000) print("Building model...") with open('titles.txt', encoding='utf8') as f: titles = f.read() model = markovify.NewlineText(titles) print("Exporting model...") model_json = model.to_json() with open('model.json', 'w') as f: f.write(model_json) print("Done!")
def read_majors( game_id=int(db_game), year=int(db_year), base=None, current=False): set_readin_args(args) #slugs = ["genesis-5","summit6","shine2018","tbh8","summit7"] fails = [] scrape_load = False slug_given = False if db_slug == None: if to_load_slugs: scrape_load = True if v >= 3 and year == int(db_year): print('Loading saved slugs...') slugs = load_slugs(game_id, year) if slugs == False or slugs == []: if v >= 3: print('Saved slugs not found.') slugs = scraper.scrape(game_id, year, v) scrape_load = False else: slugs = scraper.scrape(game_id, year, v) fails = [event[1] for event in slugs if type(event) is tuple] slugs = [event for event in slugs if type(event) is str] elif type(db_slug) is list: slugs = db_slug slug_given = True else: #print(type(db_slug)) slugs = [db_slug] slug_given = True if v >= 3 and not scrape_load and not slug_given: if len(slugs) <= 0: print('No slugs found for game %d in year %d:' % (game_id, year)) else: print('Scraped the following slugs for game %d in year %d:' % (game_id, year)) print(slugs) if not fails == [] and v > 0: print( 'The following majors could not be read (no smash.gg bracket found)' ) print(fails) if to_save_db and not scrape_load and not slug_given: save_slugs(slugs, game_id, year, to_save_db=to_save_db) return (read_tourneys(slugs, ver=game_id, year=year, base=base, current=current))
def handle_scrape(): username = request.authorization.username password = request.authorization.password sdate = request.args.get("sdate") edate = request.args.get("edate") try: scrape(username, password, sdate, edate) except NoSuchElementException as e: return jsonify({ "status": "failure" }),400 return jsonify({ "status": "success" }), 200
def renameAll(rootDir, metadataFile): try: season = os.path.basename(rootDir) metadata = scraper.scrape(open(metadataFile)) metadata # print metadata # sys.exit() except ValueError: print "Couldn't parse the season from the given." return 0 files = os.listdir(rootDir) parsedFiles = [] pattern = re.compile(r"([\w ]*) - (\d\d)x(\d\d)(.*)") for f in files: matches = pattern.search(f).groups() orgFile = os.path.join(rootDir, f) show = matches[0] season = int(matches[1]) episode = int(matches[2]) newName = '{0} - {1:0>2}x{2:0>2} - {3}{4}'.format(show, season, episode, metadata[season][episode], matches[3]) result = raw_input('Moving {0} to {1}. Continue? (y/n)'.format(f, newName)) if(result == 'y'): os.rename(orgFile, os.path.join(rootDir, newName)) parsedFiles.append({'file': orgFile, 'newPath': os.path.join(rootDir, newName)}) return parsedFiles
def buildVideoIndex(url): data=scraper.scrape(url) nextLinkUrl=scraper.scrapeNextPageLink(url) for name,info_url,img,date in data: addLink(name,info_url,3,img) if (nextLinkUrl != None): addDir("[Next Page >>]",nextLinkUrl,2,'')
def search_scene(): global search, running, show_books while search: # Ovoa e search scene, tuka e input box i unasa ime na knigata screen.fill(GRAY) search_box.update() search_box.draw(screen) pygame.display.flip() for ev in pygame.event.get(): if ev.type == pygame.QUIT: # Tuka treba som izleze od search scene da izgase celoto ama nekje taka nesto running = False search = False # Od klasata, proveruva dali si kliknal na s_box, enter, backspace i so tekst upisuvas search_box.handle_event(ev) if search_box.enter: search_book = search_box.rText show_books = True search = False for book_dict in scraper.scrape(search_book): # Od scrapero stava rezultatite u 2 arrays titles.append(book_dict["book"]) authors.append(book_dict["author"]) book_ids.append(book_dict["book_id"]) downloads.append(book_dict["downloads"]) return images, titles
def scraper(): #delete_all_potential_locations() loopnetListings = scrape() j = 0 name_set = {} for index, row in loopnetListings.iterrows(): address_map = {} address = row[0] address_map['address'] = address components = address.split(",") try: address_map['street'] = components[0] address_map['city'] = components[1] address_map['state'] = components[2] except: print("Exception: invalid format of address") continue name = row[1] if name_set.get(name) == None: name_set[name] = 1 else: name = name + " " + str(name_set.get(name)) lat, lon = get_lat_long(address) try: store_scraped_in_google(address_map, name, lat, lon) except: print("Exception: Could not store in Google")
def test_english_detection(self): from translation import Translator my_translator = Translator(None) result = scraper.scrape("http://news.google.com/news/url?sa=t&fd=R&usg=AFQjCNFY1KzEAhaiZchzd5ulmoY4_4P8kA&url=http://vov.vn/Van-hoa/NSND-Thanh-Hoa-xuc-dong-hat-truoc-benh-nhan/228256.vov") self.assertFalse(result.get('unscrapable')) text_obj = process_resources.extract_clean_content(result['htmlContent']) self.assertFalse(my_translator.is_english(text_obj['content']))
def scrape_thread(cur2, pbar, count, qhashs, nth, total, ip="open.demonii.com"): db = MySQLdb.connect(**config.mysql) cur = db.cursor() last_commit = time.time() errno=0 try: l = qhashs.get(timeout=0) while True: try: for hash, info in scraper.scrape("udp://%s:1337/announce" % ip, l).items(): cur.execute("UPDATE torrents SET scrape_date=NOW(), seeders=%s, leechers=%s, downloads_count=%s WHERE hash=%s", (info['seeds'], info['peers'], info['complete'], hash)) if time.time() - last_commit > 30: db.commit() last_commit = time.time() pbar.update(min(pbar.currval + len(l), count)) l = qhashs.get(timeout=0) errno=0 except (socket.timeout, socket.gaierror, socket.error): db.commit() time.sleep(0.1 * errno + 0.1) errno+=1 if errno > 10: raise except queue.Empty: pass finally: db.commit() cur.close() db.close()
def display_tables(): max_results_per_state = 10 state_set = ["California", "New York", "Washington", "Illinois", "Texas"] data = {"Job Title":[], "Company":[], "Location":[]} dataframe = scraper.scrape(max_results_per_state, state_set, data) print "HI" return render_template("table.html", dataframe=dataframe.to_html())
def get_torrent_stats(url): response = requests.get(url) data = bencodepy.decode(response.content) files = data[b'info'][b'files'] size = 0 for file in files: size += file[b'length'] size = size * 1e-9 size = round(size, 2) info_hash = hashlib.sha1(bencodepy.bencode(data[b"info"])).hexdigest() trackers_list = data[b'announce-list'] stats = {'seeds': 0, 'peers': 0} for tracker_url in trackers_list: tracker_url = tracker_url[0].decode('utf-8') result = scrape(tracker_url, [info_hash]) if not result: continue if result[info_hash]['seeds'] is None or result[info_hash][ 'peers'] is None: continue stats['seeds'] = max(stats['seeds'], result[info_hash]['seeds']) stats['peers'] = max(stats['peers'], result[info_hash]['peers']) stats['size_gb'] = size return stats
def scheduled(): # print out time that cron job was deployed in console now = datetime.now() dt_string = now.strftime("%d/%m/%Y %H:%M:%S") print("Cron Job Executed at " + dt_string) # Find users who signed up for cron job users = mongo.db.users.find({'cron': {"$exists": True}}) # For each user who has signed up for cron job, find jobs they are interested ibn for u in users: test = scraper.scrape() jobs = [] employer = [] links = [] jobs, employer, links, count = test.search(u['cron']['cron_job'], u['cron']['cron_loc'], True) j = jobbankapply.apply(links) emails, jobs, employer = j.run() # find user's cv and resume cv_data = u['cv'] cv_data = cv_data.encode('latin-1', 'replace').decode('latin-1') resume = u['resume'] resume = resume.encode('latin-1', 'replace').decode('latin-1') # apply for jobs on behalf of user j.email(emails, jobs, employer, cv_data, resume, u['id'], u['email'])
def run(args): styler = getStyle(args.style[0]); if not styler: raise RuntimeError("Cannot find output style " + args.style + "."); print "Loaded Style: \t" + styler.name + "."; print "Scraping Website"; web = scrape(args.url, styler, cache=(not args.no_cache)); if len(web[1]) == 0: raise RuntimeError("No valid pages found!"); web = (styler.edit_book_metadata(web[0]), web[1], web[2]); fout = title_fn(web[0]["author"] + " - " + web[0]["title"]); if args.out and len(args.out) > 0 and len(args.out[0]) > 0: # If args.out is a directory, then we use our custom filename, otherwise # we use the given filename. if args.out[0][-1] == "/": fout = args.out[0] + fout; else: fout = args.out[0]; if not fout.lower().endswith(".epub"): fout = fout + ".epub"; if args.no_overwrite and path.exists(fout): print "Skipped! \"" + fout + "\" exists."; else: print "Building ePub"; epub(web, fout, styler, args); print "Done! Written output to \"" + fout + "\"";
def async_handler(): conn = sqlite3.connect('test2.db', check_same_thread=False) c = conn.cursor() while True: c.execute( "SELECT token, period, request_id FROM test WHERE score = 'unready'" ) request = c.fetchone() if request is not None: print('ПРИНИМАЮСЬ') request_id = request[2] print(type(request_id)) date = request[1] token = request[0] dates = period(date.split(" ")[0], date.split(" ")[1]) posts = scrape(dates, token) comments = get_comments(token, posts, dates, request_id) result = predict(comments) predictions_df = result[1] c.execute("UPDATE test SET score = ? where request_id = ?", (result[0], request_id)) c.execute("UPDATE test SET comments = ? where request_id = ?", (predictions_df, request_id)) conn.commit() print('УПРАВИЛСЯ') continue time.sleep(0.5)
def mooving(ticker): """ This function retrieves the dataframe created by the scrape function. It runs mooving_average on it to predict tomorrows prices. """ df = scrape(ticker) df_cut = pd.DataFrame( df, columns=['date', 'closing', 'SMA', 'EMA_Short', 'EMA_Long']) df_cut.closing = np.around(df_cut.closing, decimals=2) window = round(len(df_cut)*0.2) #Simple Moving Average preds=[] for i in range(window): x = df_cut.closing[(len(df_cut) - 2*window + i):(len(df_cut) - window + 1)].sum() + sum(preds) x_mean = x/window preds.append(np.around(x_mean, decimals=2)) df_cut['SMA'][len(df_cut) - window + i] = preds[i] rms_sma=np.sqrt(mean_squared_error(np.array(df_cut['closing'][(len(df_cut) - window) :]), np.array(df_cut['SMA'][(len(df_cut) - window) :]))) #Tomorrow's predicted price num=0 denom=0 for j in range(window): num+= j*(df_cut.closing[len(df_cut) - window + j]) denom+= j pred_weighted = num/denom print(df_cut) return pred_weighted
def all(): a = scrape() return jsonify({ "head": "from covid-19-generator", "status": 200, "body": a })
def fetchProfiles(initURL, maxcount): """Given the URL from where to initiate the crawling, it first fetches the webpage, sends it to the crawler for scraping data from the webpage. Not only that, it also reads all the public profile urls present in the current page and adds them to the list. In subsequent iterations, it will fetch the LinkedIn profiles of people associated with these urls. The iteration continues for the number of times specified by maxcount""" count = 0 links = set([initURL]) waitinglist = list() start = datetime.now() while count< maxcount: count += 1 while len(links) > 0: newreq = links.pop() if newreq not in waitinglist: # If the url hasn't be used already, add it to the waiting list waitinglist.append(newreq) break try: page = urllib2.urlopen(waitinglist[-1]).read() # Fetch the web page from the url just appended scraper.scrape(page, waitinglist[-1]) # Send the page and the url for scraping if len(links) < 3: links.update(profileURL.findall(page)) # Get all the urls present in this web page except: pass links = set([link.strip('"') for link in links]) # String processing to remove quotes percentage = int(count*100.0/maxcount) # Progress bar sys.stdout.write('\r'+'='*percentage+'>'+' '*(101-percentage) +str(percentage)+'%') sys.stdout.flush() print 'Fetched', count, 'profiles in', \ (datetime.now() - start).total_seconds(), 'seconds' start = datetime.now() classifier.classify() # Classify all profiles in the database [TODO: classify only updated portion of db] print 'Classified all profiles in database in', \ (datetime.now() - start).total_seconds(), 'seconds' indexer.computeIndexes() # Compute indexes for every profile in the database [TODO: same as above] print 'Calculated indexes for all profiles in database in', \ (datetime.now() - start).total_seconds(), 'seconds'
def mobile(): stop = request.args.get("stop", 1, type=int) schedule = scrape(stop) if schedule: response = dict(meta=dict(status=200, message="OK"), data=schedule) else: abort(400) return render_template("m.html", path=response)
def api(): stop = request.args.get("stop", 1, type=int) schedule = scrape(stop) if schedule: response = jsonify(meta=dict(status=200, message="OK"), data=schedule) else: abort(400) return response
def second_scrape(): fir_url_box=list(first_scrape()) print "i am in second" sec_url_box=[] for ele in fir_url_box: print ele for item in scrape(ele,'//a[@class="img"]/@href'): yield item
def api(code): try: data = scrape(code) except: data = None if not data: abort(404, {'message': 'Couldn\'t retrieve data for program %s.' % code}) return jsonify(meta=dict(status=200, message='OK'), data=data)
def single(port): try: wait_times = scrape(port) except KeyError: abort(404, {'message': 'Invalid `port` value.'}) if not wait_times: abort(400) return jsonify(meta=dict(status=200, message='OK'), data=wait_times)
def scrape(docs_dir, process=None, article_version=None): if docs_dir is not None: import scraper mod = __import__(__name__) res = scraper.scrape(mod, doc=docs_dir, article_version=article_version) if process: res = process(res) import json res = json.dumps(res, indent=4, ensure_ascii = False) return res.encode('utf8')
def web_count(name, levels, out=None): folder = scrape(name, levels, out) file_list = folder_reader(folder) count = Counter() count.name = folder count.source = name if file_list == None: return else: for file in file_list: count.count(folder + "/" + file) if out == None: out = os.getcwd() writer(out + "/report.JSON", json_formulate(count))
def insert_movie(): url = 'http://www.imdb.com/movies-coming-soon' url_exist = mongo.db.movies.find_one({'url': url}) action = '' if url_exist == None: movies = mongo.db.movies.insert_many(scraper.scrape(url)) action = 'Scrape movies and add to database' else: pass # Get movies from database action = 'Get movies from database' return action
def lookup(coursestring): courses = coursestring.split("+") entries_by_day = {} for course in courses: # check whether the user provided a valid day, otherwise use today day_param = request.args.get('day') if day_param is not None and verify_date_param(day_param): coursedata = scraper.scrape(course, day_param) else: coursedata = scraper.scrape(course) # this data is from one particular course # we take the individual moments and put them in the dict, separated by day bucketadd(entries_by_day, "day", coursedata) # this will end up containing Weekday objects, which also contain the courses for that day weekdays_with_courses = [] for date, entries in entries_by_day.items(): weekdays_with_courses.append(Weekday(date, entries)) # sort the Weekdays based on their weekindex, so monday comes first and sunday last sorted_data = sorted(weekdays_with_courses, key=lambda x: x.weekindex) print(sorted_data) return render_template("lookup.html", days=sorted_data)
def scrape_now(fn): debug(fn) tp = TorrentPlayer() tp.AddTorrent(fn) data = tp.GetLastTorrentData() debug(str(data)) if data: hashes = [data['info_hash']] import scraper res = scraper.scrape(data['announce'], hashes) debug(str(res)) return res[data['info_hash']] else: return {}
def scrape_thread(cur2, pbar, count, qhashs, nth, total, ips=["open.demonii.com"]): db = MySQLdb.connect(**config.mysql) cur = db.cursor() last_commit = time.time() errno=0 nip = len(ips) banip=collections.defaultdict(int) i = nth try: l = qhashs.get(timeout=0) while True: try: ip = ips[i%len(ips)] i+=1 for hash, info in scraper.scrape("udp://%s:1337/announce" % ip, l).items(): cur.execute("UPDATE torrents SET scrape_date=NOW(), seeders=%s, leechers=%s, downloads_count=%s WHERE hash=%s", (info['seeds'], info['peers'], info['complete'], hash)) if time.time() - last_commit > 30: db.commit() last_commit = time.time() pbar.update(min(pbar.currval + len(l), count)) l = qhashs.get(timeout=0) errno=0 except (socket.timeout, socket.gaierror, socket.error): db.commit() banip[ip]+=1 if banip[ip]>3: try:ips.remove(ip) except ValueError: pass if not ips: raise ValueError("all ips failed") time.sleep(0.1 * errno + 0.1) errno+=1 if errno > nip*3: raise except (queue.Empty, ZeroDivisionError): pass except (socket.timeout, socket.gaierror, socket.error): qhashs.put(l) except (ValueError, RuntimeError) as e: print e finally: db.commit() cur.close() db.close()
def createURL(request): latest_fanfic_list = FanFic.objects.all().order_by('-pub_date')[:11] if request.method == 'POST': form = CreateURLForm(request.POST) if form.is_valid(): new_fanfic = form.save(commit=False) #banti's code -- scraping from url d = scraper.scrape(form.cleaned_data['url']) new_fanfic.title = d['title'] new_fanfic.author = d['author'] new_fanfic.text = d['text'] new_fanfic.fandom = d['fandom'] if d['text'] == '': new_fanfic.text = d['summary'] print "text of fanfic: " + new_fanfic.text #alyssa's code -- getting keywords kwlist = my_immortal_keyword_finder.getwords(new_fanfic.text) try: new_fanfic.profile=str(image_return.googlePrep(d['fandom'])) except: print "F**K the profile picture" new_fanfic.save() for kw in kwlist: kw = kw.strip() #banti's code -- getting image urls try: new_fanfic.keyword_set.create(key_word=kw, image_url=str(image_return.googlePrep(kw))) except: print kw + "is f****d" return HttpResponseRedirect('/fanfics/'+ str(new_fanfic.id)) # Redirect after POST else: form = CreateURLForm() # An unbound form return render(request, 'fanfics/createURL.html', { 'form': form, 'latest_fanfic_list':latest_fanfic_list })
def third_scrape(): print "i am in third" third_url_box=[] for ele in second_scrape(): for item in scrape(ele,'//a[contains(@href,"images")]/@href'): yield item
import csv,scraper with open('Alexs_workout_history.csv') as csvfile: reader = csv.reader(csvfile) for row in reader: scraper.scrape(row[14])
""" Main Execution File for Scheduler """ from scraper import scrape from ical import make_calendar if __name__ == "__main__": make_calendar(scrape()) print("Your calendar has been saved to this directory as 'UNT_schedule.ics'.")
def call(self): generated_eif = scraper.scrape(feeds, doc=xml_path)[0]['article'][0] expected_eif = json.load(open(eif_file)) self.assertEqual(byteify(expected_eif), byteify(generated_eif))
def inject_methods(): this_dir = os.path.abspath(os.path.dirname(__file__)) source_xml_dir = join(this_dir, 'JATS') source_eif_dir = join(this_dir, 'EIF') source_partial_dir = join(this_dir, 'EIF', 'partial') # returns a map of {fname: /path/to/fname, ...} for given `dir` def path_map(parent): paths = map(lambda fname: join(parent, fname), os.listdir(parent)) paths = filter(os.path.isfile, paths) return dict(zip(map(os.path.basename, paths), paths)) # creates absolute paths to the EIF fixtures xml_path_list = path_map(source_xml_dir) eif_path_list = path_map(source_eif_dir) partial_eif_path_list = path_map(source_partial_dir) def xml_fname_to_eif(xml_fname, xml_path): return join(source_eif_dir, os.path.splitext(xml_fname)[0] + ".json") for xml_file, xml_path in xml_path_list.items(): eif_file = xml_fname_to_eif(xml_file, xml_path) if not os.path.exists(eif_file): LOG.info('skipping %s, path `%s` not found', xml_file, eif_file) continue def _fn1(xml_path, eif_file): def call(self): generated_eif = scraper.scrape(feeds, doc=xml_path)[0]['article'][0] expected_eif = json.load(open(eif_file)) self.assertEqual(byteify(expected_eif), byteify(generated_eif)) return call slug = xml_file.replace('-', '_').replace(' ', '').replace('/', '_') setattr(TestContent, 'test_eif_%s' % slug, _fn1(xml_path, eif_file)) # handle partials def xml_fname_to_eif_partial(xml_fname, xml_path): return join(source_partial_dir, os.path.splitext(xml_fname)[0] + "-match.json") for xml_file, xml_path in xml_path_list.items(): eif_path = xml_fname_to_eif_partial(xml_file, xml_path) if not os.path.exists(eif_path): LOG.info('skipping %s, path `%s` not found', xml_file, eif_path) continue generated_eif = scraper.scrape(feeds, doc=xml_path)[0]['article'][0] # a list of maps with keys 'description' and 'data' eif_partial_tests = json.load(open(eif_path)) for test in eif_partial_tests: if not test.has_key('description') or not test.has_key('data'): LOG.debug('description or data elements not found in file %r, skipping', eif_path) continue desc, expected_eif = test['description'], test['data'] for element, expected_partial_eif in expected_eif.items(): has_key = generated_eif.has_key(element) def _fn2(eif, expected_partial_eif): def call(self): self.assertTrue(has_all_keys(expected_partial_eif, ['description', 'data'])) self.assertEqual(byteify(expected_partial_eif), byteify(eif[element])) return call slug = eif_path.replace('-', '_').replace(' ', '').replace('/', '_') setattr(TestContent, 'test_partial_%s' % slug, _fn2(xml_path, eif_file))
def setUp(self): self.ribeye_url = "http://www.bonappetit.com/recipe/salt-and-pepper-rib-eye" self.cauliflower_url = "http://www.bonappetit.com/recipe/roasted-cauliflower-with-lemon-parsley-dressing" self.ribeye_scrape = scraper.scrape(self.ribeye_url) self.cauliflower_scrape = scraper.scrape(self.cauliflower_url)
from pymongo import MongoClient from scraper import scrape import preprocess if __name__ == '__main__': client = MongoClient() collection = client.twitter.tweets scrape('#Christmas',150000,collection) preprocess.count_date(collection.find()) preprocess.analyze_term(collection.find()) preprocess.to_geojson(collection.find({'coordinates': {'$exists':1}})) preprocess.construct_retweets_graph(collection.find()) client.close()
def initLocationClassifier(): """Initialize Location Classifier""" cities = open('data/indiancities', 'r').readlines() classes = [({'name':city.split()[0]},city.split()[1]) for city in cities] return nltk.NaiveBayesClassifier.train(classes) location_classifier = initLocationClassifier() def classify(): """Classify ALL the profiles in the database [TODO]: Allow classification to run only on selected list of profiles""" for profile in dbinterface.collection.find(): first_name = profile['first_name'] locality = profile['locality'].split()[0] gender, area = None, None # Classifiers if not profile.has_key('gender') or not profile.has_key('area'): gender = gender_classifier.classify(gender_features(first_name)) area = location_classifier.classify({'name':locality}) dbinterface.collection.update({'public_profile_url':profile['public_profile_url']}, {'$set': {'gender':gender, 'area':area}}) if __name__ == '__main__': page = open('reference.profile.2', 'r') import scraper resume = scraper.scrape(page, 'http://www.example.com/') classify()
def first_scrape(): fir_url_box=scrape('http://ukiyo-e.org/','//a[contains(@href,"artist")]/@href') print "i am in first" print fir_url_box return fir_url_box