def __init__(self,config_parser): # Connect to engine database_path = get_from_config_parser(config_parser,'Database','path','database') database_debug = get_boolean_from_config_parser(config_parser,'Database','debug',False) dir = os.path.dirname(database_path) if not os.path.exists(dir): mkdir(dir) sys.stderr.write('Connecting to database at "%s"\n' % database_path) self._engine = create_engine('sqlite:///%s' % database_path,echo=database_debug) # Start session Session = sessionmaker(bind=self._engine) self._session = Session() # Initialize feed storage self._feed_storage = FeedStorage(self._engine,self._session) # Initialize item storage self._item_storage = ItemStorage(self._engine,self._session) # A list of subprocess.Popen processes that will be maintained # by the Coffer object. self._external_processes = [] # File storage (data dump) file_storage_path = get_from_config_parser(config_parser,'FileStorage','path','datadump') max_block_size = get_int_from_config_parser(config_parser,'FileStorage','max-block-size', file_storage.DEFAULT_MAX_BLOCK_SIZE) bzip2_path = get_from_config_parser(config_parser,'FileStorage','bzip2-path','/usr/bin/bzip2') self._file_storage = FileStorage(self._external_processes,file_storage_path, max_block_size,bzip2_path) # Content fetcher configuration self._fetcher = Fetcher(config_parser)
def serve_user_podcast(username): limit = request.args.get("limit") fetcher = Fetcher(request.environ['YOUTUBERSS_CONFIG']) print "What the f**k is going on here" print request.environ['YOUTUBERSS_CONFIG'] podcast, upload_playlist = fetcher.get_user_data(username) return serve(fetcher, podcast, upload_playlist, limit)
def __init__(self,packageName,debug): #Variables for tests self.logger = logging.getLogger("tmLogger") if debug: self.logger.setLevel(logging.DEBUG) # Use self.package name as a directory name self.packageName = "%s/" % packageName #Assign directories to be worked on self.workDir = "/tmp/testManager" self.repo = urlparse.urljoin ("http://cekirdek.pardus.org.tr/~serbulent/test_guides/", self.packageName) self.saveDir = os.path.join("/tmp/testManager", self.packageName) self.filesDir = os.path.join(self.saveDir, "files") self.configFile = os.path.join(self.saveDir, "testProcess.conf") #Create test directories for item in (self.workDir, self.saveDir, self.filesDir): if not os.path.isdir(item): try: os.mkdir(item) debugMsg = "%s created" % item self.logger.debug(debugMsg) except OSError: errorMsg = "An error occured when creating directory %s" % item self.logger.error(errorMsg) #Read package configuration self.fetcher = Fetcher(debug) url = urlparse.urljoin(self.repo, "testProcess.conf") self.fetcher.download(url, self.configFile) cfr = ConfReader(self.configFile) self.params = cfr.read() self.fetchFiles()
def run(limit = -1): if limit != -1 and store.count() > limit: print 'Limit: ' + str(limit) + ' Count: ' + str(store.count()) print 'Voila, We are done !!' import sys sys.exit() return if swap.current.is_empty(): if swap.next.is_empty() : print 'Somehow, We are done' else: # we exchange current and new, run again swap.exchange() return run(limit) #get the url from current-to be crawled list of urls url = swap.current.pop() if not url or not valid_url(url): print '"'+url+'" is not a valid url, giving up on it' return run(limit) f = Fetcher(url) next_urls = store.process(f.url(), f.child_urls()) for url in next_urls: swap.next.push(url) return run(limit)
def crawl_category(self): fetcher = Fetcher() kk = yield fetcher.fetch( "http://www.carters.com/%s?startRow=0&sz=all" % self.slug) page = kk.body self._process(page)
def start(self): url_queue = Queue.Queue() url_queue.put((self.root_request_info.url, 0)) request_info = RequestInfo('', None, self.root_request_info.headers) fetcher = Fetcher() while not url_queue.empty(): curr_url, depth = url_queue.get() #print 'url=%s, depth=%d' % (curr_url, depth) print curr_url if depth > self.depth_limit: continue depth += 1 request_info.url = curr_url page_content = fetcher.request(request_info) ## parse page ## Content.parse(page_content) url_list = HtmlParser.extract_url(curr_url, page_content) if url_list: for url in url_list: url_queue.put((url, depth))
def loadGamelogs(self, year=None): """ Loads gamelogs for the player for a given year Arguments: year : The season desired. Defaults to the current year if not specified """ if year is None: year = datetime.datetime.now().year if year not in self.logs: self.logs[year] = [] if 'primary_position' not in self: logger.error("no primary position attribute for " % self) return False url = Fetcher.MLB_PITCHER_URL if self['primary_position'] == 1 else Fetcher.MLB_BATTER_URL f = Fetcher(url, player_id=self.player_id, year=year) j = f.fetch() try: if self['primary_position'] == 1: parent = j['mlb_bio_pitching_last_10']['mlb_individual_pitching_game_log']['queryResults'] else: if 'mlb_individual_hitting_last_x_total' in j: parent = j['mlb_individual_hitting_last_x_total']['mlb_individual_hitting_game_log']['queryResults'] else: parent = j['mlb_bio_hitting_last_10']['mlb_individual_hitting_game_log']['queryResults'] except KeyError, e: logger.error('no key for gamelogs found in %s' % f.url) return False
def __init__(self, year, month, day = None): """ Constructor Arguments: year: The... year! month: The... month! day: The... day! (or None for all days of the month) """ days = [] if day is None: for d in xrange(1, calendar.mdays[month] + 1): days.append(datetime.date(year, month, d)) else: days.append(datetime.date(year, month, day)) begin = days[0] end = days[-1] f = Fetcher(Fetcher.MLB_TRANSACTION_URL, start=begin.strftime("%Y%m%d"), end=end.strftime("%Y%m%d")) try: obj = f.fetch() if obj['transaction_all']['queryResults']['totalSize'] == 0: return results = obj['transaction_all']['queryResults']['row'] if type(results) is dict: self.append(results) else: for row in results: self.append(row) except (ValueError, KeyError), e: logger.error("ERROR %s on %s" % (e, f.url)) pass
class Getter(metaclass=GetMetaclass): def __init__(self): self.fetcher = Fetcher() def get_info(self, callback, url): crawl_func = eval('self.{0}'.format(callback)) return crawl_func(url) def crawl_weixin(self, url): result = {} res = self.fetcher.get(url) soup = BeautifulSoup(res.content, 'html.parser', from_encoding='utf8') item_imgs = soup.select('#page-content img[data-src!=""]') imgs = [item_img['data-src'] for item_img in item_imgs] title = soup.select('h2#activity-name')[0].text.strip() result['url'] = url result['imgs'] = imgs result['title'] = title result['content'] = res.content return result def crawl_youdao(self, url): report = {} res = self.fetcher.get(url) soup = BeautifulSoup(res.content, 'html.parser', from_encoding='utf8') item_imgs = soup.select('.post-bd img[src!=""]') imgs = [item_img['src'] for item_img in item_imgs] report['url'] = url report['imgs'] = imgs report['content'] = res.content return report
class Tips(object): """ Manage Tips Events. """ def __init__(self, enable): self.enable = enable self._tips = {} self._new_tips = set() self.lock = Lock() if self.enable: self.fetcher = Fetcher(self._tips, self.lock, self._new_tips) self.cleaner = Cleaner(self._tips, self.lock, self._new_tips) self.fetcher.start() self.cleaner.start() def tips(self): return self._tips.values() def new_tips(self): if self._new_tips: wait_free_acquire(self.lock) res = [self._tips[x] for x in self._new_tips] self._new_tips.clear() self.lock.release() return res else: return [] def stop(self): if self.enable: self.fetcher.finnish() self.cleaner.finnish()
def __fetch_daily_returns(self): fetcher = Fetcher(list(self.stockList)) fetcher.fetch_history(self.startDate, self.endDate) self.df = fetcher.get_dataframe('Adj_Close') globalStats = GlobalStats(self.df) return globalStats.get_daily_returns()
def fetch(agfid, u_ip, d_ip, r_ip, a_ip, u_port, d_port, r_port, a_port, temp_file_dir, device_name): logging.basicConfig( filename="fetcher.log", filemode="w", format="%(asctime)s %(name)s:%(levelname)s:%(message)s", datefmt="%d-%M-%Y %H:%M:%S", level=logging.DEBUG) logging.info('_r=true' + ';agfid=' + agfid + ';a_port=' + str(a_port) + ';u_port=' + str(u_port) + ';r_port=' + str(r_port) + ';temp_file_dir=' + temp_file_dir + ';device_name=' + device_name + ';') params = { "rnode_ip": r_ip, "unode_ip": u_ip, "dnode_ip": d_ip, "unode_port": str(u_port), "rnode_port": str(r_port), "dnode_port": str(d_port), "adc_ip": a_ip, "adc_port": str(a_port) } fetcher = Fetcher(params) err = fetcher.start(agfid=agfid, device_name=device_name, temp_file_dir=temp_file_dir) if err is not None: click.echo("_r=false;message=" + err + ";") logging.error("_r=false;message=" + err + ";") if fetcher.ffmpeg != None: fetcher.ffmpeg.kill()
def _get_category_page(self): fetcher = Fetcher() ret = yield fetcher.fetch('http://www.6pm.com/%s' % self.slug) body = PQ(ret.body) foo = body('.last a')[0].get('href') max_page = int(re.findall('-page(\d+)', foo)[0]) for i in range(max_page): self._crawl_category_page(i)
class GlobalPlugin(globalPluginHandler.GlobalPlugin): def __init__(self): super(globalPluginHandler.GlobalPlugin, self).__init__() # Creating the configuration directory configDir = os.path.join(config.getUserDefaultConfigPath(), "owm") if not os.path.exists(configDir): os.mkdir(configDir) # Add the settings in the NVDA menu self.prefsMenu = gui.mainFrame.sysTrayIcon.menu.GetMenuItems()[0].GetSubMenu() self.owmSettingsItem = self.prefsMenu.Append(wx.ID_ANY, "OWM Settings...", "Set OWM location") gui.mainFrame.sysTrayIcon.Bind(wx.EVT_MENU, self.onOWMSettings, self.owmSettingsItem) # Create the client to retrieve information from the API self.fetcher = Fetcher() self.fetcher.start() def script_announceOWMForecast(self, gesture): if self.fetcher.client is None: ui.message("Loading, please wait and try again in a few seconds...") return client = self.fetcher.client if client.error: ui.message("{0} {1}".format(client.statusCode, client.errorReason)) self.fetcher.valid = False self.fetcher = Fetcher() self.fetcher.start() else: forecast = client.forecast message = forecast.getMessage() ui.message(message) log.info(message) def onOWMSettings(self, event): """Pop a dialog with OWM settings.""" locations = locationList.retrieve() selected = configFile['location'] locationName = locationList.get(selected).name locationValues = {} for location in locations: locationValues[location.id] = (location.name, location.country) dialog = LocationDialog(gui.mainFrame, -1, "Select OWM Location", locationValues, locationName) gui.mainFrame.prePopup() ret = dialog.ShowModal() gui.mainFrame.postPopup() if ret == wx.ID_OK: log.info("Focused {0}, {1}".format(locationList.path, dialog.location.focusedLocationName)) __gestures={ "kb:NVDA+w": "announceOWMForecast", }
def schedule_fetcher(self, cycle=FETCHER_CYCLE): """ 定时获取代理 """ fetcher = Fetcher() while True: print('开始抓取代理') fetcher.run() time.sleep(cycle)
def update(new=True, begin=0, end=0, ids=[]): fetcher = Fetcher() if new: count=fetcher.fetchNew() elif len(ids) == 0: count=fetcher.fetchAll(begin, end) else: count=fetcher.fetchSelected(ids) print "%d items updated" % count
def main(): url_file = 'url.conf' url_list = get_url_list(url_file) cache = PriorityCache() fetchers = [] for url in url_list: f = Fetcher(cache, url) fetchers.append(f) for f in fetchers: f.run()
class StockPriceData(): def __init__(self, symbol, start, end=date.today(), data_len=5, scale="D"): usecols = ["open", "high", "low", "close", "volume"] self.__data_df = Fetcher().fetch(symbol, start, end)[usecols] if scale != "D": self.__data_df = self.__data_df.resample(scale, how={ "open": 'first', "high": 'max', "low": 'min', "close": 'last', "volume": 'sum', })[:-1] self.__data_df_norm = self.__data_df.copy() self.__data_df_norm['open'] = MinMaxScaler().fit_transform( self.__data_df.open.values.reshape(-1, 1)) self.__data_df_norm['high'] = MinMaxScaler().fit_transform( self.__data_df.high.values.reshape(-1, 1)) self.__data_df_norm['low'] = MinMaxScaler().fit_transform( self.__data_df.low.values.reshape(-1, 1)) self.__data_df_norm['close'] = MinMaxScaler().fit_transform( self.__data_df.close.values.reshape(-1, 1)) self.__data_df_norm['volume'] = MinMaxScaler().fit_transform( self.__data_df.volume.values.reshape(-1, 1)) self.data_len = data_len def denormalize(self, norm_value): origin_values = self.__data_df["close"].values.reshape(-1, 1) norm_value = norm_value.reshape(-1, 1) min_max_scaler = MinMaxScaler() min_max_scaler.fit_transform(origin_values) denorm_value = min_max_scaler.inverse_transform(norm_value) return denorm_value def get_train_datas(self, scale="D"): data = self.__data_df_norm.values data_x = np.array(list(chunks(data, self.data_len))[:-1]) return data_x def get_train_targets(self): pass def get_test_datas(self): data = self.__data_df_norm.values data_x = np.array(list(chunks(data, self.data_len))[-1]) return data_x def get_raw_datas(self): return self.__data_df.values def get_norm_datas(self): return self.__data_df_norm.values
def loadYearlies(self): """ Loads yearly and career totals for a player """ if self['primary_position'] == 1 and not self.force_batting: f = Fetcher(Fetcher.MLB_PITCHER_SUMMARY_URL, player_id=self.player_id) else: f = Fetcher(Fetcher.MLB_BATTER_SUMMARY_URL, player_id=self.player_id) j = f.fetch() # if the JSON object is empty, bail if len(j.keys()) == 0: return # get yearly totals if self['primary_position'] == 1 and not self.force_batting: parent = j['mlb_bio_pitching_summary']['mlb_individual_pitching_season']['queryResults'] else: parent = j['mlb_bio_hitting_summary']['mlb_individual_hitting_season']['queryResults'] if parent['totalSize'] > 0: records = parent['row'] # accounting for player with only one row if type(records) is dict: records = [records] for row in records: log = {} for key, value in row.iteritems(): log[key] = value # handle each season as a list, so # players with multiple team seasons # get each team for that year # accounted for if row['season'] in self.totals: self.totals[row['season']].append(log) else: self.totals[row['season']] = [log] # get career totals if self['primary_position'] == 1 and not self.force_batting: parent = j['mlb_bio_pitching_summary']['mlb_individual_pitching_career']['queryResults'] else: parent = j['mlb_bio_hitting_summary']['mlb_individual_hitting_career']['queryResults'] if parent['totalSize'] > 0: for key, value in parent['row'].iteritems(): self.career[key] = value
def load(self, loadRosters = False): """ Calls MLB.com server and loads all team information Arguments: loadRosters : If true, rosters will automatically be loaded (more HTTP requests!) """ f = Fetcher(Fetcher.MLB_LEAGUE_URL) for item in f.fetch(): t = team.Team(item) if loadRosters: t.loadRoster() self.teams[t['team_code']] = t
def scrape_root(self, root, helper): """ Scrape a root URL """ t0 = time.time() # Fetch the root URL and scrape all child URLs that refer # to the same domain suffix and we haven't seen before logging.info("Fetching root {0}".format(root.url)) # Read the HTML document at the root URL html_doc = Fetcher.raw_fetch_url(root.url) if not html_doc: logging.warning("Unable to fetch root {0}".format(root.url)) return # Parse the HTML document soup = Fetcher.make_soup(html_doc) # Obtain the set of child URLs to fetch fetch_set = Fetcher.children(root, soup) # Add the children whose URLs we don't already have to the # scraper articles table with SessionContext() as session: for url in fetch_set: if helper and helper.skip_url(url): # The helper doesn't want this URL continue # noinspection PyBroadException try: article = ArticleRow(url=url, root_id=root.id) # Leave article.scraped as NULL for later retrieval session.add(article) session.commit() except IntegrityError as e: # Article URL already exists in database: # roll back and continue session.rollback() except Exception as e: logging.warning( "Roll back due to exception in scrape_root: {0}" .format(e) ) session.rollback() t1 = time.time() logging.info("Root scrape completed in {0:.2f} seconds".format(t1 - t0))
def main(argv): # myBrowser = Browser() isLogin = False myFetcher = Fetcher() username = "" password = "" domain = "" try: opts, args = getopt.getopt(argv, "u:p:d:", ["login", "domain="]) except getopt.GetoptError: print "(mandatory)\n-d <domain> or --domain <domain>" \ " \n(optional)\n--login followed by \n -u <username> -p <password> " exit() for opt, arg in opts: if opt == "--login": isLogin = True elif opt == "-u": username = arg elif opt == "-p": password = arg elif opt in ("-d", "--domain"): domain = arg if isLogin: myFetcher.setCredentials(username, password) myFetcher.login(domain) print myFetcher.getCookies()
def load(self, loadRosters=False): """ Calls MLB.com server and loads all team information Arguments: loadRosters : If true, rosters will automatically be loaded (more HTTP requests!) """ f = Fetcher(Fetcher.MLB_LEAGUE_URL) for item in f.fetch(): t = team.Team(item) if loadRosters: t.loadRoster() self.teams[t['team_code']] = t
def main(): #开启调度器 Scheduler.runScheduler() #开启抓取器 Fetcher.runFetcher(processNum=10) #开启处理器 Processor.runProcessor() # print(u"cpu 个数:" + str(multiprocessing.cpu_count())) # for fetcher in multiprocessing.active_children(): # fetcher.join() # print("child p.name:" + fetcher.name + "\tp.id" + str(fetcher.pid)) for p in SafeQueue.processList: p.start() for p in SafeQueue.processList: p.join()
def _init_from_scrape(cls, url: Optional[str], enclosing_session: Optional[Session] = None): """ Scrape an article from its URL """ if url is None: return None a = cls(url=url) with SessionContext(enclosing_session) as session: # Obtain a helper corresponding to the URL html, metadata, helper = Fetcher.fetch_url_html(url, session) if html is None: return a a._html = html if metadata is not None: a._heading = metadata.heading a._author = metadata.author a._timestamp = metadata.timestamp a._authority = metadata.authority a._scraped = datetime.utcnow() if helper is not None: helper = cast(Any, helper) a._scr_module = helper.scr_module a._scr_class = helper.scr_class a._scr_version = helper.scr_version a._root_id = helper.root_id a._root_domain = helper.domain return a
def run(self): """The starting point of a thread""" # print("Thread " + str(self.thread_id) + " started") while True: # print("get next URL") self.dash.print_cur_stat("Next_Url___", self.thread_id) value, current_url, current_dns = self.frontier.get_url(self.thread_id) if not current_url: # print("Empty Queue from thread " + str(self.thread_id)) self.dash.print_cur_stat("Empty_Queue", self.thread_id) continue self.dash.print_cur_stat("Downloading", self.thread_id) code, links, content = Fetcher.fetch(current_url) if code == -1: # print("Refused from thread " + str(self.thread_id)) self.dash.print_cur_stat("Refused_Url", self.thread_id) self.refused += 1 self.dash.print_refused(str(self.refused), self.thread_id) continue self.dash.print_cur_stat("Valid_Url__", self.thread_id) # Crawling this link successeded # print("URL got from thread " + str(self.thread_id)) out_links = len(links) sz_parent = len(content) links_mod = [] for i in range(len(links)): links_mod.append((links[i][0], links[i][1], (out_links, sz_parent, len(links[i][0]), value))) self.dash.print_cur_stat("URL_Fetched", self.thread_id) self.crawled += 1 self.dash.print_crawled(str(self.crawled), self.thread_id) # print("URL fetched from thread " + str(self.thread_id)) self.frontier.push_to_serve(links_mod, self.thread_id) Storage.cache_crawled_url(current_url, current_dns, content, self.thread_id)
def __init__(self, target_url): self.url = target_url self.logger = logging.getLogger('SaveService') self.fetcher = Fetcher() self.getter = Getter() self.url_info = {} self.client = MongoHelper(config.store_db)
def main(args, session): logging.info('Deleting existing xeno-canto recordings') session.query(Recording).filter(Recording.source == 'xc').delete() fetcher = Fetcher(cache_group='xc_api', pool_size=args.recording_load_jobs, clear_cache=args.clear_recordings_cache) query = XcQuery({'nr': f'{args.start_xc_id}-{args.end_xc_id}'}, fetcher) first_page = query.fetch_page(1) num_pages = first_page['numPages'] num_recordings = int(first_page['numRecordings']) logging.info(f'Found {num_pages} pages, {num_recordings} recordings') with multiprocessing.pool.ThreadPool(args.recording_load_jobs) as pool: for page in progress.percent( itertools.chain([first_page], pool.imap(query.fetch_page, range(2, num_pages + 1))), num_pages): try: # Allow replacements in case the API shifts pages around # (it seems to do that, probably when new recordings are # added during the run). recordings = [_parse_recording(r) for r in page['recordings']] session.bulk_save_objects_with_replace(recordings) except Exception: logging.error( f'Error parsing page:\n{json.dumps(page, indent=" ")}', exc_info=True) raise
def fetch_prices(self): t = 6 * 3600 * int(time.time() / (6 * 3600)) + 6 * 3600 js = Fetcher(json.loads).fetch( URL_PRICES.format(self.id, DATE_START.strftime("%Y-%m-%d"), t)) self.rawdata = js['data'] self.btc_series = [(datetime.strptime(k.split("T")[0], "%Y-%m-%d"), v['BTC'][0]) for k, v in self.rawdata.items()] if self.data: self.btc_series.append( (datetime.now(), self.data["quote"]["BTC"]["price"])) series_fill_zeroes(self.btc_series) normalize(self, "btc_series") self.usd_series = [(datetime.strptime(k.split("T")[0], "%Y-%m-%d"), v['USD'][0]) for k, v in self.rawdata.items()] if self.data: self.usd_series.append( (datetime.now(), self.data["quote"]["USD"]["price"])) series_fill_zeroes(self.usd_series) normalize(self, "usd_series") self.supply = [] try: self.supply = [(datetime.strptime(k.split("T")[0], "%Y-%m-%d"), 10 * round(0.1 * div0(v['USD'][2], v['USD'][0]))) for k, v in self.rawdata.items()] except: pass series_fill_zeroes(self.supply) normalize(self, "supply")
def __init__(self, data_dir, seed_urls, similarity_method): if not os.path.exists(data_dir): os.makedirs(data_dir) self.fetcher = Fetcher(data_dir) if similarity_method == "cosine": self.similarity = Cosine_Similarity() elif similarity_method == "jaccard": self.similarity = Jaccard_Similarity() else: self.similarity = None self.K = max(len(seed_urls)/2, 10) self.host = set() self.update_seeds(seed_urls)
def handle(self): # self.request is the TCP socket connected to the client data = self.request.recv(1024).strip() if data.startswith("GET /favicon.ico"): return print '--------------data---------------- ' + data query_components = get_query_parameters(data) print "query %s" % query_components # just send back the same data, but upper-cased # date_days_ago = Fetcher.DEFAULT_DATE result = {"result":"empty"} if "days" in query_components: from fetcher import Fetcher date_days_ago = float(query_components["days"][0]) result = Fetcher.get_ranked_pages(date_days_ago) elif "category" in query_components: sort = "score" if "sort" in query_components: sort = query_components["sort"][0] result = social_db.read_from_spikedate(query_components["category"][0], sort) elif "channel" in query_components: channel = query_components["channel"][0] result = get_channel_serving.sort_channel_by_field(channel, max_count=100, remove_adult=False) elif "translate" in query_components: text = query_components["translate"][0] if self.token_expired is None or self.token_expired: self.update_token() self.token_expired = False if "from" in query_components and "to" in query_components: from_lang = query_components["from"][0] to_lang = query_components["to"][0] result = MTPythonSampleCode.translate(self.final_token, textToTranslate=text, fromLangCode=from_lang, toLangCode=to_lang) else: result = MTPythonSampleCode.translate(self.final_token, text) self.request.sendall(json.dumps(result, encoding="utf-8", ensure_ascii=False))
def _process_text(parser, session, text, all_names, xform): """ Low-level utility function to parse text and return the result of a transformation function (xform) for each sentence. Set all_names = True to get a comprehensive name register. Set all_names = False to get a simple name register. Set all_names = None to get no name register. """ t0 = time.time() # Demarcate paragraphs in the input text = Fetcher.mark_paragraphs(text) # Tokenize the result toklist = list(tokenize_and_recognize(text, enclosing_session=session)) t1 = time.time() pgs, stats = TreeUtility._process_toklist(parser, session, toklist, xform) if all_names is None: register = None else: from query import create_name_register register = create_name_register(toklist, session, all_names=all_names) t2 = time.time() stats["tok_time"] = t1 - t0 stats["parse_time"] = t2 - t1 stats["total_time"] = t2 - t0 return (pgs, stats, register)
def on_search(self, w): mode = self.mode.get_active() txt = self.entry.get_text() self.musiclist.set_loading(False) self.musiclist.empty_message = "Searching..." self.musiclist.get_model().clear() if self.fetcher: self.fetcher.stop() self.fetcher = None itemgen = None if mode == 0: itemgen = lambda: jamaendo.search_artists(query=txt) elif mode == 1: itemgen = lambda: jamaendo.search_albums(query=txt) elif mode == 2: itemgen = lambda: jamaendo.search_tracks(query=txt) else: return self.fetcher = Fetcher(itemgen, self, on_item = self.on_add_result, on_ok = self.on_add_complete, on_fail = self.on_add_complete) self.fetcher.start() '''
class Ranking: def __init__(self, data_dir, seed_urls, similarity_method): if not os.path.exists(data_dir): os.makedirs(data_dir) self.fetcher = Fetcher(data_dir) if similarity_method == "cosine": self.similarity = Cosine_Similarity() elif similarity_method == "jaccard": self.similarity = Jaccard_Similarity() else: self.similarity = None self.K = max(len(seed_urls)/2, 10) self.host = set() self.update_seeds(seed_urls) def update_seeds(self, seed_urls): '''Update seed urls in the current seed list. Fetch the seed urls''' new_seed_urls = [] for url in seed_urls: host = URLUtility.get_tld(url) if host not in self.host: self.host.add(host) new_seed_urls.append(url) urls, text = self.fetcher.fetch_urls(new_seed_urls) self.similarity.update_seeds(urls, text) self.K = max(len(self.similarity.seed_pages.keys())/2, 10) def rank(self, urls): ''' Rank the urls with respect to the seeds Return the sorted ranking scores together with the urls''' # Fetch the urls urls, text = self.fetcher.fetch_urls(urls) # Rank knn_scores = [] scores = self.similarity.compute_similarity(text) for i in xrange(len(urls)): sorted_scores = sorted(scores[i], reverse=True) knn_score = sum(sorted_scores[:self.K])/float(min(self.K, len(sorted_scores))) knn_scores.append((urls[i], knn_score)) knn_scores = sorted(knn_scores, key=lambda x:x[1]) return knn_scores
class RadiosWindow(hildon.StackableWindow): def __init__(self): hildon.StackableWindow.__init__(self) self.fetcher = None self.radios = {} self.set_title("Radios") self.connect('destroy', self.on_destroy) # Results list self.panarea = hildon.PannableArea() self.radiolist = RadioList() self.radiolist.connect('row-activated', self.row_activated) self.panarea.add(self.radiolist) self.add(self.panarea) self.start_radio_fetcher() def on_destroy(self, wnd): if self.fetcher: self.fetcher.stop() self.fetcher = None def row_activated(self, treeview, path, view_column): name, _id = self.radiolist.get_radio_id(path) wnd = open_playerwindow() wnd.play_radio(name, _id) def start_radio_fetcher(self): if self.fetcher: self.fetcher.stop() self.fetcher = None self.fetcher = Fetcher(jamaendo.starred_radios, self, on_item = self.on_radio_result, on_ok = self.on_radio_complete, on_fail = self.on_radio_complete) self.fetcher.start() def on_radio_result(self, wnd, item): if wnd is self: self.radios[item.ID] = item self.radiolist.add_radios([item]) def on_radio_complete(self, wnd, error=None): if wnd is self: self.fetcher.stop() self.fetcher = None
def __init__(self, host: str, path: str, timestamp=datetime.datetime.now(), spreadsheet_id=SPREADSHEET_ID): self.host = host self.path = path self.renderer = SheetsRenderer(spreadsheet_id) self.fetcher: Fetcher = Fetcher(s, self.host) self.listing_cache: ListingCache = ListingCache("/tmp/cache-%s" % host, self.fetcher) self.emailer = Emailer(self.host, "/tmp/email_log") self.timestamp = timestamp
def loadRoster(self): """ Calls MLB.com servers to obtain the complete roster for the team. If call fails, '_error' property is set. """ f = Fetcher(Fetcher.MLB_ROSTER_URL, team_id=self['team_id']) j = f.fetch() if 'roster_40' not in j: self._error = "ERROR on %s: key roster_40 not found (cannot load 40 man roster)" % (f.url) return False parent = j['roster_40']['queryResults'] if parent['totalSize'] > 0: for record in parent['row']: player_id = record['player_id'] self.roster[player_id] = player.Player(player_id)
def _crawl_url(self, url): fetcher = Fetcher() ret = yield fetcher.fetch(url) body = PQ(ret.body) products = body('a.product') data = [] for product in products: foo = PQ(product) origin_price = re.findall('\$([\d\.]+)', foo('.discount').text()) if origin_price: origin_price = origin_price[0] sales_price = foo('.price-6pm').text().replace('$', '').strip() if not origin_price and not sales_price: continue title = '[%s] %s' % (foo('.brandName').text(), foo('.productName').text()) data.append({ 'image': foo('.productImg').attr('src'), 'link': parse_url('http://www.6pm.com' + foo('a').attr('href')), 'title': title, 'original_price': origin_price or sales_price, 'sales_price': sales_price }) data = { 'website': '6pm', 'currency': 'USD', 'country': 'USA', 'store_id': self.store_id, 'data': json.dumps(data) } data.update(self._extra_kwargs) q = yield fetcher.fetch( 'http://127.0.0.1:8000/ezlookup/deal/?key=998998998', method="POST", data=data)
def main(): fm = FileManager() fe = Fetcher() folder_dir = fm.get_folder_dir() df = [fe.statistics(), fe.coutries_data(), fe.countries_historical_data()] filenames = [ 'statistics', 'all_country_data', 'all_country_historical_data' ] for i in range(3): filename = '{}/{}.csv'.format(folder_dir, filenames[i]) df[i].to_csv(filename, index=False) print('{} is created successfully.'.format(filename)) if len(argv) > 1: country = argv[1] df = [fe.coutry_data(country), fe.country_historical_data(country)] filenames = [ '{}__country_data'.format(country), '{}__country_historical_data'.format(country) ] for i in range(2): filename = '{}/{}.csv'.format(folder_dir, filenames[i]) df[i].to_csv(filename, index=False) print('{} is created successfully.'.format(filename))
def loadRoster(self): """ Calls MLB.com servers to obtain the complete roster for the team. If call fails, '_error' property is set. """ f = Fetcher(Fetcher.MLB_ROSTER_URL, team_id=self['team_id']) j = f.fetch() if 'roster_40' not in j: self._error = "ERROR on %s: key roster_40 not found (cannot load 40 man roster)" % ( f.url) return False parent = j['roster_40']['queryResults'] if parent['totalSize'] > 0: for record in parent['row']: player_id = record['player_id'] self.roster[player_id] = player.Player(player_id)
def _scrape_single_article(self, d): """ Single article scraper that will be called by a process within a multiprocessing pool """ try: helper = Fetcher._get_helper(d.root) if helper: self.scrape_article(d.url, helper) except Exception as e: print("Exception when scraping article at {0}: {1!r}".format(d.url, e))
def start_track_fetcher(self): if self.fetcher: self.fetcher.stop() self.fetcher = None self.fetcher = Fetcher(lambda: self.playlist, self, on_item = self.on_track_result, on_ok = self.on_track_complete, on_fail = self.on_track_complete) self.fetcher.start()
def start_radio_fetcher(self): if self.fetcher: self.fetcher.stop() self.fetcher = None self.fetcher = Fetcher(jamaendo.starred_radios, self, on_item = self.on_radio_result, on_ok = self.on_radio_complete, on_fail = self.on_radio_complete) self.fetcher.start()
def command_line_runner(): args = docopt.docopt(__doc__, version=__version__) if args["--clear-cache"]: if caching._clear_cache(): exit("Cache cleared successfully.") else: exit("Clearing cache failed.") if args["--max-number"]: try: args["--max-number"] = int(args["--max-number"]) except ValueError: exit(_yellow("--max-number value should be a number!")) fetcher = Fetcher(args) selected_pkg = fetcher.user_confirm() PkgbuildReview(selected_pkg, args) DiffReview(selected_pkg, args)
def start_feature_fetcher(self): if self.fetcher: self.fetcher.stop() self.fetcher = None self.fetcher = Fetcher(self.featurefn, self, on_item = self.on_feature_result, on_ok = self.on_feature_complete, on_fail = self.on_feature_complete) self.fetcher.start()
def __init__(self, enable): self.enable = enable self._tips = {} self._new_tips = set() self.lock = Lock() if self.enable: self.fetcher = Fetcher(self._tips, self.lock, self._new_tips) self.cleaner = Cleaner(self._tips, self.lock, self._new_tips) self.fetcher.start() self.cleaner.start()
def urls2fetch(self, root, helper): """ Returns a set of URLs to fetch. If the scraper helper class has associated RSS feed URLs, these are used to acquire article URLs. Otherwise, the URLs are found by scraping the root website and searching for links to subpages. """ fetch_set = set() feeds = None if helper is None else helper.feeds if feeds: for feed_url in feeds: logging.info("Fetching feed {0}".format(feed_url)) try: d = feedparser.parse(feed_url) except Exception as e: logging.warning( "Error fetching/parsing feed {0}: {1}".format( feed_url, str(e))) continue for entry in d.entries: if entry.link and not helper.skip_rss_entry(entry): fetch_set.add(entry.link) else: # Fetch the root URL and scrape all child URLs # that refer to the same domain suffix logging.info("Fetching root {0}".format(root.url)) # Read the HTML document at the root URL html_doc = Fetcher.raw_fetch_url(root.url) if not html_doc: logging.warning("Unable to fetch root {0}".format(root.url)) return # Parse the HTML document soup = Fetcher.make_soup(html_doc) # Obtain the set of child URLs to fetch fetch_set = Fetcher.children(root, soup) return fetch_set
def __init__(self, master): self.master = master self.ram = None self.fetcher = Fetcher() master.title('BoostPack v2 Installer') master.geometry('500x500') master.iconbitmap('assets/output_onlinepngtools_ZGQ_icon.ico') img = PhotoImage(file='assets/install.png') self.install = Button(master, image=img, command=self.install, borderwidth=0) self.install.image = img self.install.pack() img = PhotoImage(file='assets/uninstall.png') self.uninstall = Button(master, image=img, command=self.uninstall, borderwidth=0) self.uninstall.image = img self.uninstall.pack() img = PhotoImage(file='assets/update.png') self.update = Button(master, image=img, command=self.update_patcher, borderwidth=0) self.update.image = img self.update.pack() self.txt = Label(master, text='Ram') self.txt.pack() self.ram = Scale(master, from_=1, to=self.fetcher.get_ram, orient=HORIZONTAL, command=self.set_ram) self.ram.pack()
def _start_radio_fetcher(self): if self.fetcher: self.fetcher.stop() self.fetcher = None self.fetcher = Fetcher(lambda: jamaendo.get_radio_tracks(self.playlist.radio_id), self, on_item = self._on_radio_result, on_ok = self._on_radio_complete, on_fail = self._on_radio_complete) self.fetcher.has_no_results = True self.fetcher.start()
def start_track_fetcher(self): if self.fetcher: self.fetcher.stop() self.fetcher = None self.fetcher = Fetcher( lambda: jamaendo.get_tracks(self.album.ID), self, on_item=self.on_track_result, on_ok=self.on_track_complete, on_fail=self.on_track_complete, ) self.fetcher.start()
def __init__(self, directory, api_key, username, max_vids ): self.API_KEY = api_key self.USERNAME = username self.SAVE_FILE_PATH = os.path.normpath(directory + 'ytsubs_' + self.USERNAME.lower() + '.json') self.fetcher = Fetcher(self.USERNAME, self.API_KEY) self.MAX_VIDEOS = max_vids self.watched = [] self.additions = [] self.raw_videos = [] self.load()
def urls2fetch(self, root, helper): """ Returns a set of URLs to fetch. If the scraper helper class has associated RSS feed URLs, these are used to acquire article URLs. Otherwise, the URLs are found by scraping the root website and searching for links to subpages. """ fetch_set = set() feeds = helper.feeds if feeds: for feed_url in feeds: logging.info("Fetching feed {0}".format(feed_url)) try: d = feedparser.parse(feed_url) except Exception as e: logging.warning( "Error fetching/parsing feed {0}: {1}".format(feed_url, str(e)) ) continue for entry in d.entries: if entry.link and not helper.skip_rss_entry(entry): fetch_set.add(entry.link) else: # Fetch the root URL and scrape all child URLs # that refer to the same domain suffix logging.info("Fetching root {0}".format(root.url)) # Read the HTML document at the root URL html_doc = Fetcher.raw_fetch_url(root.url) if not html_doc: logging.warning("Unable to fetch root {0}".format(root.url)) return # Parse the HTML document soup = Fetcher.make_soup(html_doc) # Obtain the set of child URLs to fetch fetch_set = Fetcher.children(root, soup) return fetch_set
def __init__(self, master=None): Frame.__init__(self, master) self.fetch = Fetcher() self.grid(sticky=N+S+E+W) self.init_frame() self.createSearchEntry() self.createInfCanvas() self.createInfList() self.createDetailCanvas() self.createImageLabel() self.createDetailLabels()
def __init__(self, year, month, day=None): """ Constructor Arguments: year: The... year! month: The... month! day: The... day! (or None for all days of the month) Schedule is a standard dictionary: each day is a key in the format of 'YYYY-MM-DD', each value a list of game dictionaries. """ days = [] if day is None: for d in xrange(1, calendar.mdays[month] + 1): days.append(datetime.date(year, month, d)) else: days.append(datetime.date(year, month, day)) for d in days: key = d.strftime("%Y-%m-%d") if key not in self.keys(): self[key] = [] f = Fetcher(Fetcher.MLB_SCHEDULE_URL, date=d.strftime("%Y%m%d")) try: content = f.fetch(True) if len(content) == 0: continue content = re.sub(r'\t+', '\t', content) content = content.replace('"', '\\"') content = content.replace("'", "\"") content = re.sub(r'\t([\w,_]+):\s', r'"\1":', content) obj = json.loads(content) self[key] = obj except ValueError, e: print "ERROR %s on %s" % (e, f.url) pass
def _scrape_single_article(self, d): """ Single article scraper that will be called by a process within a multiprocessing pool """ try: helper = Fetcher._get_helper(d.root) if helper: self.scrape_article(d.url, helper) except Exception as e: logging.warning( "[{2}] Exception when scraping article at {0}: {1!r}".format( d.url, e, d.seq ) )
def loadYearlies(self): """ Loads yearly and career totals for a player """ if self['primary_position'] == 1: f = Fetcher(Fetcher.MLB_PITCHER_SUMMARY_URL, player_id=self.player_id) else: f = Fetcher(Fetcher.MLB_BATTER_SUMMARY_URL, player_id=self.player_id) j = f.fetch() # if the JSON object is empty, bail if len(j.keys()) == 0: return # get yearly totals if self['primary_position'] == 1: parent = j['mlb_bio_pitching_summary']['mlb_individual_pitching_season']['queryResults'] else: parent = j['mlb_bio_hitting_summary']['mlb_individual_hitting_season']['queryResults'] if parent['totalSize'] > 0: records = parent['row'] # accounting for player with only one row if type(records) is dict: records = [records] for row in records: log = {} for key, value in row.iteritems(): log[key] = value self.totals[row['season']] = log # get career totals if self['primary_position'] == 1: parent = j['mlb_bio_pitching_summary']['mlb_individual_pitching_career']['queryResults'] else: parent = j['mlb_bio_hitting_summary']['mlb_individual_hitting_career']['queryResults'] if parent['totalSize'] > 0: for key, value in parent['row'].iteritems(): self.career[key] = value
def start_tag_fetcher(self, item_id): if self.fetcher: self.fetcher.stop() self.fetcher = None self.fetcher = Fetcher(lambda: jamaendo.get_tag_tracks(item_id), self, on_item = self.on_tag_result, on_ok = self.on_tag_complete, on_fail = self.on_tag_complete) self.fetcher.taglist = [] self.fetcher.start() banner = hildon.hildon_banner_show_information(self, '', "Getting tracks for tag") banner.set_timeout(2000)
def load(self, load_yearlies=False, id=None): """ Calls MLB.com server and loads player information. If call fails, '_error' property is set. Arguments: id : The MLB.com player ID """ if id is None and self.player_id is not None: id = self.player_id self['player_id'] = self.player_id else: raise Exception('No player_id specified') f = Fetcher(Fetcher.MLB_PLAYER_URL, player_id=self.player_id) j = f.fetch() try: records = j['player_info']['queryResults']['totalSize'] except KeyError, e: msg = 'ERROR on %s: totalSize not returned for call' % f.url self._error = msg logger.error(msg) return False
def begin(self, url_list, within_domain=False, download_location=False, depth=False): self.unvisited_urls = url_list self.visited_urls = url_list self.configure() cleaned_urls = [] fetcher = Fetcher() while True: print '#unvisited urls ', len(self.unvisited_urls) for url in self.unvisited_urls: print 'crawling ', url domain_url = self.domain(url) html_content = self.fetch_content(url) soup = self.make_clean_soup(html_content) out_links = self.get_all_links(soup) if download_location: fetcher.fetch_soup_attributes(download_location, url, soup) cleaned_urls.extend(self.clean_urls(out_links, domain_url, within_domain)) self.unvisited_urls = self.not_visited_urls(cleaned_urls) self.visited_urls.append(self.unvisited_urls) if depth: if depth==1: break else: depth -= 1