Exemplos de Fetcher em Python, exemplos de fetcher.Fetcher em Python

Exemplo n.º 1

0

Exibir arquivo

    def __init__(self,config_parser):
        # Connect to engine
        database_path  = get_from_config_parser(config_parser,'Database','path','database')
        database_debug = get_boolean_from_config_parser(config_parser,'Database','debug',False)
        dir = os.path.dirname(database_path)
        if not os.path.exists(dir):
            mkdir(dir)
        sys.stderr.write('Connecting to database at "%s"\n' % database_path)
        self._engine = create_engine('sqlite:///%s' % database_path,echo=database_debug)

        # Start session
        Session = sessionmaker(bind=self._engine)
        self._session = Session()
        # Initialize feed storage
        self._feed_storage = FeedStorage(self._engine,self._session)
        # Initialize item storage
        self._item_storage = ItemStorage(self._engine,self._session)
        # A list of subprocess.Popen processes that will be maintained
        # by the Coffer object.
        self._external_processes = []
        # File storage (data dump)
        file_storage_path = get_from_config_parser(config_parser,'FileStorage','path','datadump')
        max_block_size    = get_int_from_config_parser(config_parser,'FileStorage','max-block-size',
                                                       file_storage.DEFAULT_MAX_BLOCK_SIZE)
        bzip2_path = get_from_config_parser(config_parser,'FileStorage','bzip2-path','/usr/bin/bzip2')
        self._file_storage = FileStorage(self._external_processes,file_storage_path,
                                         max_block_size,bzip2_path)
        # Content fetcher configuration
        self._fetcher = Fetcher(config_parser)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: home.py Projeto: JohnAZoidberg/YoutubeRSS

def serve_user_podcast(username):
    limit = request.args.get("limit")
    fetcher = Fetcher(request.environ['YOUTUBERSS_CONFIG'])
    print "What the f**k is going on here"
    print request.environ['YOUTUBERSS_CONFIG']
    podcast, upload_playlist = fetcher.get_user_data(username)
    return serve(fetcher, podcast, upload_playlist, limit)

Exemplo n.º 3

0

Exibir arquivo

 def __init__(self,packageName,debug):
     #Variables for tests
     self.logger = logging.getLogger("tmLogger")
     if debug:
         self.logger.setLevel(logging.DEBUG)
     # Use self.package name as a directory name
     self.packageName = "%s/" % packageName
     #Assign directories to be worked on
     self.workDir = "/tmp/testManager"
     self.repo = urlparse.urljoin ("http://cekirdek.pardus.org.tr/~serbulent/test_guides/", self.packageName)
     self.saveDir = os.path.join("/tmp/testManager",  self.packageName)
     self.filesDir =  os.path.join(self.saveDir, "files")
     self.configFile = os.path.join(self.saveDir, "testProcess.conf")
     #Create test directories
     for item in (self.workDir, self.saveDir, self.filesDir):
         if not os.path.isdir(item):
             try:
                 os.mkdir(item)
                 debugMsg = "%s created" % item
                 self.logger.debug(debugMsg)
             except OSError:
                 errorMsg =  "An error occured when creating directory %s" % item
                 self.logger.error(errorMsg)
     #Read package configuration
     self.fetcher = Fetcher(debug)
     url = urlparse.urljoin(self.repo, "testProcess.conf")
     self.fetcher.download(url, self.configFile)
     cfr = ConfReader(self.configFile)
     self.params = cfr.read()
     self.fetchFiles()

Exemplo n.º 4

0

Exibir arquivo

Arquivo: crawler.py Projeto: kapv89/slither

def run(limit = -1):
  if limit != -1 and store.count() > limit:
    print 'Limit: ' + str(limit) + ' Count: ' + str(store.count())
    print 'Voila, We are done !!'
    import sys
    sys.exit()
    return

  if swap.current.is_empty():
    if swap.next.is_empty() :
      print 'Somehow, We are done'
    else:
      # we exchange current and new, run again
      swap.exchange()
      return run(limit)
    

  #get the url from current-to be crawled list of urls
  url = swap.current.pop()
  
  if not url or not valid_url(url):
    print '"'+url+'" is not a valid url, giving up on it'
    return run(limit)

  f = Fetcher(url)

  next_urls = store.process(f.url(), f.child_urls())
  
  for url in next_urls:
    swap.next.push(url)

  return run(limit)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: test.py Projeto: woodstone121/pycrawler

    def crawl_category(self):
        fetcher = Fetcher()
        kk = yield fetcher.fetch(
            "http://www.carters.com/%s?startRow=0&sz=all" % self.slug)
        page = kk.body

        self._process(page)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: spider.py Projeto: ZhitongLei/web

	def start(self):
		url_queue = Queue.Queue()
		url_queue.put((self.root_request_info.url, 0))

		request_info = RequestInfo('', None, self.root_request_info.headers)
		fetcher = Fetcher()

		while not url_queue.empty():
			curr_url, depth = url_queue.get()		
			#print 'url=%s, depth=%d' % (curr_url, depth)
			print curr_url

			if depth > self.depth_limit:
				continue
			
			depth += 1
			request_info.url = curr_url
			page_content = fetcher.request(request_info)

			## parse page
			## Content.parse(page_content)

			url_list = HtmlParser.extract_url(curr_url, page_content)
			if url_list:
				for url in url_list:
					url_queue.put((url, depth))

Exemplo n.º 7

0

Exibir arquivo

Arquivo: player.py Projeto: therg/py-mlb

    def loadGamelogs(self, year=None):
        """
        Loads gamelogs for the player for a given year

        Arguments:
        year : The season desired. Defaults to the current year if not specified
        """
        if year is None: year = datetime.datetime.now().year
        if year not in self.logs: self.logs[year] = []

        if 'primary_position' not in self:
            logger.error("no primary position attribute for " % self)
            return False

        url = Fetcher.MLB_PITCHER_URL if self['primary_position'] == 1 else Fetcher.MLB_BATTER_URL

        f = Fetcher(url, player_id=self.player_id, year=year)
        j = f.fetch()

        try:
            if self['primary_position'] == 1:
                parent = j['mlb_bio_pitching_last_10']['mlb_individual_pitching_game_log']['queryResults']
            else:
                if 'mlb_individual_hitting_last_x_total' in j:
                    parent = j['mlb_individual_hitting_last_x_total']['mlb_individual_hitting_game_log']['queryResults']
                else:
                    parent = j['mlb_bio_hitting_last_10']['mlb_individual_hitting_game_log']['queryResults']
        except KeyError, e:
            logger.error('no key for gamelogs found in %s' % f.url)
            return False

Exemplo n.º 8

0

Exibir arquivo

Arquivo: transactions.py Projeto: Mpellet771/py-mlb

	def __init__(self, year, month, day = None):
		"""
		Constructor
		
		Arguments:
		year: The... year!
		month: The... month!
		day: The... day! (or None for all days of the month)
		"""
		days = []
		
		if day is None:
			for d in xrange(1, calendar.mdays[month] + 1):
				days.append(datetime.date(year, month, d))
		else:
			days.append(datetime.date(year, month, day))

		begin = days[0]
		end = days[-1]

		f = Fetcher(Fetcher.MLB_TRANSACTION_URL, start=begin.strftime("%Y%m%d"), end=end.strftime("%Y%m%d"))
		try:
		 	obj = f.fetch()
			if obj['transaction_all']['queryResults']['totalSize'] == 0: return
			results = obj['transaction_all']['queryResults']['row']
			
			if type(results) is dict:
				self.append(results)
			else:
				for row in results:
					self.append(row)
		except (ValueError, KeyError), e:
			logger.error("ERROR %s on %s" % (e, f.url))
			pass

Exemplo n.º 9

0

Exibir arquivo

Arquivo: crawl_func.py Projeto: wwxs972/wximage

class Getter(metaclass=GetMetaclass):
    def __init__(self):
        self.fetcher = Fetcher()

    def get_info(self, callback, url):
        crawl_func = eval('self.{0}'.format(callback))
        return crawl_func(url)

    def crawl_weixin(self, url):
        result = {}
        res = self.fetcher.get(url)
        soup = BeautifulSoup(res.content, 'html.parser', from_encoding='utf8')
        item_imgs = soup.select('#page-content img[data-src!=""]')
        imgs = [item_img['data-src'] for item_img in item_imgs]
        title = soup.select('h2#activity-name')[0].text.strip()
        result['url'] = url
        result['imgs'] = imgs
        result['title'] = title
        result['content'] = res.content
        return result

    def crawl_youdao(self, url):
        report = {}
        res = self.fetcher.get(url)
        soup = BeautifulSoup(res.content, 'html.parser', from_encoding='utf8')
        item_imgs = soup.select('.post-bd img[src!=""]')
        imgs = [item_img['src'] for item_img in item_imgs]
        report['url'] = url
        report['imgs'] = imgs
        report['content'] = res.content
        return report

Exemplo n.º 10

0

Exibir arquivo

Arquivo: __init__.py Projeto: chudichudichudi/neurociencia

class Tips(object):
    """ Manage Tips Events. """

    def __init__(self, enable):
        self.enable = enable
        self._tips = {}
        self._new_tips = set()
        self.lock = Lock()
        if self.enable:
            self.fetcher = Fetcher(self._tips, self.lock, self._new_tips)
            self.cleaner = Cleaner(self._tips, self.lock, self._new_tips)
            self.fetcher.start()
            self.cleaner.start()

    def tips(self):
        return self._tips.values()

    def new_tips(self):
        if self._new_tips:
            wait_free_acquire(self.lock)
            res = [self._tips[x] for x in self._new_tips]
            self._new_tips.clear()
            self.lock.release()
            return res
        else:
            return []

    def stop(self):
        if self.enable:
            self.fetcher.finnish()
            self.cleaner.finnish()

Exemplo n.º 11

0

Exibir arquivo

Arquivo: optimizer.py Projeto: vbhumidev/harpStocks

    def __fetch_daily_returns(self):
        fetcher = Fetcher(list(self.stockList))
        fetcher.fetch_history(self.startDate, self.endDate)
        self.df = fetcher.get_dataframe('Adj_Close')
        globalStats = GlobalStats(self.df)

        return globalStats.get_daily_returns()

Exemplo n.º 12

0

Exibir arquivo

def fetch(agfid, u_ip, d_ip, r_ip, a_ip, u_port, d_port, r_port, a_port,
          temp_file_dir, device_name):

    logging.basicConfig(
        filename="fetcher.log",
        filemode="w",
        format="%(asctime)s %(name)s:%(levelname)s:%(message)s",
        datefmt="%d-%M-%Y %H:%M:%S",
        level=logging.DEBUG)
    logging.info('_r=true' + ';agfid=' + agfid + ';a_port=' + str(a_port) +
                 ';u_port=' + str(u_port) + ';r_port=' + str(r_port) +
                 ';temp_file_dir=' + temp_file_dir + ';device_name=' +
                 device_name + ';')

    params = {
        "rnode_ip": r_ip,
        "unode_ip": u_ip,
        "dnode_ip": d_ip,
        "unode_port": str(u_port),
        "rnode_port": str(r_port),
        "dnode_port": str(d_port),
        "adc_ip": a_ip,
        "adc_port": str(a_port)
    }

    fetcher = Fetcher(params)
    err = fetcher.start(agfid=agfid,
                        device_name=device_name,
                        temp_file_dir=temp_file_dir)
    if err is not None:
        click.echo("_r=false;message=" + err + ";")
        logging.error("_r=false;message=" + err + ";")

    if fetcher.ffmpeg != None:
        fetcher.ffmpeg.kill()

Exemplo n.º 13

0

Exibir arquivo

Arquivo: test.py Projeto: woodstone121/pycrawler

 def _get_category_page(self):
     fetcher = Fetcher()
     ret = yield fetcher.fetch('http://www.6pm.com/%s' % self.slug)
     body = PQ(ret.body)
     foo = body('.last a')[0].get('href')
     max_page = int(re.findall('-page(\d+)', foo)[0])
     for i in range(max_page):
         self._crawl_category_page(i)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: __init__.py Projeto: vlegoff/OpenWeatherMapPlugin

class GlobalPlugin(globalPluginHandler.GlobalPlugin):

	def __init__(self):
		super(globalPluginHandler.GlobalPlugin, self).__init__()

		# Creating the configuration directory
		configDir = os.path.join(config.getUserDefaultConfigPath(),
				"owm")
		if not os.path.exists(configDir):
			os.mkdir(configDir)

		# Add the settings in the NVDA menu
		self.prefsMenu = gui.mainFrame.sysTrayIcon.menu.GetMenuItems()[0].GetSubMenu()
		self.owmSettingsItem = self.prefsMenu.Append(wx.ID_ANY, "OWM Settings...", "Set OWM location")
		gui.mainFrame.sysTrayIcon.Bind(wx.EVT_MENU, self.onOWMSettings, self.owmSettingsItem)

		# Create the client to retrieve information from the API
		self.fetcher = Fetcher()
		self.fetcher.start()

	def script_announceOWMForecast(self, gesture):
		if self.fetcher.client is None:
			ui.message("Loading, please wait and try again in a few seconds...")
			return

		client = self.fetcher.client
		if client.error:
			ui.message("{0} {1}".format(client.statusCode, client.errorReason))
			self.fetcher.valid = False
			self.fetcher = Fetcher()
			self.fetcher.start()
		else:
			forecast = client.forecast
			message = forecast.getMessage()
			ui.message(message)
			log.info(message)

	def onOWMSettings(self, event):
		"""Pop a dialog with OWM settings."""
		locations = locationList.retrieve()
		selected = configFile['location']
		locationName = locationList.get(selected).name
		locationValues = {}
		for location in locations:
			locationValues[location.id] = (location.name, location.country)

		dialog = LocationDialog(gui.mainFrame, -1, "Select OWM Location",
				locationValues, locationName)
		gui.mainFrame.prePopup()
		ret = dialog.ShowModal()
		gui.mainFrame.postPopup()
		if ret == wx.ID_OK:
			log.info("Focused {0}, {1}".format(locationList.path,
					dialog.location.focusedLocationName))

	__gestures={
			"kb:NVDA+w": "announceOWMForecast",
	}

Exemplo n.º 15

0

Exibir arquivo

Arquivo: scheduler.py Projeto: qingchunjun/proxy_pool

 def schedule_fetcher(self, cycle=FETCHER_CYCLE):
     """
     定时获取代理
     """
     fetcher = Fetcher()
     while True:
         print('开始抓取代理')
         fetcher.run()
         time.sleep(cycle)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: vpfound.py Projeto: reedboat/vimfound

def update(new=True, begin=0, end=0, ids=[]):
    fetcher = Fetcher()
    if new: 
        count=fetcher.fetchNew()
    elif len(ids) == 0:
        count=fetcher.fetchAll(begin, end)
    else:
        count=fetcher.fetchSelected(ids)
    print "%d items updated" % count

Exemplo n.º 17

0

Exibir arquivo

Arquivo: main.py Projeto: srichand/DocumentGenerator

def main():
    url_file = 'url.conf'
    url_list = get_url_list(url_file)
    cache = PriorityCache()
    fetchers = []
    for url in url_list:
        f = Fetcher(cache, url)
        fetchers.append(f)

    for f in fetchers:
        f.run()

Exemplo n.º 18

0

Exibir arquivo

Arquivo: torch_data.py Projeto: ChunYoLin/quant

class StockPriceData():
    def __init__(self, symbol, start, end=date.today(), data_len=5, scale="D"):
        usecols = ["open", "high", "low", "close", "volume"]
        self.__data_df = Fetcher().fetch(symbol, start, end)[usecols]
        if scale != "D":
            self.__data_df = self.__data_df.resample(scale,
                                                     how={
                                                         "open": 'first',
                                                         "high": 'max',
                                                         "low": 'min',
                                                         "close": 'last',
                                                         "volume": 'sum',
                                                     })[:-1]
        self.__data_df_norm = self.__data_df.copy()
        self.__data_df_norm['open'] = MinMaxScaler().fit_transform(
            self.__data_df.open.values.reshape(-1, 1))
        self.__data_df_norm['high'] = MinMaxScaler().fit_transform(
            self.__data_df.high.values.reshape(-1, 1))
        self.__data_df_norm['low'] = MinMaxScaler().fit_transform(
            self.__data_df.low.values.reshape(-1, 1))
        self.__data_df_norm['close'] = MinMaxScaler().fit_transform(
            self.__data_df.close.values.reshape(-1, 1))
        self.__data_df_norm['volume'] = MinMaxScaler().fit_transform(
            self.__data_df.volume.values.reshape(-1, 1))
        self.data_len = data_len

    def denormalize(self, norm_value):
        origin_values = self.__data_df["close"].values.reshape(-1, 1)
        norm_value = norm_value.reshape(-1, 1)
        min_max_scaler = MinMaxScaler()
        min_max_scaler.fit_transform(origin_values)
        denorm_value = min_max_scaler.inverse_transform(norm_value)
        return denorm_value

    def get_train_datas(self, scale="D"):
        data = self.__data_df_norm.values
        data_x = np.array(list(chunks(data, self.data_len))[:-1])
        return data_x

    def get_train_targets(self):
        pass

    def get_test_datas(self):
        data = self.__data_df_norm.values
        data_x = np.array(list(chunks(data, self.data_len))[-1])
        return data_x

    def get_raw_datas(self):
        return self.__data_df.values

    def get_norm_datas(self):
        return self.__data_df_norm.values

Exemplo n.º 19

0

Exibir arquivo

Arquivo: player.py Projeto: therg/py-mlb

    def loadYearlies(self):
        """
        Loads yearly and career totals for a player
        """
        if self['primary_position'] == 1 and not self.force_batting:
            f = Fetcher(Fetcher.MLB_PITCHER_SUMMARY_URL, player_id=self.player_id)
        else:
            f = Fetcher(Fetcher.MLB_BATTER_SUMMARY_URL, player_id=self.player_id)

        j = f.fetch()

        # if the JSON object is empty, bail
        if len(j.keys()) == 0:
            return

        # get yearly totals
        if self['primary_position'] == 1 and not self.force_batting:
            parent = j['mlb_bio_pitching_summary']['mlb_individual_pitching_season']['queryResults']
        else:
            parent = j['mlb_bio_hitting_summary']['mlb_individual_hitting_season']['queryResults']

        if parent['totalSize'] > 0:
            records = parent['row']

            # accounting for player with only one row
            if type(records) is dict:
                records = [records]

            for row in records:
                log = {}
                for key, value in row.iteritems():
                    log[key] = value

                # handle each season as a list, so
                # players with multiple team seasons
                # get each team for that year
                # accounted for
                if row['season'] in self.totals:
                    self.totals[row['season']].append(log)
                else:
                    self.totals[row['season']] = [log]

        # get career totals
        if self['primary_position'] == 1 and not self.force_batting:
            parent = j['mlb_bio_pitching_summary']['mlb_individual_pitching_career']['queryResults']
        else:
            parent = j['mlb_bio_hitting_summary']['mlb_individual_hitting_career']['queryResults']

        if parent['totalSize'] > 0:
            for key, value in parent['row'].iteritems():
                self.career[key] = value

Exemplo n.º 20

0

Exibir arquivo

Arquivo: league.py Projeto: Mpellet771/py-mlb

	def load(self, loadRosters = False):
		"""
		Calls MLB.com server and loads all team information

		Arguments:
		loadRosters : If true, rosters will automatically be loaded (more HTTP requests!)
		"""
		f = Fetcher(Fetcher.MLB_LEAGUE_URL)

		for item in f.fetch():
			t = team.Team(item)
			if loadRosters:
				t.loadRoster()
			self.teams[t['team_code']] = t

Exemplo n.º 21

0

Exibir arquivo

Arquivo: scraper.py Projeto: haukurb/Reynir

    def scrape_root(self, root, helper):
        """ Scrape a root URL """

        t0 = time.time()
        # Fetch the root URL and scrape all child URLs that refer
        # to the same domain suffix and we haven't seen before
        logging.info("Fetching root {0}".format(root.url))

        # Read the HTML document at the root URL
        html_doc = Fetcher.raw_fetch_url(root.url)
        if not html_doc:
            logging.warning("Unable to fetch root {0}".format(root.url))
            return

        # Parse the HTML document
        soup = Fetcher.make_soup(html_doc)

        # Obtain the set of child URLs to fetch
        fetch_set = Fetcher.children(root, soup)

        # Add the children whose URLs we don't already have to the
        # scraper articles table
        with SessionContext() as session:

            for url in fetch_set:

                if helper and helper.skip_url(url):
                    # The helper doesn't want this URL
                    continue

                # noinspection PyBroadException
                try:
                    article = ArticleRow(url=url, root_id=root.id)
                    # Leave article.scraped as NULL for later retrieval
                    session.add(article)
                    session.commit()
                except IntegrityError as e:
                    # Article URL already exists in database:
                    # roll back and continue
                    session.rollback()
                except Exception as e:
                    logging.warning(
                        "Roll back due to exception in scrape_root: {0}"
                        .format(e)
                    )
                    session.rollback()

        t1 = time.time()

        logging.info("Root scrape completed in {0:.2f} seconds".format(t1 - t0))

Exemplo n.º 22

0

Exibir arquivo

Arquivo: loginTest.py Projeto: lukem1995/WorkingSeniorProject

def main(argv):
    # myBrowser = Browser()
    isLogin = False
    myFetcher = Fetcher()
    username = ""
    password = ""
    domain = ""

    try:
        opts, args = getopt.getopt(argv, "u:p:d:", ["login", "domain="])
    except getopt.GetoptError:
        print "(mandatory)\n-d <domain> or --domain <domain>" \
            " \n(optional)\n--login followed by \n -u <username> -p <password> "
        exit()

    for opt, arg in opts:
        if opt == "--login":
            isLogin = True
        elif opt == "-u":
            username = arg
        elif opt == "-p":
            password = arg
        elif opt in ("-d", "--domain"):
            domain = arg

    if isLogin:
        myFetcher.setCredentials(username, password)
        myFetcher.login(domain)
        print myFetcher.getCookies()

Exemplo n.º 23

0

Exibir arquivo

    def load(self, loadRosters=False):
        """
		Calls MLB.com server and loads all team information

		Arguments:
		loadRosters : If true, rosters will automatically be loaded (more HTTP requests!)
		"""
        f = Fetcher(Fetcher.MLB_LEAGUE_URL)

        for item in f.fetch():
            t = team.Team(item)
            if loadRosters:
                t.loadRoster()
            self.teams[t['team_code']] = t

Exemplo n.º 24

0

Exibir arquivo

Arquivo: run.py Projeto: hucxgit/spider_test

def main():
    #开启调度器
    Scheduler.runScheduler()
    #开启抓取器
    Fetcher.runFetcher(processNum=10)
    #开启处理器
    Processor.runProcessor()
    # print(u"cpu 个数:" + str(multiprocessing.cpu_count()))
    # for fetcher in multiprocessing.active_children():
    #     fetcher.join()
    #     print("child   p.name:" + fetcher.name + "\tp.id" + str(fetcher.pid))

    for p in SafeQueue.processList:
        p.start()
    for p in SafeQueue.processList:
        p.join()

Exemplo n.º 25

0

Exibir arquivo

 def _init_from_scrape(cls,
                       url: Optional[str],
                       enclosing_session: Optional[Session] = None):
     """ Scrape an article from its URL """
     if url is None:
         return None
     a = cls(url=url)
     with SessionContext(enclosing_session) as session:
         # Obtain a helper corresponding to the URL
         html, metadata, helper = Fetcher.fetch_url_html(url, session)
         if html is None:
             return a
         a._html = html
         if metadata is not None:
             a._heading = metadata.heading
             a._author = metadata.author
             a._timestamp = metadata.timestamp
             a._authority = metadata.authority
         a._scraped = datetime.utcnow()
         if helper is not None:
             helper = cast(Any, helper)
             a._scr_module = helper.scr_module
             a._scr_class = helper.scr_class
             a._scr_version = helper.scr_version
             a._root_id = helper.root_id
             a._root_domain = helper.domain
         return a

Exemplo n.º 26

0

Exibir arquivo

    def run(self):
        """The starting point of a thread"""
        # print("Thread " + str(self.thread_id) + " started")
        while True:
            # print("get next URL")
            self.dash.print_cur_stat("Next_Url___", self.thread_id)
            value, current_url, current_dns = self.frontier.get_url(self.thread_id)
            if not current_url:
                # print("Empty Queue from thread " + str(self.thread_id))
                self.dash.print_cur_stat("Empty_Queue", self.thread_id)
                continue
            self.dash.print_cur_stat("Downloading", self.thread_id)
            code, links, content = Fetcher.fetch(current_url)
            if code == -1:
                # print("Refused from thread " + str(self.thread_id))
                self.dash.print_cur_stat("Refused_Url", self.thread_id)
                self.refused += 1
                self.dash.print_refused(str(self.refused), self.thread_id)
                continue
            self.dash.print_cur_stat("Valid_Url__", self.thread_id)
            # Crawling this link successeded
            # print("URL got from thread " + str(self.thread_id))
            out_links = len(links)
            sz_parent = len(content)
            links_mod = []
            for i in range(len(links)):
                links_mod.append((links[i][0], links[i][1], (out_links, sz_parent, len(links[i][0]), value)))

            self.dash.print_cur_stat("URL_Fetched", self.thread_id)
            self.crawled += 1
            self.dash.print_crawled(str(self.crawled), self.thread_id)
            # print("URL fetched from thread " + str(self.thread_id))
            self.frontier.push_to_serve(links_mod, self.thread_id)
            Storage.cache_crawled_url(current_url, current_dns, content, self.thread_id)

Exemplo n.º 27

0

Exibir arquivo

 def __init__(self, target_url):
     self.url = target_url
     self.logger = logging.getLogger('SaveService')
     self.fetcher = Fetcher()
     self.getter = Getter()
     self.url_info = {}
     self.client = MongoHelper(config.store_db)

Exemplo n.º 28

0

Exibir arquivo

Arquivo: load_recordings.py Projeto: ttencate/papageno

def main(args, session):
    logging.info('Deleting existing xeno-canto recordings')
    session.query(Recording).filter(Recording.source == 'xc').delete()

    fetcher = Fetcher(cache_group='xc_api',
                      pool_size=args.recording_load_jobs,
                      clear_cache=args.clear_recordings_cache)
    query = XcQuery({'nr': f'{args.start_xc_id}-{args.end_xc_id}'}, fetcher)
    first_page = query.fetch_page(1)
    num_pages = first_page['numPages']
    num_recordings = int(first_page['numRecordings'])
    logging.info(f'Found {num_pages} pages, {num_recordings} recordings')
    with multiprocessing.pool.ThreadPool(args.recording_load_jobs) as pool:
        for page in progress.percent(
                itertools.chain([first_page],
                                pool.imap(query.fetch_page,
                                          range(2, num_pages + 1))),
                num_pages):
            try:
                # Allow replacements in case the API shifts pages around
                # (it seems to do that, probably when new recordings are
                # added during the run).
                recordings = [_parse_recording(r) for r in page['recordings']]
                session.bulk_save_objects_with_replace(recordings)
            except Exception:
                logging.error(
                    f'Error parsing page:\n{json.dumps(page, indent="  ")}',
                    exc_info=True)
                raise

Exemplo n.º 29

0

Exibir arquivo

    def fetch_prices(self):
        t = 6 * 3600 * int(time.time() / (6 * 3600)) + 6 * 3600
        js = Fetcher(json.loads).fetch(
            URL_PRICES.format(self.id, DATE_START.strftime("%Y-%m-%d"), t))

        self.rawdata = js['data']

        self.btc_series = [(datetime.strptime(k.split("T")[0],
                                              "%Y-%m-%d"), v['BTC'][0])
                           for k, v in self.rawdata.items()]
        if self.data:
            self.btc_series.append(
                (datetime.now(), self.data["quote"]["BTC"]["price"]))
        series_fill_zeroes(self.btc_series)
        normalize(self, "btc_series")

        self.usd_series = [(datetime.strptime(k.split("T")[0],
                                              "%Y-%m-%d"), v['USD'][0])
                           for k, v in self.rawdata.items()]
        if self.data:
            self.usd_series.append(
                (datetime.now(), self.data["quote"]["USD"]["price"]))
        series_fill_zeroes(self.usd_series)
        normalize(self, "usd_series")

        self.supply = []
        try:
            self.supply = [(datetime.strptime(k.split("T")[0], "%Y-%m-%d"),
                            10 * round(0.1 * div0(v['USD'][2], v['USD'][0])))
                           for k, v in self.rawdata.items()]
        except:
            pass
        series_fill_zeroes(self.supply)
        normalize(self, "supply")

Exemplo n.º 30

0

Exibir arquivo

    def __init__(self, data_dir, seed_urls, similarity_method):
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)

        self.fetcher = Fetcher(data_dir)

        if similarity_method == "cosine":
            self.similarity = Cosine_Similarity()
        elif similarity_method == "jaccard":
            self.similarity = Jaccard_Similarity()
        else:
            self.similarity = None
        self.K = max(len(seed_urls)/2, 10)

        self.host = set()
        self.update_seeds(seed_urls)

Exemplo n.º 31

0

Exibir arquivo

Arquivo: social_tcp_handler.py Projeto: razhong/randy

 def handle(self):
     # self.request is the TCP socket connected to the client
     data = self.request.recv(1024).strip()
     if data.startswith("GET /favicon.ico"):
         return
     print '--------------data---------------- ' + data
     query_components = get_query_parameters(data)
     print "query %s" % query_components
     # just send back the same data, but upper-cased
     # date_days_ago = Fetcher.DEFAULT_DATE
     result = {"result":"empty"}
     if "days" in query_components:
         from fetcher import Fetcher
         date_days_ago = float(query_components["days"][0])
         result = Fetcher.get_ranked_pages(date_days_ago)
     elif "category" in query_components:
         sort = "score"
         if "sort" in query_components:
             sort = query_components["sort"][0]
         result = social_db.read_from_spikedate(query_components["category"][0], sort)
     elif "channel" in query_components:
         channel = query_components["channel"][0]
         result = get_channel_serving.sort_channel_by_field(channel, max_count=100, remove_adult=False)
     elif "translate" in query_components:
         text = query_components["translate"][0]
         if self.token_expired is None or self.token_expired:
             self.update_token()
             self.token_expired = False
         if "from" in query_components and "to" in query_components:
             from_lang = query_components["from"][0]
             to_lang = query_components["to"][0]
             result = MTPythonSampleCode.translate(self.final_token, textToTranslate=text, fromLangCode=from_lang, toLangCode=to_lang)
         else:
             result = MTPythonSampleCode.translate(self.final_token, text)
     self.request.sendall(json.dumps(result, encoding="utf-8", ensure_ascii=False))

Exemplo n.º 32

0

Exibir arquivo

 def _process_text(parser, session, text, all_names, xform):
     """ Low-level utility function to parse text and return the result of
         a transformation function (xform) for each sentence.
         Set all_names = True to get a comprehensive name register.
         Set all_names = False to get a simple name register.
         Set all_names = None to get no name register. """
     t0 = time.time()
     # Demarcate paragraphs in the input
     text = Fetcher.mark_paragraphs(text)
     # Tokenize the result
     toklist = list(tokenize_and_recognize(text, enclosing_session=session))
     t1 = time.time()
     pgs, stats = TreeUtility._process_toklist(parser, session, toklist,
                                               xform)
     if all_names is None:
         register = None
     else:
         from query import create_name_register
         register = create_name_register(toklist,
                                         session,
                                         all_names=all_names)
     t2 = time.time()
     stats["tok_time"] = t1 - t0
     stats["parse_time"] = t2 - t1
     stats["total_time"] = t2 - t0
     return (pgs, stats, register)

Exemplo n.º 33

0

Exibir arquivo

Arquivo: search.py Projeto: krig/jamaendo

    def on_search(self, w):
        mode = self.mode.get_active()
        txt = self.entry.get_text()
        self.musiclist.set_loading(False)
        self.musiclist.empty_message = "Searching..."
        self.musiclist.get_model().clear()

        if self.fetcher:
            self.fetcher.stop()
            self.fetcher = None

        itemgen = None
        if mode == 0:
            itemgen = lambda: jamaendo.search_artists(query=txt)
        elif mode == 1:
            itemgen = lambda: jamaendo.search_albums(query=txt)
        elif mode == 2:
            itemgen = lambda: jamaendo.search_tracks(query=txt)
        else:
            return

        self.fetcher = Fetcher(itemgen, self,
                               on_item = self.on_add_result,
                               on_ok = self.on_add_complete,
                               on_fail = self.on_add_complete)
        self.fetcher.start()
        '''

Exemplo n.º 34

0

Exibir arquivo

class Ranking:
    def __init__(self, data_dir, seed_urls, similarity_method):
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)

        self.fetcher = Fetcher(data_dir)

        if similarity_method == "cosine":
            self.similarity = Cosine_Similarity()
        elif similarity_method == "jaccard":
            self.similarity = Jaccard_Similarity()
        else:
            self.similarity = None
        self.K = max(len(seed_urls)/2, 10)

        self.host = set()
        self.update_seeds(seed_urls)

    def update_seeds(self, seed_urls):
        '''Update seed urls in the current seed list.
        Fetch the seed urls'''
        new_seed_urls = []
        for url in seed_urls:
            host = URLUtility.get_tld(url)
            if host not in self.host:
                self.host.add(host)
                new_seed_urls.append(url)
        urls, text = self.fetcher.fetch_urls(new_seed_urls)
        self.similarity.update_seeds(urls, text)
        self.K = max(len(self.similarity.seed_pages.keys())/2, 10)

    def rank(self, urls):
        ''' Rank the urls with respect to the seeds
            Return the sorted ranking scores together with the urls'''
        # Fetch the urls
        urls, text = self.fetcher.fetch_urls(urls)

        # Rank
        knn_scores = []
        scores = self.similarity.compute_similarity(text) 
        for i in xrange(len(urls)):
            sorted_scores = sorted(scores[i], reverse=True)
            knn_score = sum(sorted_scores[:self.K])/float(min(self.K, len(sorted_scores)))
            knn_scores.append((urls[i], knn_score))
        knn_scores = sorted(knn_scores, key=lambda x:x[1])

        return knn_scores

Exemplo n.º 35

0

Exibir arquivo

Arquivo: radios.py Projeto: krig/jamaendo

class RadiosWindow(hildon.StackableWindow):
    def __init__(self):
        hildon.StackableWindow.__init__(self)
        self.fetcher = None
        self.radios = {}

        self.set_title("Radios")
        self.connect('destroy', self.on_destroy)

        # Results list
        self.panarea = hildon.PannableArea()
        self.radiolist = RadioList()
        self.radiolist.connect('row-activated', self.row_activated)
        self.panarea.add(self.radiolist)
        self.add(self.panarea)

        self.start_radio_fetcher()

    def on_destroy(self, wnd):
        if self.fetcher:
            self.fetcher.stop()
            self.fetcher = None

    def row_activated(self, treeview, path, view_column):
        name, _id = self.radiolist.get_radio_id(path)
        wnd = open_playerwindow()
        wnd.play_radio(name, _id)

    def start_radio_fetcher(self):
        if self.fetcher:
            self.fetcher.stop()
            self.fetcher = None
        self.fetcher = Fetcher(jamaendo.starred_radios, self,
                               on_item = self.on_radio_result,
                               on_ok = self.on_radio_complete,
                               on_fail = self.on_radio_complete)
        self.fetcher.start()

    def on_radio_result(self, wnd, item):
        if wnd is self:
            self.radios[item.ID] = item
            self.radiolist.add_radios([item])

    def on_radio_complete(self, wnd, error=None):
        if wnd is self:
            self.fetcher.stop()
            self.fetcher = None

Exemplo n.º 36

0

Exibir arquivo

 def __init__(self, host: str, path: str, timestamp=datetime.datetime.now(), spreadsheet_id=SPREADSHEET_ID):
   self.host = host
   self.path = path
   self.renderer = SheetsRenderer(spreadsheet_id)
   self.fetcher: Fetcher = Fetcher(s, self.host)
   self.listing_cache: ListingCache = ListingCache("/tmp/cache-%s" % host, self.fetcher)
   self.emailer = Emailer(self.host, "/tmp/email_log")
   self.timestamp = timestamp

Exemplo n.º 37

0

Exibir arquivo

Arquivo: team.py Projeto: charlesdoutriaux/py-mlb

 def loadRoster(self):
     """
     Calls MLB.com servers to obtain the complete roster for the team. If call fails, '_error' property is set.
     """
     f = Fetcher(Fetcher.MLB_ROSTER_URL, team_id=self['team_id'])
     j = f.fetch()
     
     if 'roster_40' not in j:
         self._error = "ERROR on %s: key roster_40 not found (cannot load 40 man roster)" % (f.url)            
         return False
     
     parent = j['roster_40']['queryResults']
     
     if parent['totalSize'] > 0:
         for record in parent['row']:
             player_id = record['player_id']
             self.roster[player_id] = player.Player(player_id)

Exemplo n.º 38

0

Exibir arquivo

Arquivo: test.py Projeto: woodstone121/pycrawler

    def _crawl_url(self, url):
        fetcher = Fetcher()
        ret = yield fetcher.fetch(url)
        body = PQ(ret.body)
        products = body('a.product')

        data = []
        for product in products:
            foo = PQ(product)
            origin_price = re.findall('\$([\d\.]+)', foo('.discount').text())
            if origin_price:
                origin_price = origin_price[0]
            sales_price = foo('.price-6pm').text().replace('$', '').strip()

            if not origin_price and not sales_price:
                continue
            title = '[%s] %s' % (foo('.brandName').text(),
                                 foo('.productName').text())

            data.append({
                'image':
                foo('.productImg').attr('src'),
                'link':
                parse_url('http://www.6pm.com' + foo('a').attr('href')),
                'title':
                title,
                'original_price':
                origin_price or sales_price,
                'sales_price':
                sales_price
            })

        data = {
            'website': '6pm',
            'currency': 'USD',
            'country': 'USA',
            'store_id': self.store_id,
            'data': json.dumps(data)
        }
        data.update(self._extra_kwargs)

        q = yield fetcher.fetch(
            'http://127.0.0.1:8000/ezlookup/deal/?key=998998998',
            method="POST",
            data=data)

Exemplo n.º 39

0

Exibir arquivo

Arquivo: corona_data.py Projeto: msghera/corona-data

def main():

    fm = FileManager()
    fe = Fetcher()

    folder_dir = fm.get_folder_dir()

    df = [fe.statistics(), fe.coutries_data(), fe.countries_historical_data()]
    filenames = [
        'statistics', 'all_country_data', 'all_country_historical_data'
    ]

    for i in range(3):
        filename = '{}/{}.csv'.format(folder_dir, filenames[i])
        df[i].to_csv(filename, index=False)

        print('{} is created successfully.'.format(filename))

    if len(argv) > 1:

        country = argv[1]

        df = [fe.coutry_data(country), fe.country_historical_data(country)]
        filenames = [
            '{}__country_data'.format(country),
            '{}__country_historical_data'.format(country)
        ]

        for i in range(2):
            filename = '{}/{}.csv'.format(folder_dir, filenames[i])
            df[i].to_csv(filename, index=False)

            print('{} is created successfully.'.format(filename))

Exemplo n.º 40

0

Exibir arquivo

    def loadRoster(self):
        """
		Calls MLB.com servers to obtain the complete roster for the team. If call fails, '_error' property is set.
		"""
        f = Fetcher(Fetcher.MLB_ROSTER_URL, team_id=self['team_id'])
        j = f.fetch()

        if 'roster_40' not in j:
            self._error = "ERROR on %s: key roster_40 not found (cannot load 40 man roster)" % (
                f.url)
            return False

        parent = j['roster_40']['queryResults']

        if parent['totalSize'] > 0:
            for record in parent['row']:
                player_id = record['player_id']
                self.roster[player_id] = player.Player(player_id)

Exemplo n.º 41

0

Exibir arquivo

Arquivo: scraper.py Projeto: busla/Reynir

 def _scrape_single_article(self, d):
     """ Single article scraper that will be called by a process within a
         multiprocessing pool """
     try:
         helper = Fetcher._get_helper(d.root)
         if helper:
             self.scrape_article(d.url, helper)
     except Exception as e:
         print("Exception when scraping article at {0}: {1!r}".format(d.url, e))

Exemplo n.º 42

0

Exibir arquivo

Arquivo: showplaylist.py Projeto: krig/jamaendo

 def start_track_fetcher(self):
     if self.fetcher:
         self.fetcher.stop()
         self.fetcher = None
     self.fetcher = Fetcher(lambda: self.playlist, self,
                            on_item = self.on_track_result,
                            on_ok = self.on_track_complete,
                            on_fail = self.on_track_complete)
     self.fetcher.start()

Exemplo n.º 43

0

Exibir arquivo

Arquivo: radios.py Projeto: krig/jamaendo

 def start_radio_fetcher(self):
     if self.fetcher:
         self.fetcher.stop()
         self.fetcher = None
     self.fetcher = Fetcher(jamaendo.starred_radios, self,
                            on_item = self.on_radio_result,
                            on_ok = self.on_radio_complete,
                            on_fail = self.on_radio_complete)
     self.fetcher.start()

Exemplo n.º 44

0

Exibir arquivo

Arquivo: bez.py Projeto: Bezmehrabi/-bezAur

def command_line_runner():
    args = docopt.docopt(__doc__, version=__version__)

    if args["--clear-cache"]:
        if caching._clear_cache():
            exit("Cache cleared successfully.")
        else:
            exit("Clearing cache failed.")

    if args["--max-number"]:
        try:
            args["--max-number"] = int(args["--max-number"])
        except ValueError:
            exit(_yellow("--max-number value should be a number!"))
    fetcher = Fetcher(args)
    selected_pkg = fetcher.user_confirm()
    PkgbuildReview(selected_pkg, args)
    DiffReview(selected_pkg, args)

Exemplo n.º 45

0

Exibir arquivo

Arquivo: featured.py Projeto: krig/jamaendo

 def start_feature_fetcher(self):
     if self.fetcher:
         self.fetcher.stop()
         self.fetcher = None
     self.fetcher = Fetcher(self.featurefn, self,
                            on_item = self.on_feature_result,
                            on_ok = self.on_feature_complete,
                            on_fail = self.on_feature_complete)
     self.fetcher.start()

Exemplo n.º 46

0

Exibir arquivo

Arquivo: __init__.py Projeto: chudichudichudi/neurociencia

 def __init__(self, enable):
     self.enable = enable
     self._tips = {}
     self._new_tips = set()
     self.lock = Lock()
     if self.enable:
         self.fetcher = Fetcher(self._tips, self.lock, self._new_tips)
         self.cleaner = Cleaner(self._tips, self.lock, self._new_tips)
         self.fetcher.start()
         self.cleaner.start()

Exemplo n.º 47

0

Exibir arquivo

    def urls2fetch(self, root, helper):
        """ Returns a set of URLs to fetch. If the scraper helper class has
            associated RSS feed URLs, these are used to acquire article URLs.
            Otherwise, the URLs are found by scraping the root website and
            searching for links to subpages. """
        fetch_set = set()
        feeds = None if helper is None else helper.feeds

        if feeds:

            for feed_url in feeds:
                logging.info("Fetching feed {0}".format(feed_url))
                try:
                    d = feedparser.parse(feed_url)
                except Exception as e:
                    logging.warning(
                        "Error fetching/parsing feed {0}: {1}".format(
                            feed_url, str(e)))
                    continue
                for entry in d.entries:
                    if entry.link and not helper.skip_rss_entry(entry):
                        fetch_set.add(entry.link)

        else:

            # Fetch the root URL and scrape all child URLs
            # that refer to the same domain suffix
            logging.info("Fetching root {0}".format(root.url))

            # Read the HTML document at the root URL
            html_doc = Fetcher.raw_fetch_url(root.url)
            if not html_doc:
                logging.warning("Unable to fetch root {0}".format(root.url))
                return

            # Parse the HTML document
            soup = Fetcher.make_soup(html_doc)

            # Obtain the set of child URLs to fetch
            fetch_set = Fetcher.children(root, soup)

        return fetch_set

Exemplo n.º 48

0

Exibir arquivo

    def __init__(self, master):
        self.master = master
        self.ram = None
        self.fetcher = Fetcher()

        master.title('BoostPack v2 Installer')
        master.geometry('500x500')
        master.iconbitmap('assets/output_onlinepngtools_ZGQ_icon.ico')

        img = PhotoImage(file='assets/install.png')
        self.install = Button(master,
                              image=img,
                              command=self.install,
                              borderwidth=0)
        self.install.image = img
        self.install.pack()

        img = PhotoImage(file='assets/uninstall.png')
        self.uninstall = Button(master,
                                image=img,
                                command=self.uninstall,
                                borderwidth=0)
        self.uninstall.image = img
        self.uninstall.pack()

        img = PhotoImage(file='assets/update.png')
        self.update = Button(master,
                             image=img,
                             command=self.update_patcher,
                             borderwidth=0)
        self.update.image = img
        self.update.pack()

        self.txt = Label(master, text='Ram')
        self.txt.pack()

        self.ram = Scale(master,
                         from_=1,
                         to=self.fetcher.get_ram,
                         orient=HORIZONTAL,
                         command=self.set_ram)
        self.ram.pack()

Exemplo n.º 49

0

Exibir arquivo

Arquivo: player.py Projeto: krig/jamaendo

 def _start_radio_fetcher(self):
     if self.fetcher:
         self.fetcher.stop()
         self.fetcher = None
     self.fetcher = Fetcher(lambda: jamaendo.get_radio_tracks(self.playlist.radio_id),
                            self,
                            on_item = self._on_radio_result,
                            on_ok = self._on_radio_complete,
                            on_fail = self._on_radio_complete)
     self.fetcher.has_no_results = True
     self.fetcher.start()

Exemplo n.º 50

0

Exibir arquivo

Arquivo: showalbum.py Projeto: krig/jamaendo

 def start_track_fetcher(self):
     if self.fetcher:
         self.fetcher.stop()
         self.fetcher = None
     self.fetcher = Fetcher(
         lambda: jamaendo.get_tracks(self.album.ID),
         self,
         on_item=self.on_track_result,
         on_ok=self.on_track_complete,
         on_fail=self.on_track_complete,
     )
     self.fetcher.start()

Exemplo n.º 51

0

Exibir arquivo

Arquivo: handler.py Projeto: JohnAZoidberg/ytsubs

    def __init__(self, directory, api_key, username, max_vids ):
        self.API_KEY = api_key
        self.USERNAME = username

        self.SAVE_FILE_PATH = os.path.normpath(directory + 'ytsubs_' + self.USERNAME.lower() + '.json')
        self.fetcher = Fetcher(self.USERNAME, self.API_KEY)

        self.MAX_VIDEOS = max_vids
        self.watched = []
        self.additions = []
        self.raw_videos = []
        self.load()

Exemplo n.º 52

0

Exibir arquivo

Arquivo: scraper.py Projeto: vthorsteinsson/Reynir

    def urls2fetch(self, root, helper):
        """ Returns a set of URLs to fetch. If the scraper helper class has
            associated RSS feed URLs, these are used to acquire article URLs.
            Otherwise, the URLs are found by scraping the root website and
            searching for links to subpages. """
        fetch_set = set()
        feeds = helper.feeds

        if feeds:
            for feed_url in feeds:
                logging.info("Fetching feed {0}".format(feed_url))
                try:
                    d = feedparser.parse(feed_url)
                except Exception as e:
                    logging.warning(
                        "Error fetching/parsing feed {0}: {1}".format(feed_url, str(e))
                    )
                    continue

                for entry in d.entries:
                    if entry.link and not helper.skip_rss_entry(entry):
                        fetch_set.add(entry.link)
        else:
            # Fetch the root URL and scrape all child URLs
            # that refer to the same domain suffix
            logging.info("Fetching root {0}".format(root.url))

            # Read the HTML document at the root URL
            html_doc = Fetcher.raw_fetch_url(root.url)
            if not html_doc:
                logging.warning("Unable to fetch root {0}".format(root.url))
                return

            # Parse the HTML document
            soup = Fetcher.make_soup(html_doc)

            # Obtain the set of child URLs to fetch
            fetch_set = Fetcher.children(root, soup)

        return fetch_set

Exemplo n.º 53

0

Exibir arquivo

Arquivo: gui.py Projeto: dan-boa/MyMovieMonitor

    def __init__(self, master=None):
        Frame.__init__(self, master)

        self.fetch = Fetcher()

        self.grid(sticky=N+S+E+W)
        self.init_frame()
        self.createSearchEntry()
        self.createInfCanvas()
        self.createInfList()
        self.createDetailCanvas()
        self.createImageLabel()
        self.createDetailLabels()

Exemplo n.º 54

0

Exibir arquivo

Arquivo: schedule.py Projeto: therg/py-mlb

    def __init__(self, year, month, day=None):
        """
        Constructor
        
        Arguments:
        year: The... year!
        month: The... month!
        day: The... day! (or None for all days of the month)
        
        Schedule is a standard dictionary: each day is a key in the format of 'YYYY-MM-DD', each value
        a list of game dictionaries.
        """
        days = []
        
        if day is None:
            for d in xrange(1, calendar.mdays[month] + 1):
                days.append(datetime.date(year, month, d))
        else:
            days.append(datetime.date(year, month, day))

        for d in days:
            key = d.strftime("%Y-%m-%d")
            if key not in self.keys():
                self[key] = []

            f = Fetcher(Fetcher.MLB_SCHEDULE_URL, date=d.strftime("%Y%m%d"))
            try:
                content = f.fetch(True)
                if len(content) == 0:
                    continue
                content = re.sub(r'\t+', '\t', content)
                content = content.replace('"', '\\"')
                content = content.replace("'", "\"")
                content = re.sub(r'\t([\w,_]+):\s', r'"\1":', content)
                obj = json.loads(content)
                self[key] = obj
            except ValueError, e:
                print "ERROR %s on %s" % (e, f.url)
                pass

Exemplo n.º 55

0

Exibir arquivo

Arquivo: scraper.py Projeto: vthorsteinsson/Reynir

 def _scrape_single_article(self, d):
     """ Single article scraper that will be called by a process within a
         multiprocessing pool """
     try:
         helper = Fetcher._get_helper(d.root)
         if helper:
             self.scrape_article(d.url, helper)
     except Exception as e:
         logging.warning(
             "[{2}] Exception when scraping article at {0}: {1!r}".format(
                 d.url, e, d.seq
             )
         )

Exemplo n.º 56

0

Exibir arquivo

Arquivo: player.py Projeto: Mpellet771/py-mlb

	def loadYearlies(self):
		"""
		Loads yearly and career totals for a player
		"""
		if self['primary_position'] == 1:
			f = Fetcher(Fetcher.MLB_PITCHER_SUMMARY_URL, player_id=self.player_id)
		else:
			f = Fetcher(Fetcher.MLB_BATTER_SUMMARY_URL, player_id=self.player_id)
		
		j = f.fetch()
		
		# if the JSON object is empty, bail
		if len(j.keys()) == 0: return
		
		# get yearly totals
		if self['primary_position'] == 1:
			parent = j['mlb_bio_pitching_summary']['mlb_individual_pitching_season']['queryResults']
		else:
			parent = j['mlb_bio_hitting_summary']['mlb_individual_hitting_season']['queryResults']
		
		if parent['totalSize'] > 0:
			records = parent['row']

			# accounting for player with only one row
			if type(records) is dict: records = [records]
			
			for row in records:
				log = {}
				for key, value in row.iteritems(): log[key] = value
				self.totals[row['season']] = log
			
		# get career totals
		if self['primary_position'] == 1:
			parent = j['mlb_bio_pitching_summary']['mlb_individual_pitching_career']['queryResults']
		else:
			parent = j['mlb_bio_hitting_summary']['mlb_individual_hitting_career']['queryResults']
			
		if parent['totalSize'] > 0:
			for key, value in parent['row'].iteritems(): self.career[key] = value

Exemplo n.º 57

0

Exibir arquivo

Arquivo: featured.py Projeto: krig/jamaendo

 def start_tag_fetcher(self, item_id):
     if self.fetcher:
         self.fetcher.stop()
         self.fetcher = None
     self.fetcher = Fetcher(lambda: jamaendo.get_tag_tracks(item_id),
                            self,
                            on_item = self.on_tag_result,
                            on_ok = self.on_tag_complete,
                            on_fail = self.on_tag_complete)
     self.fetcher.taglist = []
     self.fetcher.start()
     banner = hildon.hildon_banner_show_information(self, '', "Getting tracks for tag")
     banner.set_timeout(2000)

Exemplo n.º 58

0

Exibir arquivo

Arquivo: player.py Projeto: therg/py-mlb

    def load(self, load_yearlies=False, id=None):
        """
        Calls MLB.com server and loads player information. If call fails, '_error' property is set.

        Arguments:
        id : The MLB.com player ID
        """
        if id is None and self.player_id is not None:
            id = self.player_id
            self['player_id'] = self.player_id
        else:
            raise Exception('No player_id specified')

        f = Fetcher(Fetcher.MLB_PLAYER_URL, player_id=self.player_id)
        j = f.fetch()

        try:
            records = j['player_info']['queryResults']['totalSize']
        except KeyError, e:
            msg = 'ERROR on %s: totalSize not returned for call' % f.url
            self._error = msg
            logger.error(msg)
            return False

Exemplo n.º 59

0

Exibir arquivo

Arquivo: crawler.py Projeto: jithinoc/pycrawler

	def begin(self, url_list, within_domain=False, download_location=False, depth=False):
		self.unvisited_urls = url_list
		self.visited_urls = url_list
		self.configure()
		cleaned_urls = []
		fetcher = Fetcher()
		while True:
			print '#unvisited urls ', len(self.unvisited_urls)
			for url in self.unvisited_urls:
				print 'crawling ', url 
				domain_url = self.domain(url)
				html_content = self.fetch_content(url)
				soup = self.make_clean_soup(html_content)
				out_links = self.get_all_links(soup)
				if download_location:
					fetcher.fetch_soup_attributes(download_location, url, soup)
				cleaned_urls.extend(self.clean_urls(out_links, domain_url, within_domain))
			self.unvisited_urls = self.not_visited_urls(cleaned_urls)
			self.visited_urls.append(self.unvisited_urls)
			if depth:
				if depth==1:
					break
				else:
					depth -= 1