Python Fetcher.fetch 예제들, fetcher.Fetcher.fetch Python 예제들

예제 #1

0

파일 보기

파일: test.py 프로젝트: woodstone121/pycrawler

    def crawl_category(self):
        fetcher = Fetcher()
        kk = yield fetcher.fetch(
            "http://www.carters.com/%s?startRow=0&sz=all" % self.slug)
        page = kk.body

        self._process(page)

예제 #2

0

파일 보기

파일: player.py 프로젝트: therg/py-mlb

    def loadGamelogs(self, year=None):
        """
        Loads gamelogs for the player for a given year

        Arguments:
        year : The season desired. Defaults to the current year if not specified
        """
        if year is None: year = datetime.datetime.now().year
        if year not in self.logs: self.logs[year] = []

        if 'primary_position' not in self:
            logger.error("no primary position attribute for " % self)
            return False

        url = Fetcher.MLB_PITCHER_URL if self['primary_position'] == 1 else Fetcher.MLB_BATTER_URL

        f = Fetcher(url, player_id=self.player_id, year=year)
        j = f.fetch()

        try:
            if self['primary_position'] == 1:
                parent = j['mlb_bio_pitching_last_10']['mlb_individual_pitching_game_log']['queryResults']
            else:
                if 'mlb_individual_hitting_last_x_total' in j:
                    parent = j['mlb_individual_hitting_last_x_total']['mlb_individual_hitting_game_log']['queryResults']
                else:
                    parent = j['mlb_bio_hitting_last_10']['mlb_individual_hitting_game_log']['queryResults']
        except KeyError, e:
            logger.error('no key for gamelogs found in %s' % f.url)
            return False

예제 #3

0

파일 보기

파일: transactions.py 프로젝트: Mpellet771/py-mlb

	def __init__(self, year, month, day = None):
		"""
		Constructor
		
		Arguments:
		year: The... year!
		month: The... month!
		day: The... day! (or None for all days of the month)
		"""
		days = []
		
		if day is None:
			for d in xrange(1, calendar.mdays[month] + 1):
				days.append(datetime.date(year, month, d))
		else:
			days.append(datetime.date(year, month, day))

		begin = days[0]
		end = days[-1]

		f = Fetcher(Fetcher.MLB_TRANSACTION_URL, start=begin.strftime("%Y%m%d"), end=end.strftime("%Y%m%d"))
		try:
		 	obj = f.fetch()
			if obj['transaction_all']['queryResults']['totalSize'] == 0: return
			results = obj['transaction_all']['queryResults']['row']
			
			if type(results) is dict:
				self.append(results)
			else:
				for row in results:
					self.append(row)
		except (ValueError, KeyError), e:
			logger.error("ERROR %s on %s" % (e, f.url))
			pass

예제 #4

0

파일 보기

    def run(self):
        """The starting point of a thread"""
        # print("Thread " + str(self.thread_id) + " started")
        while True:
            # print("get next URL")
            self.dash.print_cur_stat("Next_Url___", self.thread_id)
            value, current_url, current_dns = self.frontier.get_url(self.thread_id)
            if not current_url:
                # print("Empty Queue from thread " + str(self.thread_id))
                self.dash.print_cur_stat("Empty_Queue", self.thread_id)
                continue
            self.dash.print_cur_stat("Downloading", self.thread_id)
            code, links, content = Fetcher.fetch(current_url)
            if code == -1:
                # print("Refused from thread " + str(self.thread_id))
                self.dash.print_cur_stat("Refused_Url", self.thread_id)
                self.refused += 1
                self.dash.print_refused(str(self.refused), self.thread_id)
                continue
            self.dash.print_cur_stat("Valid_Url__", self.thread_id)
            # Crawling this link successeded
            # print("URL got from thread " + str(self.thread_id))
            out_links = len(links)
            sz_parent = len(content)
            links_mod = []
            for i in range(len(links)):
                links_mod.append((links[i][0], links[i][1], (out_links, sz_parent, len(links[i][0]), value)))

            self.dash.print_cur_stat("URL_Fetched", self.thread_id)
            self.crawled += 1
            self.dash.print_crawled(str(self.crawled), self.thread_id)
            # print("URL fetched from thread " + str(self.thread_id))
            self.frontier.push_to_serve(links_mod, self.thread_id)
            Storage.cache_crawled_url(current_url, current_dns, content, self.thread_id)

예제 #5

0

파일 보기

파일: test.py 프로젝트: woodstone121/pycrawler

 def _get_category_page(self):
     fetcher = Fetcher()
     ret = yield fetcher.fetch('http://www.6pm.com/%s' % self.slug)
     body = PQ(ret.body)
     foo = body('.last a')[0].get('href')
     max_page = int(re.findall('-page(\d+)', foo)[0])
     for i in range(max_page):
         self._crawl_category_page(i)

예제 #6

0

파일 보기

파일: test.py 프로젝트: woodstone121/pycrawler

    def _crawl_url(self, url):
        fetcher = Fetcher()
        ret = yield fetcher.fetch(url)
        body = PQ(ret.body)
        products = body('a.product')

        data = []
        for product in products:
            foo = PQ(product)
            origin_price = re.findall('\$([\d\.]+)', foo('.discount').text())
            if origin_price:
                origin_price = origin_price[0]
            sales_price = foo('.price-6pm').text().replace('$', '').strip()

            if not origin_price and not sales_price:
                continue
            title = '[%s] %s' % (foo('.brandName').text(),
                                 foo('.productName').text())

            data.append({
                'image':
                foo('.productImg').attr('src'),
                'link':
                parse_url('http://www.6pm.com' + foo('a').attr('href')),
                'title':
                title,
                'original_price':
                origin_price or sales_price,
                'sales_price':
                sales_price
            })

        data = {
            'website': '6pm',
            'currency': 'USD',
            'country': 'USA',
            'store_id': self.store_id,
            'data': json.dumps(data)
        }
        data.update(self._extra_kwargs)

        q = yield fetcher.fetch(
            'http://127.0.0.1:8000/ezlookup/deal/?key=998998998',
            method="POST",
            data=data)

예제 #7

0

파일 보기

파일: player.py 프로젝트: therg/py-mlb

    def loadYearlies(self):
        """
        Loads yearly and career totals for a player
        """
        if self['primary_position'] == 1 and not self.force_batting:
            f = Fetcher(Fetcher.MLB_PITCHER_SUMMARY_URL, player_id=self.player_id)
        else:
            f = Fetcher(Fetcher.MLB_BATTER_SUMMARY_URL, player_id=self.player_id)

        j = f.fetch()

        # if the JSON object is empty, bail
        if len(j.keys()) == 0:
            return

        # get yearly totals
        if self['primary_position'] == 1 and not self.force_batting:
            parent = j['mlb_bio_pitching_summary']['mlb_individual_pitching_season']['queryResults']
        else:
            parent = j['mlb_bio_hitting_summary']['mlb_individual_hitting_season']['queryResults']

        if parent['totalSize'] > 0:
            records = parent['row']

            # accounting for player with only one row
            if type(records) is dict:
                records = [records]

            for row in records:
                log = {}
                for key, value in row.iteritems():
                    log[key] = value

                # handle each season as a list, so
                # players with multiple team seasons
                # get each team for that year
                # accounted for
                if row['season'] in self.totals:
                    self.totals[row['season']].append(log)
                else:
                    self.totals[row['season']] = [log]

        # get career totals
        if self['primary_position'] == 1 and not self.force_batting:
            parent = j['mlb_bio_pitching_summary']['mlb_individual_pitching_career']['queryResults']
        else:
            parent = j['mlb_bio_hitting_summary']['mlb_individual_hitting_career']['queryResults']

        if parent['totalSize'] > 0:
            for key, value in parent['row'].iteritems():
                self.career[key] = value

예제 #8

0

파일 보기

파일: league.py 프로젝트: Mpellet771/py-mlb

	def load(self, loadRosters = False):
		"""
		Calls MLB.com server and loads all team information

		Arguments:
		loadRosters : If true, rosters will automatically be loaded (more HTTP requests!)
		"""
		f = Fetcher(Fetcher.MLB_LEAGUE_URL)

		for item in f.fetch():
			t = team.Team(item)
			if loadRosters:
				t.loadRoster()
			self.teams[t['team_code']] = t

예제 #9

0

파일 보기

    def load(self, loadRosters=False):
        """
		Calls MLB.com server and loads all team information

		Arguments:
		loadRosters : If true, rosters will automatically be loaded (more HTTP requests!)
		"""
        f = Fetcher(Fetcher.MLB_LEAGUE_URL)

        for item in f.fetch():
            t = team.Team(item)
            if loadRosters:
                t.loadRoster()
            self.teams[t['team_code']] = t

예제 #10

0

파일 보기

 def run(self):
     """The starting point of a thread"""
     print("Thread " + str(self.thread_id) + " started")
     while True:
         uid, current_url = self.frontier.get_url(self.thread_id)
         if not current_url:
             break
         print("thread " + str(self.thread_id) + " Got a URL")
         code, _, content = Fetcher.fetch(current_url)
         if code == -1:
             print("Unable to fetch link from thread " + str(self.thread_id))
             DBDeleteCrawled(0, 'cache_crawled', self.thread_id, self.name, 0, uid).start()
             continue
         print("URL Refreshed from thread " + str(self.thread_id))
         DBCacheCrawledRevisit(0, 'cache_crawled', self.thread_id, self.name, 0, uid, content).start()
     print(self.name + " has finished revisiting")

예제 #11

0

파일 보기

파일: team.py 프로젝트: charlesdoutriaux/py-mlb

 def loadRoster(self):
     """
     Calls MLB.com servers to obtain the complete roster for the team. If call fails, '_error' property is set.
     """
     f = Fetcher(Fetcher.MLB_ROSTER_URL, team_id=self['team_id'])
     j = f.fetch()
     
     if 'roster_40' not in j:
         self._error = "ERROR on %s: key roster_40 not found (cannot load 40 man roster)" % (f.url)            
         return False
     
     parent = j['roster_40']['queryResults']
     
     if parent['totalSize'] > 0:
         for record in parent['row']:
             player_id = record['player_id']
             self.roster[player_id] = player.Player(player_id)

예제 #12

0

파일 보기

    def loadRoster(self):
        """
		Calls MLB.com servers to obtain the complete roster for the team. If call fails, '_error' property is set.
		"""
        f = Fetcher(Fetcher.MLB_ROSTER_URL, team_id=self['team_id'])
        j = f.fetch()

        if 'roster_40' not in j:
            self._error = "ERROR on %s: key roster_40 not found (cannot load 40 man roster)" % (
                f.url)
            return False

        parent = j['roster_40']['queryResults']

        if parent['totalSize'] > 0:
            for record in parent['row']:
                player_id = record['player_id']
                self.roster[player_id] = player.Player(player_id)

예제 #13

0

파일 보기

파일: task.py 프로젝트: woodstone121/pycrawler

class BaseTask(object):
    __metaclass__ = TaskMeta

    def __init__(self):
        self.fetcher = Fetcher()

    def on_start(self):
        raise NotImplemented

    @gen.coroutine
    def fetch(self, url, next=None, args=(), data_type="html", **kwargs):
        '''
        :param url: URL will be fetched
        :param next: callback function
        :param args: arguments passed to callback function after data, if callback function is not specified,
                     args will be ignored.
        :param data_type: html/json/xml
        :return:
        '''
        ret = yield self.fetcher.fetch(url, **kwargs)

        if next:
            # Use callback
            next(Response(ret, data_type), *args)  # Wrapp in Response Object
        else:
            # Use yield
            raise gen.Return(Response(ret, data_type))

    @gen.coroutine
    def save(self, data):
        '''save to mongodb, overlay it when you want to change behavior of save
        :param data:
        :return:
        '''
        if not isinstance(data, dict):
            raise TypeError('data must be instance of dict')

        # add time stamp
        data.update({'_timestamp': datetime.datetime.utcnow()})

        result = yield self._db.insert(data)
        self._logger.info("Saved: " + str(result))

예제 #14

0

파일 보기

파일: main.py 프로젝트: minibear0523/StockNeeqData

def begin():
    try:
        f = Fetcher()
        fetch_result = f.fetch(kjxjr=False)
        p = Parser()
        parse_result = p.parse()
        print(parse_result)

        db = DB()
        db.insert(parse_result)
        r = Report()
        today = arrow.now()
        if today.format('dddd') == 'Friday':
            to_addr = [('*****@*****.**', '张磊'), ('*****@*****.**', '张永泉')]
            r.send_report(to_addr=to_addr)
        else:
            to_addr = [('*****@*****.**', '张磊'), ]
            r.send_report(to_addr=to_addr)
    except Exception as e:
        print(e)

예제 #15

0

파일 보기

파일: schedule.py 프로젝트: therg/py-mlb

    def __init__(self, year, month, day=None):
        """
        Constructor
        
        Arguments:
        year: The... year!
        month: The... month!
        day: The... day! (or None for all days of the month)
        
        Schedule is a standard dictionary: each day is a key in the format of 'YYYY-MM-DD', each value
        a list of game dictionaries.
        """
        days = []
        
        if day is None:
            for d in xrange(1, calendar.mdays[month] + 1):
                days.append(datetime.date(year, month, d))
        else:
            days.append(datetime.date(year, month, day))

        for d in days:
            key = d.strftime("%Y-%m-%d")
            if key not in self.keys():
                self[key] = []

            f = Fetcher(Fetcher.MLB_SCHEDULE_URL, date=d.strftime("%Y%m%d"))
            try:
                content = f.fetch(True)
                if len(content) == 0:
                    continue
                content = re.sub(r'\t+', '\t', content)
                content = content.replace('"', '\\"')
                content = content.replace("'", "\"")
                content = re.sub(r'\t([\w,_]+):\s', r'"\1":', content)
                obj = json.loads(content)
                self[key] = obj
            except ValueError, e:
                print "ERROR %s on %s" % (e, f.url)
                pass

예제 #16

0

파일 보기

파일: player.py 프로젝트: Mpellet771/py-mlb

	def loadYearlies(self):
		"""
		Loads yearly and career totals for a player
		"""
		if self['primary_position'] == 1:
			f = Fetcher(Fetcher.MLB_PITCHER_SUMMARY_URL, player_id=self.player_id)
		else:
			f = Fetcher(Fetcher.MLB_BATTER_SUMMARY_URL, player_id=self.player_id)
		
		j = f.fetch()
		
		# if the JSON object is empty, bail
		if len(j.keys()) == 0: return
		
		# get yearly totals
		if self['primary_position'] == 1:
			parent = j['mlb_bio_pitching_summary']['mlb_individual_pitching_season']['queryResults']
		else:
			parent = j['mlb_bio_hitting_summary']['mlb_individual_hitting_season']['queryResults']
		
		if parent['totalSize'] > 0:
			records = parent['row']

			# accounting for player with only one row
			if type(records) is dict: records = [records]
			
			for row in records:
				log = {}
				for key, value in row.iteritems(): log[key] = value
				self.totals[row['season']] = log
			
		# get career totals
		if self['primary_position'] == 1:
			parent = j['mlb_bio_pitching_summary']['mlb_individual_pitching_career']['queryResults']
		else:
			parent = j['mlb_bio_hitting_summary']['mlb_individual_hitting_career']['queryResults']
			
		if parent['totalSize'] > 0:
			for key, value in parent['row'].iteritems(): self.career[key] = value

예제 #17

0

파일 보기

    def __init__(self, year, month, day=None):
        """
		Constructor
		
		Arguments:
		year: The... year!
		month: The... month!
		day: The... day! (or None for all days of the month)
		
		Schedule is a standard dictionary: each day is a key in the format of 'YYYY-MM-DD', each value
		a list of game dictionaries.
		"""
        days = []

        if day is None:
            for d in xrange(1, calendar.mdays[month] + 1):
                days.append(datetime.date(year, month, d))
        else:
            days.append(datetime.date(year, month, day))

        for d in days:
            key = d.strftime("%Y-%m-%d")
            if key not in self.keys():
                self[key] = []

            f = Fetcher(Fetcher.MLB_SCHEDULE_URL, date=d.strftime("%Y%m%d"))
            try:
                content = f.fetch(True)
                if len(content) == 0: continue
                content = re.sub(r'\t+', '\t', content)
                content = content.replace('"', '\\"')
                content = content.replace("'", "\"")
                content = re.sub(r'\t([\w,_]+):\s', r'"\1":', content)
                obj = json.loads(content)
                self[key] = obj
            except ValueError, e:
                print "ERROR %s on %s" % (e, f.url)
                pass

예제 #18

0

파일 보기

파일: player.py 프로젝트: therg/py-mlb

    def load(self, load_yearlies=False, id=None):
        """
        Calls MLB.com server and loads player information. If call fails, '_error' property is set.

        Arguments:
        id : The MLB.com player ID
        """
        if id is None and self.player_id is not None:
            id = self.player_id
            self['player_id'] = self.player_id
        else:
            raise Exception('No player_id specified')

        f = Fetcher(Fetcher.MLB_PLAYER_URL, player_id=self.player_id)
        j = f.fetch()

        try:
            records = j['player_info']['queryResults']['totalSize']
        except KeyError, e:
            msg = 'ERROR on %s: totalSize not returned for call' % f.url
            self._error = msg
            logger.error(msg)
            return False

예제 #19

0

파일 보기

파일: transactions.py 프로젝트: ddaannddrree/baseball

    def __init__(self, year, month, day=None):
        """
		Constructor
		
		Arguments:
		year: The... year!
		month: The... month!
		day: The... day! (or None for all days of the month)
		"""
        days = []

        if day is None:
            for d in xrange(1, calendar.mdays[month] + 1):
                days.append(datetime.date(year, month, d))
        else:
            days.append(datetime.date(year, month, day))

        begin = days[0]
        end = days[-1]

        f = Fetcher(Fetcher.MLB_TRANSACTION_URL,
                    start=begin.strftime("%Y%m%d"),
                    end=end.strftime("%Y%m%d"))
        try:
            obj = f.fetch()
            if obj['transaction_all']['queryResults']['totalSize'] == 0: return
            results = obj['transaction_all']['queryResults']['row']

            if type(results) is dict:
                self.append(results)
            else:
                for row in results:
                    self.append(row)
        except (ValueError, KeyError), e:
            logger.error("ERROR %s on %s" % (e, f.url))
            pass

예제 #20

0

파일 보기

파일: show.py 프로젝트: smilitude/SemiAutoOJ

from fetcher import Fetcher
import sys

num = int( sys.argv[1] )
code = Fetcher.fetch( num )

print "----------------------------------------------------"
print 
print
print code
print "----------------------------------------------------"

예제 #21

0

파일 보기

파일: crawler.py 프로젝트: todorgrigorov/web-crawler

class Crawler:
    """ Scans through websites via HTTP GET requests and recursion.  """

    MAX_DEPTH = 10

    def __init__(self):
        self.fetcher = Fetcher()
        self.tree = Tree()
        self.documents = []
        self.invalid_documents = []

    def get_document_by_guid(self, guid):
        for document in self.documents:
            if document.guid == guid:
                return document

    def crawl(self, url, current_depth=0):
        if self.fetcher.url_valid(url):
            try:
                html = self.fetcher.fetch(url)
                parser = Parser(url, html)
                parser.parse()

                self.documents.append(parser.document)
                for token in parser.document.tokens:
                    self.tree.add(token.text, token)

                print(colored('Document crawled successfully.', 'green'))
                print('URL: %s' % (parser.document.url))
                print('Tokens: %s' % (len(parser.document.tokens)))
                print('Links: %s' % (len(parser.document.links)))
                print('\n')

                # the depth increases with each subsequent crawl call
                if current_depth <= Crawler.MAX_DEPTH:
                    for link in parser.document.links:
                        document = Document(link)
                        if document not in self.documents and document not in self.invalid_documents:
                            # NOTE: this is not 100% guarantee that the same document will not be crawled again,
                            #       as there are many ways to forward DNS to the same page...
                            self.crawl(link, current_depth + 1)

                            # if crawling has been successful, then append the link as crawled
                            parser.document.crawled_links.append(link)
            except (MissingSchema, InvalidSchema):
                self.handle_invalid_url(url, 'Invalid URL specified.')
            except NotFoundException:
                self.handle_invalid_url(url, 'Document not found.')
            except ConnectionError:
                self.handle_invalid_url(url, 'Error while crawling document.')
            except (SyntaxException, RecursionError):
                self.handle_invalid_url(url, 'Error while parsing document.')
            except TimeoutException:
                self.handle_invalid_url(url,
                                        'Document took too long to process.')
        else:
            self.handle_invalid_url(url, 'Invalid URL specified.')

    def handle_invalid_url(self, url, message):
        document = Document(url)
        document.valid = False

        if document not in self.invalid_documents:
            self.invalid_documents.append(document)

        print(colored(message, 'red'))
        print('URL: %s' % (url))
        print('\n')

예제 #22

0

파일 보기

class Coffer:
    def __init__(self,config_parser):
        # Connect to engine
        database_path  = get_from_config_parser(config_parser,'Database','path','database')
        database_debug = get_boolean_from_config_parser(config_parser,'Database','debug',False)
        dir = os.path.dirname(database_path)
        if not os.path.exists(dir):
            mkdir(dir)
        sys.stderr.write('Connecting to database at "%s"\n' % database_path)
        self._engine = create_engine('sqlite:///%s' % database_path,echo=database_debug)

        # Start session
        Session = sessionmaker(bind=self._engine)
        self._session = Session()
        # Initialize feed storage
        self._feed_storage = FeedStorage(self._engine,self._session)
        # Initialize item storage
        self._item_storage = ItemStorage(self._engine,self._session)
        # A list of subprocess.Popen processes that will be maintained
        # by the Coffer object.
        self._external_processes = []
        # File storage (data dump)
        file_storage_path = get_from_config_parser(config_parser,'FileStorage','path','datadump')
        max_block_size    = get_int_from_config_parser(config_parser,'FileStorage','max-block-size',
                                                       file_storage.DEFAULT_MAX_BLOCK_SIZE)
        bzip2_path = get_from_config_parser(config_parser,'FileStorage','bzip2-path','/usr/bin/bzip2')
        self._file_storage = FileStorage(self._external_processes,file_storage_path,
                                         max_block_size,bzip2_path)
        # Content fetcher configuration
        self._fetcher = Fetcher(config_parser)

    def clone_db_session(self):
        clone_session = sessionmaker(bind=self._engine)
        return clone_session()

    def finish(self):
        '''
        Waits for all external processes started by coffer to finish.
        '''
        sys.stderr.write('Waiting for sub-processes to finish..\n')
        for process in self._external_processes:
            process.wait()
        sys.stderr.write('  ..finished.\n\n')

    def check_processes(self):
        '''
        Checks if some of the external processes have finished and
        removes them from the external-process list if they have.
        '''
        end_i = len(self._external_processes)
        i     = 0
        while i < end_i:
            if self._external_processes[i].poll() is not None:
                del self._external_processes[i]
                end_i -= 1
            else:
                i += 1

    def run_command_shell(self):
        shell = CommandShell(self)
        shell.cmdloop()

    def get_feed_info(self,url):
        '''
        Obtain information on an RSS feed, given its URL. The
        information will be obtained directly from the URL,
        not from our database. This works for feeds regardless
        of whether they are stored in our database.
        '''
        feed_results = feedparser.parse(url)
        sys.stderr.write(str(feed_results))
        if 'title' in feed_results.feed:
            return feed_results.feed.title
        else:
            return None

    def current_items_feed(self,
                           session,
                           feed,
                           enable_ad_filter = False,
                           check_existence  = False,
                           debug_enabled    = False):
        '''
        Returns a generator for the list of current items, i.e. the
        current list of fresh items returned by all known feeds.
        @param enable_ad_filter: if True, advertisements will be filtered out
                       using the predefined regex
        @param check_existence: if True, only entries that are not already
                       stored in the items database will be returned.
        '''
        if enable_ad_filter and len(feed.ad_filters) > 0:
            exclude_pattern = re.compile(u'|'.join(feed.ad_filters))
        feed_results = feedparser.parse(feed.get_url())
        for entry in feed_results.entries:
            if 'link' not in entry.keys():
                sys.stderr.write((u'No link found in this item: "%s"\n' \
                                  % entry.title).encode('utf-8'))
                if debug_enabled:
                    sys.stderr.write('Keys:\n%s\n' % str(entry.keys()))
                continue
            if 'id' not in entry.keys():
                if debug_enabled:
                    sys.stderr.write((u'No entry id found in this item: "%s"\n' \
                                      % entry.title).encode('utf-8'))
                entry_id = entry.link
                if debug_enabled:
                    sys.stderr.write('Keys:\n%s\n' % str(entry.keys()))
                    sys.stderr.write((u'Using link [%s] instead of id.\n' \
                                      % entry_id).encode('utf-8'))
            else:
                entry_id = entry.id
            if check_existence:
                if self._item_storage.exists_in_session(session,entry_id):
                    continue
            if (not enable_ad_filter) or (len(feed.ad_filters) == 0) \
                   or (not exclude_pattern.search(entry.title)):
                yield (feed.get_id(),entry_id,entry)

    def current_items_in_session(self,
                                 session,
                                 enable_ad_filter = False,
                                 check_existence  = False,
                                 debug_enabled    = False):
        '''
        Returns a generator for the list of current items, i.e. the
        current list of fresh items returned by all known feeds.
        @param enable_ad_filter: if True, advertisements will be filtered out
                       using the predefined regex
        @param check_existence: if True, only entries that are not already
                       stored in the items database will be returned.
        '''
        for feed in self._feed_storage.feeds():
            for item in self.current_items_feed(session,feed,enable_ad_filter,check_existence,debug_enabled):
                yield item

    def current_items(self,
                      enable_ad_filter = False,
                      check_existence  = False,
                      debug_enabled    = False):
        '''
        Returns a generator for the list of current items, i.e. the
        current list of fresh items returned by all known feeds.
        @param enable_ad_filter: if True, advertisements will be filtered out
                       using the predefined regex
        @param check_existence: if True, only entries that are not already
                       stored in the items database will be returned.
        '''
        for feed in self._feed_storage.feeds():
            for item in self.current_items_feed(self._session,feed,enable_ad_filter,check_existence,debug_enabled):
                yield item

    def fetch_and_store(self,targets):
        '''
        Download target URLs and store them in the file storage.
        @param targets: A list of (feed-id,URL) pairs.
        '''
        text_objs_dict = self._fetcher.fetch(targets)
        self._file_storage.store_all(text_objs_dict)

예제 #23

0

파일 보기

파일: fetch.py 프로젝트: smilitude/SemiAutoOJ

from fetcher import Fetcher

num = 1

while True:
    x = raw_input( ">> " )
    if x == "q":
        submission = Fetcher.fetch( num )
        if submission:
            num += 1
        else:
            print "No new submission"

예제 #24

0

파일 보기

파일: test.py 프로젝트: woodstone121/pycrawler

 def _save(self, data):
     fetcher = Fetcher()
     q = yield fetcher.fetch(
         'http://127.0.0.1:8000/ezlookup/deal/?key=998998998',
         method="POST",
         data=data)