class Preview(Base): """Represents the vizualization page where one can get the update date without needing captchas.""" url = ('http://buscatextual.cnpq.br/buscatextual' '/preview.do?metodo=apresentar&id=') logger = BaseLogger.from_file(Base.base_name + 'Preview', logger_file) @classmethod def date(cls, short_id, user_agent=None): """Given a short id, open the Preview page and retrieves the date when the curriculum was last updated. Returns String with the last updated date or Flase if string date could not be retrieved. @param short_id : 10 character string that represents a curriculum id for this webpage @type short_id : str @param user_agent: Optional param in order to set requests.Session's user_agent. If None was give, the default one will be used, signaling python's requests library. @type user_agent: str """ short_id = cls.check_param(short_id, '^[A-Z0-9]{10}$') url = cls.url + short_id request = Request('GET', url) response = False tries = 0 cls.logger.info('Getting upDATE for {}'.format(short_id)) while not response or tries < cls.max_tries: tries += 1 cls.logger.info('Try: {}'.format(tries)) with Session() as session: if user_agent: session.headers['User-Agent'] = user_agent try: prepped = session.prepare_request(request) response = session.send(prepped, timeout=cls.timeout) except RequestException as e: cls.logger.info('Error: {}'.format(e)) continue if response.status_code == 200: pattern = r'(\d{2}/){2}\d{4}' regex = re.compile(pattern) soup = bs4(response.text, 'html.parser') date_text = soup.span.text date_text = regex.search(date_text).group() cls.logger.info('Response 200: date: {}'.format(date_text)) return date_text else: cls.logger.info('Could not fetch Preview page.') return False
#!/usr/bin/env python # -*- coding: utf-8 -*- from multiprocessing import Process from lattes.pages import Curriculum, Xml from lattes.config import BaseLogger logger = BaseLogger.from_file('simple_client', file_name='simple_client.log') ZIP_PATH = '/Users/josefson/Workspace/Python/cnpq/xmls' def single_cored_example(short_ids): worker(short_ids) def multi_cored_example(short_ids): """Simple enough multicore example.""" chunk = 1 short_ids = [ short_ids[x:x + chunk] for x in range(0, len(short_ids), chunk) ] logger.info('Spawning processes') for split_list in short_ids: p = Process(target=worker, args=(split_list, )) p.start() def worker(short_ids): """Run through a list of short_ids downloading it's respective xmls.""" for short_id in short_ids: logger.info('Getting curriculum for {}'.format(short_id)) curriculum = Curriculum(short_id)
#!/usr/bin/env python # -*- coding: utf-8 -*- from lattes.pages import Curriculum, Xml, Preview from lattes.config import BaseLogger logger = BaseLogger.from_file('better_cli', 'better_cli.log') ZIP_PATH = '/home/elodin/Workspace/Python/cnpq/xmls' def setup(): """This function is just an example of how you could relate the data you already have together (short_id, long_id, update_date). The real benefit is in the main() funciton. Read its doc.""" # Ids we already have from previous scraping short_ids = ['K8185478E7', 'K4246690H2', 'K4138636E6'] long_ids = ['6380212729787758', '7639569152487589', '1024601314143406'] updated_on = ['01/01/1995', None, '24/11/2009'] # Recently scraped short_ids scraped_short_ids = ['K8185478E7', 'K4246690H2', 'K4138636E6', 'K4138281J4', 'K4130978D4', 'K4133929U0'] class MyData: """Simple DataClass just to group the data i already have together.""" def __init__(self, short_id, long_id=None, date=None): self.short_id = short_id self.long_id = long_id self.date = date # Dictionary/hashtable for instances of MyData where short_id is the key