示例#1
0
class Preview(Base):
    """Represents the vizualization page where one can get the update date
    without needing captchas."""

    url = ('http://buscatextual.cnpq.br/buscatextual'
           '/preview.do?metodo=apresentar&id=')
    logger = BaseLogger.from_file(Base.base_name + 'Preview', logger_file)

    @classmethod
    def date(cls, short_id, user_agent=None):
        """Given a short id, open the Preview page and retrieves the date when
        the curriculum was last updated.
        Returns String with the last updated date or Flase if string date
        could not be retrieved.

        @param short_id  : 10 character string that represents a curriculum id
                           for this webpage
        @type  short_id  : str

        @param user_agent: Optional param in order to set requests.Session's
                           user_agent. If None was give, the default one will
                           be used, signaling python's requests library.
        @type  user_agent: str
        """

        short_id = cls.check_param(short_id, '^[A-Z0-9]{10}$')
        url = cls.url + short_id
        request = Request('GET', url)
        response = False
        tries = 0
        cls.logger.info('Getting upDATE for {}'.format(short_id))
        while not response or tries < cls.max_tries:
            tries += 1
            cls.logger.info('Try: {}'.format(tries))
            with Session() as session:
                if user_agent:
                    session.headers['User-Agent'] = user_agent
                try:
                    prepped = session.prepare_request(request)
                    response = session.send(prepped, timeout=cls.timeout)
                except RequestException as e:
                    cls.logger.info('Error: {}'.format(e))
                    continue
            if response.status_code == 200:
                pattern = r'(\d{2}/){2}\d{4}'
                regex = re.compile(pattern)
                soup = bs4(response.text, 'html.parser')
                date_text = soup.span.text
                date_text = regex.search(date_text).group()
                cls.logger.info('Response 200: date: {}'.format(date_text))
                return date_text
        else:
            cls.logger.info('Could not fetch Preview page.')
            return False
示例#2
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from multiprocessing import Process
from lattes.pages import Curriculum, Xml
from lattes.config import BaseLogger

logger = BaseLogger.from_file('simple_client', file_name='simple_client.log')
ZIP_PATH = '/Users/josefson/Workspace/Python/cnpq/xmls'


def single_cored_example(short_ids):
    worker(short_ids)


def multi_cored_example(short_ids):
    """Simple enough multicore example."""
    chunk = 1
    short_ids = [
        short_ids[x:x + chunk] for x in range(0, len(short_ids), chunk)
    ]
    logger.info('Spawning processes')
    for split_list in short_ids:
        p = Process(target=worker, args=(split_list, ))
        p.start()


def worker(short_ids):
    """Run through a list of short_ids downloading it's respective xmls."""
    for short_id in short_ids:
        logger.info('Getting curriculum for {}'.format(short_id))
        curriculum = Curriculum(short_id)
示例#3
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from lattes.pages import Curriculum, Xml, Preview
from lattes.config import BaseLogger

logger = BaseLogger.from_file('better_cli', 'better_cli.log')
ZIP_PATH = '/home/elodin/Workspace/Python/cnpq/xmls'


def setup():
    """This function is just an example of how you could relate the data you
    already have together (short_id, long_id, update_date).
    The real benefit is in the main() funciton. Read its doc."""

    # Ids we already have from previous scraping
    short_ids = ['K8185478E7', 'K4246690H2', 'K4138636E6']
    long_ids = ['6380212729787758', '7639569152487589', '1024601314143406']
    updated_on = ['01/01/1995', None, '24/11/2009']

    # Recently scraped short_ids
    scraped_short_ids = ['K8185478E7', 'K4246690H2', 'K4138636E6',
                         'K4138281J4', 'K4130978D4', 'K4133929U0']

    class MyData:
        """Simple DataClass just to group the data i already have together."""
        def __init__(self, short_id, long_id=None, date=None):
            self.short_id = short_id
            self.long_id = long_id
            self.date = date

    # Dictionary/hashtable for instances of MyData where short_id is the key