示例#1
0
def test_check():
    """Test the check method."""

    req = Requester()
    req.check()

    assert True
示例#2
0
文件: base.py 项目: njr175/ERP_SCANR
    def __init__(self):
        """Initialize ERP-SCANR Base() object."""

        # Initialize dictionary to store db info
        self.db_info = dict()

        # Initialize variable to keep track of term type used
        self.terms_type = str()

        # Initialize list of erps & term terms to use, including labels
        self.labels = list()
        self.erps = list()
        self.exclusions = list()
        self.terms = list()
        self.term_labels = list()

        # Initialize counters for numbers of terms
        self.n_erps = int()
        self.n_terms = int()

        # Requester object for handling URL calls
        self.req = Requester()

        # Initialize for date that data is collected
        self.date = str()
示例#3
0
def test_check():
    """Test the check method."""

    req = Requester()
    req.check()

    assert True
示例#4
0
def test_wait():
    """Test the wait method."""

    req = Requester()

    req.wait(0.01)

    assert True
示例#5
0
def test_open():
    """Test the open method."""

    req = Requester()

    req.open()

    assert req.is_active
示例#6
0
def test_open():
    """Test the open method."""

    req = Requester()

    req.open()

    assert req.is_active
示例#7
0
def test_get_url():
    """Test the get_url method."""

    req = Requester()

    web_page = req.get_url('http://www.google.com')

    assert web_page
示例#8
0
def test_wait():
    """Test the wait method."""

    req = Requester()

    req.wait(0.01)

    assert True
示例#9
0
def test_get_url():
    """Test the get_url method."""

    req = Requester()

    web_page = req.get_url('http://www.google.com')

    assert web_page
示例#10
0
def test_throttle():
    """Test the throttle method."""

    req = Requester()
    req.time_last_req = time.time()

    req.throttle()

    assert True
示例#11
0
def test_close():
    """Test the close method."""

    req = Requester()

    req.open()
    req.close()

    assert not req.is_active
示例#12
0
def test_throttle():
    """Test the throttle method."""

    req = Requester()
    req.time_last_req = time.time()

    req.throttle()

    assert True
示例#13
0
文件: base.py 项目: njr175/ERP_SCANR
    def __init__(self):
        """Initialize ERP-SCANR Base() object."""

        # Initialize dictionary to store db info
        self.db_info = dict()

        # Initialize variable to keep track of term type used
        self.terms_type = str()

        # Initialize list of erps & term terms to use, including labels
        self.labels = list()
        self.erps = list()
        self.exclusions = list()
        self.terms = list()
        self.term_labels = list()

        # Initialize counters for numbers of terms
        self.n_erps = int()
        self.n_terms = int()

        # Requester object for handling URL calls
        self.req = Requester()

        # Initialize for date that data is collected
        self.date = str()
示例#14
0
def test_close():
    """Test the close method."""

    req = Requester()

    req.open()
    req.close()

    assert not req.is_active
示例#15
0
def test_requester():
    """Test the Requester object returns properly."""

    assert Requester()
示例#16
0
文件: base.py 项目: njr175/ERP_SCANR
class Base(object):
    """Base class for ERPSC analyses.

    Attributes
    ----------
    db_info : dict()
        Stores info about the database used for scarping data.
    terms_type : {'cognitive', 'disease'}
        Type of terms used.
    labels : list of str
        Label to reference each ERP.
    erps : list of list of str
        Name(s) for each ERP (used as search terms).
    exclusions : list of list str
        Exclusion words for each ERP, used to avoid unwanted articles.
    terms : list of list of str
        Terms words.
    term_labels : list of str
        Labels for each term.
    n_erps : int
        Number of erps.
    n_terms : int
        Number of terms.
    req : Requester() object
        Object to handle URL requests.
    date : str
        Date data was collected.
    """
    def __init__(self):
        """Initialize ERP-SCANR Base() object."""

        # Initialize dictionary to store db info
        self.db_info = dict()

        # Initialize variable to keep track of term type used
        self.terms_type = str()

        # Initialize list of erps & term terms to use, including labels
        self.labels = list()
        self.erps = list()
        self.exclusions = list()
        self.terms = list()
        self.term_labels = list()

        # Initialize counters for numbers of terms
        self.n_erps = int()
        self.n_terms = int()

        # Requester object for handling URL calls
        self.req = Requester()

        # Initialize for date that data is collected
        self.date = str()

    def set_erps(self, erps):
        """Sets the given list of strings as erp terms to use.

        Parameters
        ----------
        erps : list of str OR list of list of str
            List of ERP terms to be used.
        """

        # Unload previous terms if some are already loaded
        self.unload_erps()

        # Set given list as erp words
        for erp in erps:
            erp = _check_type(erp)
            self.labels.append(erp[0])
            self.erps.append(erp)

        # Set the number of erps
        self.n_erps = len(erps)

    def set_erps_file(self):
        """Load ERP terms from a txt file."""

        # Unload previous terms if some are already loaded
        self.unload_erps()

        # Get erps from module data file
        labels = _terms_load_file('erp_labels')
        erps = _terms_load_file('erps')

        # Set the number of erps
        self.n_erps = len(erps)

        # Drop number indices, add labels & erps (as list)
        for i in range(self.n_erps):
            self.labels.append(labels[i][3:])
            self.erps.append(erps[i][3:].split(','))

    def check_erps(self):
        """Print out the current list of erps."""

        # Print out header and all current ERPs
        print('List of ERPs used: \n')
        for lab, erp_lst in zip(self.labels, self.erps):
            print(lab + "\t : " + ", ".join(erp for erp in erp_lst))

    def unload_erps(self):
        """Unload the current set of ERP words."""

        # Check if exclusions are loaded, to empty them if so.
        if self.erps:

            # Print status that ERPs are being unloaded
            print('Unloading previous ERP words.')

            # Reset ERP variables to empty
            self.labels = list()
            self.erps = list()
            self.n_erps = int()

    def set_exclusions(self, exclusions):
        """Sets the given list of strings as exclusion words.

        Parameters
        ----------
        exclusions : list of str OR list of list of str
            List of exclusion words to be used.
        """

        # Unload previous terms if some are already loaded
        self.unload_exclusions()

        # Set given list as erp exclusion words
        for exclude in exclusions:
            self.exclusions.append(_check_type(exclude))

        # Check that the number of exclusions matches n_erps
        if len(exclusions) != self.n_erps:
            raise InconsistentDataError(
                'Mismatch in number of exclusions and erps!')

    def set_exclusions_file(self):
        """Load exclusion words from a txt file."""

        # Unload previous terms if some are already loaded
        self.unload_exclusions()

        # Get exclusion words from module data file
        exclusions = _terms_load_file('erps_exclude')

        # Check that the number of exclusions matches n_erps
        if len(exclusions) != self.n_erps:
            raise InconsistentDataError(
                'Mismatch in number of exclusions and erps!')

        # Drop number indices for exclusions, and set as list
        for i in range(self.n_erps):
            self.exclusions.append(exclusions[i][3:].split(','))

    def check_exclusions(self):
        """Print out the current list of exclusion words."""

        # Print out header and all exclusion words
        print('List of exclusion words used: \n')
        for lab, excs in zip(self.labels, self.exclusions):
            print(lab + "\t : " + ", ".join(exc for exc in excs))

    def unload_exclusions(self):
        """Unload the current set of exclusion words."""

        # Check if exclusions are loaded. If so, print status and empty.
        if self.exclusions:

            # Print status that exclusion words are being unloaded
            print('Unloading previous exclusion words.')

            # Reset exclusions variables to empty
            self.exclusions = list()

    def set_terms(self, terms):
        """Sets the given list of strings as terms to use.

        Parameters
        ----------
        terms : list of str OR list of list of str
            List of terms to be used.
        """

        # Unload previous terms if some are already loaded
        self.unload_terms()

        # Set given list as the terms
        for term in terms:
            self.terms.append(_check_type(term))

        # Set the number of terms
        self.n_terms = len(terms)

    def set_terms_file(self, terms_type):
        """Load terms from a txt file."""

        # Unload previous terms if some are already loaded
        self.unload_terms()

        # Set the type of terms
        self.terms_type = terms_type

        # Get terms from module data file
        terms = _terms_load_file(terms_type)

        # Set the number of terms
        self.n_terms = len(terms)

        # Set as list, and attach to object
        for i in range(self.n_terms):
            self.terms.append(terms[i][:].split(','))

    def check_terms(self):
        """Print out the current list of terms."""

        # Print out header and all term words
        print('List of terms used: \n')
        for terms_ls in self.terms:
            print(", ".join(term for term in terms_ls))

    def unload_terms(self):
        """Unload the current set of terms."""

        # Check if exclusions are loaded, to empty them if so.
        if self.terms:

            # Print status that term words are being unloaded
            print('Unloading previous terms words.')

            # Reset term variables to empty
            self.terms_type = str()
            self.terms = list()
            self.n_terms = int()

    def get_term_labels(self):
        """Get term labels."""

        self.term_labels = [term[-1] for term in self.terms]

    def get_db_info(self, info_url):
        """Calls EInfo to get info and status of db to be used for scraping.

        Parameters
        ----------
        info_url : str
            URL to request db information from.
        """

        # Get the info page and parse with BeautifulSoup
        info_page = self.req.get_url(info_url)
        info_page_soup = BeautifulSoup(info_page.content, 'lxml')

        # Set list of fields to extract from eInfo
        fields = [
            'dbname', 'menuname', 'description', 'dbbuild', 'count',
            'lastupdate'
        ]

        # Extract basic infomation into a dictionary
        for field in fields:
            self.db_info[field] = extract(info_page_soup, field, 'str')
示例#17
0
文件: base.py 项目: njr175/ERP_SCANR
class Base(object):
    """Base class for ERPSC analyses.

    Attributes
    ----------
    db_info : dict()
        Stores info about the database used for scarping data.
    terms_type : {'cognitive', 'disease'}
        Type of terms used.
    labels : list of str
        Label to reference each ERP.
    erps : list of list of str
        Name(s) for each ERP (used as search terms).
    exclusions : list of list str
        Exclusion words for each ERP, used to avoid unwanted articles.
    terms : list of list of str
        Terms words.
    term_labels : list of str
        Labels for each term.
    n_erps : int
        Number of erps.
    n_terms : int
        Number of terms.
    req : Requester() object
        Object to handle URL requests.
    date : str
        Date data was collected.
    """

    def __init__(self):
        """Initialize ERP-SCANR Base() object."""

        # Initialize dictionary to store db info
        self.db_info = dict()

        # Initialize variable to keep track of term type used
        self.terms_type = str()

        # Initialize list of erps & term terms to use, including labels
        self.labels = list()
        self.erps = list()
        self.exclusions = list()
        self.terms = list()
        self.term_labels = list()

        # Initialize counters for numbers of terms
        self.n_erps = int()
        self.n_terms = int()

        # Requester object for handling URL calls
        self.req = Requester()

        # Initialize for date that data is collected
        self.date = str()


    def set_erps(self, erps):
        """Sets the given list of strings as erp terms to use.

        Parameters
        ----------
        erps : list of str OR list of list of str
            List of ERP terms to be used.
        """

        # Unload previous terms if some are already loaded
        self.unload_erps()

        # Set given list as erp words
        for erp in erps:
            erp = _check_type(erp)
            self.labels.append(erp[0])
            self.erps.append(erp)

        # Set the number of erps
        self.n_erps = len(erps)


    def set_erps_file(self):
        """Load ERP terms from a txt file."""

        # Unload previous terms if some are already loaded
        self.unload_erps()

        # Get erps from module data file
        labels = _terms_load_file('erp_labels')
        erps = _terms_load_file('erps')

        # Set the number of erps
        self.n_erps = len(erps)

        # Drop number indices, add labels & erps (as list)
        for i in range(self.n_erps):
            self.labels.append(labels[i][3:])
            self.erps.append(erps[i][3:].split(','))


    def check_erps(self):
        """Print out the current list of erps."""

        # Print out header and all current ERPs
        print('List of ERPs used: \n')
        for lab, erp_lst in zip(self.labels, self.erps):
            print(lab + "\t : " + ", ".join(erp for erp in erp_lst))


    def unload_erps(self):
        """Unload the current set of ERP words."""

        # Check if exclusions are loaded, to empty them if so.
        if self.erps:

            # Print status that ERPs are being unloaded
            print('Unloading previous ERP words.')

            # Reset ERP variables to empty
            self.labels = list()
            self.erps = list()
            self.n_erps = int()


    def set_exclusions(self, exclusions):
        """Sets the given list of strings as exclusion words.

        Parameters
        ----------
        exclusions : list of str OR list of list of str
            List of exclusion words to be used.
        """

        # Unload previous terms if some are already loaded
        self.unload_exclusions()

        # Set given list as erp exclusion words
        for exclude in exclusions:
            self.exclusions.append(_check_type(exclude))

        # Check that the number of exclusions matches n_erps
        if len(exclusions) != self.n_erps:
            raise InconsistentDataError('Mismatch in number of exclusions and erps!')


    def set_exclusions_file(self):
        """Load exclusion words from a txt file."""

        # Unload previous terms if some are already loaded
        self.unload_exclusions()

        # Get exclusion words from module data file
        exclusions = _terms_load_file('erps_exclude')

        # Check that the number of exclusions matches n_erps
        if len(exclusions) != self.n_erps:
            raise InconsistentDataError('Mismatch in number of exclusions and erps!')

        # Drop number indices for exclusions, and set as list
        for i in range(self.n_erps):
            self.exclusions.append(exclusions[i][3:].split(','))


    def check_exclusions(self):
        """Print out the current list of exclusion words."""

        # Print out header and all exclusion words
        print('List of exclusion words used: \n')
        for lab, excs in zip(self.labels, self.exclusions):
            print(lab + "\t : " + ", ".join(exc for exc in excs))


    def unload_exclusions(self):
        """Unload the current set of exclusion words."""

        # Check if exclusions are loaded. If so, print status and empty.
        if self.exclusions:

            # Print status that exclusion words are being unloaded
            print('Unloading previous exclusion words.')

            # Reset exclusions variables to empty
            self.exclusions = list()


    def set_terms(self, terms):
        """Sets the given list of strings as terms to use.

        Parameters
        ----------
        terms : list of str OR list of list of str
            List of terms to be used.
        """

        # Unload previous terms if some are already loaded
        self.unload_terms()

        # Set given list as the terms
        for term in terms:
            self.terms.append(_check_type(term))

        # Set the number of terms
        self.n_terms = len(terms)


    def set_terms_file(self, terms_type):
        """Load terms from a txt file."""

        # Unload previous terms if some are already loaded
        self.unload_terms()

        # Set the type of terms
        self.terms_type = terms_type

        # Get terms from module data file
        terms = _terms_load_file(terms_type)

        # Set the number of terms
        self.n_terms = len(terms)

        # Set as list, and attach to object
        for i in range(self.n_terms):
            self.terms.append(terms[i][:].split(','))


    def check_terms(self):
        """Print out the current list of terms."""

        # Print out header and all term words
        print('List of terms used: \n')
        for terms_ls in self.terms:
            print(", ".join(term for term in terms_ls))


    def unload_terms(self):
        """Unload the current set of terms."""

        # Check if exclusions are loaded, to empty them if so.
        if self.terms:

            # Print status that term words are being unloaded
            print('Unloading previous terms words.')

            # Reset term variables to empty
            self.terms_type = str()
            self.terms = list()
            self.n_terms = int()


    def get_term_labels(self):
        """Get term labels."""

        self.term_labels = [term[-1] for term in self.terms]


    def get_db_info(self, info_url):
        """Calls EInfo to get info and status of db to be used for scraping.

        Parameters
        ----------
        info_url : str
            URL to request db information from.
        """

        # Get the info page and parse with BeautifulSoup
        info_page = self.req.get_url(info_url)
        info_page_soup = BeautifulSoup(info_page.content, 'lxml')

        # Set list of fields to extract from eInfo
        fields = ['dbname', 'menuname', 'description', 'dbbuild', 'count', 'lastupdate']

        # Extract basic infomation into a dictionary
        for field in fields:
            self.db_info[field] = extract(info_page_soup, field, 'str')