Пример #1
0
def test_get_url_real_drug_name():
    """
    Tests that the get_url function returns the correct url for a standard drug name ('dupixent')
    """
    scraper = EverydayHealthScraper()
    url = scraper.get_url('dupixent')
    assert url == 'https://www.everydayhealth.com/drugs/dupixent/reviews'
Пример #2
0
def test_scrape_no_reviews():
    """
    Tests that the scrape function works for page with no reviews
    """
    scraper = EverydayHealthScraper()
    scraper.scrape('https://www.everydayhealth.com/drugs/triprolidine/reviews')
    assert len(scraper.reviews) == 0
Пример #3
0
def test_short_drug_name():
    """
    Tests that the get_url function does not search for drug names shorter than 4 characters
    """
    scraper = EverydayHealthScraper()
    url = scraper.get_url('ACE')
    assert not url
Пример #4
0
def test_drug_name_with_space():
    """
    Tests that the get_url function returns the correct url for a drug name
    that includes a space
    """
    scraper = EverydayHealthScraper()
    url = scraper.get_url('Mucinex Allergy')
    assert url == 'https://www.everydayhealth.com/drugs/mucinex-allergy/reviews'
Пример #5
0
def test_scrape_invalid_url_no_title():
    """
    Tests that when the scrape function is called on a url that lacks a title
    (invalid url), it raises an AttributeError and returns 0
    """
    scraper = EverydayHealthScraper()
    returned = scraper.scrape('https://www.everydayhealth.com/drugs/')
    assert returned == 0
Пример #6
0
def test_url_fake_drug_name():
    """
    Tests tha the get_url function returns 'None' for a drug name that does not have a review
    page
    """
    scraper = EverydayHealthScraper()
    url = scraper.get_url('garbage')
    assert not url
Пример #7
0
def test_no_reviews():
    """
    Tests that the scrape page function returns 'None' when no reviews are
    found on the page
    """
    scraper = EverydayHealthScraper()
    returned = scraper.scrape_page(
        'https://www.everydayhealth.com/drugs/triprolidine/reviews')
    assert not returned
Пример #8
0
def test_incorrect_url():
    """
    Tests that the scrape_page function raises and assertion error when a url is
    entered with an incorrect domain name (not 'https://everydayhealth.com...')
    """
    scraper = EverydayHealthScraper()
    with pytest.raises(AssertionError):
        scraper.scrape_page(
            'https://www.drugs.com/comments/aripiprazole/abilify.html?page=1')
Пример #9
0
def test_everydayhealth_scrape():
    """Test everydayhealth scrape"""
    input_url = 'https://www.everydayhealth.com/drugs/citalopram/reviews'
    everydayhealth_scraper = EverydayHealthScraper()
    review_list = everydayhealth_scraper.scrape(input_url)
    assert len(review_list) > 5
    keys = list(review_list[-1].keys())
    assert 'comment' in keys
    assert 'rating' in keys
Пример #10
0
def test_scrape_assert_title_error():
    """
    Tests that when the scrape function is called with an invalid url that does have a
    title, but the title is wrong (doesn't have the phrase 'Drug Reviews') that an AssertionError
    is raised and the function returns 0
    """
    scraper = EverydayHealthScraper()
    returned = scraper.scrape('https://www.everydayhealth.com/drugs/')
    assert returned == 0
Пример #11
0
def test_scrape_with_parameters():
    """
    Tests that, when calling the scrape function with a scraper of non-default parameters, the
    correct types of data are stored in the 'reviews' attribute
    """
    scraper = EverydayHealthScraper(collect_urls=True)
    scraper.scrape('https://www.everydayhealth.com/drugs/gabapentin/reviews')
    assert len(scraper.reviews) > 20
    data_collected = list(scraper.reviews[0].keys())
    assert len(data_collected) == 5
    assert 'url' in data_collected
Пример #12
0
def test_scrape_page_with_parameters():
    """
    Tests to make sure that calling the scrape_page function on a scraper object with a
    non-default parameter (collect_url true) collects the correct types of data
    ('comment', 'rating, 'date', 'drug', and 'url')
    """
    scraper = EverydayHealthScraper(collect_urls=True)
    scraper.scrape_page(
        'https://www.everydayhealth.com/drugs/hydroxyzine/reviews')
    data_collected = list(scraper.reviews[0].keys())
    assert len(data_collected) == 5
    assert 'url' in data_collected
Пример #13
0
def test_scrape_correct_review_data():
    """
    Tests to make sure that the last review in the scraped reviews list has
    the correct data when the scrape function is called
    (this data is from the oldest review of the drug)
    """
    scraper = EverydayHealthScraper(collect_urls=True)
    scraper.scrape(
        'https://www.everydayhealth.com/drugs/ciclopirox-topical/reviews')
    assert scraper.reviews[-1]['comment'][:10] == 'After OVER'
    assert scraper.reviews[-1]['comment'][-10:] == 'inally hav'
    assert scraper.reviews[-1]['rating'] == 5
    assert scraper.reviews[-1]['date'] == '5/22/2015 4:18:19 AM'
Пример #14
0
def test_scrape_page_default_parameters():
    """
    Tests to make sure that calling the scrape_page function on a scraper object
    with a default parameter collects the correct types of data ('comment', 'rating',
    'date', and 'drug') and that the correct number of reviews were collected (20)
    """
    scraper = EverydayHealthScraper()
    scraper.scrape_page('https://www.everydayhealth.com/drugs/allegra/reviews')
    data_collected = list(scraper.reviews[0].keys())
    assert len(data_collected) == 4
    assert 'comment' in data_collected
    assert 'rating' in data_collected
    assert 'date' in data_collected
    assert 'drug' in data_collected
    assert len(scraper.reviews) == 20
Пример #15
0
    def collect(self, url, testing=False):
        """Scrapes drug reviews and saves them as dictionary property

        Args:
            url: WebMD URL where all the reviews are
        """
        if self.meta['locked']:
            print('Dataset locked. Please load a different dataset.')
            return

        self.meta['startTimestamp'] = time()
        self.meta['drugs'] = [self.drug_name]

        scraper = None
        if self.scraper == 'WebMD':
            scraper = WebMDScraper()
        elif self.scraper == 'EverydayHealth':
            scraper = EverydayHealthScraper()
        elif self.scraper == 'Drugs':
            scraper = DrugsScraper()
        elif self.scraper == 'DrugRatingz':
            scraper = DrugRatingzScraper()

        if testing:
            scraper = WebMDScraper(False, 1)

        self.reviews = scraper.scrape(url)
        self.meta['endTimestamp'] = time()
Пример #16
0
def test_scrape_default_parameter():
    """
    Tests that, when calling the scrape function with a scraper with default parameters,
    the correct types of data are stored in the 'reviews' attribute and that the
    correct number of reviews are collected (more than 20, this proves that it's
    scraping multiple pages)
    """
    scraper = EverydayHealthScraper()
    scraper.scrape('https://www.everydayhealth.com/drugs/gabapentin/reviews')
    assert len(scraper.reviews) > 20
    data_collected = list(scraper.reviews[0].keys())
    assert len(data_collected) == 4
    assert 'comment' in data_collected
    assert 'rating' in data_collected
    assert 'date' in data_collected
    assert 'drug' in data_collected
Пример #17
0
def test_initialization_parameters():
    """
    Tests that when a new EverydayHealth scraper object is initialized with a
    non default argument the correct attributes are set (empty list[dict] 'reviews')
    and that the collect_urls attribute (boolean) is set to true
    """
    scraper = EverydayHealthScraper(collect_urls=True)
    assert scraper.collect_urls
Пример #18
0
def test_default_initialization():
    """
    Tests that when the new scraper object is initialized, the correct
    attributes are set (empty list[dict] 'reviews') and the attribute
    collect_urls (boolean) is false by default
    """
    scraper = EverydayHealthScraper()
    assert len(scraper.reviews) == 0
    assert not scraper.collect_urls
Пример #19
0
def test_scrape_empty_reviews():
    """
    Tests to make sure that the scrape function would discard the reviews
    of a scraper object that already has data collected in 'reviews'
    """
    scraper = EverydayHealthScraper()
    scraper.scrape('https://www.everydayhealth.com/drugs/phenadoz/reviews')
    num_reviews = len(scraper.reviews)
    scraper.scrape('https://www.everydayhealth.com/drugs/phenadoz/reviews')
    assert num_reviews == len(scraper.reviews)
Пример #20
0
    def collect_drug_names(self, file_path, output_path):
        """Given list of drug names, collect urls for those review page on
            the scraper's website

        Args:
            file_path: input csv with list of drug names
            output_path: output csv with urls
        """
        if self.scraper == 'WebMD':
            scraper = WebMDScraper()
            scraper.get_drug_urls(file_path, output_path)
        elif self.scraper == 'EverydayHealth':
            scraper = EverydayHealthScraper()
            scraper.get_drug_urls(file_path, output_path)
        elif self.scraper == 'Drugs':
            scraper = DrugsScraper()
            scraper.get_drug_urls(file_path, output_path)
        elif self.scraper == 'DrugRatingz':
            scraper = DrugRatingzScraper()
            scraper.get_drug_urls(file_path, output_path)
Пример #21
0
    def collect_urls(self, file_path, start=0):
        """Scrape all reviews for all drugs urls in file

        Args:
            start: index to start at if continuing from previous run
        """
        if self.meta['locked']:
            print('Dataset locked. Please load a different dataset.')
            return

        scraper = None
        if self.scraper == 'WebMD':
            scraper = WebMDScraper()
        elif self.scraper == 'EverydayHealth':
            scraper = EverydayHealthScraper()
        elif self.scraper == 'Drugs':
            scraper = DrugsScraper()
        elif self.scraper == 'DrugRatingz':
            scraper = DrugRatingzScraper()

        urls = []

        with open(file_path) as csv_file:
            reader = csv.DictReader(csv_file)
            for row in reader:
                if row['URL'] != 'Not found':
                    urls.append({'name': row['Drug'], 'url': row['URL']})
        print('Found {} urls.'.format(len(urls)))

        if os.path.isfile(self.drug_name.lower() + '-dataset.pickle'):
            self.load()
        else:
            print('Saving meta...')
            drug_names = [x['name'] for x in urls]
            self.meta['drugs'] = drug_names
            self.meta['startTimestamp'] = time()
            self.save()

        # Loop through urls starting at start index
        for i in range(start, len(urls)):
            drug = urls[i]
            print('\n{} drugs left to scrape.'.format(len(urls) - i))
            print('Scraping {}...'.format(drug['name']))
            reviews = scraper.scrape(drug['url'])

            # If it's the first drug then replace self.reviews instead of appending
            if drug['name'] == urls[0]['name']:
                self.reviews = reviews
            else:
                self.reviews += reviews

            # Save our progress and let the user know the data is safe
            self.meta['endTimestamp'] = time()
            self.save()
            print('{} reviews saved. Safe to quit.'.format(drug['name']))

            # Let the user know what start index to use to continue later
            if i < len(urls) - 1:
                print('To continue run with parameter start={}'.format(i + 1))

        print('\nAll urls scraped!')
def main():
    scraper = WebMDScraper(
    )  # non funziona DrugsScraper(), non funziona DrugRatingzScraper(), or EverydayHealthScraper()
    url = ""
    json_aggregrationReviews = {"website": "webmd.com"}
    json_aggregrationReviews["ratingSystem"] = "stars"
    json_aggregrationReviews["itemsNamesAggregration"] = input_list
    reviewsAggregrate = []
    for i in range(len(input_list)):
        json_reviews = {"name": input_list[i]}
        try:
            url = scraper.get_url(input_list[i])  # or any other drug name
            scraper.scrape(url)
            dataframe_reviews = pd.DataFrame.from_dict(scraper.reviews)
            json_reviews["averageEffectiveness"] = round(
                pd.DataFrame.from_records(
                    dataframe_reviews["rating"])["effectiveness"].mean(), 1)
            json_reviews["averageEaseOfUse"] = round(
                pd.DataFrame.from_records(
                    dataframe_reviews["rating"])["ease of use"].mean(), 1)
            json_reviews["averageSatisfaction"] = round(
                pd.DataFrame.from_records(
                    dataframe_reviews["rating"])["satisfaction"].mean(), 1)
            json_reviews["minRating"] = round(
                pd.DataFrame.from_records(
                    dataframe_reviews["rating"])["satisfaction"].min(), 1)
            json_reviews["maxRating"] = round(
                pd.DataFrame.from_records(
                    dataframe_reviews["rating"])["satisfaction"].max(), 1)
            json_reviews["reviews"] = scraper.reviews
        except:
            print("Could not get " + input_list[i] + " from webmd website")
            webmd_names_errors.append(input_list[i])
        reviewsAggregrate.append(json_reviews)
    json_aggregrationReviews["aggregrateReviews"] = reviewsAggregrate

    with open("webmdresult.json", "w") as f:
        obj = json.dumps(json_aggregrationReviews, indent=4)
        f.write(obj)

    scraper2 = EverydayHealthScraper()
    json_aggregrationReviews = {"website": "everydayhealth.com"}
    json_aggregrationReviews["ratingSystem"] = "stars"
    json_aggregrationReviews["itemsNamesAggregration"] = input_list
    reviewsAggregrate = []
    for i in range(len(input_list)):
        json_reviews = {"name": input_list[i]}
        try:
            url = scraper2.get_url("Adderall")
            print(url)
            scraper2.scrape(url)
            dataframe_reviews = pd.DataFrame.from_dict(scraper2.reviews)
            json_reviews["averageRating"] = round(
                dataframe_reviews["rating"].mean(), 1)
            json_reviews["minRating"] = round(
                dataframe_reviews["rating"].min(), 1)
            json_reviews["maxRating"] = round(
                dataframe_reviews["rating"].max(), 1)
            json_reviews["reviews"] = scraper2.reviews
        except:
            print("Could not get " + input_list[i] +
                  " from everydayhealthscraper website ")
            everydayhealth_names_errors.append(input_list[i])
        reviewsAggregrate.append(json_reviews)

    json_aggregrationReviews["aggregrateReviews"] = reviewsAggregrate

    with open("everydayhealth.json", "w") as f:
        obj = json.dumps(json_aggregrationReviews, indent=4)
        f.write(obj)

    if (len(webmd_names_errors) != 0):
        print("I could not get from webmd " + str(webmd_names_errors))

    if (len(everydayhealth_names_errors) != 0):
        print("I could not get from everydayhealth " +
              str(everydayhealth_names_errors))
Пример #23
0
def test_get_drug_urls_everydayhealth():
    scraper = EverydayHealthScraper()
    scraper.get_drug_urls('test-drug-names.csv', 'urls.csv')
    assert os.path.exists('urls.csv')
    os.remove('urls.csv')
Пример #24
0
def test_everydayhealth_max_pages():
    """Test everydayhealth max pages"""
    url = 'https://www.everydayhealth.com/drugs/citalopram/reviews'
    everydayhealth_scraper = EverydayHealthScraper()
    assert everydayhealth_scraper.max_pages(url) == 15