示例#1
0
def get_1001tracklists_data(dataframe):
    """
    A function to retrieve data from 1001Tracklists.com

    :param dataframe: A reference dataframe (with 1001Tracklists Track ID)
    :return: A dataframe with number of plays and unique DJ supports.
    """

    dataframe["1001T_TotPlays"] = 0
    dataframe["1001T_Supports"] = 0

    initialize_VPN(save=1, area_input=['random countries europe 20'])

    for idx, row in dataframe.iterrows():
        print(f'{idx} | {row["1001Tracklists_ID"]} | {row["Track_Name"]}')

        if pd.notna(row['1001Tracklists_ID']
                    ) and row['1001Tracklists_ID'] not in exception_1001T:
            call = get_1001tracklists_track_data(row['1001Tracklists_ID'])

            if isinstance(call, str):
                print(call)
                rotate_VPN()
                data_1001tt = get_1001tracklists_track_data(
                    row['1001Tracklists_ID'])

            else:
                data_1001tt = call

            dataframe.loc[idx, "1001T_Supports"], dataframe.loc[
                idx, "1001T_TotPlays"] = data_1001tt

    terminate_VPN()

    return dataframe
示例#2
0
def get_soundcloud_data(data_frame):
    """
    A function to get data from Soundcloud (here, plays for each music).

    :param data_frame: A dataframe with the Soundcloud links associated to each music.
    :return: The same dataframe but with the total number of views for each music.
    """

    df = data_frame.fillna("NONE")
    soundcloud_dict = {}
    tracks_plays = {}

    for an_idx, a_row in df.iterrows():
        soundcloud_dict[a_row["Soundcloud_Link1"]] = [an_idx]
        soundcloud_dict[a_row["Soundcloud_Link2"]] = [an_idx]

    df1 = pd.DataFrame(soundcloud_dict).transpose().rename({0: "idx"}, axis=1) \
        .drop(index="NONE")

    tracks = list(df1.index)

    initialize_VPN(save=1, area_input=['complete rotation'])

    for idx, track_url in enumerate(tracks):
        print(f'{idx} | {track_url}')

        try:
            tracks_plays[track_url] = {
                'plays': soundcloud_scrapping(track_url)
            }

        except (ConnectionError, IndexError):
            print('IP BLOCKED - Need Rotation')
            rotate_VPN()
            tracks_plays[track_url] = {
                'plays': soundcloud_scrapping(track_url)
            }

        sleep(1)

    terminate_VPN()

    df2 = pd.DataFrame(tracks_plays).transpose().rename({
        0: "idx",
        1: "plays"
    },
                                                        axis=1)

    concat = pd.merge(df1, df2, left_index=True, right_index=True)
    plays_sum = concat.groupby("idx").sum()

    final = data_frame.join(plays_sum,
                            how='outer').rename({'plays': "Soundcloud_Plays"},
                                                axis=1)

    final.Soundcloud_Plays = final.Soundcloud_Plays.fillna(0)

    return final
示例#3
0
def soundcloud_scrapping(soundcloud_url):
    """

    :param soundcloud_url:
    :return:
    """
    plays = 0

    page_link = f'https://soundcloud.com/{soundcloud_url}/'

    success = False
    n_fail = 0
    while not success:

        if n_fail < 3:

            try:
                page_response = requests.get(page_link,
                                             headers=Headers().generate())
                soup = BeautifulSoup(page_response.content, "html.parser")
                plays = str(
                    soup.find_all("meta", property="soundcloud:play_count")[0])
                plays = int(re.search('meta content="(.+?)"', plays).group(1))
                success = True

            except requests.exceptions.ConnectionError:
                n_fail += 1
                print(
                    "ConnectionError (from \"requests\"): retrying in 5 sec..."
                )
                sleep(5)

        else:
            print('IP SOFT-LOCKED - Need Rotation')
            rotate_VPN()

    return plays
示例#4
0
    def parseReview(self, response):
        asin = response.meta['item']
        self.driver.get(response.url)
        try:
            element = WebDriverWait(self.driver, 5).until(
                EC.presence_of_element_located((
                    By.XPATH,
                    '//*[@class="a-section a-spacing-none review-views celwidget"]//*[@data-hook="review-star-rating"]'
                )))
            #scrolling to element
            time.sleep(1)
            soup = BeautifulSoup(self.driver.page_source, "html.parser")
            #create xml tree to use xpath
            dom = etree.HTML(str(soup))

            reviewTitle = dom.xpath(
                '//*[@class="a-section a-spacing-none review-views celwidget"]//*[@data-hook="review-title"]//text()'
            )
            reviewTitle = [i.strip() for i in reviewTitle if i.strip() != '']
            reviewRatings = dom.xpath(
                '//*[@class="a-section a-spacing-none review-views celwidget"]//*[@data-hook="review-star-rating"]//text()'
            )

            reviewText = dom.xpath(
                '//*[@class="a-section a-spacing-none review-views celwidget"]//*[@data-hook="review-body"]//text()'
            )
            reviewText = [i.strip() for i in reviewText if i.strip() != '']
            reviewDate = dom.xpath('//*[@data-hook="review-date"]/text()')

            all_v = list(
                zip(reviewTitle, reviewText, reviewRatings, reviewDate))
            for i in all_v:
                rtitle = i[0]
                rtext = i[1]
                rrating = i[2]
                rdate = i[3]
                dict_ = {
                    "asin": asin,
                    "reviewTitle": rtitle,
                    "reviewText": rtext,
                    "reviewRatings": rrating,
                    "reviewDate": rdate
                }
                with open("Reviews.json", "a") as fl:
                    json.dump(dict_, fl)
                    fl.write('\n')

            nextpage = dom.xpath(
                '//*[@class="a-last"]//@href')  # nextpage link

            self.counter = self.counter + 1

            if self.counter % 200 == 0:  # to rotate proxy after every 250 rotations
                print(
                    '!@#$%^&*()!@##$$%!@#$%^&*(!@#$%^&*()!@##$$%!@#$%^&*(!@#$%^&*()!@##$$%!@#$%^&*('
                )
                settings = initialize_VPN(save=1,
                                          area_input=['complete rotation'])
                rotate_VPN(settings)

            if str(nextpage) != '[]':  # go to next page
                nextpage = "https://www.amazon.com" + nextpage[0]

                yield Request(nextpage,
                              callback=self.parseReview,
                              dont_filter=True,
                              meta={'item': asin})
            else:
                pass
        except:
            pass
示例#5
0
class AmazonrSpider(scrapy.Spider):
    name = 'amazonr'
    allowed_domains = ['*']
    start_urls = ['http://amazon.com/']
    df = pd.read_csv(
        r'C:\Amazon Reviews scraper(part1)\Scraper\reviewsScraper\cleanedProducts.csv'
    )
    link_asin = list(zip(df.SeeAllReviews, df.asin))[500:1500]
    DRIVER_PATH = r"E:\ChromeDriver\chromedriver.exe"
    driver = webdriver.Chrome(executable_path=DRIVER_PATH)
    counter = 0
    settings = initialize_VPN(save=1, area_input=['complete rotation'])
    rotate_VPN(settings)

    def start_requests(self):
        for pair in self.link_asin:
            self.driver.get(pair[0])
            request = Request(self.driver.current_url,
                              callback=self.parseReview,
                              dont_filter=True,
                              meta={'item': pair[1]})
            yield request

    def parseReview(self, response):
        asin = response.meta['item']
        self.driver.get(response.url)
        try:
            element = WebDriverWait(self.driver, 5).until(
                EC.presence_of_element_located((
                    By.XPATH,
                    '//*[@class="a-section a-spacing-none review-views celwidget"]//*[@data-hook="review-star-rating"]'
                )))
            #scrolling to element
            time.sleep(1)
            soup = BeautifulSoup(self.driver.page_source, "html.parser")
            #create xml tree to use xpath
            dom = etree.HTML(str(soup))

            reviewTitle = dom.xpath(
                '//*[@class="a-section a-spacing-none review-views celwidget"]//*[@data-hook="review-title"]//text()'
            )
            reviewTitle = [i.strip() for i in reviewTitle if i.strip() != '']
            reviewRatings = dom.xpath(
                '//*[@class="a-section a-spacing-none review-views celwidget"]//*[@data-hook="review-star-rating"]//text()'
            )

            reviewText = dom.xpath(
                '//*[@class="a-section a-spacing-none review-views celwidget"]//*[@data-hook="review-body"]//text()'
            )
            reviewText = [i.strip() for i in reviewText if i.strip() != '']
            reviewDate = dom.xpath('//*[@data-hook="review-date"]/text()')

            all_v = list(
                zip(reviewTitle, reviewText, reviewRatings, reviewDate))
            for i in all_v:
                rtitle = i[0]
                rtext = i[1]
                rrating = i[2]
                rdate = i[3]
                dict_ = {
                    "asin": asin,
                    "reviewTitle": rtitle,
                    "reviewText": rtext,
                    "reviewRatings": rrating,
                    "reviewDate": rdate
                }
                with open("Reviews.json", "a") as fl:
                    json.dump(dict_, fl)
                    fl.write('\n')

            nextpage = dom.xpath(
                '//*[@class="a-last"]//@href')  # nextpage link

            self.counter = self.counter + 1

            if self.counter % 200 == 0:  # to rotate proxy after every 250 rotations
                print(
                    '!@#$%^&*()!@##$$%!@#$%^&*(!@#$%^&*()!@##$$%!@#$%^&*(!@#$%^&*()!@##$$%!@#$%^&*('
                )
                settings = initialize_VPN(save=1,
                                          area_input=['complete rotation'])
                rotate_VPN(settings)

            if str(nextpage) != '[]':  # go to next page
                nextpage = "https://www.amazon.com" + nextpage[0]

                yield Request(nextpage,
                              callback=self.parseReview,
                              dont_filter=True,
                              meta={'item': asin})
            else:
                pass
        except:
            pass
示例#6
0
def run():
    email_addr, verification_code_email, password = get_email()
    if not email_addr:
        return
    print(email_addr)

    use_vpn, status, url, code_sender = get_input_args()

    if use_vpn:
        initialize_VPN(save=1,
                       area_input=["random countries europe 30"],
                       skip_settings=1)
        rotate_VPN()

    driver = get_driver()
    answers = get_answers()

    while status <= 0:
        driver.get(url)

        if status == -1:
            claim_prize(driver, email_addr)

        print("current status: %s" % status)
        time.sleep(2)

        if use_vpn:
            try:
                switch_to_frame(driver)
                skip_button = WebDriverWait(driver, 5).until(
                    EC.visibility_of_element_located(
                        (By.CSS_SELECTOR, "button.bg-transparent")))
                button_click(driver, skip_button)
            except:
                pass
        elif status != -1:
            switch_to_frame(driver)
            input_email_and_accept_terms(driver, email_addr)

            # Wait for email to arrive
            time.sleep(6)

            verification_code = get_verification_code(verification_code_email,
                                                      password, code_sender)

            switch_to_frame(driver)
            enter_verification_code(driver, verification_code)

        time.sleep(2)
        switch_to_frame2(driver)
        start_quiz(driver, status)

        if status > -2:
            status = game_loop(driver, answers)

        current_url = driver.current_url
        if "http" in current_url:
            url = current_url

        if status != -1:
            status = claim_prize(driver, email_addr)

    driver.close()
    terminate_VPN()
示例#7
0
from nordvpn_switcher import initialize_VPN, rotate_VPN, terminate_VPN
import time

settings = initialize_VPN(save=1, area_input=['Ireland'])

while True:
    rotate_VPN(settings, google_check=1)
    time.sleep(1800)  #e.g. rotate servers every hour
import os
import time
import pandas as pd
from nordvpn_switcher import initialize_VPN, rotate_VPN, terminate_VPN
settings = initialize_VPN()
rotate_VPN(settings)
from auxiliary.unitedstateszipcode_scraper import ZipCodeUSA
zipCodeScraper = ZipCodeUSA()

donors = pd.read_csv(
    "D:\Programming\Python\DonorsChoose\data\DonorsChoose\Donors.csv")
#donors = donors.loc[donors["Donor State"] != 'other']

zipcodes = pd.Series(donors["Donor Zip"].unique(), name="zipcodes")
zipcodes = pd.to_numeric(zipcodes,
                         errors="coerce").dropna().astype(int).astype(str)
for i in range(1, 3):
    zipcodes.loc[zipcodes.str.len() == i] = (
        zipcodes.loc[zipcodes.str.len() == i] + "0" * (3 - i)).values
zipcodes = zipcodes.unique()

zipcode_df = pd.DataFrame(index=range(len(zipcodes)),
                          columns=[
                              "id", "Population", "Population Density",
                              "Housing Units", "Median Home Value",
                              "Land Area", "Water Area",
                              "Occupied Housing Units",
                              "Median Household Income"
                          ])

additions = [
示例#9
0
from nordvpn_switcher import initialize_VPN, rotate_VPN
import time

#######################
## WINDOWS OR LINUX ###
#######################

# [1] save settings file as a variable

instructions = initialize_VPN(
)  #this will guide you through a step-by-step guide, including a help-menu with connection options

for i in range(3):
    rotate_VPN(instructions)  #refer to the instructions variable here
    print(
        '\nDo whatever you want here (e.g.scraping). Pausing for 10 seconds...\n'
    )
    time.sleep(10)

# [2] if you'd like to skip the step-by-step menu (because you want to automate your script fully without any required human intervention), use the area_input parameter

instructions = initialize_VPN(area_input=[
    'Belgium,France,Netherlands'
])  # <-- Be aware: the area_input parameter expects a list, not a string

for i in range(3):
    rotate_VPN(instructions)  #refer to the instructions variable here
    print(
        '\nDo whatever you want here (e.g.scraping). Pausing for 10 seconds...\n'
    )
    time.sleep(10)