def fetch_via_proxy(url):
    import time
    from http_request_randomizer.requests.proxy.requestProxy import RequestProxy
    req_proxy = RequestProxy()
    request = req_proxy.generate_proxied_request(url)
    if request is not None:
        time.sleep(5)
        return request
Пример #2
0
def req_split(r):
    start = time.time()
    req_proxy = RequestProxy()
    #print("Initialization took: {0} sec".format((time.time() - start)))
    #print("Size: {0}".format(len(req_proxy.get_proxy_list())))
    #print("ALL = {0} ".format(list(map(lambda x: x.get_address(), req_proxy.get_proxy_list()))))
    request = req_proxy.generate_proxied_request(url)
    if request is not None:
        print("\t Response: ip={0}".format(u''.join(request.text).encode('utf-8')))
        print("-> Going to sleep..")
Пример #3
0
def getInfo(url, artist, album):
    # Get the album page
    # An alternate way of doing things- use the rym search engine
    # url = https://rateyourmusic.com/search?searchterm=ARTIST+ALBUM&type=l
    #search the page for the artist and album, then go to that page
    url = url + artist + "/" + album + "/"
    try:
        req_proxy = RequestProxy()
        page = req_proxy.generate_proxied_request(
            url
        )  #PROBLEM, gets detected after a few different requests, unless you manually make it wait for multiple minutes between requests
        soup = BeautifulSoup(page.content, 'html.parser')
    except UnicodeDecodeError:
        print 'UnicodeDecodeError! Skipping...'
        return

    # Get genres from page
    genre_text = str(soup.findAll("span", {"class": "release_pri_genres"}))
    # Get secondary genres from page
    sec_genre_text = str(soup.findAll("span", {"class": "release_sec_genres"}))
    # Clean up and compile all genres
    unclean_genres = re.findall(r']">.*?</a>', genre_text)
    unclean_sec_genres = re.findall(r']">.*?</a>', sec_genre_text)
    genres = []
    for genre in unclean_genres:
        genre = genre[3:-4]
        genres.append(genre)
    for genre in unclean_sec_genres:
        genre = genre[3:-4]
        genres.append(genre)

    # Get descriptors from page
    descriptor_text = str(
        soup.findAll("span", {"class": "release_pri_descriptors"}))
    descriptor_text = descriptor_text[37:-7]
    # Clean up and organize each descriptor
    unclean_descriptors = re.findall(r'.*?,', descriptor_text)
    descriptors = []
    for descriptor in unclean_descriptors:
        descriptor = descriptor[2:-1]
        descriptors.append(descriptor)

    # Print genres
    genres = ';'.join(genre for genre in genres)
    print(artist + '->' + album + ' genres:'),
    print genres

    # Print descriptors
    descriptors = '; '.join(descriptor.title() for descriptor in descriptors)
    print(artist + '->' + album + ' descriptors:')
    print descriptors

    return genres, descriptors
Пример #4
0
class RequestMaker:
    def __init__(self):
        self.req_proxy = RequestProxy()

    def _generate_proxied_request(self, url, params=None):
        if params is None:
            params = {}
        for _ in range(0, len(self.req_proxy.get_proxy_list())):
            proxy_response = self.req_proxy.generate_proxied_request(
                url, params=params)
            if proxy_response is not None:
                return proxy_response
        return None

    def get(self, url, params=None):
        proxy_response = self._generate_proxied_request(url, params)
        if proxy_response is None:
            raise RuntimeError(
                'Failed to generate proxied request for {}'.format(url))

        return proxy_response
Пример #5
0
# -*- coding: utf-8 -*
#!/usr/bin/python
#pip install http-request-randomizer
#####################################
##KILL THE NET##
##############[LIBS]###################
from http_request_randomizer.requests.proxy.requestProxy import RequestProxy
import sys
while 1:
    try:
        ip = sys.argv[1]
        api = 'http://api.hackertarget.com/reverseiplookup/?q='+ip
        req_proxy = RequestProxy()
        try:
            request = req_proxy.generate_proxied_request(api)
            if request:
                if 'error' in request.text or 'No DNS' in request.text:
                    break
                if 'API count exceeded' in request.text or 'Bad Request' in request.text:
                    continue
                else:
                    open(ip+'.txt','a').write(request.text+'\n')
                    open('ALL-SITES.txt','a').write(request.text+'\n')
                    break
        except:
            pass
    except Exception as e:
        print(e)
        break
class YFHistoricalDataExtract(object):
    """
    Function for grabbing historical stock data from yahoo finance.  Utilizes
    the HTTP_Request_Randomizer library to make proxied function calls so as to
    avoid IPbans from relevant sources.

    <More Info Here!!!>
    """
    def __init__(self,
                 stock_file,
                 data_storage_dir="./historical_stock_data",
                 threads=10,
                 clear_existing=True):
        """
        Initializes the proxy server as well as directories that all of
        the read in historical data will be stored to.

        Note: The directory structure could already exist and the data could already be there.
        It does not always make sense to delete the old data and start again.  If the clear_existing
        variable is set, clear the existing directories.  The default is to clear the existing
        directories containing historical data and start over.
        """

        self.proxy_server = RequestProxy()
        self.output_dir = data_storage_dir
        self.ticker_file = stock_file
        self.thread_limit = threads

        # If the user asks for it, clear the existing directory structure
        if clear_existing is True:
            self.clear_directories()

        # Check to see if the file containing ticker symbols exists
        if not os.path.exists(stock_file):
            raise BadTickerFile()

        # Try to make the directory structure that the data will be stored in
        self.setup_directories()

        try:
            os.makedirs("%s/dividends" % self.output_dir)
        except OSError:
            print "[Error]: Could not create directory structure."
            raise CannotCreateDirectory()

    def clear_directories(self):
        """
        Wipe the existing directory structure if it exists.
        """

        os.system("rm -rf %s" % self.output_dir)

    def setup_directories(self):
        if not os.path.exists(self.output_dir):
            try:
                os.makedirs(self.output_dir)
            except OSError as e:
                print "[ERROR]: %s" % str(e)
                raise CannotCreateDirectory()

        if not os.path.exists(self.output_dir + "/dividend_history"):
            try:
                os.makedirs(self.output_dir + "/dividend_history")
            except OsError as e:
                print "[ERROR]: %s" % str(e)
                raise CannotCreateDirectory()

    def get_historical_data(self):
        stock_file = open(self.ticker_file, "r")

        candidates_to_test = []

        pool = ThreadPool(self.thread_limit)

        for ticker in stock_file.readlines():
            candidates_to_test.append(ticker.strip())

        pool.map(self.read_ticker_historical, candidates_to_test)

    def read_ticker_historical(self, ticker_symbol):
        URL = "https://finance.yahoo.com/quote/%s/history/" % ticker_symbol
        response = None

        # Loop until you get a valid response
        while True:
            try:
                response = self.proxy_server.generate_proxied_request(
                    URL, req_timeout=5)
            except Exception as e:
                print "Exception: %s %s" % (ticker_symbol, str(e))
                return

            if response is None:
                continue

            if response.__dict__['status_code'] == 200:
                break

        response_soup = BeautifulSoup(response.text, 'html5lib')

        # Find all rows in the historical data.
        response_soup = response_soup.find_all("tr")
        response_soup = response_soup[2:]

        json_history_file = open(
            "%s/%s.json" % (self.output_dir, ticker_symbol), "w")
        json_dividend_file = open(
            "%s/%s_dividend.json" %
            (self.output_dir + "/dividend_history", ticker_symbol), "w")

        historical_data = {
            'Date': [],
            'Open': [],
            'High': [],
            'Low': [],
            'Close': [],
            'Adj Close': [],
            'Volume': []
        }

        dividend_data = {'Date': [], 'Amount': []}

        for response in response_soup:
            filtered_response = response.find_all("td")

            if len(filtered_response) == 7:

                # Date
                historical_data["Date"].append(filtered_response[0].text)

                # Open
                historical_data["Open"].append(filtered_response[1].text)

                # High
                historical_data["High"].append(filtered_response[2].text)

                # Low
                historical_data["Low"].append(filtered_response[3].text)

                # Close
                historical_data["Close"].append(filtered_response[4].text)

                # Adj Close
                historical_data["Adj Close"].append(filtered_response[5].text)
            elif len(filtered_response) == 2:

                # Date
                dividend_data["Date"].append(filtered_response[0].text)

                # Dividend Amount
                amount = filtered_response[1].text.replace(" Dividend", "")
                dividend_data["Amount"].append(amount)
            else:
                continue

        json_history_file.write(json.dumps(historical_data))
        json_dividend_file.write(json.dumps(dividend_data))

        json_history_file.close()
        json_dividend_file.close()
Пример #7
0
 def get_search_links(self, prodname):
     url = 'https://www.google.fr/search?q=reference%20"' + "%20".join(
         prodname.split(" ")) + '"'
     urllist = []
     headers = {
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
     }
     try:
         page = requests.get(url, headers=headers)
         soup = BeautifulSoup(page.content)
         for link in soup.find_all("a"):
             if (link.has_attr('href')):
                 if ("https://" in link['href']
                         and "webcache" not in link['href']
                         and "google." not in link['href']
                         and "youtube." not in link['href']):
                     templink = link['href'].split("&")[0]
                     if ("https:" in templink):
                         urllist.append("http" + templink.split("http")[1])
         if (len(urllist) == 0):
             itr = 0
             while itr < 5:
                 try:
                     req_proxy = RequestProxy()
                     request = req_proxy.generate_proxied_request(url)
                     if (request.status_code == 200
                             and request is not None):
                         soup = BeautifulSoup(request.content)
                         for link in soup.find_all("a"):
                             if (link.has_attr('href')):
                                 if ("https://" in link['href']
                                         and "webcache" not in link['href']
                                         and "google." not in link['href']
                                         and "youtube."
                                         not in link['href']):
                                     templink = link['href'].split("&")[0]
                                     if ("https:" in templink):
                                         urllist.append(
                                             "http" +
                                             templink.split("http")[1])
                         if (len(urllist) > 0):
                             itr = 6
                             break
                         else:
                             itr = itr + 1
                 except:
                     itr = itr + 1
                     continue
             if (len(urllist) == 0):
                 urllist = list(
                     search(query="%20".join(prodname.split(" ")),
                            tld="fr",
                            lang="fr",
                            num=10,
                            start=1,
                            stop=20))
         self.logger.info("Number of sites found:" + str(len(urllist)))
     except Exception as e:
         self.logger.info("Error:" + str(e))
         self.logger.info("Failed prod:" + prodname)
     return (urllist)
Пример #8
0
    text = (codeOpen + artistName + spaceInput + hyphenInput + spaceInput +
            songName + '(' + mixName + ' Remix' + ')' + codeClose)
    newName = ('/storage/emulated/0/temp/' + artistName + hyphenInput +
               songName + hyphenInput + mixName + '.mp3')

print('\nWorking on Request: ' + query)

# baseURL

url = 'https://mp3cc.biz/search/f/' + query + '/'

# proxy_headersRequest

req_proxy = RequestProxy()

while not req_proxy.generate_proxied_request(url):
    print('\nNext proxy for "Base URL"')
else:
    print('\nConnected to "Base URL!"')
    pass

# saveToFile

with open('parse.txt', 'wb') as f:
    response = requests.get(url)
    f.write(response.content)

# parseFromFile

with open('parse.txt', 'r', encoding='UTF-8') as p:
    s = BeautifulSoup(p, 'html.parser')
Пример #9
0
def read_one_pg(pageno):
    url = URL.format(pageno=pageno)
    req_proxy = RequestProxy()
    return req_proxy.generate_proxied_request(url)
Пример #10
0
import time
from http_request_randomizer.requests.proxy.requestProxy import RequestProxy

if __name__ == '__main__':

    start = time.time()
    req_proxy = RequestProxy()
    print("Initialization took: {0} sec".format((time.time() - start)))
    print("Size: {0}".format(len(req_proxy.get_proxy_list())))
    print("ALL = {0} ".format(list(map(lambda x: x.get_address(), req_proxy.get_proxy_list()))))

    test_url = 'http://ipv4.icanhazip.com'

    while True:
        start = time.time()
        request = req_proxy.generate_proxied_request(test_url)
        print("Proxied Request Took: {0} sec => Status: {1}".format((time.time() - start), request.__str__()))
        if request is not None:
            print("\t Response: ip={0}".format(u''.join(request.text).encode('utf-8')))
        print("Proxy List Size: {0}".format(len(req_proxy.get_proxy_list())))

        print("-> Going to sleep..")
        time.sleep(10)
Пример #11
0
        i = (i - 1) % (len(SYMBOL))

    file.close()
    file = open(FILE_NAME, 'a', newline='', encoding='utf-8')
    csvfile = csv.DictWriter(file, FIELDS)

req_proxy = RequestProxy()

stocktwit_url = "https://api.stocktwits.com/api/2/streams/symbol/" + SYMBOL[
    token] + ".json?" + access_token[token]
if last_message_id[token] is not None:
    stocktwit_url += "max=" + str(last_message_id[token])

api_hits = 0
while True:
    response = req_proxy.generate_proxied_request(stocktwit_url)

    if response is not None:

        if response.status_code == 429:
            print("###############")
            print("REQUEST IP RATE LIMITED FOR {} seconds !!!".format(
                int(response.headers['X-RateLimit-Reset']) - int(time.time())))

        if not response.status_code == 200:
            token = (token + 1) % (len(access_token))
            stocktwit_url = "https://api.stocktwits.com/api/2/streams/symbol/" + SYMBOL[token] + ".json?" + \
                            access_token[token] + "max=" + str(
                last_message_id[token])

            continue