Exemplo n.º 1
0
    def __init__(self,
                 access_key=None,
                 secret=None,
                 failed_table_name='parsed_simple_failed',
                 missing_table_name='parsed_simple_missing',
                 success_table_name='parsed_simple_good'):
        """
        If access_key and/or secret are not passed in, assumes we are accessing erenev's aws account and that the
        access info is stored as environment variables on the current server.

        Connection and Table are available to clients via self properties, in case clients wish to use those objects
        directly.
        """

        access_key = access_key or get_environment_variable(
            'VEN_S3_ACCESS_KEY')
        secret = secret or get_environment_variable('VEN_S3_SECRET')
        self.connection = boto.dynamodb2.connect_to_region(
            region_name='eu-west-1',
            aws_access_key_id=access_key,
            aws_secret_access_key=secret)

        self.failed_table = Table(failed_table_name,
                                  connection=self.connection)
        self.missing_table = Table(missing_table_name,
                                   connection=self.connection)
        self.success_table = Table(success_table_name,
                                   connection=self.connection)
Exemplo n.º 2
0
 def with_ven_anonymizer(cls):
     return Browser(proxies={
         'http':
         get_environment_variable('VEN_ANONYMIZER_PROX_HTTP')
     },
                    auth=HTTPProxyAuth(
                        get_environment_variable('VEN_ANONYMIZER_LOGIN'),
                        get_environment_variable('VEN_ANONYMIZER_PASS')))
Exemplo n.º 3
0
 def __init__(self,
              to_addresses=None,
              sender=None,
              aws_key=None,
              aws_secret=None):
     self.sender = sender or 'Thor Stats <*****@*****.**>'
     self.to_addresses = to_addresses or '*****@*****.**'
     self.aws_key = aws_key or get_environment_variable('VEN_S3_ACCESS_KEY')
     self.aws_secret = aws_secret or get_environment_variable(
         'VEN_S3_SECRET')
Exemplo n.º 4
0
    def __init__(self, bucket_name, region=S3_DEFAULT_REGION):
        """
        Creates an instance with a handle on the S3 bucket corresponding to bucket_name. If bucket_name does not
        exist, it is created.

        :type bucket_name: str
        """
        self.conn = boto.s3.connect_to_region(
            region,
            aws_access_key_id=get_environment_variable('VEN_S3_SECRET'),
            aws_secret_access_key=get_environment_variable(
                'VEN_S3_ACCESS_KEY'))
        if bucket_name not in [b.name for b in self.conn.get_all_buckets()]:
            self.conn.create_bucket(bucket_name, location=Location.EU)
        self.bucket = self.conn.get_bucket(bucket_name)
Exemplo n.º 5
0
def datapath(filename=''):
    # NOTE: Tried os.path.join first, but noticed that if filename started with a /, we didn't get the MS_DATA root.
    # so for simplicity and (??) performance, chose to use concatination

    # protecting filename from None
    filename = filename or ''
    return get_environment_variable('MS_DATA') + filename
Exemplo n.º 6
0
    def __init__(self, account_id='7998744469', store=None, min_money=None, max_money=None, logging_level=logging.DEBUG, chk_size=5000):
        """
        Pass in min and max (in Euros, NOT micros) based bid if you want to override the GMoney defaults
        store: storage that acts like an HDF5 store
        """
        self.conn = Connection(password=get_environment_variable('VEN_ADWORDS_PASSWORD'),
                               developer_token=get_environment_variable('VEN_ADWORDS_TOKEN'),
                               account_id=account_id)
        self.awq = AWQ(self.conn)
        self.gmoney = GMoney(min_money=min_money, max_money=max_money)
        self.ops = Operations(self.gmoney)
        self.mutations = Mutations(self.conn)

        self.chk_size = chk_size

        self.store = store

        self.logger = KhanLogger(level=logging_level, origin=self.__class__.__name__)

        # convenience properties to make it easier to call methods in succession without tracking job ids
        self.job_ids = None
        self.job_ids_completed = None
        self.job_ids_failed = None
Exemplo n.º 7
0
Arquivo: s3.py Projeto: SRHerzog/ut
    def __init__(self,
                 bucket_name=None,
                 base_folder=None,
                 extension=None,
                 force_extension=False,
                 encoding=None,
                 access_key=None,
                 secret=None):
        """
        Creates an instance with a handle on the S3 bucket corresponding to bucket_name.

        If access_key and/or secret are not passed in, assumes we are accessing erenev's aws account and that the
        access info is stored as environment variables on the current server.

        Connection and bucket are available to clients via self properties, in case clients wish to use those objects
        directly.
        """
        assert not extension, "extension has not been implement yet for S3."

        if access_key and not secret:
            if access_key == 'ut':
                access_key = get_environment_variable('VEN_AWS_ACCESS_KEY_ID')
                secret = get_environment_variable('VEN_AWS_SECRET_ACCESS_KEY')
            elif access_key == 'mon':
                access_key = get_environment_variable('MON_AWS_ACCESS_KEY_ID')
                secret = get_environment_variable('MON_AWS_SECRET_ACCESS_KEY')
            else:
                ValueError('I cannot recognize that access_key')
        else:  # if access_key is not given, take a default
            #access_key = access_key or os.environ['MON_AWS_ACCESS_KEY_ID']
            #secret = secret or os.environ['MON_AWS_SECRET_ACCESS_KEY']
            access_key = access_key or get_environment_variable(
                'VEN_AWS_ACCESS_KEY_ID')
            secret = secret or get_environment_variable(
                'VEN_AWS_SECRET_ACCESS_KEY')

        # note - this calls the setter
        self.base_folder = base_folder
        self.extension = extension
        self.force_extension = force_extension
        self.encoding = encoding

        self.connection = S3Connection(access_key,
                                       secret,
                                       host='s3-eu-west-1.amazonaws.com')
        if bucket_name:
            self.bucket = self.connection.get_bucket(bucket_name)
        else:
            self.bucket = None
Exemplo n.º 8
0
__author__ = 'thorwhalen'

from ut.datapath import datapath
import pickle
import os
from ut.util.importing import get_environment_variable
import pandas as pd
import ut.pfile.to as file_to
import ut.pfile.name as pfile_name
import ut.pstr.to as str_to
from ut.pstr.trans import str_to_unicode_or_bust
#from os import environ # does this load the whole array? Can we just take MS_DATA instead?

try:
    MS_DATA = get_environment_variable('MS_DATA')
except KeyError:
    MS_DATA = ''

ENCODING_NONE = 0
ENCODING_UNICODE = 1
ENCODING_UTF8 = 2


class Local(object):
    def __init__(self,
                 relative_root=None,
                 extension='',
                 force_extension=False,
                 encoding='UTF-8',
                 mother_root=MS_DATA):
        if relative_root:
Exemplo n.º 9
0
__author__ = 'thor'

import ut.pfile.accessor as pfile_accessor
import os
from ut.util.importing import get_environment_variable
import pdb

PROJECT_PATH = get_environment_variable('PY_PROJ_FOLDER')

#import os
#PROJECT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))

# THOR_KHAN_CODE_PATH = "/D/Dropbox/dev/py/proj/khan/"
# MATT_KHAN_CODE_PATH = "/Users/mattjmorris/Dev/python/khan/"

pdbrc_file = '.pdbrc'

# constant .pdbrc aliases

# define pdbrc_prefix
pdbrc_prefix = "##### PDBRC PREFIX ###########################" + "\n"
pdbrc_prefix = pdbrc_prefix + "\n" + "# Print a dictionary, sorted. %1 is the dict, %2 is the prefix for the names."
pdbrc_prefix = pdbrc_prefix + "\n" + 'alias p_ for k in sorted(%1.keys()): print "%s%-15s= %-80.80s" % ("%2",k,repr(%1[k]))'
pdbrc_prefix = pdbrc_prefix + "\n" + ""
pdbrc_prefix = pdbrc_prefix + "\n" + "# Print the instance variables of a thing."
pdbrc_prefix = pdbrc_prefix + "\n" + "alias pi p_ %1.__dict__ %1."
pdbrc_prefix = pdbrc_prefix + "\n" + ""
pdbrc_prefix = pdbrc_prefix + "\n" + "# Print the instance variables of self."
pdbrc_prefix = pdbrc_prefix + "\n" + "alias ps pi self"
pdbrc_prefix = pdbrc_prefix + "\n" + ""
pdbrc_prefix = pdbrc_prefix + "\n" + "# Print the locals."
Exemplo n.º 10
0
class Browser(object):
    """ Base class for slurpers """

    # Load up scraper config stuff from config file upon loading class definition
    CONFIG = json.load(open(os.path.dirname(__file__) + "/config.json"))
    USER_AGENTS = CONFIG["USER_AGENTS"]
    HEADERS = CONFIG["HEADERS"]
    try:
        ANONYMIZER_AUTH = HTTPProxyAuth(
            get_environment_variable('VEN_ANONYMIZER_LOGIN'),
            get_environment_variable('VEN_ANONYMIZER_PASS'))
        ANONYMIZER_PROXIES = {
            'http': get_environment_variable('VEN_ANONYMIZER_PROX_HTTP')
        }  #,'https': get_environment_variable('VEN_ANONYMIZER_PROX_HTTPS']}
    except KeyError:
        print(
            "VEN_ANONYMIZER_LOGIN, VEN_ANONYMIZER_PASS, and/or VEN_ANONYMIZER_PROX_HTTP missing from environment"
        )
    try:
        PROXYMESH_AUTH = requests.auth.HTTPProxyAuth(
            get_environment_variable('PROXYMESH_USER'),
            get_environment_variable('PROXYMESH_PASS'))
        PROXYMESH_PROXIES = {'http': 'http://us.proxymesh.com:31280'}
    except KeyError:
        print("PROXYMESH_USER and/or PROXYMESH_PASS missing from environment")

    def __init__(self, **kwargs):
        """
        Creates an instance that will use proxies and authorization if proxies & auth are provided.
        :type proxies: dict
        :type auth: requests.auth.HTTPProxyAuth
        """
        # CONFIG = json.load(open(os.path.dirname(__file__) + "/config.json"))
        default_kwargs = {
            'get_header_fun': 'header_with_random_firefox_user_agent',
            'random_agents': [],
            'header': {
                "User-Agent":
                "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate",
                "Connection": "close",
                "DNT": "1"
            },
            'proxies': None,
            'auth': None,
            'timeout': 10.0
        }
        self = ut.util.pobj.set_attributes(self, kwargs, default_kwargs)
        # self.random_agents = []
        # self.proxies = proxies
        # self.auth = auth
        # self.timeout = 10.0
        # self.webdriver = None

    @classmethod
    def options(cls):
        print '''
            get_header_fun:
                'header_with_random_firefox_user_agent' (default)
                'fixed_header'
        '''

    def get_header(self):
        if self.get_header_fun == 'header_with_random_firefox_user_agent':
            return self.header_with_random_user_agent(
                filter_in_user_agents='^Mozilla.*')
        elif self.get_header_fun == 'fixed_header':
            return self.header
        else:
            raise ValueError("Unknown get_header_fun value")

    def header_with_random_user_agent(self, filter_in_user_agents=None):
        """
        Returns a header with a random user agent
        """
        headers = self.HEADERS.copy()
        headers['User-Agent'] = self.random_user_agent(
            filter_in_user_agents=filter_in_user_agents)
        return headers

    def random_user_agent(self, filter_in_user_agents=None):
        """
        Returns a random user agent from the full list of user agents.
        Cycles through all agents before re-sampling from the full list again.
        """
        if not self.random_agents:
            self.random_agents = self.USER_AGENTS[:]
            if filter_in_user_agents:
                self.random_agents = filter(
                    re.compile(filter_in_user_agents).search,
                    self.random_agents)
        random.shuffle(self.random_agents)
        return self.random_agents.pop()

    def get_html_through_tor_unfinished(self, url):
        UserWarning("This hasn't really be coded yet...")
        proxy_support = urllib2.ProxyHandler({"http": "127.0.0.1:8118"})
        opener = urllib2.build_opener(proxy_support)
        opener.addheaders = [('User-agent', self.random_user_agent())]
        return opener.open(url).read()

    def get_html_through_requests(self, url, url_params={}, timeout=None):
        r = self.get_response_through_requests(url=url,
                                               url_params=url_params,
                                               timeout=timeout)
        # return the text if no error
        if not r.ok:
            raise ValueError('HTTP Error: {} for url {} (headers="{}"'.format(
                r.status_code, url, str(r.headers)))
        else:
            return r.text

    def get_response_through_requests(self, url, url_params={}, timeout=None):
        timeout = timeout or self.timeout
        header = self.header_with_random_user_agent()
        # get the content for the url
        r = requests.get(url,
                         headers=header,
                         params=url_params,
                         timeout=timeout,
                         proxies=self.proxies,
                         auth=self.auth)
        return r

    def get_html_through_selenium(self, url, url_params={}, timeout=None):
        ValueError("You haven't written this yet!!")
        pass
        # timeout = timeout or self.timeout
        # header = self.header_with_random_user_agent()
        # # get the content for the url
        # r = requests.get(url, headers=header, params=url_params,
        #                  timeout=timeout, proxies=self.proxies, auth=self.auth)
        # # return the text if no error
        # if not r.ok:
        #     raise ValueError('HTTP Error: {} for url {} (user-agent="{}"'.format(r.status_code, url, header['User-Agent'] ))
        # else:
        #     return r.text

    @classmethod
    def with_ven_anonymizer(cls):
        return Browser(proxies={
            'http':
            get_environment_variable('VEN_ANONYMIZER_PROX_HTTP')
        },
                       auth=HTTPProxyAuth(
                           get_environment_variable('VEN_ANONYMIZER_LOGIN'),
                           get_environment_variable('VEN_ANONYMIZER_PASS')))

    @classmethod
    def with_proxymesh(cls):
        return Browser(proxies=cls.PROXYMESH_PROXIES, auth=cls.PROXYMESH_AUTH)

    @classmethod
    def firefox_selenium(cls, **kwargs):
        default_kwargs = {'webdriver': webdriver.Firefox()}
        kwargs = dict(default_kwargs, **kwargs)
        return Browser(**kwargs)
Exemplo n.º 11
0
class Amazon(object):
    url_template = dict()
    url_template['product_page'] = 'http://www.amazon.{country}/dp/{asin}/'
    url_template['product_reviews'] = 'http://www.amazon.{country}/product-reviews/{asin}/'

    regexp = dict()
    regexp['nreviews_re'] = {'com': re.compile('\d[\d,]*(?= customer review)'),
                             'co.uk': re.compile('\d[\d,]*(?= customer review)'),
                             'in': re.compile('\d[\d,]*(?= customer review)'),
                             'de': re.compile('\d[\d\.]*(?= Kundenrezens\w\w)')}
    regexp['no_reviews_re'] = {'com': re.compile('no customer reviews'),
                               'co.uk': re.compile('no customer reviews'),
                               'in': re.compile('no customer reviews'),
                               'de': re.compile('Noch keine Kundenrezensionen')}
    # regexp['average_rating_re'] = {'com': re.compile('')}
    default = dict()
    default['country'] = 'com'
    # default['requests_kwargs'] = {}
    default['requests_kwargs'] = {
        'proxies': {'http': 'http://us.proxymesh.com:31280'},
        'auth': requests.auth.HTTPProxyAuth(get_environment_variable('PROXYMESH_USER'),
                                            get_environment_variable('PROXYMESH_PASS'))
    }

    @classmethod
    def url(cls, what='product_page', **kwargs):
        kwargs = dict(Amazon.default, **kwargs)
        return cls.url_template[what].format(**kwargs)
        return r.text

    @classmethod
    def slurp(cls, what='product_page', **kwargs):
        kwargs = dict(Amazon.default, **kwargs)
        r = requests.get(Amazon.url(what=what, **kwargs), **Amazon.default['requests_kwargs'])
        if r.status_code == 200:
            return r.text
        else:  # try again and return no matter what
            r = requests.get(Amazon.url(what=what, **kwargs), **Amazon.default['requests_kwargs'])
            return r.text

    # @classmethod
    # def get_dynamic_book_info(cls, asin, **kwargs):
    #     html = Amazon.slurp(what='product_page', **kwargs)
    #     b = bs3_BeautifulSoup(b)


    @classmethod
    def get_info(cls, asin, country='co.uk', **kwargs):
        info = {'date': datetime.now()}
        info = dict(info, **{'sales_ranks': cls.get_sales_rank(asin, country='co.uk', **kwargs)})
        # info = dict(info, **{'num_of_reviews': cls.get_number_of_reviews(asin, country='co.uk', **kwargs)})
        return info

    @classmethod
    def get_sales_rank(cls, **kwargs):
        html = Amazon.slurp(what='product_page', **kwargs)
        sales_rank = [Amazon.parse_sales_rank(html, **kwargs)]
        sales_rank += Amazon.parse_sales_sub_rank(html, **kwargs)
        return sales_rank

    @classmethod
    def parse_product_title(cls, b, **kwargs):
        if not isinstance(b, bs3_BeautifulSoup):
            b = bs3_BeautifulSoup(b)
        return b.find('span', attrs={'id': 'productTitle'}).text

    @classmethod
    def parse_sales_rank(cls, b, **kwargs):
        if not isinstance(b, bs3_BeautifulSoup):
            b = bs3_BeautifulSoup(b)
        t = b.find('li', attrs={'id': re.compile('SalesRank')})
        sales_rank_re = re.compile('(\d[\d,]+) in ([\w\ ]+)')
        tt = sales_rank_re.findall(t.text)
        return {'sales_rank': int(re.compile('\D').sub('', tt[0][0])),
                'sales_rank_category': tt[0][1].strip(' ')}

    @classmethod
    def parse_sales_sub_rank(cls, b, **kwargs):
        if not isinstance(b, bs3_BeautifulSoup):
            b = bs3_BeautifulSoup(b)
        t = b.find('li', attrs={'id': re.compile('SalesRank')})
        tt = t.findAll('li', 'zg_hrsr_item')
        sales_sub_rank = list()
        for tti in tt:
            d = dict()
            d['sales_rank'] = int(re.compile('\D').sub('', tti.find('span', 'zg_hrsr_rank').text))
            ttt = tti.find('span', 'zg_hrsr_ladder')
            ttt = ttt.text.split('&nbsp;')[1]
            d['sales_rank_category'] = ttt.split('&gt;')
            sales_sub_rank.append(d)
        return sales_sub_rank

    @classmethod
    def parse_avg_rating(cls, b, **kwargs):
        if not isinstance(b, bs3_BeautifulSoup):
            b = bs3_BeautifulSoup(b)
        t = b.find('span', 'reviewCountTextLinkedHistogram')
        return float(re.compile('[\d\.]+').findall(t['title'])[0])

    @classmethod
    def parse_product_title(cls, b, **kwargs):
        if not isinstance(b, bs3_BeautifulSoup):
            b = bs3_BeautifulSoup(b)
        t = b.find('div', attrs={'id': 'title'})
        return t.find('span', attrs={'id': 'productTitle'}).text

    @staticmethod
    def test_rating_scrape_with_vanessas_book():
        html = Amazon.slurp(what='product_page', country_ext='.co.uk', asin='1857886127')

    @staticmethod
    def get_number_of_reviews(asin, country, **kwargs):
        url = 'http://www.amazon.{country}/product-reviews/{asin}'.format(country=country, asin=asin)
        html = requests.get(url).text
        try:
            return int(re.compile('\D').sub('', Amazon.regexp['nreviews_re'][country].search(html).group(0)))
        except Exception:
            if Amazon.regexp['no_reviews_re'][country].search(html):
                return 0
            else:
                return None  # to distinguish from 0, and handle more cases if necessary
Exemplo n.º 12
0
    url_from_filename
'''

from ut.util.importing import get_environment_variable
import logging

import urllib2
import urlparse
from bs4 import BeautifulSoup
from BeautifulSoup import BeautifulSoup as bs3_BeautifulSoup

import requests
from selenium import webdriver

# from selenium.webdriver.common.keys import Keys

import ut
import ut.parse.util
from ut.parse.util import disp_html
import ut.util.log
import ut.webscrape.util
from ut.webscrape.util import filename_from_url
from ut.webscrape.util import url_from_filename
from ut.parse.util import open_html_in_firefox

logging.basicConfig(filename=get_environment_variable('DEFAULT_LOG_FILE'),
                    filemode='w',
                    level=logging.DEBUG)

print "logging in use:\n   %s" % get_environment_variable('DEFAULT_LOG_FILE')
Exemplo n.º 13
0
Arquivo: yboss.py Projeto: SRHerzog/ut
class Yboss(object):

    default_yboss_attrs = {
        'oauth_consumer_key': get_environment_variable('MON_YB_KEY'),
        'oauth_consumer_secret': get_environment_variable('MON_YB_SECRET'),
        'default_service': 'limitedweb',
        'default_params': {},
        'default_save_folder': os.getcwd()
    }

    def __init__(self, **kwargs):
        self.__dict__.update(Yboss.default_yboss_attrs)
        self.__dict__.update(kwargs)
        self.consumer = oauth2.Consumer(key=self.oauth_consumer_key,
                                        secret=self.oauth_consumer_secret)

    ####################################################################################
    ###### SLURPERS ####################################################################
    def slurp_raw(self, query, service=None, params=None):
        service, params = self.fill_with_defaults(service, params)
        url = self.url(query, service, params)

        request_params = {
            'oauth_version': '1.0',
            'oauth_nonce': oauth2.generate_nonce(),
            'oauth_timestamp': int(time.time()),
        }
        oauth_request = oauth2.Request(method='GET',
                                       url=url,
                                       parameters=request_params)
        oauth_request.sign_request(oauth2.SignatureMethod_HMAC_SHA1(),
                                   self.consumer, None)
        oauth_header = oauth_request.to_header(realm='yahooapis.com')

        # Get search results
        http = httplib2.Http()
        resp, content = http.request(url, 'GET', headers=oauth_header)
        return {'content': content, 'resp': resp}
        # return "{'resp': %s, 'content': %s}" % (resp, content)

    def slurp_content(self, query, service=None, params=None):
        resp_content = self.slurp_raw(query, service=service, params=params)
        return resp_content['content']

    def slurp_content_as_dict(self, query, service=None, params=None):
        return json.loads(
            self.slurp_content(query, service=service, params=params))

    def slurp_content_and_save(self,
                               query,
                               service=None,
                               params=None,
                               filepath=None):
        filepath = self.get_filepath_for_params(query=query,
                                                service=service,
                                                params=params,
                                                filepath=filepath)
        resp_content = self.slurp_raw(query, service=service, params=params)
        json.dump(resp_content['content'], open(filepath, 'w'))

    def slurp_df_and_save(self,
                          query,
                          service=None,
                          params=None,
                          filepath=None,
                          n_pages=1):
        filepath = self.get_filepath_for_params(query=query,
                                                service=service,
                                                params=params,
                                                filepath=filepath)
        df = self.slurp_results_df_multiple_pages(query=query,
                                                  service=service,
                                                  params=params,
                                                  n_pages=n_pages)
        pd.to_pickle(df, filepath)
        return df

    def get_df(self,
               query,
               service=None,
               params=None,
               filepath=None,
               n_pages=1,
               overwrite=False):
        filepath = self.get_filepath_for_params(query=query,
                                                service=service,
                                                params=params,
                                                filepath=filepath)
        if not overwrite and os.path.exists(filepath):
            return pd.read_pickle(filepath)
        else:
            return self.slurp_df_and_save(query=query,
                                          service=service,
                                          params=params,
                                          n_pages=n_pages)

    def slurp_results_df(self, query, service=None, params=None):
        content_dict = json.loads(
            self.slurp_content(query, service=service, params=params))
        content_dict = self.get_item(content_dict)
        return self.content_to_results_df(content_dict)

    def content_to_results_df(self, content_dict):
        df = pd.DataFrame(content_dict['results'])
        start_position = int(content_dict['start'])
        df['position'] = range(start_position, start_position + len(df))
        df._metadata = {'totalresults': int(content_dict['totalresults'])}
        return df

    def slurp_results_df_multiple_pages(self,
                                        query,
                                        service=None,
                                        params=None,
                                        n_pages=5):
        service, params = self.fill_with_defaults(service, params)
        df = pd.DataFrame()
        new_df = pd.DataFrame()
        for i in range(n_pages):
            # print "slurping %d/%d" % (i, n_pages-1)
            try:
                new_df = self.slurp_results_df(query,
                                               service=service,
                                               params=params)
                df = pd.concat([df, new_df])
            except:
                break
            params['start'] += params['count']
        df._metadata = new_df._metadata
        return df

    ####################################################################################
    ###### UTILS #######################################################################

    ####################################################################################
    ###### CONTENT ACCESSORS ###########################################################

    def load_json_dict(self, filepath):
        filepath = self.get_filepath(filepath)
        return json.loads(json.load(open(filepath, 'r')))

    def get_service_results_getter(self, service):
        service = service or self.default_save_folder
        return mk_fixed_coordinates_value_getter(['bossresponse', service])

    @classmethod
    def get_results(cls, content_dict):
        return Yboss.get_item(content_dict)['results']

    # def get_results

    @classmethod
    def get_totalresults(cls, content_dict):
        return int(Yboss.get_item(content_dict)['totalresults'])

    @classmethod
    def get_item(cls, content_dict):
        content_dict = content_dict['bossresponse']
        return content_dict[content_dict.keys()[0]]

    @classmethod
    def mk_fixed_coordinates_value_getter(cls, coord_list):
        return mk_fixed_coordinates_value_getter(['bossresponse'] + coord_list)

    ####################################################################################
    ###### UTILS #######################################################################
    def url(self, query, service=None, params=None):
        service = service or self.default_service
        params = params or self.default_params
        return yboss_search_root_url + self.rel_url(
            query=query, service=service, params=params)

    def rel_url(self, query, service=None, params=None):
        service = service or self.default_service
        params = params or self.default_params
        params = Yboss.mk_req_params(service, params)
        return "%s?q=%s%s" % (service, self.url_encode_str(query),
                              self.url_encode_params(params))

    def get_filename_for_query(self, query, service=None, params=None):
        return self.rel_url(query, service=service,
                            params=params).replace('?', '--')

    def get_filepath(self, filespec):
        if os.path.exists(filespec):
            file_path = filespec
        else:
            file_path = os.path.join(self.default_save_folder, filespec)
            if not os.path.exists(file_path):
                # assume it's a query, and derive what the filepath should be
                file_path = os.path.join(self.default_save_folder,
                                         self.get_filename_for_query(filespec))
        return file_path

    def get_filepath_for_params(self,
                                query,
                                service=None,
                                params=None,
                                filepath=None):
        filepath = filepath or self.default_save_folder
        if os.path.isdir(
                filepath
        ):  # if filepath is a directory, need to make a filename for it
            filepath = os.path.join(
                filepath,
                self.get_filename_for_query(query,
                                            service=service,
                                            params=params))
        return filepath

    def fill_with_defaults(self, service=None, params=None):
        service = service or self.default_service
        params = Yboss.mk_req_params(service, params)
        return service, params

    @classmethod
    def mk_req_params(cls, service, params=None):
        params = params or {}
        return dict(
            dict(default_universal_args, **service_default_req_args[service]),
            **params)

    @classmethod
    def url_encode_str(cls, s):
        return url_encode_yboss(s)

    @classmethod
    def url_encode_params(cls, params):
        u = ''
        for p, v in params.iteritems():
            if isinstance(v, basestring):
                u += '&%s=%s' % (p, v)
            else:
                u += '&%s=%s' % (p, str(v))
        return u

    @classmethod
    def print_some_resources(cls):
        print '''
            guide to yahoo BOSS: http://developer.yahoo.com/boss/search/boss_api_guide/index.html
            pricing (by service): http://developer.yahoo.com/boss/search/#pricing
            services: web, limitedweb, images, news, blogs, related
            response fields: http://developer.yahoo.com/boss/search/boss_api_guide/webv2_response.html
            market and languages: http://developer.yahoo.com/boss/search/boss_api_guide/supp_regions_lang.html
                '''

    @classmethod
    def process_df(cls, df):
        df['dispurl'] = df['dispurl'].map(YbossText.remove_html_bold)
        df['title'] = df['title'].apply(YbossText.html2text)
        df['abstract'] = df['abstract'].apply(YbossText.html2text)
        df = ms.daf.manip.reorder_columns_as(df, major_cols + minor_cols)
        df = df.reset_index(drop=True)
        return df
Exemplo n.º 14
0
Arquivo: yboss.py Projeto: SRHerzog/ut
        'style': 'raw'
    },  # age, sort, title, url
    'blogs': {
        'count': 20,
        'style': 'raw'
    },  # age, sort, count, title, url
    'related': {
        'count': 10
    },  # age, sort, count, title, url
    'images': {
        'count': 35
    }  # filter, queryfilter, dimensions, referurl, url
}

default_yboss_attrs = {
    'oauth_consumer_key': get_environment_variable('MON_YB_KEY'),
    'oauth_consumer_secret': get_environment_variable('MON_YB_SECRET'),
    'default_service': 'limitedweb',
    'default_params': {},
    'default_save_folder': os.getcwd()
}

service_list = ['limitedweb', 'web', 'blogs', 'news', 'related', 'images']

major_cols = [
    'query', 'position', 'title', 'abstract', 'dispurl',
    'num_of_slurped_results', 'author'
]
minor_cols = ['date', 'url', 'clickurl']