Пример #1
0
def format_tweets(data):
    p(data.keys())
    p(data['search_metadata'])
    p(data['statuses'][0].keys())
    tweet_fields = ['created_at', 'from_user', 'id', 'text']
    tweets = DataFrame(data['statuses'], columns=tweet_fields)
    p(tweets)
    p(tweets.ix[7])
Пример #2
0
def main():
    '''
    HTML and WebAPI
    '''
    twitter_secret_path = study.ROOT_DIR + '/twitter_secret.json'
    with open(twitter_secret_path) as f:
        tw_secret = json.load(f)
        p(tw_secret)

    format_tweets(search_tweets(tw_secret))
    format_tweets(search_and_post_status(tw_secret))
Пример #3
0
def search_and_post_status(tw_secret):
    """
    oauth by requests module
    """
    CONSUMER_KEY = tw_secret['api_key']
    CONSUMER_SECRET = tw_secret['api_secret_key']
    # request token
    request_token_url = 'https://api.twitter.com/oauth/request_token'
    auth = OAuth1(CONSUMER_KEY, CONSUMER_SECRET, callback_uri=u'oob')
    res = requests.post(request_token_url, auth=auth)
    request_token = dict(urlparse.parse_qsl(res.text))
    p(request_token)

    # access token
    authorize_url = 'https://api.twitter.com/oauth/authorize'
    access_token_url = 'https://api.twitter.com/oauth/access_token'
    # Authorize
    print 'Auth link:'
    print '{0}?oauth_token={1}'.format(authorize_url, request_token['oauth_token'])
    print
    oauth_verifier = unicode(raw_input('What is the PIN? '))
    auth = OAuth1(CONSUMER_KEY, CONSUMER_SECRET,
                  request_token['oauth_token'], request_token['oauth_token_secret'],
                  verifier=oauth_verifier)
    res = requests.post(access_token_url, auth=auth)
    access_token = dict(urlparse.parse_qsl(res.text))
    p(access_token)

    # search
    search_url = 'https://api.twitter.com/1.1/search/tweets.json'
    query = urllib.quote('python pandas')
    auth = OAuth1(CONSUMER_KEY, CONSUMER_SECRET,
                  access_token['oauth_token'], access_token['oauth_token_secret'])
    res = requests.get(search_url + '?q=' + query, auth=auth)
    tweets = json.loads(res.text)
    format_tweets(tweets)

    # post status
    update_url = 'https://api.twitter.com/1.1/statuses/update.json'
    data = {
        'status': 'This status is posted by requests module.',
    }
    res = requests.post(update_url, data=data, auth=auth)
    p(res.text)
Пример #4
0
def search_tweets(tw_secret):
    '''
    Using twitter module
    '''
    MY_TWITTER_CREDS = os.path.expanduser(study.ROOT_DIR + '/.my_app_credentials')
    CONSUMER_KEY = tw_secret['api_key']
    CONSUMER_SECRET = tw_secret['api_secret_key']

    if not os.path.exists(MY_TWITTER_CREDS):
        twitter.oauth_dance('My App Name', CONSUMER_KEY, CONSUMER_SECRET, MY_TWITTER_CREDS)

    # oauth_token is access_token format
    # oauth_secret is access_token_secret format
    oauth_token, oauth_secret = twitter.read_token_file(MY_TWITTER_CREDS)
    p(oauth_token)
    p(oauth_secret)

    auth = twitter.OAuth(oauth_token, oauth_secret, CONSUMER_KEY, CONSUMER_SECRET)
    t = twitter.Twitter(auth=auth)
    q = urllib.quote('python pandas')
    p(q)

    return t.search.tweets(q=q)
def main():
    out_dir = os.path.dirname(__file__)

    ex1_path = study.DATA_DIR + '/ch06/ex1.csv'
    cat(ex1_path)

    df = pd.read_csv(ex1_path)
    p(df)
    p(pd.read_table(ex1_path, sep=','))

    p('header less---------------------')
    ex2_path = study.DATA_DIR + '/ch06/ex2.csv'
    cat(ex2_path)
    names = ['a','b', 'c', 'd', 'message']
    p(pd.read_csv(ex2_path, header=None))
    p(pd.read_csv(ex2_path, names=names))
    p(pd.read_csv(ex2_path, names=names, index_col='message'))

    p('hierarchy index---------------------')
    mindex_path = study.DATA_DIR + '/ch06/csv_mindex.csv'
    cat(mindex_path)
    p(pd.read_csv(mindex_path, index_col=['key1', 'key2']))

    p('separate by regex-------------')
    ex3_path = study.DATA_DIR + '/ch06/ex3.csv'
    cat(ex3_path)
    p(pd.read_csv(ex3_path, sep='\s+'))

    p('skip rows-----------')
    ex4_path = study.DATA_DIR + '/ch06/ex4.csv'
    cat(ex4_path)
    p(pd.read_csv(ex4_path, skiprows=[0,2,3]))

    p('N/A------------------')
    ex5_path = study.DATA_DIR + '/ch06/ex5.csv'
    cat(ex5_path)
    result = pd.read_csv(ex5_path)
    p(result)
    p(pd.isnull(result))
    result = pd.read_csv(ex5_path, na_values=['NULL', '12']) # 12 is NA
    p(result)

    p('N/A dict------------------')
    sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
    p(sentinels)
    p(pd.read_csv(ex5_path, na_values=sentinels))

    p('6.1.1 read data chunk size---------------------')
    ex6_path = study.DATA_DIR + '/ch06/ex6.csv'
    p(pd.read_csv(ex6_path).count())
    p(pd.read_csv(ex6_path, nrows=5))
    chunker = pd.read_csv(ex6_path, chunksize=1000)
    p(chunker)
    tot = Series([])
    for piece in chunker:
        tot = tot.add(piece['key'].value_counts(), fill_value=0)
    tot.order(ascending=False)
    p(tot[:10])

    p('6.1.2 write---------------------')
    data = pd.read_csv(ex5_path)
    p(data)

    ex5_out_path = out_dir + '/ex5_out.csv'
    data.to_csv(ex5_out_path)
    cat(ex5_path)

    data.to_csv(sys.stdout, index=False, header=False)
    print ''
    data.to_csv(sys.stdout, index=False, cols=list('abc'))
    print ''

    p('Series--------------')
    tseries_out_path = out_dir + '/tseries_out.csv'
    dates = pd.date_range('1/1/2000', periods=7)
    ts = Series(np.arange(7), index=dates)
    ts.to_csv(tseries_out_path)
    cat(tseries_out_path)
    p(Series.from_csv(tseries_out_path, parse_dates=True))

    p('6.1.3 csv-------------------------')
    ex7_path = study.DATA_DIR + '/ch06/ex7.csv'
    cat(ex7_path)
    f = open(ex7_path)
    reader = csv.reader(f)
    for line in reader:
        print line
    lines = list(csv.reader(open(ex7_path)))
    header, values = lines[0], lines[1:]
    data_dict = {h: v for h,v in zip(header, zip(*values))}
    p(data_dict)

    my_data_out_path = out_dir + '/mydata.csv'
    with open(my_data_out_path, 'w') as fp:
        writer = csv.writer(fp, dialect=my_dialect)
        writer.writerow(('one', 'two', 'three'))
        writer.writerow(('1', '2', '3'))
        writer.writerow(('4', '5', '6'))
        writer.writerow(('7', '8', '9'))
    cat(my_data_out_path)

    p('6.1.4 JSON-------------------------')
    obj = """
{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
             {"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""
    result = json.loads(obj)
    p(result)
    asjson = json.dumps(result)
    p(asjson)
    siblings = DataFrame(result['siblings'], columns=['name', 'age'])
    p(siblings)

    p('6.1.4 XML/HTML Web Scraping-------------------------')
    url = '' #'http://finance.yahoo.com/q/op?s=AAPL+Options'
    if not url is '':
        parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
        doc = parsed.getroot()
        p([lnk.get('href') for lnk in doc.findall('.//a')][-10:])

        tables = doc.findall('.//table')
        p(parse_options_data(tables[9])[:5])
        p(parse_options_data(tables[13])[:5])

    p('6.1.5 Read XML-------------------------')
    xml_path = out_dir + '/Performance_MNR.xml'
    xml_content ="""
<INDICATOR>
    <INDICATOR_SEQ>373889</INDICATOR_SEQ>
    <PARENT_SEQ></PARENT_SEQ>
    <AGENCY_NAME>MEtro-North Railroad</AGENCY_NAME>
    <INDICATOR_NAME>Escalator Availability</INDICATOR_NAME>
    <DESCRIPTION>Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009.</DESCRIPTION>
    <PERIOD_YEAR>2011</PERIOD_YEAR>
    <PERIOD_MONTH>12</PERIOD_MONTH>
    <CATEGORY>Service Indicators</CATEGORY>
    <FREQUENCY>M</FREQUENCY>
    <DESIRED_CHANGE>U</DESIRED_CHANGE>
    <INDICATOR_UNIT>%</INDICATOR_UNIT>
    <DECIMAL_PLACES>1</DECIMAL_PLACES>
    <YTD_TARGET>97.00</YTD_TARGET>
    <YTD_ACTUAL></YTD_ACTUAL>
    <MONTHLY_TARGET>97.00</MONTHLY_TARGET>
    <MONTHLY_ACTUAL></MONTHLY_ACTUAL>
</INDICATOR>
"""
    if not os.path.exists(xml_path):
        with open(xml_path, 'w') as f:
            f.write(xml_content)
    parsed = objectify.parse(open(xml_path))
    root = parsed.getroot()
    data = []
    skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',
                   'DESIRED_SEQ', 'DECIMAL_PLACES']
    p(dir(root))
    for elt in root: # .INDICATOR:
        el_data = {}
        for child in elt.getchildren():
            if child.tag in skip_fields:
                continue
            el_data[child.tag] = child.pyval
        data.append(el_data)
    perf = DataFrame(data)
    p(perf)

    tag = '<a href="http://google.com">Google</a>'
    root = objectify.parse(StringIO.StringIO(tag)).getroot()
    p(root)
    p(root.get('href'))
    p(root.text)
Пример #6
0
def main():
    """
    Binary data format
    """
    out_dir = os.path.dirname(__file__)

    ex1_path = study.DATA_DIR + '/ch06/ex1.csv'
    cat(ex1_path)
    frame = pd.read_csv(ex1_path)
    p(frame)

    out_pickle = out_dir + '/frame_pickle'
    # deprecated
    # frame.save(out_pickle)
    # pd.load(out_pickle)
    frame.to_pickle(out_pickle)
    p(pd.read_pickle(out_pickle))

    p('6.2.1 Hierarchical Data Format(HDF)----------------')
    h5_path = out_dir + '/mydata.h5'
    store = pd.HDFStore(h5_path)
    store['obj1'] = frame
    store['obj_col1'] = frame['a']
    p(store)
    p(store.obj1)

    p('6.2.2 Excel-------------------')
    xls_file = pd.ExcelFile(out_dir + '/data.xlsx')
    table = xls_file.parse('Sheet1')
    p(table)
Пример #7
0
def main():
    """
    Binary data format
    """
    out_dir = os.path.dirname(__file__)

    query = """
CREATE TABLE test
(
    a VARCHAR(20)
    , b VARCHAR(20)
    , c REAL
    , INTEGER
);"""
    con = sqlite3.connect(out_dir + '/ch06-sqlite.db')
    try:
        con.execute(query)

        data = [('Atlanta', 'Georgia', 1.25, 6),
                ('Tallahassee', 'Florida', 2.6, 3),
                ('Sacramento', 'California', 1.7, 5)]
        stmt = 'INSERT INTO test VALUES(?, ?, ?, ?)'
        con.executemany(stmt, data)
        con.commit()
    except sqlite3.OperationalError as e:
        print e.message
        print traceback.format_exc()
    finally:
        p('finally')

    cursor = con.execute('select * from test')
    rows = cursor.fetchall()
    p(rows)
    p(cursor.description)
    p(DataFrame(rows, columns=zip(*cursor.description)[0]))
    p(sql.read_sql('select a, b, c from test', con))
    # deprecated
    # p(sql.read_frame('select a, b, c from test', con))
    con.close()

    p('6.4.1 MongoDB------------------')
    con = pymongo.Connection('localhost', port=27017)
    tweets = con.db.twees

    columns = ['created_at', 'from_user', 'id', 'text']
    date_combine = datetime.combine(date(2005, 7, 14), time(12, 30))
    data = [
        [date.today().isoformat(),                     'a', 1, 'aa'],
        [str(date.today()),                            'b', 2, 'bb'],
        [date.today().strftime('%Y-%m-%d %H:%M:%S'),   'c', 3, 'cc'],
        [datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'd', 4, 'dd'],
        [date_combine.strftime('%Y-%m-%d %H:%M:%S'),   'd', 4, 'dd'],
    ]
    for d in data:
        r = dict(zip(columns, d))
        print r
        tweets.save(r)

    cursor = tweets.find({'from_user': '******'})
    p(DataFrame(list(cursor), columns=columns))