def read_dir(datestr, print_date=False): if print_date: print(datestr) return pd.concat([ read_file(f, datestr) for f in os.listdir(mda.data_dir('FTSE100', datestr)) ])
def get_s3(download_date): proxy_args = { 'bucket': 'ftse100', 'local_dir': mda.data_dir('FTSE100', download_date), 'prefix': 'raw/' + download_date + '/' } s3 = S3Proxy(**proxy_args) return s3
def read(): df = pd.concat( pd.read_csv( mda.data_dir('doing_data_science', 'dds_datasets', 'nyt{}.csv'.format(i)), ).assign(Day=i) for i in range(1, 32)) df.columns = [col.lower() for col in df.columns] return df
from __future__ import division import os import re import time import boto3 from tqdm import tqdm import mda from mda.io.google_finance import LseReader from mda.io.s3 import S3Proxy __author__ = 'mattmcd' dataLoc = mda.data_dir('FTSE100') def get_all(do_copy=False): """Get last 10 days of 1 minute intraday data from FTSE 100 Args: do_copy: copy downloaded files to S3 Returns: <none> Creates saved text files """ reader = LseReader() download_date = time.strftime("%Y%m%d") save_loc = os.path.join(dataLoc, download_date) if not os.path.isdir(save_loc): os.mkdir(save_loc)
def read_file(ticker_file, datestr): with open(mda.data_dir('FTSE100', datestr, ticker_file)) as f: return parse_text(f.read(), ticker=ticker_file.split('.')[0])
import mda import urllib2 import numpy as np import pandas as pd import os ftseFile = mda.data_dir('FTSE100', 'FTSE100.csv') class LseReader: def __init__(self, interval=60, period=10): """Constructor :param interval: time in seconds between downloaded values :param period: period in days to download :return: LseReader """ self.ftse100 = pd.read_csv(ftseFile) self.prefixURL = 'https://www.google.com/finance/getprices?' self.interval = interval self.period = period def read_url(self, ticker, interval=None, period=None): """Read intraday history for selected ticker on LSE :param interval: time in seconds between downloaded values :param period: period in days to download :return: txt : downloaded price data as string :return: interval : interval in seconds between downloaded prices """ if not interval: interval = self.interval
import numpy as np import pandas as pd import mda import matplotlib.pyplot as plt import seaborn as sns df = pd.read_pickle(mda.data_dir('Events_20161207_20161208.pkl')) sessions = df.groupby('session_id')['collector_tstamp'].agg(['min', 'max']) sessions['duration'] = (sessions['max'] - sessions['min']) / pd.Timedelta( 1, 's') sessions = pd.merge(sessions, df[['session_id', 'app_id']].drop_duplicates(), left_index=True, right_on='session_id') sessions.loc[sessions.app_id == 'phone', 'app'] = 'Wanda' sessions.loc[sessions.app_id.str.contains('mondo'), 'app'] = 'Mondo' g = sns.FacetGrid(sessions.query('10 < duration < 600'), col='app') g.map(plt.hist, 'duration', normed=True) df = pd.merge(df, sessions) df['t'] = (df['collector_tstamp'] - df['min']) / pd.Timedelta(1, 's') df.loc[df.se_category.str.contains('TryItClick'), 'event'] = 1 g = sns.FacetGrid(df.query('t < 600'), row='app') g.map(plt.scatter, 't', 'event', alpha=0.05) g = sns.FacetGrid(df.query('event == 1 and t < 600'), row='app') g.map(plt.hist, 't', normed=True)